pylucene的安装与使用
Lucene是apache软件基金会4jakarta项目组的一个子项目,是一个开放源代码的全文检索引擎工具包,即它不是一个完整的全文检索引擎,而是一个全文检索引擎的架构,提供了完整的查询引擎和索引引擎,部分文本分析引擎(英文与德文两种西方语言)。
Lucene的目的是为软件开发人员提供一个简单易用的工具包,以方便的在目标系统中实现全文检索的功能,或者是以此为基础建立起完整的全文检索引擎。它是一个索引与搜索类库,而不是完整的程序,可以建立包含词语位置信息的索引。
使用Lucene的方式主要有二种:一是自己编写程序,调用类库;二是使用第三方基于Lucene编写的程序,如Solr等。
许多搜索引擎都有参考lucene的实现,python下的搜索工具whoosh也是参考lucence。pylucene是python版的lucene,使用jcc“编译”。安装、使用步骤如下:
安装
- pylucene需要java环境,所以要先装java,然后设置好JAVA_HOME
- 下载pylucene 6.5,解压
- 进入pylucene-6-5目录,然后执行
$ pushd jcc
<edit setup.py to match your environment> # 主要看java的目录是否正确
$ python setup.py build
$ sudo python setup.py install
$ popd
<edit Makefile to match your environment> # 修改的内容详见目录
$ make
$ make test (look for failures)
$ sudo make install
- 验证是否安装成功,能在python中
import lucene
即表明成功安装
使用
使用时可以参考官网API,以及参考java版的API, pylucene的格式与lucene很相似
#!/usr/bin/env python
# coding: utf-8
import time
import os
import logging
import shutil
import lucene
from org.apache.lucene.analysis.cn.smart import SmartChineseAnalyzer
from org.apache.lucene.document import Document, Field, FieldType
from org.apache.lucene.index import DirectoryReader, PostingsEnum,\
SlowCompositeReaderWrapper, IndexWriter, IndexWriterConfig,\
IndexOptions, MultiFields
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.search import DocIdSetIterator, IndexSearcher,\
ScoreDoc, TopDocs
from org.apache.lucene.store import FSDirectory
from org.apache.lucene.util import BytesRef, BytesRefIterator
from java.nio.file import Paths
from flask import current_app as app
from service import tabledef
BASE_PATH = os.path.dirname(os.path.abspath(__file__))
INDEX_DIR = os.path.join(BASE_PATH, tabledef.IndexSettings.path)
logger = logging.getLogger(tabledef.IndexSettings.logger_name)
def separateQuery(query):
'''separate query with SmartChineseAnalyzer
Args:
query: str
Return:
word_list: list
'''
docContentField = ''
parser = QueryParser(docContentField, SmartChineseAnalyzer())
query = parser.parse(query)
word_list = query.toString().split(' ')
return word_list
def getQueryPositionsAndFreqs(query, field, reader):
'''return query position in document
Args:
query: unicode
field: unicode, the field in index
reader: FSDirectory.open(index_dir)
Return:
dict: {word: {document_id: [position(int) list]}}
dict: {word_order: [docid which has this word]}
list: docid list
'''
positons = {}
freqs = {}
doc_list = set()
words = separateQuery(query)
logger.debug('open index: %s', reader)
atomic_reader = SlowCompositeReaderWrapper.wrap(reader)
field_reader = atomic_reader.terms(field)
if field_reader is None:
raise KeyError("Cann't find the field: %s" % field)
terms_enum = field_reader.iterator()
for word_order, word in enumerate(words):
# 每个词查询时都要清空上一次的搜索结果
search_positons = {}
id_freq = {}
logger.debug('search word: %s', word)
if_found = terms_enum.seekExact(BytesRef(word))
# 必须判断,因为如果索引中没有这个词,它会从索引中取出另外的词,很容易引起没有该词结果有结果的问题http://lucene.apache.org/core/6_5_0/core/index.html
if not if_found:
freqs[word_order] = {}
continue
docs = terms_enum.postings(None, PostingsEnum.POSITIONS)
if not docs:
freqs[word_order] = {}
continue
docid = docs.nextDoc()
while docid != DocIdSetIterator.NO_MORE_DOCS:
doc_list.add(docid)
position_list = []
freq = docs.freq()
id_freq[docid] = freq
for i in range(freq):
position_list.append(docs.nextPosition())
search_positons[docid] = position_list
docid = docs.nextDoc()
freqs[word_order] = id_freq
positons[word] = search_positons
return positons, freqs, list(doc_list)
def getDocStats(field, reader):
'''return doc status in index
Args:
query: unicode
reader: FSDirectory.open(index_dir)
Return:
doc_num: int
avg_doc_length: float
doc_lem_map: dict, {docid: doc_len}
'''
total_dl = 0
doc_length_map = {}
doc_num = reader.numDocs()
for docid in range(doc_num):
doc_len = 0
term_num = 0
terms = reader.getTermVector(docid, field)
if terms and terms.size() > 0:
terms_enum = terms.iterator()
for term in BytesRefIterator.cast_(terms_enum):
freq = terms_enum.totalTermFreq()
doc_len += freq
term_num += 1
total_dl += doc_len
doc_length_map[docid] = doc_len
avg_doc_length = total_dl * 1.0 / doc_num
return doc_num, doc_length_map, avg_doc_length
def getTermStats(field, reader):
'''Find all terms of field in reader
Args:
field: str, search filed
reader: FSDirectory.open(index_dir)
Return:
dict: {term(str): frequence(int), ...}
Raise:
KeyError: when the field can be found in index'''
result = {}
fields = MultiFields.getFields(reader)
terms = fields.terms(field)
if not terms:
raise KeyError("Cann't find the field: %s" % field)
iterator = terms.iterator()
for term in BytesRefIterator.cast_(iterator):
term_name = term.utf8ToString()
doc_freq = iterator.docFreq()
result[term_name] = doc_freq
return result
def getIndexPath(index_dir):
"""根据index_dir生成目录名
"""
if index_dir:
index_dir = os.path.join(
INDEX_DIR, tabledef.IndexSettings.sub_path + index_dir)
else:
index_dir = INDEX_DIR
return index_dir
class CrterIndex(object):
def __init__(self):
if app.cache.has_key('vm'):
# app.java_vm['vm'].attachCurrentThread()
logger.debug('javaVM exists: %s', app.cache['vm'])
else:
vm = lucene.initVM()
app.cache['vm'] = vm
logger.debug('init javaVM: %s', vm)
def createIndex(self, data, index_dir=None):
'''create index
Args:
data: [{key: data},{}...{}], dict list, and the key will be field
index_dir: str, index path, default is INDEX_DIR
Return:
True if create index success
'''
index_dir = getIndexPath(index_dir)
logger.debug('create index: %s', index_dir)
index = FSDirectory.open(Paths.get(index_dir))
analyzer = SmartChineseAnalyzer()
config = IndexWriterConfig(analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer = IndexWriter(index, config)
ft = FieldType()
ft.setStored(True)
ft.setTokenized(True)
ft.setStoreTermVectors(True)
ft.setStoreTermVectorOffsets(True)
ft.setStoreTermVectorPositions(True)
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for dict_data in data:
doc = Document()
# logger.debug('add field: %s', dict_data)
for key in dict_data:
# logger.debug('field data:%s', key)
doc.add(Field(key, unicode(dict_data[key]), ft))
writer.addDocument(doc)
writer.commit()
writer.close()
index.close()
def removeIndex(self, index_dir=None):
"""Remove index catalog
Args:
index_dir: str, default is None, which means INDEX_DIR
Return:
None"""
index_dir = getIndexPath(index_dir)
try:
shutil.rmtree(index_dir)
os.mkdir(index_dir)
logger.debug('remove all files in index: %s', index_dir)
except Exception, e:
logger.warn(e)
if __name__ == '__main__':
lucene.initVM(vmargs=['-Djava.awt.headless=true'])
print 'lucene', lucene.VERSION
# data = [{"id": "592e42958fb0fe10a2816719", "question": "劳动者解除劳动合同的经济补偿金", "answer": "劳动者自身原因离职的除非用人单位同意支付经济补偿金,否则在这种情况下法律并没有规定劳动者主动解除劳动合同也应获经济补偿金"},
# {"id": "592e42958fb0fe10a281671a", "question": "劳动者被迫解除劳动合同", "answer": "被迫解除是因为用人单位有法定情形损害劳动者权益时,劳动者被迫提出的解除劳动合同;"},
# { "id": "592e42958fb0fe10a281671b", "question": "劳动者主动解除劳动合同", "answer": "主动解除是指劳动者由于个人原因选择离开"}]
# createIndex(data)
index = FSDirectory.open(Paths.get(INDEX_DIR))
reader = DirectoryReader.open(index)
result = getQueryPositionsAndFreqs('刑法', 'question', reader)
logger.debug('getQueryPositions: %s', result)
result = getTermStats('question', reader)
logger.debug('getTermStats: %s', result)
result = getDocStats('question', reader)
logger.debug('getDocStats: %s', result)
index.close()
reader.close()
附录
MakeFile修改的内容
根据系统版本修改,比如:
76 # Linux (Debian Jessie 64-bit, Python 2.7.9, Oracle Java 1.8
77 # Be sure to also set JDK['linux2'] in jcc's setup.py to the JAVA_HOME value
78 # used below for ANT (and rebuild jcc after changing it).
79 PREFIX_PYTHON=/usr # centos下验证生效,结合第81行
80 ANT=JAVA_HOME=/usr/lib/jvm/java-8-oracle /usr/bin/ant # 根据JAVA_HOME修改
81 PYTHON=$(PREFIX_PYTHON)/bin/python
82 JCC=$(PYTHON) -m jcc --shared # 如果要多线程共用java虚拟机,则--shared不能删除
83 NUM_FILES=8
# 如果需要使用SmartChineseAnalyzer分词,则需要做
137 #JARS+=$(SMARTCN_JAR) # smart chinese analyzer,需要取消此行注释
318 --exclude org.apache.lucene.sandbox.queries.regex.JakartaRegexpCapabilities \ #原文件有的内容
319 --exclude org.apache.lucene.analysis.cn.smart.AnalyzerProfile\ # 需要添加的内容
验证是否成功添加:
$ python
import lucene
from org.apache.lucene.analysis.cn.smart import SmartChineseAnalyzer # 能导入说明成功添加
评论详情
共45条评论