I'm using pylucene to build and search through an inverted text index. I built this class (don't be afraid of the python code, pylucene exposes the same functions as in java):
import os, re, sys, lucene
from java.nio.file import Paths
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.index import IndexWriter, IndexWriterConfig, IndexOptions
from org.apache.lucene.document import Document, Field, StringField, TextField, StoredField, FieldType
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.queryparser.classic import QueryParser
class LuceneCtrl():
def __init__(self, index_dir):
lucene.initVM(vmargs=['-Djava.awt.headless=true'])
self.index_dir = index_dir
self.dir_wrapper = SimpleFSDirectory(Paths.get(self.index_dir))
self.analyzer = StandardAnalyzer()
self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 5000)
def index_documents(self, docs):
writer_config = IndexWriterConfig(self.analyzer)
writer = IndexWriter(self.dir_wrapper, writer_config)
for content, id in docs:
doc = Document()
doc.add(Field("content", content, self.TermsField.TYPE_STORED))
doc.add(Field("id", id, StringField.TYPE_STORED))
writer.addDocument(doc)
writer.commit()
writer.close()
def query_index(self, query_terms, n_top=10):
reader = DirectoryReader.open(self.dir_wrapper)
searcher = IndexSearcher(reader)
parser = QueryParser("content", self.analyzer)
parser.setDefaultOperator(QueryParser.Operator.AND)
query = parser.parse(query_terms)
scoreDocs = searcher.search(query, n_top).scoreDocs
for scoreDoc in scoreDocs:
doc = searcher.doc(scoreDoc.doc)
table = dict((field.name(), field.stringValue()) for field in doc.getFields())
print(table)
reader.close()
I am new to lucene and I was wondering how optimal it was to create the writer and reader each time the index_documents
and query_index
functions are run. Couldn't I store more information into the class? I tried to save the reader and writer as attributes but it crashes the process.
EDIT: the final class I'm using
import os, re, sys, lucene
from java.nio.file import Paths
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.index import IndexWriter, IndexWriterConfig, IndexOptions
from org.apache.lucene.document import Document, Field, StringField, TextField, StoredField, FieldType
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.queryparser.classic import QueryParser
class LuceneCtrl():
def __init__(self, index_dir):
lucene.initVM(vmargs=['-Djava.awt.headless=true'])
self.index_dir = index_dir
self.dir_wrapper_reader = SimpleFSDirectory(Paths.get(self.index_dir))
self.dir_wrapper_writer = SimpleFSDirectory(Paths.get(self.index_dir))
self.analyzer = StandardAnalyzer()
self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 5000)
self.reader = None
self.searcher = None
self.writer_config = IndexWriterConfig(self.analyzer)
self.writer = IndexWriter(self.dir_wrapper_writer, self.writer_config)
def index_documents(self, docs):
for text, id_ in docs:
doc = Document()
doc.add(Field("text", text, TextField.TYPE_STORED))
doc.add(Field("id", id_, StringField.TYPE_STORED))
self.writer.addDocument(doc)
self.writer.commit()
def query_index(self, tokens, operator='AND',n_top=10):
if self.reader is None:
self.reader = DirectoryReader.open(self.dir_wrapper_reader)
self.searcher = IndexSearcher(self.reader)
else:
new_reader = DirectoryReader.openIfChanged(self.reader)
if new_reader:
self.reader = new_reader
self.searcher = IndexSearcher(self.reader)
parser = QueryParser("text", self.analyzer)
if operator.lower() in ['and', '+']:
parser.setDefaultOperator(QueryParser.Operator.AND)
else:
parser.setDefaultOperator(QueryParser.Operator.OR)
query = parser.parse(tokens)
scoreDocs = self.searcher.search(query, n_top).scoreDocs
return scoreDocs