0

I'm using pylucene to build and search through an inverted text index. I built this class (don't be afraid of the python code, pylucene exposes the same functions as in java):

import os, re, sys, lucene
from java.nio.file import Paths
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.index import IndexWriter, IndexWriterConfig, IndexOptions
from org.apache.lucene.document import Document, Field, StringField, TextField, StoredField, FieldType
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.queryparser.classic import QueryParser

class LuceneCtrl():

    def __init__(self, index_dir):
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        self.index_dir = index_dir
        self.dir_wrapper = SimpleFSDirectory(Paths.get(self.index_dir))
        self.analyzer = StandardAnalyzer()
        self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 5000)

    def index_documents(self, docs):
        writer_config = IndexWriterConfig(self.analyzer)
        writer = IndexWriter(self.dir_wrapper, writer_config)
        for content, id in docs:
            doc = Document()
            doc.add(Field("content", content, self.TermsField.TYPE_STORED))
            doc.add(Field("id", id, StringField.TYPE_STORED))
            writer.addDocument(doc)
        writer.commit()
        writer.close()

    def query_index(self, query_terms, n_top=10):
        reader = DirectoryReader.open(self.dir_wrapper)
        searcher = IndexSearcher(reader)
        parser = QueryParser("content", self.analyzer)
        parser.setDefaultOperator(QueryParser.Operator.AND)
        query = parser.parse(query_terms)
        scoreDocs = searcher.search(query, n_top).scoreDocs
        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            table = dict((field.name(), field.stringValue()) for field in doc.getFields())
            print(table)
        reader.close()

I am new to lucene and I was wondering how optimal it was to create the writer and reader each time the index_documents and query_index functions are run. Couldn't I store more information into the class? I tried to save the reader and writer as attributes but it crashes the process.

EDIT: the final class I'm using

import os, re, sys, lucene
from java.nio.file import Paths
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.index import IndexWriter, IndexWriterConfig, IndexOptions
from org.apache.lucene.document import Document, Field, StringField, TextField, StoredField, FieldType
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.queryparser.classic import QueryParser

class LuceneCtrl():

    def __init__(self, index_dir):
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        self.index_dir = index_dir
        self.dir_wrapper_reader = SimpleFSDirectory(Paths.get(self.index_dir))
        self.dir_wrapper_writer = SimpleFSDirectory(Paths.get(self.index_dir))
        self.analyzer = StandardAnalyzer()
        self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 5000)
        self.reader = None
        self.searcher = None
        self.writer_config = IndexWriterConfig(self.analyzer)
        self.writer = IndexWriter(self.dir_wrapper_writer, self.writer_config)

    def index_documents(self, docs):
        for text, id_ in docs:
            doc = Document()
            doc.add(Field("text", text, TextField.TYPE_STORED))
            doc.add(Field("id", id_, StringField.TYPE_STORED))
            self.writer.addDocument(doc)
        self.writer.commit()

    def query_index(self, tokens, operator='AND',n_top=10):
        if self.reader is None:
            self.reader = DirectoryReader.open(self.dir_wrapper_reader)
            self.searcher = IndexSearcher(self.reader)
        else:
            new_reader = DirectoryReader.openIfChanged(self.reader)
            if new_reader:
                self.reader = new_reader
                self.searcher = IndexSearcher(self.reader)
        parser = QueryParser("text", self.analyzer)
        if operator.lower() in ['and', '+']:
            parser.setDefaultOperator(QueryParser.Operator.AND)
        else:
            parser.setDefaultOperator(QueryParser.Operator.OR)
        query = parser.parse(tokens)
        scoreDocs = self.searcher.search(query, n_top).scoreDocs
        return scoreDocs
user3091275
  • 1,013
  • 2
  • 11
  • 27
  • 1
    You are right to wonder, opening readers and writers is expensive. You should definitely be keeping the same one open for multiple reads/writes. – femtoRgon Jul 20 '17 at 20:20
  • Thanks for your comment! I guess it is it not possible to keep a reader and a writer simultaneously opened at all time? I m thinking about defining a default reader mode, where a writer is ready at all time, only to be closed when we want to write something. – user3091275 Jul 21 '17 at 08:38
  • 1
    No, it is possible to have both open, and you don't need to close a writer after writing (just commit without closing), and readers only need to be reopened to see new changes (see [DirectoryReader.openIfChanged](https://lucene.apache.org/core/6_5_0/core/org/apache/lucene/index/DirectoryReader.html#openIfChanged-org.apache.lucene.index.DirectoryReader-)). Don't know what you were doing that was causing issues with it. – femtoRgon Jul 23 '17 at 07:05

0 Answers0