I have used PyQT5 to create the UI of a search engine. I have been able to let the application open a folder containing a pdf file, create an index file, search within it and export the results as txt and excel file. Running the code attached will do it.
I wanted to give the user also an immediate view of the results via Qtablewidget. The results of the search are store as dictionary to self.data.
Below is represented how the widget read the variable for a specific example.
Has you can see the widget read the number of column and rows of the variable but does not fill the cells.
The error I have experienced related to the fact that I queried the self.data as if it was a list lists rather than a dictionary of lists.
This is the code that is complete code.
from PyQt5 import QtCore, QtGui, QtWidgets, QtWidgets
from PyQt5.QtWidgets import QHeaderView, QTableWidgetItem
import os, os.path
import glob
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
import pdftotext
from whoosh import index
from whoosh.fields import Schema, TEXT, ID, STORED
from whoosh.analysis import RegexTokenizer
from whoosh.analysis import StopFilter
from whoosh import scoring
from whoosh.index import open_dir
from whoosh import qparser
from whoosh import highlight
import pandas as pd
class Ui_MainWindow(object):
def setupUi(self, MainWindow):
MainWindow.setObjectName("MainWindow")
MainWindow.resize(1126, 879)
self.centralwidget = QtWidgets.QWidget(MainWindow)
self.centralwidget.setObjectName("centralwidget")
self.pushButton = QtWidgets.QPushButton(self.centralwidget)
self.pushButton.setGeometry(QtCore.QRect(40, 30, 100, 30))
self.pushButton.setObjectName("pushButton")
self.pushButton_2 = QtWidgets.QPushButton(self.centralwidget)
self.pushButton_2.setGeometry(QtCore.QRect(180, 30, 120, 30))
self.pushButton_2.setObjectName("pushButton_2")
self.pushButton_3 = QtWidgets.QPushButton(self.centralwidget)
self.pushButton_3.setGeometry(QtCore.QRect(620, 30, 80, 30))
self.pushButton_3.setObjectName("pushButton_3")
self.lineEdit = QtWidgets.QLineEdit(self.centralwidget)
self.lineEdit.setGeometry(QtCore.QRect(380, 60, 191, 21))
self.lineEdit.setObjectName("lineEdit")
self.lineEdit_2 = QtWidgets.QLineEdit(self.centralwidget)
self.lineEdit_2.setGeometry(QtCore.QRect(40, 90, 50, 21))
self.lineEdit_2.setObjectName("lineEdit_2")
self.label = QtWidgets.QLabel(self.centralwidget)
self.label.setGeometry(QtCore.QRect(380, 30, 50, 35))
font = QtGui.QFont()
font.setPointSize(10)
self.label.setFont(font)
self.label.setObjectName("label")
self.label2 = QtWidgets.QLabel(self.centralwidget)
self.label2.setGeometry(QtCore.QRect(40, 70, 150, 16))
font = QtGui.QFont()
font.setPointSize(10)
self.label2.setFont(font)
self.label2.setObjectName("label")
self.tableWidget = QtWidgets.QTableWidget(self.centralwidget)
self.tableWidget.setGeometry(QtCore.QRect(0, 120, 1121, 721))
self.tableWidget.setObjectName("tableWidget")
#self.tableWidget.setColumnCount(2)
MainWindow.setCentralWidget(self.centralwidget)
self.menubar = QtWidgets.QMenuBar(MainWindow)
self.menubar.setGeometry(QtCore.QRect(0, 0, 1126, 21))
self.menubar.setObjectName("menubar")
MainWindow.setMenuBar(self.menubar)
self.statusbar = QtWidgets.QStatusBar(MainWindow)
self.statusbar.setObjectName("statusbar")
MainWindow.setStatusBar(self.statusbar)
self.retranslateUi(MainWindow)
QtCore.QMetaObject.connectSlotsByName(MainWindow)
self.pushButton.clicked.connect(self.open_directory)
self.pushButton_2.clicked.connect(self.createindex)
self.pushButton_3.clicked.connect(self.export)
self.lineEdit.returnPressed.connect(self.search)
self.lineEdit.returnPressed.connect(self.datatable)
def datatable(self):
data = self.data
numrows = len(self.data)
numcols = len(self.data[0])
self.tableWidget.setColumnCount(numcols)
self.tableWidget.setRowCount(numrows)
for row in range(numrows):
for column in range(numcols):
self.tableWidget.setItem(row, column, QTableWidgetItem((self.data[row][column])))
def open_directory(self):
self.dialog = QtWidgets.QFileDialog()
self.folder_path = self.dialog.getExistingDirectory(None, "Select Folder")
return self.folder_path
def createindex(self):
os.chdir(self.folder_path)
self.mypdfiles = glob.glob("*.pdf")
#creation of folder for splitted files
MYDIR = ("Splitted")
CHECK_FOLDER = os.path.isdir(MYDIR)
if not CHECK_FOLDER:
os.makedirs(MYDIR)
# save split downloaded file and save into new folder
for self.file in self.mypdfiles:
self.fname = os.path.splitext(os.path.basename(self.file))[0]
self.pdf = PdfFileReader(self.file)
for self.page in range(self.pdf.getNumPages()):
self.pdfwrite = PdfFileWriter()
self.pdfwrite.addPage(self.pdf.getPage(self.page))
self.outputfilename = '{}_page_{}.pdf'.format(self.fname, self.page+1)
with open(os.path.join("./Splitted", self.outputfilename), 'wb') as out:
self.pdfwrite.write(out)
print('Created: {}'.format(self.outputfilename))
#set working directory
os.chdir(self.folder_path + "/Splitted")
self.spltittedfiles = glob.glob("*.pdf")
MYDIR = ("Txt")
CHECK_FOLDER = os.path.isdir(MYDIR)
if not CHECK_FOLDER:
os.makedirs(MYDIR)
# Load your PDF
for self.file in self.spltittedfiles:
with open(self.file, "rb") as f:
self.pdf = pdftotext.PDF(f)
#creation of folder for splitted files
# Save all text to a txt file.
with open(os.path.join("./TXT", os.path.splitext(os.path.basename(self.file))[0] + ".txt") , 'w', encoding = 'utf-8') as f:
f.write("\n\n".join(self.pdf))
f.close()
os.chdir(self.folder_path)
MYDIR = ("indexdir")
CHECK_FOLDER = os.path.isdir(MYDIR)
if not CHECK_FOLDER:
os.makedirs(MYDIR)
self.my_analyzer = RegexTokenizer()| StopFilter(lang = "en")
self.schema = Schema(title=TEXT(stored=True),path=ID(stored=True),
content=TEXT(analyzer = self.my_analyzer),
textdata=TEXT(stored=True))
# set an index writer to add document as per schema
self.ix = index.create_in("indexdir",self.schema)
self.writer = self.ix.writer()
self.filepaths = [os.path.join("./Splitted/Txt",i) for i in os.listdir("./Splitted/Txt")]
for path in self.filepaths:
self.fp = open(path, "r", encoding='utf-8')
self.text = self.fp.read()
self.writer.add_document(title = os.path.splitext(os.path.basename(path))[0] , path=path, content=self.text,textdata=self.text)
self.fp.close()
self.writer.commit()
def search(self):
os.chdir(self.folder_path)
self.ix = open_dir("indexdir")
MYDIR = ("Results")
CHECK_FOLDER = os.path.isdir(MYDIR)
if not CHECK_FOLDER:
os.makedirs(MYDIR)
self.text = self.lineEdit.text()
self.query_str = self.text
self.query = qparser.QueryParser("textdata", schema = self.ix.schema)
self.q = self.query.parse(self.query_str)
self.topN = self.lineEdit_2.text()
if self.lineEdit_2.text() == "":
self.topN = 1000
else:
self.topN = int(self.lineEdit_2.text())
self.data=[]
with self.ix.searcher() as searcher:
self.results = searcher.search(self.q, terms=True, limit=self.topN)
Upper = highlight.UppercaseFormatter()
self.results.formatter = Upper
my_cf = highlight.ContextFragmenter(maxchars=500, surround=300)
self.results.fragmenter = my_cf
for self.i in self.results:
self.data.append({"title": self.i['title'], "text": self.i.highlights('textdata'), "score": str(self.i.score)})
pd.DataFrame(self.data).to_excel("data.xlsx")
def export(self):
with self.ix.searcher() as searcher:
self.results = searcher.search(self.q, terms=True, limit= None)
Upper = highlight.UppercaseFormatter()
self.results.formatter = Upper
my_cf = highlight.ContextFragmenter(maxchars=500, surround=300)
self.results.fragmenter = my_cf
self.countrow = len(self.results)
for self.i in self.results:
with open(os.path.join(self.folder_path, self.text + ".txt"), 'a', encoding="utf-8") as f:
print("Title {}".format(self.i['title']), "Text {}".format(self.i.highlights('textdata')), file=f)
def retranslateUi(self, MainWindow):
_translate = QtCore.QCoreApplication.translate
MainWindow.setWindowTitle(_translate("MainWindow", "Search Text"))
self.pushButton.setText(_translate("MainWindow", "Select Folder"))
self.pushButton_2.setText(_translate("MainWindow", "Create Database"))
self.pushButton_3.setText(_translate("MainWindow", "Export"))
self.label.setText(_translate("MainWindow", "Search"))
self.label2.setText(_translate("MainWindow", "Top Results"))
if __name__ == "__main__":
import sys
app = QtWidgets.QApplication(sys.argv)
MainWindow = QtWidgets.QMainWindow()
ui = Ui_MainWindow()
ui.setupUi(MainWindow)
MainWindow.show()
sys.exit(app.exec_())