I am stuck on the last bit of the code to create a small search engine. So far I have been able to let users do some actions as select a folder where the files to search are stored, create an index, search for keyword and then export excerpt of the text around the keyword to a txt file. This is the layout
And this is the code I have used:
from PyQt5 import QtCore, QtGui, QtWidgets, QtWidgets
from PyQt5.QtWidgets import QHeaderView
import os, os.path
import glob
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
import pdftotext
from whoosh import index
from whoosh.fields import Schema, TEXT, ID, STORED
from whoosh.analysis import RegexTokenizer
from whoosh.analysis import StopFilter
from whoosh import scoring
from whoosh.index import open_dir
from whoosh import qparser
class Ui_MainWindow(object):
def setupUi(self, MainWindow):
MainWindow.setObjectName("MainWindow")
MainWindow.resize(1126, 879)
self.centralwidget = QtWidgets.QWidget(MainWindow)
self.centralwidget.setObjectName("centralwidget")
self.pushButton = QtWidgets.QPushButton(self.centralwidget)
self.pushButton.setGeometry(QtCore.QRect(40, 30, 100, 30))
self.pushButton.setObjectName("pushButton")
self.pushButton_2 = QtWidgets.QPushButton(self.centralwidget)
self.pushButton_2.setGeometry(QtCore.QRect(180, 30, 120, 30))
self.pushButton_2.setObjectName("pushButton_2")
self.pushButton_3 = QtWidgets.QPushButton(self.centralwidget)
self.pushButton_3.setGeometry(QtCore.QRect(620, 30, 80, 30))
self.pushButton_3.setObjectName("pushButton_3")
self.lineEdit = QtWidgets.QLineEdit(self.centralwidget)
self.lineEdit.setGeometry(QtCore.QRect(380, 60, 191, 21))
self.lineEdit.setObjectName("lineEdit")
self.lineEdit_2 = QtWidgets.QLineEdit(self.centralwidget)
self.lineEdit_2.setGeometry(QtCore.QRect(40, 90, 50, 21))
self.lineEdit_2.setObjectName("lineEdit_2")
self.label = QtWidgets.QLabel(self.centralwidget)
self.label.setGeometry(QtCore.QRect(380, 30, 50, 35))
font = QtGui.QFont()
font.setPointSize(10)
self.label.setFont(font)
self.label.setObjectName("label")
self.label2 = QtWidgets.QLabel(self.centralwidget)
self.label2.setGeometry(QtCore.QRect(40, 70, 150, 16))
font = QtGui.QFont()
font.setPointSize(10)
self.label2.setFont(font)
self.label2.setObjectName("label")
self.tableView = QtWidgets.QTableView(self.centralwidget)
self.tableView.setGeometry(QtCore.QRect(0, 120, 1121, 721))
self.tableView.setObjectName("tableView")
self.horizontal_header = self.tableView.horizontalHeader()
self.vertical_header = self.tableView.verticalHeader()
self.horizontal_header.setSectionResizeMode(
QHeaderView.ResizeToContents
)
self.vertical_header.setSectionResizeMode(
QHeaderView.ResizeToContents
)
self.horizontal_header.setStretchLastSection(True)
self.tableView.showGrid()
self.tableView.wordWrap()
MainWindow.setCentralWidget(self.centralwidget)
self.menubar = QtWidgets.QMenuBar(MainWindow)
self.menubar.setGeometry(QtCore.QRect(0, 0, 1126, 21))
self.menubar.setObjectName("menubar")
MainWindow.setMenuBar(self.menubar)
self.statusbar = QtWidgets.QStatusBar(MainWindow)
self.statusbar.setObjectName("statusbar")
MainWindow.setStatusBar(self.statusbar)
self.retranslateUi(MainWindow)
QtCore.QMetaObject.connectSlotsByName(MainWindow)
self.pushButton.clicked.connect(self.open_directory)
self.pushButton_2.clicked.connect(self.createindex)
self.pushButton_3.clicked.connect(self.export)
self.lineEdit.returnPressed.connect(self.search)
def open_directory(self):
self.dialog = QtWidgets.QFileDialog()
self.folder_path = self.dialog.getExistingDirectory(None, "Select Folder")
return self.folder_path
def createindex(self):
os.chdir(self.folder_path)
self.mypdfiles = glob.glob("*.pdf")
#creation of folder for splitted files
MYDIR = ("Splitted")
CHECK_FOLDER = os.path.isdir(MYDIR)
if not CHECK_FOLDER:
os.makedirs(MYDIR)
# save split downloaded file and save into new folder
for self.file in self.mypdfiles:
self.fname = os.path.splitext(os.path.basename(self.file))[0]
self.pdf = PdfFileReader(self.file)
for self.page in range(self.pdf.getNumPages()):
self.pdfwrite = PdfFileWriter()
self.pdfwrite.addPage(self.pdf.getPage(self.page))
self.outputfilename = '{}_page_{}.pdf'.format(self.fname, self.page+1)
with open(os.path.join("./Splitted", self.outputfilename), 'wb') as out:
self.pdfwrite.write(out)
print('Created: {}'.format(self.outputfilename))
#set working directory
os.chdir(self.folder_path + "/Splitted")
self.spltittedfiles = glob.glob("*.pdf")
MYDIR = ("Txt")
CHECK_FOLDER = os.path.isdir(MYDIR)
if not CHECK_FOLDER:
os.makedirs(MYDIR)
# Load your PDF
for self.file in self.spltittedfiles:
with open(self.file, "rb") as f:
self.pdf = pdftotext.PDF(f)
#creation of folder for splitted files
# Save all text to a txt file.
with open(os.path.join("./TXT", os.path.splitext(os.path.basename(self.file))[0] + ".txt") , 'w', encoding = 'utf-8') as f:
f.write("\n\n".join(self.pdf))
f.close()
os.chdir(self.folder_path)
MYDIR = ("indexdir")
CHECK_FOLDER = os.path.isdir(MYDIR)
if not CHECK_FOLDER:
os.makedirs(MYDIR)
self.my_analyzer = RegexTokenizer()| StopFilter(lang = "en")
self.schema = Schema(title=TEXT(stored=True),path=ID(stored=True),
content=TEXT(analyzer = self.my_analyzer),
textdata=TEXT(stored=True))
# set an index writer to add document as per schema
self.ix = index.create_in("indexdir",self.schema)
self.writer = self.ix.writer()
self.filepaths = [os.path.join("./Splitted/Txt",i) for i in os.listdir("./Splitted/Txt")]
for path in self.filepaths:
self.fp = open(path, "r", encoding='utf-8')
self.text = self.fp.read()
self.writer.add_document(title = os.path.splitext(os.path.basename(path))[0] , path=path, content=self.text,textdata=self.text)
self.fp.close()
self.writer.commit()
def search(self):
os.chdir(self.folder_path)
self.ix = open_dir("indexdir")
MYDIR = ("Results")
CHECK_FOLDER = os.path.isdir(MYDIR)
if not CHECK_FOLDER:
os.makedirs(MYDIR)
self.text = self.lineEdit.text()
self.query_str = self.text
self.query = qparser.QueryParser("textdata", schema = self.ix.schema)
self.q = self.query.parse(self.query_str)
self.topN = self.lineEdit_2.text()
if self.lineEdit_2.text() == "":
self.topN = 1000
else:
self.topN = int(self.lineEdit_2.text())
with self.ix.searcher(weighting=scoring.Frequency) as searcher:
self.results = searcher.search(self.q, terms=True, limit=self.topN)
for self.i in range(self.topN):
print(self.results[self.i]['title'], self.results[self.i]['textdata'])
def export(self):
with self.ix.searcher(weighting=scoring.Frequency) as searcher:
self.results = searcher.search(self.q, terms=True, limit= None)
for self.i in range(self.topN):
with open(os.path.join(self.folder_path, self.text + ".txt"), 'a') as f:
print(self.results[self.i]['title'], self.results[self.i]['textdata'], file=f)
def retranslateUi(self, MainWindow):
_translate = QtCore.QCoreApplication.translate
MainWindow.setWindowTitle(_translate("MainWindow", "Search Text"))
self.pushButton.setText(_translate("MainWindow", "Select Folder"))
self.pushButton_2.setText(_translate("MainWindow", "Create Database"))
self.pushButton_3.setText(_translate("MainWindow", "Export"))
self.label.setText(_translate("MainWindow", "Search"))
self.label2.setText(_translate("MainWindow", "Top Results"))
if __name__ == "__main__":
import sys
app = QtWidgets.QApplication(sys.argv)
MainWindow = QtWidgets.QMainWindow()
ui = Ui_MainWindow()
ui.setupUi(MainWindow)
MainWindow.show()
sys.exit(app.exec_())
What I want to do now is to show the results also in the table. I have been trying to understand how to "send" the returned value of the search function to the table and how to show it. It should have two columns: File_Page and Content and as many rows as top results selected. Each row should then show the file with hit and the text of interest like this:
So far I have been able to just set to parameter of the table, but not much more. Is there any mean to let the table how the results without pushing any other button? As I have so far understood, is it possible to trigger the same function from different places of the code but I did not find the opposite, that is to activate two functions with just one signal
I have found lots of examples but none suits the objective. I am still learning how to use Python and I have never used C++.