-2

I have used PyQT5 to create the UI of a search engine. I have been able to let the application open a folder containing a pdf file, create an index file, search within it and export the results as txt and excel file. Running the code attached will do it. I wanted to give the user also an immediate view of the results via Qtablewidget. The results of the search are store as dictionary to self.data. Below is represented how the widget read the variable for a specific example. enter image description here

Has you can see the widget read the number of column and rows of the variable but does not fill the cells.

The error I have experienced related to the fact that I queried the self.data as if it was a list lists rather than a dictionary of lists.

This is the code that is complete code.

from PyQt5 import QtCore, QtGui, QtWidgets, QtWidgets
from PyQt5.QtWidgets import QHeaderView, QTableWidgetItem
import os, os.path
import glob
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
import pdftotext
from whoosh import index
from whoosh.fields import Schema, TEXT, ID, STORED
from whoosh.analysis import RegexTokenizer
from whoosh.analysis import StopFilter
from whoosh import scoring 
from whoosh.index import open_dir
from whoosh import qparser
from whoosh import highlight
import pandas as pd


            
class Ui_MainWindow(object):
    def setupUi(self, MainWindow):
        MainWindow.setObjectName("MainWindow")
        MainWindow.resize(1126, 879)
        self.centralwidget = QtWidgets.QWidget(MainWindow)
        self.centralwidget.setObjectName("centralwidget")
        self.pushButton = QtWidgets.QPushButton(self.centralwidget)
        self.pushButton.setGeometry(QtCore.QRect(40, 30, 100, 30))
        self.pushButton.setObjectName("pushButton")
        self.pushButton_2 = QtWidgets.QPushButton(self.centralwidget)
        self.pushButton_2.setGeometry(QtCore.QRect(180, 30, 120, 30))
        self.pushButton_2.setObjectName("pushButton_2")
        self.pushButton_3 = QtWidgets.QPushButton(self.centralwidget)
        self.pushButton_3.setGeometry(QtCore.QRect(620, 30, 80, 30))
        self.pushButton_3.setObjectName("pushButton_3")
        self.lineEdit = QtWidgets.QLineEdit(self.centralwidget)
        self.lineEdit.setGeometry(QtCore.QRect(380, 60, 191, 21))
        self.lineEdit.setObjectName("lineEdit")
        self.lineEdit_2 = QtWidgets.QLineEdit(self.centralwidget)
        self.lineEdit_2.setGeometry(QtCore.QRect(40, 90, 50, 21))
        self.lineEdit_2.setObjectName("lineEdit_2")
        self.label = QtWidgets.QLabel(self.centralwidget)
        self.label.setGeometry(QtCore.QRect(380, 30, 50, 35))
        font = QtGui.QFont()
        font.setPointSize(10)
        self.label.setFont(font)
        self.label.setObjectName("label")
        self.label2 = QtWidgets.QLabel(self.centralwidget)
        self.label2.setGeometry(QtCore.QRect(40, 70, 150, 16))
        font = QtGui.QFont()
        font.setPointSize(10)
        self.label2.setFont(font)
        self.label2.setObjectName("label")
        self.tableWidget = QtWidgets.QTableWidget(self.centralwidget)
        self.tableWidget.setGeometry(QtCore.QRect(0, 120, 1121, 721))
        self.tableWidget.setObjectName("tableWidget")
        #self.tableWidget.setColumnCount(2)
        
        MainWindow.setCentralWidget(self.centralwidget)
        self.menubar = QtWidgets.QMenuBar(MainWindow)
        self.menubar.setGeometry(QtCore.QRect(0, 0, 1126, 21))
        self.menubar.setObjectName("menubar")
        MainWindow.setMenuBar(self.menubar)
        self.statusbar = QtWidgets.QStatusBar(MainWindow)
        self.statusbar.setObjectName("statusbar")
        MainWindow.setStatusBar(self.statusbar)

        self.retranslateUi(MainWindow)
        QtCore.QMetaObject.connectSlotsByName(MainWindow)
        
        self.pushButton.clicked.connect(self.open_directory)
        self.pushButton_2.clicked.connect(self.createindex)
        self.pushButton_3.clicked.connect(self.export)
        self.lineEdit.returnPressed.connect(self.search)
        self.lineEdit.returnPressed.connect(self.datatable)
    
    def datatable(self): 
              
        data = self.data
        numrows = len(self.data)
        numcols = len(self.data[0])
        self.tableWidget.setColumnCount(numcols)
        self.tableWidget.setRowCount(numrows)
        for row in range(numrows):
            for column in range(numcols):
               self.tableWidget.setItem(row, column, QTableWidgetItem((self.data[row][column])))        
        
            
    def open_directory(self):
        self.dialog = QtWidgets.QFileDialog()
        self.folder_path = self.dialog.getExistingDirectory(None, "Select Folder")
        return self.folder_path
    
    def createindex(self):
        os.chdir(self.folder_path)
        self.mypdfiles = glob.glob("*.pdf")

#creation of folder for splitted files
        MYDIR = ("Splitted")
        CHECK_FOLDER = os.path.isdir(MYDIR)
        if not CHECK_FOLDER:
            os.makedirs(MYDIR)
    
# save split downloaded file and save into new folder
        for self.file in self.mypdfiles:
            self.fname = os.path.splitext(os.path.basename(self.file))[0]
            self.pdf = PdfFileReader(self.file)
            for self.page in range(self.pdf.getNumPages()):
                self.pdfwrite = PdfFileWriter()
                self.pdfwrite.addPage(self.pdf.getPage(self.page))
                self.outputfilename = '{}_page_{}.pdf'.format(self.fname, self.page+1)
                with open(os.path.join("./Splitted", self.outputfilename), 'wb') as out:
                     self.pdfwrite.write(out)
                     
        print('Created: {}'.format(self.outputfilename))
        
#set working directory 
        os.chdir(self.folder_path + "/Splitted")

        self.spltittedfiles = glob.glob("*.pdf")
        MYDIR = ("Txt")
        CHECK_FOLDER = os.path.isdir(MYDIR)
        if not CHECK_FOLDER:
            os.makedirs(MYDIR)
# Load your PDF
        for self.file in self.spltittedfiles:
            with open(self.file, "rb") as f:
                self.pdf = pdftotext.PDF(f)
        
#creation of folder for splitted files

            
# Save all text to a txt file.
            with open(os.path.join("./TXT", os.path.splitext(os.path.basename(self.file))[0] + ".txt") , 'w', encoding = 'utf-8') as f:
                f.write("\n\n".join(self.pdf))
            f.close()
        
        os.chdir(self.folder_path)
        MYDIR = ("indexdir")
        CHECK_FOLDER = os.path.isdir(MYDIR)
        if not CHECK_FOLDER:
            os.makedirs(MYDIR)
    
        self.my_analyzer = RegexTokenizer()| StopFilter(lang = "en")
        self.schema = Schema(title=TEXT(stored=True),path=ID(stored=True), 
                        content=TEXT(analyzer = self.my_analyzer),
                        textdata=TEXT(stored=True))
 
# set an index writer to add document as per schema
        self.ix = index.create_in("indexdir",self.schema)
        self.writer = self.ix.writer()
    
        self.filepaths = [os.path.join("./Splitted/Txt",i) for i in os.listdir("./Splitted/Txt")]
        for path in self.filepaths:
            self.fp = open(path, "r", encoding='utf-8')
            self.text = self.fp.read()
            self.writer.add_document(title = os.path.splitext(os.path.basename(path))[0] , path=path, content=self.text,textdata=self.text)
            self.fp.close()
        self.writer.commit()
        
    def search(self):
        
        os.chdir(self.folder_path)
        self.ix = open_dir("indexdir")
        MYDIR = ("Results")
        CHECK_FOLDER = os.path.isdir(MYDIR)
        if not CHECK_FOLDER:
            os.makedirs(MYDIR) 
        self.text = self.lineEdit.text()
        self.query_str = self.text
        self.query = qparser.QueryParser("textdata", schema = self.ix.schema)
        self.q = self.query.parse(self.query_str)
        self.topN = self.lineEdit_2.text()
        if self.lineEdit_2.text() == "":
            self.topN = 1000           
        else:
            self.topN = int(self.lineEdit_2.text())
        
        self.data=[]
        with self.ix.searcher() as searcher:
            self.results = searcher.search(self.q, terms=True, limit=self.topN)
            Upper = highlight.UppercaseFormatter()
            self.results.formatter = Upper
            my_cf = highlight.ContextFragmenter(maxchars=500, surround=300)
            self.results.fragmenter = my_cf
            for self.i in self.results:
                self.data.append({"title": self.i['title'], "text": self.i.highlights('textdata'), "score": str(self.i.score)})
        pd.DataFrame(self.data).to_excel("data.xlsx") 
              
    def export(self):
        
        with self.ix.searcher() as searcher:
            self.results = searcher.search(self.q, terms=True, limit= None)
            Upper = highlight.UppercaseFormatter()
            self.results.formatter = Upper
            my_cf = highlight.ContextFragmenter(maxchars=500, surround=300)
            self.results.fragmenter = my_cf
            self.countrow = len(self.results)
            for self.i in self.results:
                with open(os.path.join(self.folder_path, self.text + ".txt"), 'a', encoding="utf-8") as f:
                    print("Title {}".format(self.i['title']), "Text {}".format(self.i.highlights('textdata')), file=f)
                   
    

                
    
    def retranslateUi(self, MainWindow):
        _translate = QtCore.QCoreApplication.translate
        MainWindow.setWindowTitle(_translate("MainWindow", "Search Text"))
        self.pushButton.setText(_translate("MainWindow", "Select Folder"))
        self.pushButton_2.setText(_translate("MainWindow", "Create Database"))
        self.pushButton_3.setText(_translate("MainWindow", "Export"))
        self.label.setText(_translate("MainWindow", "Search"))
        self.label2.setText(_translate("MainWindow", "Top Results"))



if __name__ == "__main__":
    import sys
    app = QtWidgets.QApplication(sys.argv)
    MainWindow = QtWidgets.QMainWindow()
    ui = Ui_MainWindow()
    ui.setupUi(MainWindow)
    MainWindow.show()
    sys.exit(app.exec_())

1 Answers1

0

I was dealing with a list of dictionaries that I needed to unnest. I have slightly modified the datatable function:

def datatable(self): 
              
        data = self.data
        numrows = len(self.data)
        numcols = len(self.data[0])
        self.tableWidget.setColumnCount(numcols)
        self.tableWidget.setRowCount(numrows)
        self.tableWidget.setHorizontalHeaderLabels((list(self.data[0].keys())))
        for row in range(numrows):
            for column in range(numcols):
                item = (list(self.data[row].values())[column])
                self.tableWidget.setItem(row, column, QTableWidgetItem(item))