I have a program that does ocr using tesseract and pyqt5
thread.py
class SearchThread(QThread, QObject):
signal = pyqtSignal(str)
finished = pyqtSignal()
def __init__(self, data):
super(QThread, self).__init__()
super(QObject, self).__init__()
self.data = data
def search(self):
pdf_files = PDF.get_pdf_files(self.data['folder'])
for pdf in pdf_files:
if pdf.search(self.data['query']):
self.signal.emit(pdf.pdf_path)
self.finished.emit()
def run(self):
try:
self.search()
except Exception as e:
raise e
self.signal.emit(f'Error : {str(e)}')
util.py
class PDF:
def __init__(self, pdf_path):
self.pdf_path = pdf_path
def search(self, query):
pytesseract.pytesseract.tesseract_cmd = OCR_EXEC_PATH
pytesseract_config = f'--tessdata-dir "{OCR_DATA_PATH}"'
images = pdf2image.convert_from_path(self.pdf_path)
for image in images:
text = pytesseract.image_to_string(image, lang='ara', config=pytesseract_config)
for keyword in query.split():
if keyword in text:
return True
@staticmethod
def get_pdf_files(folder):
path = pathlib.Path(folder)
return [PDF(str(file.resolve())) for file in path.glob('**/*.pdf')]
ui.py
class UI(QMainWindow):
def __init__(self):
super(UI, self).__init__()
self.search_thread = None
try:
import os, sys
os.chdir(sys._MEIPASS)
except:
pass
loadUi(DESIGNER_FILE, self)
....
self.show()
def on_search_btn_click(self):
if self.search_thread and self.search_thread.isRunning():
return
else:
data = {'query': self.query.text().lower().strip(), 'folder': self.folder.text()}
self.search_thread = SearchThread(data)
self.search_thread.signal.connect(self.on_signal_received)
self.search_thread.finished.connect(self.on_finished)
self.search_thread.start()
def on_signal_received(self, value):
self.add_item(value)
def on_finished(self, value):
self.cancel_btn.setEnabled(False)
main.py
if __name__ == '__main__':
app = QApplication(argv)
window = UI()
app.exec_()
build.bat
pyinstaller --noconsole --onefile --name ArchiveSearch --distpath . --add-data="data:data" main.py
Every time a pdf is processed, a console window opens and closes in less than a second when running as an exe. When running directly from python it works fine.