diff --git a/bananaSPLIT/conf/conftest.json b/bananaSPLIT/conf/conftest.json new file mode 100644 index 0000000..6961807 --- /dev/null +++ b/bananaSPLIT/conf/conftest.json @@ -0,0 +1,109 @@ +{ + "version": "v1.1a", + "paths": { + "lastUsed": "defaults.json", + "configurationPath": "D:/Emanuele/Documenti/workspace/bananaSPLIT/bananaSPLIT/conf/", + "INworkPath": "D:/Emanuele/Documenti/workspace/bananaSPLIT/TestFiles/File nuovo formato", + "OUTworkPath": "H:/", + "fileList": [ + "_ Women, Drugs And Depression.txt", + "_96 Election Represents Last Hurrah of Generational Politics.txt", + "_A 60_S GENERATION MADE BY MYTH-MAKERS.txt", + "_MIDDLE AGE_ NO END TO AN UPWARD CREEP.txt", + "A DEMOCRATIC SOCIETY IN NEEDOF A MILITARY.txt", + "A Dog Who Taught the Lesson of Hope.txt", + "A Fading Bohemia, Gritty but Beloved.txt" + ] + }, + "docStruct": { + "fileVersNew": true, + "language": { + "dateWords": [ + "January", + "February", + "March", + "April", + "May", + "June", + "July", + "August", + "September", + "October", + "November", + "December" + ], + "headWords": [ + "BYLINE:", + "SECTION:", + "LENGTH:", + "DATELINE:", + "HIGHLIGHT:", + "Email:" + ], + "tailWords": [ + "Newstex ID:", + "NOTES:", + "LANGUAGE:", + "GRAPHIC:", + "TYPE:", + "URL:", + "LOAD-DATE:", + "PUBLICATION-TYPE:", + "DOCUMENT-TYPE:", + "CHARTS:", + "JOURNAL-CODE:" + ] + }, + "docSep": "\\s*Copyright [(0-9)]+", + "beginOfDocument": "\\s*Body", + "endOfDocument": "\\s*End of Document", + "dateFormat": "{month} {day:d}, {year:d}{}", + "outPrefix": "PRE", + "outSuffix": "SUF", + "outExt": ".txt", + "outDateType": "jpn", + "outNameFormat": "PRE+{docnum}+{year:04d}{month:02d}{day:02d}+{title}+SUF.txt", + "outDate": true, + "outTitle": true, + "outNumber": true, + "numberPos": 1, + "datePos": 2, + "titlePos": 3, + "maxTitleLen": 10, + "outNameSep": "+", + "customSep": "=", + "dateFormats": { + "jpn": "{year:04d}{month:02d}{day:02d}", + "it": "{day:02d}{month:02d}{year:04d}", + "usa": "{month:02d}{year:04d}{day:02d}" + } + }, + "settings": { + "encoding": "ansi", + "monthPosition": 0, + "getNewsPaperName": true, + "nameNotFoundStr": "ND", + "includeTitle": true, + "removeDuplicates": true, + "showSkipped": false, + "showRemovedDuplicates": false, + "loadTXT": true, + "loadDOCX": false, + "removeOldFiles": true, + "saveSeparateFiles": true, + "saveBodyFile": true, + "saveBodyNumber": true, + "delLF": false, + "delWordBreak": true, + "delChars": [ + "'", + "@", + "#", + "$", + "%", + "^", + "&" + ] + }, + "name": "_ Women, Drugs And Depression.txt" +} \ No newline at end of file diff --git a/bananaSPLIT/conf/defaults.json b/bananaSPLIT/conf/defaults.json index 7fd02d7..c1d2c1e 100644 --- a/bananaSPLIT/conf/defaults.json +++ b/bananaSPLIT/conf/defaults.json @@ -24,16 +24,15 @@ "NYT 2009.txt", "NYT 2013.txt", "NYT 2015.txt", - "NYT 2017.txt", - "README" + "NYT 2017.txt" ] }, "docStruct": { "fileVersNew": true, - "language": "English", + "language": "Italiano", "docSep": "\\s*Copyright [(0-9)]+", - "beginOfDocument":"\\s*Body", - "endOfDocument":"\\s*End of Document", + "beginOfDocument": "\\s*Body", + "endOfDocument": "\\s*End of Document", "dateFormat": "{month} {day:d}, {year:d}{}", "outPrefix": "PRE", "outSuffix": "SUF", diff --git a/bananaSPLIT/convert.py b/bananaSPLIT/convert.py index 51fadf0..c3d6abe 100644 --- a/bananaSPLIT/convert.py +++ b/bananaSPLIT/convert.py @@ -12,8 +12,8 @@ if __name__ == '__main__': print(f) try: txt=docx2txt.process(f) - with open(f.replace('.docx', '.txt'), 'w') as fp: - fp.write(txt) + with open(f.replace('.docx', '.txt'), 'wb') as fp: + fp.write(txt.encode('ansi')) fp.close() except Exception as e: print(e) diff --git a/bananaSPLIT/guimain.py b/bananaSPLIT/guimain.py index 8345a44..ccac8f8 100644 --- a/bananaSPLIT/guimain.py +++ b/bananaSPLIT/guimain.py @@ -92,11 +92,11 @@ class bananaMain(PyQt5.QtWidgets.QMainWindow): def fillFileList(self): fl = QDir(self.conf.getParam('paths','INworkPath')) fl.setNameFilters(['*.txt','*.TXT']) - fl.entryList(QDir.NoDotAndDotDot | QDir.Files) + fl = fl.entryList(QDir.NoDotAndDotDot | QDir.Files) self.ui.lst_files.clear() for f in fl: self.ui.lst_files.addItem(PyQt5.QtWidgets.QListWidgetItem(f)) - self.conf.setParam(('paths', 'fileList'), fl) + self.conf.setParam(('paths', 'fileList'), fl) def nextTab(self): self.ui.wgt_main.setCurrentIndex(self.ui.wgt_main.currentIndex()+1) @@ -455,11 +455,12 @@ class bananaSelezOut(PyQt5.QtWidgets.QWidget): # costruisco i thread tDict={} try: - for f in splconf['paths']['fileList']: + for f in [splconf['paths']['fileList'][0]]: splconf['name']=f tDict[f] = bananaSPLITTER(fileParams=copy.deepcopy(splconf), logger=self.log) + tDict[f].run() #tDict[f].sendStatus.connect(updateState) - QThreadPool.globalInstance().start(tDict[f]) + #QThreadPool.globalInstance().start(tDict[f]) except Exception as e: self.log.error(f"Impossibile avviare lo splitter: {e}") pass diff --git a/bananaSPLIT/libbananasplit/libsplit.py b/bananaSPLIT/libbananasplit/libsplit.py index 3a5f11e..903d9c3 100644 --- a/bananaSPLIT/libbananasplit/libsplit.py +++ b/bananaSPLIT/libbananasplit/libsplit.py @@ -3,19 +3,18 @@ Created on 2 nov 2019 @author: Emanuele Trabattoni ''' -from PyQt5.QtCore import QThread, QRunnable -from PyQt5.Qt import pyqtSignal +from PyQt5.QtCore import QRunnable, QObject, pyqtSignal from slugify import slugify -import time, parse, re, copy, os, json +import time, parse, re, copy, os,json import traceback -class bananaSPLITTER(QRunnable): +class bananaSPLITTER(): #sendStatus = pyqtSignal(str) def __init__(self, fileParams=None, logger=None): - QRunnable.__init__(self) + #QRunnable.__init__(self) self.fileParams = fileParams self.log = logger self.rawFile = None @@ -23,7 +22,7 @@ class bananaSPLITTER(QRunnable): self.contentList = list() self.bodyCounter=0 self.duplicateNumber=0 - #self.log.debug(f"Configurazione: \n {json.dumps(fileParams, indent=2)}") + self.log.debug(f"Configurazione: \n {json.dumps(fileParams, indent=2)}") if fileParams is not None: self.log.info("Sto operando sul file: {}..".format(self.fileParams['name'])) self.paths = self.fileParams['paths'] @@ -104,7 +103,7 @@ class bananaSPLITTER(QRunnable): for ll in self.rawFile: for c in self.settings['delChars']: ll = ll.replace(c,'') - if ll not in ['\n', '\r']: + if ll not in ['\n', '\r', '\r\n']: tempContent.append(ll) self.rawFile = copy.deepcopy(tempContent) return True @@ -174,7 +173,7 @@ class bananaSPLITTER(QRunnable): elif self.status == 'head': tempContent = list() #doppio check per trovare línizio del corpo documento - if re.match(self.docStruct['beginOfDocument'],l,re.i): + if re.match(self.docStruct['beginOfDocument'],l): self.status='body' if lineWords[0] not in self.docStruct['language']['headWords']: #se la prima parola non e' tra quelle di inizio tempBody.append(l) # vuol dire che ho trovato l'articolo e aggiungo la prima riga al contenuto del documento diff --git a/bananaSPLIT/libtestmain.py b/bananaSPLIT/libtestmain.py index 05b1779..09788f8 100644 --- a/bananaSPLIT/libtestmain.py +++ b/bananaSPLIT/libtestmain.py @@ -3,30 +3,16 @@ Created on 1 dic 2019 @author: Emanuele Trabattoni ''' -import os -from glob import glob -from copy import deepcopy +import json from libsplit import bananaSPLITTER -from libconfload import bananaCONF from libfancylogger import fancyLogger if __name__ == "__main__": - print("CWD-> "+os.getcwd()) - logger = fancyLogger(fileLog = False) - confl = bananaCONF(workdir=r"./libbananasplit", logger=logger) - confl.open() - - confl.use("testEN.json") - splconf = confl.getParams("splitter") - splist = [] - os.chdir(splconf["paths"]["INworkPath"]) - for f in glob("*.txt"): - splconf["name"] = f - logger.info("-"*80) - splitter = bananaSPLITTER(fileParams=deepcopy(splconf), logger=logger) - splist.append(splitter) - splitter.start() - splitter.join() + logger = fancyLogger(filepath=r"./conf/loggerconf.json",fileLog=False) + fp = open('./conf/conftest.json', 'r') + splitter = bananaSPLITTER(fileParams=json.load(fp), logger=logger) + fp.close() + splitter.run() logger.info("\n"+"="*50+"\n\tFINITO!!!\n"+"="*50)