non riesco a provare le modifiche, il debugger di python mi prende per

il culo
This commit is contained in:
2020-03-30 14:52:09 +02:00
parent 989aa755dd
commit 6d43c564b8
6 changed files with 133 additions and 39 deletions

View File

@@ -0,0 +1,109 @@
{
"version": "v1.1a",
"paths": {
"lastUsed": "defaults.json",
"configurationPath": "D:/Emanuele/Documenti/workspace/bananaSPLIT/bananaSPLIT/conf/",
"INworkPath": "D:/Emanuele/Documenti/workspace/bananaSPLIT/TestFiles/File nuovo formato",
"OUTworkPath": "H:/",
"fileList": [
"_ Women, Drugs And Depression.txt",
"_96 Election Represents Last Hurrah of Generational Politics.txt",
"_A 60_S GENERATION MADE BY MYTH-MAKERS.txt",
"_MIDDLE AGE_ NO END TO AN UPWARD CREEP.txt",
"A DEMOCRATIC SOCIETY IN NEEDOF A MILITARY.txt",
"A Dog Who Taught the Lesson of Hope.txt",
"A Fading Bohemia, Gritty but Beloved.txt"
]
},
"docStruct": {
"fileVersNew": true,
"language": {
"dateWords": [
"January",
"February",
"March",
"April",
"May",
"June",
"July",
"August",
"September",
"October",
"November",
"December"
],
"headWords": [
"BYLINE:",
"SECTION:",
"LENGTH:",
"DATELINE:",
"HIGHLIGHT:",
"Email:"
],
"tailWords": [
"Newstex ID:",
"NOTES:",
"LANGUAGE:",
"GRAPHIC:",
"TYPE:",
"URL:",
"LOAD-DATE:",
"PUBLICATION-TYPE:",
"DOCUMENT-TYPE:",
"CHARTS:",
"JOURNAL-CODE:"
]
},
"docSep": "\\s*Copyright [(0-9)]+",
"beginOfDocument": "\\s*Body",
"endOfDocument": "\\s*End of Document",
"dateFormat": "{month} {day:d}, {year:d}{}",
"outPrefix": "PRE",
"outSuffix": "SUF",
"outExt": ".txt",
"outDateType": "jpn",
"outNameFormat": "PRE+{docnum}+{year:04d}{month:02d}{day:02d}+{title}+SUF.txt",
"outDate": true,
"outTitle": true,
"outNumber": true,
"numberPos": 1,
"datePos": 2,
"titlePos": 3,
"maxTitleLen": 10,
"outNameSep": "+",
"customSep": "=",
"dateFormats": {
"jpn": "{year:04d}{month:02d}{day:02d}",
"it": "{day:02d}{month:02d}{year:04d}",
"usa": "{month:02d}{year:04d}{day:02d}"
}
},
"settings": {
"encoding": "ansi",
"monthPosition": 0,
"getNewsPaperName": true,
"nameNotFoundStr": "ND",
"includeTitle": true,
"removeDuplicates": true,
"showSkipped": false,
"showRemovedDuplicates": false,
"loadTXT": true,
"loadDOCX": false,
"removeOldFiles": true,
"saveSeparateFiles": true,
"saveBodyFile": true,
"saveBodyNumber": true,
"delLF": false,
"delWordBreak": true,
"delChars": [
"'",
"@",
"#",
"$",
"%",
"^",
"&"
]
},
"name": "_ Women, Drugs And Depression.txt"
}

View File

@@ -24,16 +24,15 @@
"NYT 2009.txt", "NYT 2009.txt",
"NYT 2013.txt", "NYT 2013.txt",
"NYT 2015.txt", "NYT 2015.txt",
"NYT 2017.txt", "NYT 2017.txt"
"README"
] ]
}, },
"docStruct": { "docStruct": {
"fileVersNew": true, "fileVersNew": true,
"language": "English", "language": "Italiano",
"docSep": "\\s*Copyright [(0-9)]+", "docSep": "\\s*Copyright [(0-9)]+",
"beginOfDocument":"\\s*Body", "beginOfDocument": "\\s*Body",
"endOfDocument":"\\s*End of Document", "endOfDocument": "\\s*End of Document",
"dateFormat": "{month} {day:d}, {year:d}{}", "dateFormat": "{month} {day:d}, {year:d}{}",
"outPrefix": "PRE", "outPrefix": "PRE",
"outSuffix": "SUF", "outSuffix": "SUF",

View File

@@ -12,8 +12,8 @@ if __name__ == '__main__':
print(f) print(f)
try: try:
txt=docx2txt.process(f) txt=docx2txt.process(f)
with open(f.replace('.docx', '.txt'), 'w') as fp: with open(f.replace('.docx', '.txt'), 'wb') as fp:
fp.write(txt) fp.write(txt.encode('ansi'))
fp.close() fp.close()
except Exception as e: except Exception as e:
print(e) print(e)

View File

@@ -92,11 +92,11 @@ class bananaMain(PyQt5.QtWidgets.QMainWindow):
def fillFileList(self): def fillFileList(self):
fl = QDir(self.conf.getParam('paths','INworkPath')) fl = QDir(self.conf.getParam('paths','INworkPath'))
fl.setNameFilters(['*.txt','*.TXT']) fl.setNameFilters(['*.txt','*.TXT'])
fl.entryList(QDir.NoDotAndDotDot | QDir.Files) fl = fl.entryList(QDir.NoDotAndDotDot | QDir.Files)
self.ui.lst_files.clear() self.ui.lst_files.clear()
for f in fl: for f in fl:
self.ui.lst_files.addItem(PyQt5.QtWidgets.QListWidgetItem(f)) self.ui.lst_files.addItem(PyQt5.QtWidgets.QListWidgetItem(f))
self.conf.setParam(('paths', 'fileList'), fl) self.conf.setParam(('paths', 'fileList'), fl)
def nextTab(self): def nextTab(self):
self.ui.wgt_main.setCurrentIndex(self.ui.wgt_main.currentIndex()+1) self.ui.wgt_main.setCurrentIndex(self.ui.wgt_main.currentIndex()+1)
@@ -455,11 +455,12 @@ class bananaSelezOut(PyQt5.QtWidgets.QWidget):
# costruisco i thread # costruisco i thread
tDict={} tDict={}
try: try:
for f in splconf['paths']['fileList']: for f in [splconf['paths']['fileList'][0]]:
splconf['name']=f splconf['name']=f
tDict[f] = bananaSPLITTER(fileParams=copy.deepcopy(splconf), logger=self.log) tDict[f] = bananaSPLITTER(fileParams=copy.deepcopy(splconf), logger=self.log)
tDict[f].run()
#tDict[f].sendStatus.connect(updateState) #tDict[f].sendStatus.connect(updateState)
QThreadPool.globalInstance().start(tDict[f]) #QThreadPool.globalInstance().start(tDict[f])
except Exception as e: except Exception as e:
self.log.error(f"Impossibile avviare lo splitter: {e}") self.log.error(f"Impossibile avviare lo splitter: {e}")
pass pass

View File

@@ -3,19 +3,18 @@ Created on 2 nov 2019
@author: Emanuele Trabattoni @author: Emanuele Trabattoni
''' '''
from PyQt5.QtCore import QThread, QRunnable from PyQt5.QtCore import QRunnable, QObject, pyqtSignal
from PyQt5.Qt import pyqtSignal
from slugify import slugify from slugify import slugify
import time, parse, re, copy, os, json import time, parse, re, copy, os,json
import traceback import traceback
class bananaSPLITTER(QRunnable): class bananaSPLITTER():
#sendStatus = pyqtSignal(str) #sendStatus = pyqtSignal(str)
def __init__(self, fileParams=None, logger=None): def __init__(self, fileParams=None, logger=None):
QRunnable.__init__(self) #QRunnable.__init__(self)
self.fileParams = fileParams self.fileParams = fileParams
self.log = logger self.log = logger
self.rawFile = None self.rawFile = None
@@ -23,7 +22,7 @@ class bananaSPLITTER(QRunnable):
self.contentList = list() self.contentList = list()
self.bodyCounter=0 self.bodyCounter=0
self.duplicateNumber=0 self.duplicateNumber=0
#self.log.debug(f"Configurazione: \n {json.dumps(fileParams, indent=2)}") self.log.debug(f"Configurazione: \n {json.dumps(fileParams, indent=2)}")
if fileParams is not None: if fileParams is not None:
self.log.info("Sto operando sul file: {}..".format(self.fileParams['name'])) self.log.info("Sto operando sul file: {}..".format(self.fileParams['name']))
self.paths = self.fileParams['paths'] self.paths = self.fileParams['paths']
@@ -104,7 +103,7 @@ class bananaSPLITTER(QRunnable):
for ll in self.rawFile: for ll in self.rawFile:
for c in self.settings['delChars']: for c in self.settings['delChars']:
ll = ll.replace(c,'') ll = ll.replace(c,'')
if ll not in ['\n', '\r']: if ll not in ['\n', '\r', '\r\n']:
tempContent.append(ll) tempContent.append(ll)
self.rawFile = copy.deepcopy(tempContent) self.rawFile = copy.deepcopy(tempContent)
return True return True
@@ -174,7 +173,7 @@ class bananaSPLITTER(QRunnable):
elif self.status == 'head': elif self.status == 'head':
tempContent = list() tempContent = list()
#doppio check per trovare línizio del corpo documento #doppio check per trovare línizio del corpo documento
if re.match(self.docStruct['beginOfDocument'],l,re.i): if re.match(self.docStruct['beginOfDocument'],l):
self.status='body' self.status='body'
if lineWords[0] not in self.docStruct['language']['headWords']: #se la prima parola non e' tra quelle di inizio if lineWords[0] not in self.docStruct['language']['headWords']: #se la prima parola non e' tra quelle di inizio
tempBody.append(l) # vuol dire che ho trovato l'articolo e aggiungo la prima riga al contenuto del documento tempBody.append(l) # vuol dire che ho trovato l'articolo e aggiungo la prima riga al contenuto del documento

View File

@@ -3,30 +3,16 @@ Created on 1 dic 2019
@author: Emanuele Trabattoni @author: Emanuele Trabattoni
''' '''
import os import json
from glob import glob
from copy import deepcopy
from libsplit import bananaSPLITTER from libsplit import bananaSPLITTER
from libconfload import bananaCONF
from libfancylogger import fancyLogger from libfancylogger import fancyLogger
if __name__ == "__main__": if __name__ == "__main__":
print("CWD-> "+os.getcwd()) logger = fancyLogger(filepath=r"./conf/loggerconf.json",fileLog=False)
logger = fancyLogger(fileLog = False) fp = open('./conf/conftest.json', 'r')
confl = bananaCONF(workdir=r"./libbananasplit", logger=logger) splitter = bananaSPLITTER(fileParams=json.load(fp), logger=logger)
confl.open() fp.close()
splitter.run()
confl.use("testEN.json")
splconf = confl.getParams("splitter")
splist = []
os.chdir(splconf["paths"]["INworkPath"])
for f in glob("*.txt"):
splconf["name"] = f
logger.info("-"*80)
splitter = bananaSPLITTER(fileParams=deepcopy(splconf), logger=logger)
splist.append(splitter)
splitter.start()
splitter.join()
logger.info("\n"+"="*50+"\n\tFINITO!!!\n"+"="*50) logger.info("\n"+"="*50+"\n\tFINITO!!!\n"+"="*50)