non riesco a provare le modifiche, il debugger di python mi prende per

il culo
This commit is contained in:
2020-03-30 14:52:09 +02:00
parent 989aa755dd
commit 6d43c564b8
6 changed files with 133 additions and 39 deletions

View File

@@ -0,0 +1,109 @@
{
"version": "v1.1a",
"paths": {
"lastUsed": "defaults.json",
"configurationPath": "D:/Emanuele/Documenti/workspace/bananaSPLIT/bananaSPLIT/conf/",
"INworkPath": "D:/Emanuele/Documenti/workspace/bananaSPLIT/TestFiles/File nuovo formato",
"OUTworkPath": "H:/",
"fileList": [
"_ Women, Drugs And Depression.txt",
"_96 Election Represents Last Hurrah of Generational Politics.txt",
"_A 60_S GENERATION MADE BY MYTH-MAKERS.txt",
"_MIDDLE AGE_ NO END TO AN UPWARD CREEP.txt",
"A DEMOCRATIC SOCIETY IN NEEDOF A MILITARY.txt",
"A Dog Who Taught the Lesson of Hope.txt",
"A Fading Bohemia, Gritty but Beloved.txt"
]
},
"docStruct": {
"fileVersNew": true,
"language": {
"dateWords": [
"January",
"February",
"March",
"April",
"May",
"June",
"July",
"August",
"September",
"October",
"November",
"December"
],
"headWords": [
"BYLINE:",
"SECTION:",
"LENGTH:",
"DATELINE:",
"HIGHLIGHT:",
"Email:"
],
"tailWords": [
"Newstex ID:",
"NOTES:",
"LANGUAGE:",
"GRAPHIC:",
"TYPE:",
"URL:",
"LOAD-DATE:",
"PUBLICATION-TYPE:",
"DOCUMENT-TYPE:",
"CHARTS:",
"JOURNAL-CODE:"
]
},
"docSep": "\\s*Copyright [(0-9)]+",
"beginOfDocument": "\\s*Body",
"endOfDocument": "\\s*End of Document",
"dateFormat": "{month} {day:d}, {year:d}{}",
"outPrefix": "PRE",
"outSuffix": "SUF",
"outExt": ".txt",
"outDateType": "jpn",
"outNameFormat": "PRE+{docnum}+{year:04d}{month:02d}{day:02d}+{title}+SUF.txt",
"outDate": true,
"outTitle": true,
"outNumber": true,
"numberPos": 1,
"datePos": 2,
"titlePos": 3,
"maxTitleLen": 10,
"outNameSep": "+",
"customSep": "=",
"dateFormats": {
"jpn": "{year:04d}{month:02d}{day:02d}",
"it": "{day:02d}{month:02d}{year:04d}",
"usa": "{month:02d}{year:04d}{day:02d}"
}
},
"settings": {
"encoding": "ansi",
"monthPosition": 0,
"getNewsPaperName": true,
"nameNotFoundStr": "ND",
"includeTitle": true,
"removeDuplicates": true,
"showSkipped": false,
"showRemovedDuplicates": false,
"loadTXT": true,
"loadDOCX": false,
"removeOldFiles": true,
"saveSeparateFiles": true,
"saveBodyFile": true,
"saveBodyNumber": true,
"delLF": false,
"delWordBreak": true,
"delChars": [
"'",
"@",
"#",
"$",
"%",
"^",
"&"
]
},
"name": "_ Women, Drugs And Depression.txt"
}

View File

@@ -24,16 +24,15 @@
"NYT 2009.txt",
"NYT 2013.txt",
"NYT 2015.txt",
"NYT 2017.txt",
"README"
"NYT 2017.txt"
]
},
"docStruct": {
"fileVersNew": true,
"language": "English",
"language": "Italiano",
"docSep": "\\s*Copyright [(0-9)]+",
"beginOfDocument":"\\s*Body",
"endOfDocument":"\\s*End of Document",
"beginOfDocument": "\\s*Body",
"endOfDocument": "\\s*End of Document",
"dateFormat": "{month} {day:d}, {year:d}{}",
"outPrefix": "PRE",
"outSuffix": "SUF",

View File

@@ -12,8 +12,8 @@ if __name__ == '__main__':
print(f)
try:
txt=docx2txt.process(f)
with open(f.replace('.docx', '.txt'), 'w') as fp:
fp.write(txt)
with open(f.replace('.docx', '.txt'), 'wb') as fp:
fp.write(txt.encode('ansi'))
fp.close()
except Exception as e:
print(e)

View File

@@ -92,7 +92,7 @@ class bananaMain(PyQt5.QtWidgets.QMainWindow):
def fillFileList(self):
fl = QDir(self.conf.getParam('paths','INworkPath'))
fl.setNameFilters(['*.txt','*.TXT'])
fl.entryList(QDir.NoDotAndDotDot | QDir.Files)
fl = fl.entryList(QDir.NoDotAndDotDot | QDir.Files)
self.ui.lst_files.clear()
for f in fl:
self.ui.lst_files.addItem(PyQt5.QtWidgets.QListWidgetItem(f))
@@ -455,11 +455,12 @@ class bananaSelezOut(PyQt5.QtWidgets.QWidget):
# costruisco i thread
tDict={}
try:
for f in splconf['paths']['fileList']:
for f in [splconf['paths']['fileList'][0]]:
splconf['name']=f
tDict[f] = bananaSPLITTER(fileParams=copy.deepcopy(splconf), logger=self.log)
tDict[f].run()
#tDict[f].sendStatus.connect(updateState)
QThreadPool.globalInstance().start(tDict[f])
#QThreadPool.globalInstance().start(tDict[f])
except Exception as e:
self.log.error(f"Impossibile avviare lo splitter: {e}")
pass

View File

@@ -3,19 +3,18 @@ Created on 2 nov 2019
@author: Emanuele Trabattoni
'''
from PyQt5.QtCore import QThread, QRunnable
from PyQt5.Qt import pyqtSignal
from PyQt5.QtCore import QRunnable, QObject, pyqtSignal
from slugify import slugify
import time, parse, re, copy, os, json
import time, parse, re, copy, os,json
import traceback
class bananaSPLITTER(QRunnable):
class bananaSPLITTER():
#sendStatus = pyqtSignal(str)
def __init__(self, fileParams=None, logger=None):
QRunnable.__init__(self)
#QRunnable.__init__(self)
self.fileParams = fileParams
self.log = logger
self.rawFile = None
@@ -23,7 +22,7 @@ class bananaSPLITTER(QRunnable):
self.contentList = list()
self.bodyCounter=0
self.duplicateNumber=0
#self.log.debug(f"Configurazione: \n {json.dumps(fileParams, indent=2)}")
self.log.debug(f"Configurazione: \n {json.dumps(fileParams, indent=2)}")
if fileParams is not None:
self.log.info("Sto operando sul file: {}..".format(self.fileParams['name']))
self.paths = self.fileParams['paths']
@@ -104,7 +103,7 @@ class bananaSPLITTER(QRunnable):
for ll in self.rawFile:
for c in self.settings['delChars']:
ll = ll.replace(c,'')
if ll not in ['\n', '\r']:
if ll not in ['\n', '\r', '\r\n']:
tempContent.append(ll)
self.rawFile = copy.deepcopy(tempContent)
return True
@@ -174,7 +173,7 @@ class bananaSPLITTER(QRunnable):
elif self.status == 'head':
tempContent = list()
#doppio check per trovare línizio del corpo documento
if re.match(self.docStruct['beginOfDocument'],l,re.i):
if re.match(self.docStruct['beginOfDocument'],l):
self.status='body'
if lineWords[0] not in self.docStruct['language']['headWords']: #se la prima parola non e' tra quelle di inizio
tempBody.append(l) # vuol dire che ho trovato l'articolo e aggiungo la prima riga al contenuto del documento

View File

@@ -3,30 +3,16 @@ Created on 1 dic 2019
@author: Emanuele Trabattoni
'''
import os
from glob import glob
from copy import deepcopy
import json
from libsplit import bananaSPLITTER
from libconfload import bananaCONF
from libfancylogger import fancyLogger
if __name__ == "__main__":
print("CWD-> "+os.getcwd())
logger = fancyLogger(fileLog = False)
confl = bananaCONF(workdir=r"./libbananasplit", logger=logger)
confl.open()
confl.use("testEN.json")
splconf = confl.getParams("splitter")
splist = []
os.chdir(splconf["paths"]["INworkPath"])
for f in glob("*.txt"):
splconf["name"] = f
logger.info("-"*80)
splitter = bananaSPLITTER(fileParams=deepcopy(splconf), logger=logger)
splist.append(splitter)
splitter.start()
splitter.join()
logger = fancyLogger(filepath=r"./conf/loggerconf.json",fileLog=False)
fp = open('./conf/conftest.json', 'r')
splitter = bananaSPLITTER(fileParams=json.load(fp), logger=logger)
fp.close()
splitter.run()
logger.info("\n"+"="*50+"\n\tFINITO!!!\n"+"="*50)