non riesco a provare le modifiche, il debugger di python mi prende per
il culo
This commit is contained in:
109
bananaSPLIT/conf/conftest.json
Normal file
109
bananaSPLIT/conf/conftest.json
Normal file
@@ -0,0 +1,109 @@
|
||||
{
|
||||
"version": "v1.1a",
|
||||
"paths": {
|
||||
"lastUsed": "defaults.json",
|
||||
"configurationPath": "D:/Emanuele/Documenti/workspace/bananaSPLIT/bananaSPLIT/conf/",
|
||||
"INworkPath": "D:/Emanuele/Documenti/workspace/bananaSPLIT/TestFiles/File nuovo formato",
|
||||
"OUTworkPath": "H:/",
|
||||
"fileList": [
|
||||
"_ Women, Drugs And Depression.txt",
|
||||
"_96 Election Represents Last Hurrah of Generational Politics.txt",
|
||||
"_A 60_S GENERATION MADE BY MYTH-MAKERS.txt",
|
||||
"_MIDDLE AGE_ NO END TO AN UPWARD CREEP.txt",
|
||||
"A DEMOCRATIC SOCIETY IN NEEDOF A MILITARY.txt",
|
||||
"A Dog Who Taught the Lesson of Hope.txt",
|
||||
"A Fading Bohemia, Gritty but Beloved.txt"
|
||||
]
|
||||
},
|
||||
"docStruct": {
|
||||
"fileVersNew": true,
|
||||
"language": {
|
||||
"dateWords": [
|
||||
"January",
|
||||
"February",
|
||||
"March",
|
||||
"April",
|
||||
"May",
|
||||
"June",
|
||||
"July",
|
||||
"August",
|
||||
"September",
|
||||
"October",
|
||||
"November",
|
||||
"December"
|
||||
],
|
||||
"headWords": [
|
||||
"BYLINE:",
|
||||
"SECTION:",
|
||||
"LENGTH:",
|
||||
"DATELINE:",
|
||||
"HIGHLIGHT:",
|
||||
"Email:"
|
||||
],
|
||||
"tailWords": [
|
||||
"Newstex ID:",
|
||||
"NOTES:",
|
||||
"LANGUAGE:",
|
||||
"GRAPHIC:",
|
||||
"TYPE:",
|
||||
"URL:",
|
||||
"LOAD-DATE:",
|
||||
"PUBLICATION-TYPE:",
|
||||
"DOCUMENT-TYPE:",
|
||||
"CHARTS:",
|
||||
"JOURNAL-CODE:"
|
||||
]
|
||||
},
|
||||
"docSep": "\\s*Copyright [(0-9)]+",
|
||||
"beginOfDocument": "\\s*Body",
|
||||
"endOfDocument": "\\s*End of Document",
|
||||
"dateFormat": "{month} {day:d}, {year:d}{}",
|
||||
"outPrefix": "PRE",
|
||||
"outSuffix": "SUF",
|
||||
"outExt": ".txt",
|
||||
"outDateType": "jpn",
|
||||
"outNameFormat": "PRE+{docnum}+{year:04d}{month:02d}{day:02d}+{title}+SUF.txt",
|
||||
"outDate": true,
|
||||
"outTitle": true,
|
||||
"outNumber": true,
|
||||
"numberPos": 1,
|
||||
"datePos": 2,
|
||||
"titlePos": 3,
|
||||
"maxTitleLen": 10,
|
||||
"outNameSep": "+",
|
||||
"customSep": "=",
|
||||
"dateFormats": {
|
||||
"jpn": "{year:04d}{month:02d}{day:02d}",
|
||||
"it": "{day:02d}{month:02d}{year:04d}",
|
||||
"usa": "{month:02d}{year:04d}{day:02d}"
|
||||
}
|
||||
},
|
||||
"settings": {
|
||||
"encoding": "ansi",
|
||||
"monthPosition": 0,
|
||||
"getNewsPaperName": true,
|
||||
"nameNotFoundStr": "ND",
|
||||
"includeTitle": true,
|
||||
"removeDuplicates": true,
|
||||
"showSkipped": false,
|
||||
"showRemovedDuplicates": false,
|
||||
"loadTXT": true,
|
||||
"loadDOCX": false,
|
||||
"removeOldFiles": true,
|
||||
"saveSeparateFiles": true,
|
||||
"saveBodyFile": true,
|
||||
"saveBodyNumber": true,
|
||||
"delLF": false,
|
||||
"delWordBreak": true,
|
||||
"delChars": [
|
||||
"'",
|
||||
"@",
|
||||
"#",
|
||||
"$",
|
||||
"%",
|
||||
"^",
|
||||
"&"
|
||||
]
|
||||
},
|
||||
"name": "_ Women, Drugs And Depression.txt"
|
||||
}
|
||||
@@ -24,16 +24,15 @@
|
||||
"NYT 2009.txt",
|
||||
"NYT 2013.txt",
|
||||
"NYT 2015.txt",
|
||||
"NYT 2017.txt",
|
||||
"README"
|
||||
"NYT 2017.txt"
|
||||
]
|
||||
},
|
||||
"docStruct": {
|
||||
"fileVersNew": true,
|
||||
"language": "English",
|
||||
"language": "Italiano",
|
||||
"docSep": "\\s*Copyright [(0-9)]+",
|
||||
"beginOfDocument":"\\s*Body",
|
||||
"endOfDocument":"\\s*End of Document",
|
||||
"beginOfDocument": "\\s*Body",
|
||||
"endOfDocument": "\\s*End of Document",
|
||||
"dateFormat": "{month} {day:d}, {year:d}{}",
|
||||
"outPrefix": "PRE",
|
||||
"outSuffix": "SUF",
|
||||
|
||||
@@ -12,8 +12,8 @@ if __name__ == '__main__':
|
||||
print(f)
|
||||
try:
|
||||
txt=docx2txt.process(f)
|
||||
with open(f.replace('.docx', '.txt'), 'w') as fp:
|
||||
fp.write(txt)
|
||||
with open(f.replace('.docx', '.txt'), 'wb') as fp:
|
||||
fp.write(txt.encode('ansi'))
|
||||
fp.close()
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
@@ -92,7 +92,7 @@ class bananaMain(PyQt5.QtWidgets.QMainWindow):
|
||||
def fillFileList(self):
|
||||
fl = QDir(self.conf.getParam('paths','INworkPath'))
|
||||
fl.setNameFilters(['*.txt','*.TXT'])
|
||||
fl.entryList(QDir.NoDotAndDotDot | QDir.Files)
|
||||
fl = fl.entryList(QDir.NoDotAndDotDot | QDir.Files)
|
||||
self.ui.lst_files.clear()
|
||||
for f in fl:
|
||||
self.ui.lst_files.addItem(PyQt5.QtWidgets.QListWidgetItem(f))
|
||||
@@ -455,11 +455,12 @@ class bananaSelezOut(PyQt5.QtWidgets.QWidget):
|
||||
# costruisco i thread
|
||||
tDict={}
|
||||
try:
|
||||
for f in splconf['paths']['fileList']:
|
||||
for f in [splconf['paths']['fileList'][0]]:
|
||||
splconf['name']=f
|
||||
tDict[f] = bananaSPLITTER(fileParams=copy.deepcopy(splconf), logger=self.log)
|
||||
tDict[f].run()
|
||||
#tDict[f].sendStatus.connect(updateState)
|
||||
QThreadPool.globalInstance().start(tDict[f])
|
||||
#QThreadPool.globalInstance().start(tDict[f])
|
||||
except Exception as e:
|
||||
self.log.error(f"Impossibile avviare lo splitter: {e}")
|
||||
pass
|
||||
|
||||
@@ -3,19 +3,18 @@ Created on 2 nov 2019
|
||||
|
||||
@author: Emanuele Trabattoni
|
||||
'''
|
||||
from PyQt5.QtCore import QThread, QRunnable
|
||||
from PyQt5.Qt import pyqtSignal
|
||||
from PyQt5.QtCore import QRunnable, QObject, pyqtSignal
|
||||
|
||||
from slugify import slugify
|
||||
import time, parse, re, copy, os, json
|
||||
import time, parse, re, copy, os,json
|
||||
import traceback
|
||||
|
||||
class bananaSPLITTER(QRunnable):
|
||||
class bananaSPLITTER():
|
||||
|
||||
#sendStatus = pyqtSignal(str)
|
||||
|
||||
def __init__(self, fileParams=None, logger=None):
|
||||
QRunnable.__init__(self)
|
||||
#QRunnable.__init__(self)
|
||||
self.fileParams = fileParams
|
||||
self.log = logger
|
||||
self.rawFile = None
|
||||
@@ -23,7 +22,7 @@ class bananaSPLITTER(QRunnable):
|
||||
self.contentList = list()
|
||||
self.bodyCounter=0
|
||||
self.duplicateNumber=0
|
||||
#self.log.debug(f"Configurazione: \n {json.dumps(fileParams, indent=2)}")
|
||||
self.log.debug(f"Configurazione: \n {json.dumps(fileParams, indent=2)}")
|
||||
if fileParams is not None:
|
||||
self.log.info("Sto operando sul file: {}..".format(self.fileParams['name']))
|
||||
self.paths = self.fileParams['paths']
|
||||
@@ -104,7 +103,7 @@ class bananaSPLITTER(QRunnable):
|
||||
for ll in self.rawFile:
|
||||
for c in self.settings['delChars']:
|
||||
ll = ll.replace(c,'')
|
||||
if ll not in ['\n', '\r']:
|
||||
if ll not in ['\n', '\r', '\r\n']:
|
||||
tempContent.append(ll)
|
||||
self.rawFile = copy.deepcopy(tempContent)
|
||||
return True
|
||||
@@ -174,7 +173,7 @@ class bananaSPLITTER(QRunnable):
|
||||
elif self.status == 'head':
|
||||
tempContent = list()
|
||||
#doppio check per trovare línizio del corpo documento
|
||||
if re.match(self.docStruct['beginOfDocument'],l,re.i):
|
||||
if re.match(self.docStruct['beginOfDocument'],l):
|
||||
self.status='body'
|
||||
if lineWords[0] not in self.docStruct['language']['headWords']: #se la prima parola non e' tra quelle di inizio
|
||||
tempBody.append(l) # vuol dire che ho trovato l'articolo e aggiungo la prima riga al contenuto del documento
|
||||
|
||||
@@ -3,30 +3,16 @@ Created on 1 dic 2019
|
||||
|
||||
@author: Emanuele Trabattoni
|
||||
'''
|
||||
import os
|
||||
from glob import glob
|
||||
from copy import deepcopy
|
||||
import json
|
||||
from libsplit import bananaSPLITTER
|
||||
from libconfload import bananaCONF
|
||||
from libfancylogger import fancyLogger
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("CWD-> "+os.getcwd())
|
||||
logger = fancyLogger(fileLog = False)
|
||||
confl = bananaCONF(workdir=r"./libbananasplit", logger=logger)
|
||||
confl.open()
|
||||
|
||||
confl.use("testEN.json")
|
||||
splconf = confl.getParams("splitter")
|
||||
splist = []
|
||||
os.chdir(splconf["paths"]["INworkPath"])
|
||||
for f in glob("*.txt"):
|
||||
splconf["name"] = f
|
||||
logger.info("-"*80)
|
||||
splitter = bananaSPLITTER(fileParams=deepcopy(splconf), logger=logger)
|
||||
splist.append(splitter)
|
||||
splitter.start()
|
||||
splitter.join()
|
||||
logger = fancyLogger(filepath=r"./conf/loggerconf.json",fileLog=False)
|
||||
fp = open('./conf/conftest.json', 'r')
|
||||
splitter = bananaSPLITTER(fileParams=json.load(fp), logger=logger)
|
||||
fp.close()
|
||||
splitter.run()
|
||||
|
||||
logger.info("\n"+"="*50+"\n\tFINITO!!!\n"+"="*50)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user