non riesco a provare le modifiche, il debugger di python mi prende per
il culo
This commit is contained in:
109
bananaSPLIT/conf/conftest.json
Normal file
109
bananaSPLIT/conf/conftest.json
Normal file
@@ -0,0 +1,109 @@
|
|||||||
|
{
|
||||||
|
"version": "v1.1a",
|
||||||
|
"paths": {
|
||||||
|
"lastUsed": "defaults.json",
|
||||||
|
"configurationPath": "D:/Emanuele/Documenti/workspace/bananaSPLIT/bananaSPLIT/conf/",
|
||||||
|
"INworkPath": "D:/Emanuele/Documenti/workspace/bananaSPLIT/TestFiles/File nuovo formato",
|
||||||
|
"OUTworkPath": "H:/",
|
||||||
|
"fileList": [
|
||||||
|
"_ Women, Drugs And Depression.txt",
|
||||||
|
"_96 Election Represents Last Hurrah of Generational Politics.txt",
|
||||||
|
"_A 60_S GENERATION MADE BY MYTH-MAKERS.txt",
|
||||||
|
"_MIDDLE AGE_ NO END TO AN UPWARD CREEP.txt",
|
||||||
|
"A DEMOCRATIC SOCIETY IN NEEDOF A MILITARY.txt",
|
||||||
|
"A Dog Who Taught the Lesson of Hope.txt",
|
||||||
|
"A Fading Bohemia, Gritty but Beloved.txt"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"docStruct": {
|
||||||
|
"fileVersNew": true,
|
||||||
|
"language": {
|
||||||
|
"dateWords": [
|
||||||
|
"January",
|
||||||
|
"February",
|
||||||
|
"March",
|
||||||
|
"April",
|
||||||
|
"May",
|
||||||
|
"June",
|
||||||
|
"July",
|
||||||
|
"August",
|
||||||
|
"September",
|
||||||
|
"October",
|
||||||
|
"November",
|
||||||
|
"December"
|
||||||
|
],
|
||||||
|
"headWords": [
|
||||||
|
"BYLINE:",
|
||||||
|
"SECTION:",
|
||||||
|
"LENGTH:",
|
||||||
|
"DATELINE:",
|
||||||
|
"HIGHLIGHT:",
|
||||||
|
"Email:"
|
||||||
|
],
|
||||||
|
"tailWords": [
|
||||||
|
"Newstex ID:",
|
||||||
|
"NOTES:",
|
||||||
|
"LANGUAGE:",
|
||||||
|
"GRAPHIC:",
|
||||||
|
"TYPE:",
|
||||||
|
"URL:",
|
||||||
|
"LOAD-DATE:",
|
||||||
|
"PUBLICATION-TYPE:",
|
||||||
|
"DOCUMENT-TYPE:",
|
||||||
|
"CHARTS:",
|
||||||
|
"JOURNAL-CODE:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"docSep": "\\s*Copyright [(0-9)]+",
|
||||||
|
"beginOfDocument": "\\s*Body",
|
||||||
|
"endOfDocument": "\\s*End of Document",
|
||||||
|
"dateFormat": "{month} {day:d}, {year:d}{}",
|
||||||
|
"outPrefix": "PRE",
|
||||||
|
"outSuffix": "SUF",
|
||||||
|
"outExt": ".txt",
|
||||||
|
"outDateType": "jpn",
|
||||||
|
"outNameFormat": "PRE+{docnum}+{year:04d}{month:02d}{day:02d}+{title}+SUF.txt",
|
||||||
|
"outDate": true,
|
||||||
|
"outTitle": true,
|
||||||
|
"outNumber": true,
|
||||||
|
"numberPos": 1,
|
||||||
|
"datePos": 2,
|
||||||
|
"titlePos": 3,
|
||||||
|
"maxTitleLen": 10,
|
||||||
|
"outNameSep": "+",
|
||||||
|
"customSep": "=",
|
||||||
|
"dateFormats": {
|
||||||
|
"jpn": "{year:04d}{month:02d}{day:02d}",
|
||||||
|
"it": "{day:02d}{month:02d}{year:04d}",
|
||||||
|
"usa": "{month:02d}{year:04d}{day:02d}"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"settings": {
|
||||||
|
"encoding": "ansi",
|
||||||
|
"monthPosition": 0,
|
||||||
|
"getNewsPaperName": true,
|
||||||
|
"nameNotFoundStr": "ND",
|
||||||
|
"includeTitle": true,
|
||||||
|
"removeDuplicates": true,
|
||||||
|
"showSkipped": false,
|
||||||
|
"showRemovedDuplicates": false,
|
||||||
|
"loadTXT": true,
|
||||||
|
"loadDOCX": false,
|
||||||
|
"removeOldFiles": true,
|
||||||
|
"saveSeparateFiles": true,
|
||||||
|
"saveBodyFile": true,
|
||||||
|
"saveBodyNumber": true,
|
||||||
|
"delLF": false,
|
||||||
|
"delWordBreak": true,
|
||||||
|
"delChars": [
|
||||||
|
"'",
|
||||||
|
"@",
|
||||||
|
"#",
|
||||||
|
"$",
|
||||||
|
"%",
|
||||||
|
"^",
|
||||||
|
"&"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"name": "_ Women, Drugs And Depression.txt"
|
||||||
|
}
|
||||||
@@ -24,16 +24,15 @@
|
|||||||
"NYT 2009.txt",
|
"NYT 2009.txt",
|
||||||
"NYT 2013.txt",
|
"NYT 2013.txt",
|
||||||
"NYT 2015.txt",
|
"NYT 2015.txt",
|
||||||
"NYT 2017.txt",
|
"NYT 2017.txt"
|
||||||
"README"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"docStruct": {
|
"docStruct": {
|
||||||
"fileVersNew": true,
|
"fileVersNew": true,
|
||||||
"language": "English",
|
"language": "Italiano",
|
||||||
"docSep": "\\s*Copyright [(0-9)]+",
|
"docSep": "\\s*Copyright [(0-9)]+",
|
||||||
"beginOfDocument":"\\s*Body",
|
"beginOfDocument": "\\s*Body",
|
||||||
"endOfDocument":"\\s*End of Document",
|
"endOfDocument": "\\s*End of Document",
|
||||||
"dateFormat": "{month} {day:d}, {year:d}{}",
|
"dateFormat": "{month} {day:d}, {year:d}{}",
|
||||||
"outPrefix": "PRE",
|
"outPrefix": "PRE",
|
||||||
"outSuffix": "SUF",
|
"outSuffix": "SUF",
|
||||||
|
|||||||
@@ -12,8 +12,8 @@ if __name__ == '__main__':
|
|||||||
print(f)
|
print(f)
|
||||||
try:
|
try:
|
||||||
txt=docx2txt.process(f)
|
txt=docx2txt.process(f)
|
||||||
with open(f.replace('.docx', '.txt'), 'w') as fp:
|
with open(f.replace('.docx', '.txt'), 'wb') as fp:
|
||||||
fp.write(txt)
|
fp.write(txt.encode('ansi'))
|
||||||
fp.close()
|
fp.close()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
|
|||||||
@@ -92,7 +92,7 @@ class bananaMain(PyQt5.QtWidgets.QMainWindow):
|
|||||||
def fillFileList(self):
|
def fillFileList(self):
|
||||||
fl = QDir(self.conf.getParam('paths','INworkPath'))
|
fl = QDir(self.conf.getParam('paths','INworkPath'))
|
||||||
fl.setNameFilters(['*.txt','*.TXT'])
|
fl.setNameFilters(['*.txt','*.TXT'])
|
||||||
fl.entryList(QDir.NoDotAndDotDot | QDir.Files)
|
fl = fl.entryList(QDir.NoDotAndDotDot | QDir.Files)
|
||||||
self.ui.lst_files.clear()
|
self.ui.lst_files.clear()
|
||||||
for f in fl:
|
for f in fl:
|
||||||
self.ui.lst_files.addItem(PyQt5.QtWidgets.QListWidgetItem(f))
|
self.ui.lst_files.addItem(PyQt5.QtWidgets.QListWidgetItem(f))
|
||||||
@@ -455,11 +455,12 @@ class bananaSelezOut(PyQt5.QtWidgets.QWidget):
|
|||||||
# costruisco i thread
|
# costruisco i thread
|
||||||
tDict={}
|
tDict={}
|
||||||
try:
|
try:
|
||||||
for f in splconf['paths']['fileList']:
|
for f in [splconf['paths']['fileList'][0]]:
|
||||||
splconf['name']=f
|
splconf['name']=f
|
||||||
tDict[f] = bananaSPLITTER(fileParams=copy.deepcopy(splconf), logger=self.log)
|
tDict[f] = bananaSPLITTER(fileParams=copy.deepcopy(splconf), logger=self.log)
|
||||||
|
tDict[f].run()
|
||||||
#tDict[f].sendStatus.connect(updateState)
|
#tDict[f].sendStatus.connect(updateState)
|
||||||
QThreadPool.globalInstance().start(tDict[f])
|
#QThreadPool.globalInstance().start(tDict[f])
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.log.error(f"Impossibile avviare lo splitter: {e}")
|
self.log.error(f"Impossibile avviare lo splitter: {e}")
|
||||||
pass
|
pass
|
||||||
|
|||||||
@@ -3,19 +3,18 @@ Created on 2 nov 2019
|
|||||||
|
|
||||||
@author: Emanuele Trabattoni
|
@author: Emanuele Trabattoni
|
||||||
'''
|
'''
|
||||||
from PyQt5.QtCore import QThread, QRunnable
|
from PyQt5.QtCore import QRunnable, QObject, pyqtSignal
|
||||||
from PyQt5.Qt import pyqtSignal
|
|
||||||
|
|
||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
import time, parse, re, copy, os, json
|
import time, parse, re, copy, os,json
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
class bananaSPLITTER(QRunnable):
|
class bananaSPLITTER():
|
||||||
|
|
||||||
#sendStatus = pyqtSignal(str)
|
#sendStatus = pyqtSignal(str)
|
||||||
|
|
||||||
def __init__(self, fileParams=None, logger=None):
|
def __init__(self, fileParams=None, logger=None):
|
||||||
QRunnable.__init__(self)
|
#QRunnable.__init__(self)
|
||||||
self.fileParams = fileParams
|
self.fileParams = fileParams
|
||||||
self.log = logger
|
self.log = logger
|
||||||
self.rawFile = None
|
self.rawFile = None
|
||||||
@@ -23,7 +22,7 @@ class bananaSPLITTER(QRunnable):
|
|||||||
self.contentList = list()
|
self.contentList = list()
|
||||||
self.bodyCounter=0
|
self.bodyCounter=0
|
||||||
self.duplicateNumber=0
|
self.duplicateNumber=0
|
||||||
#self.log.debug(f"Configurazione: \n {json.dumps(fileParams, indent=2)}")
|
self.log.debug(f"Configurazione: \n {json.dumps(fileParams, indent=2)}")
|
||||||
if fileParams is not None:
|
if fileParams is not None:
|
||||||
self.log.info("Sto operando sul file: {}..".format(self.fileParams['name']))
|
self.log.info("Sto operando sul file: {}..".format(self.fileParams['name']))
|
||||||
self.paths = self.fileParams['paths']
|
self.paths = self.fileParams['paths']
|
||||||
@@ -104,7 +103,7 @@ class bananaSPLITTER(QRunnable):
|
|||||||
for ll in self.rawFile:
|
for ll in self.rawFile:
|
||||||
for c in self.settings['delChars']:
|
for c in self.settings['delChars']:
|
||||||
ll = ll.replace(c,'')
|
ll = ll.replace(c,'')
|
||||||
if ll not in ['\n', '\r']:
|
if ll not in ['\n', '\r', '\r\n']:
|
||||||
tempContent.append(ll)
|
tempContent.append(ll)
|
||||||
self.rawFile = copy.deepcopy(tempContent)
|
self.rawFile = copy.deepcopy(tempContent)
|
||||||
return True
|
return True
|
||||||
@@ -174,7 +173,7 @@ class bananaSPLITTER(QRunnable):
|
|||||||
elif self.status == 'head':
|
elif self.status == 'head':
|
||||||
tempContent = list()
|
tempContent = list()
|
||||||
#doppio check per trovare línizio del corpo documento
|
#doppio check per trovare línizio del corpo documento
|
||||||
if re.match(self.docStruct['beginOfDocument'],l,re.i):
|
if re.match(self.docStruct['beginOfDocument'],l):
|
||||||
self.status='body'
|
self.status='body'
|
||||||
if lineWords[0] not in self.docStruct['language']['headWords']: #se la prima parola non e' tra quelle di inizio
|
if lineWords[0] not in self.docStruct['language']['headWords']: #se la prima parola non e' tra quelle di inizio
|
||||||
tempBody.append(l) # vuol dire che ho trovato l'articolo e aggiungo la prima riga al contenuto del documento
|
tempBody.append(l) # vuol dire che ho trovato l'articolo e aggiungo la prima riga al contenuto del documento
|
||||||
|
|||||||
@@ -3,30 +3,16 @@ Created on 1 dic 2019
|
|||||||
|
|
||||||
@author: Emanuele Trabattoni
|
@author: Emanuele Trabattoni
|
||||||
'''
|
'''
|
||||||
import os
|
import json
|
||||||
from glob import glob
|
|
||||||
from copy import deepcopy
|
|
||||||
from libsplit import bananaSPLITTER
|
from libsplit import bananaSPLITTER
|
||||||
from libconfload import bananaCONF
|
|
||||||
from libfancylogger import fancyLogger
|
from libfancylogger import fancyLogger
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
print("CWD-> "+os.getcwd())
|
logger = fancyLogger(filepath=r"./conf/loggerconf.json",fileLog=False)
|
||||||
logger = fancyLogger(fileLog = False)
|
fp = open('./conf/conftest.json', 'r')
|
||||||
confl = bananaCONF(workdir=r"./libbananasplit", logger=logger)
|
splitter = bananaSPLITTER(fileParams=json.load(fp), logger=logger)
|
||||||
confl.open()
|
fp.close()
|
||||||
|
splitter.run()
|
||||||
confl.use("testEN.json")
|
|
||||||
splconf = confl.getParams("splitter")
|
|
||||||
splist = []
|
|
||||||
os.chdir(splconf["paths"]["INworkPath"])
|
|
||||||
for f in glob("*.txt"):
|
|
||||||
splconf["name"] = f
|
|
||||||
logger.info("-"*80)
|
|
||||||
splitter = bananaSPLITTER(fileParams=deepcopy(splconf), logger=logger)
|
|
||||||
splist.append(splitter)
|
|
||||||
splitter.start()
|
|
||||||
splitter.join()
|
|
||||||
|
|
||||||
logger.info("\n"+"="*50+"\n\tFINITO!!!\n"+"="*50)
|
logger.info("\n"+"="*50+"\n\tFINITO!!!\n"+"="*50)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user