14 Commits

Author SHA1 Message Date
cf173843d2 prima del checkout di gui-devel 2019-12-13 15:31:15 +01:00
d8808ddf5b funziona la pulizia automatica delle cartelle prima del salvataggio,
credo sia pronto per il merge
2019-12-13 15:27:31 +01:00
b5ee32e924 primo svuotamento automatico della cartella di uscita 2019-12-13 14:39:45 +01:00
b69463a303 eliminato log inutile del numero file 2019-12-13 14:28:12 +01:00
82710a73a3 corretta numerazione sequenziale dei file 2019-12-13 14:26:07 +01:00
275101eed3 Il lancio di tanti thread concorrenti fa casino, il numero incrementale
del file non viene resettato per ogni thread (inspiegabile)
2019-12-13 12:34:02 +01:00
3826c7d2d1 Ritorno a threading, il multiprocess vero per ora e' troppo difficile da
gestire dal punto di vista del logging
2019-12-13 12:18:46 +01:00
377251b59f aggiornato gitignore 2019-12-05 23:10:00 +01:00
428946e39a altre modifiche per uso del modulo multiprocessing, rimane il problema
del logging dai thread
2019-12-05 23:09:08 +01:00
c5662c8397 uso di multiprocessing al posto d threading per una concorrenza vera
serve? da decidere
2019-12-05 22:55:49 +01:00
e3307d8db5 correzioni procedura salvataggio file e cartelle 2019-12-05 20:26:50 +01:00
01dd92e4da file separati vengono isolati in cartelle differenti 2019-12-02 20:12:00 +01:00
8628c3dbfb risolti i problemi di parsign su file di esempio semplificato 2019-12-01 18:29:09 +01:00
d69a3d0628 semplificato file di test 2019-12-01 17:54:49 +01:00
7 changed files with 91 additions and 12064 deletions

1
.gitignore vendored
View File

@@ -3,3 +3,4 @@
/.pydevproject /.pydevproject
/org.eclipse.core.resources.prefs /org.eclipse.core.resources.prefs
bananaSPLIT/build bananaSPLIT/build
/TestFiles/

File diff suppressed because it is too large Load Diff

View File

@@ -1 +1,2 @@
*.py[ocd] *.py[ocd]
/bananaSPLIT.exe.spec

View File

@@ -3,9 +3,8 @@ Created on 2 nov 2019
@author: Emanuele Trabattoni @author: Emanuele Trabattoni
''' '''
from libfancylogger import fancyLogger
from slugify.slugify import slugify from slugify.slugify import slugify
import threading, time, parse, re, copy, os import time, parse, re, copy, os, threading
class bananaSPLITTER(threading.Thread): class bananaSPLITTER(threading.Thread):
def __init__(self, fileParams=None, logger=None): def __init__(self, fileParams=None, logger=None):
@@ -23,6 +22,7 @@ class bananaSPLITTER(threading.Thread):
self.docStruct = self.fileParams['docStruct'] self.docStruct = self.fileParams['docStruct']
self.settings = self.fileParams['settings'] self.settings = self.fileParams['settings']
self.fileName = self.fileParams['name'] self.fileName = self.fileParams['name']
self.outPath = self.paths['OUTworkPath']+slugify(self.fileName)
self.beginTime = time.time() self.beginTime = time.time()
pass pass
else: else:
@@ -31,9 +31,11 @@ class bananaSPLITTER(threading.Thread):
def run(self): def run(self):
self.log.info("Nuovo SPLITTER su file: {}".format(self.fileName)) self.log.info("Nuovo SPLITTER su file: {}".format(self.fileName))
try:
self.openFile() self.openFile()
self.remEmptyLines() self.remEmptyLines()
self.splitFile() self.splitFile()
if self.settings['removeDuplicates']: if self.settings['removeDuplicates']:
self.log.info("Controllo se ci sono dei duplicati..") self.log.info("Controllo se ci sono dei duplicati..")
self.removeDuplicates() self.removeDuplicates()
@@ -41,15 +43,37 @@ class bananaSPLITTER(threading.Thread):
for idx, ff in enumerate(self.fileList): for idx, ff in enumerate(self.fileList):
ff['duplicate']=False ff['duplicate']=False
self.fileList[idx]=ff self.fileList[idx]=ff
print('Salto il controllo dei duplicati..') self.log.warn('Salto il controllo dei duplicati..')
# se il parse e la rimozione dei duplicati e' andata bene
# preparo e inizio il salvataggio
if os.path.exists(self.outPath):
if self.settings['removeOldFiles']:
os.chdir(self.outPath)
for f in os.listdir(self.outPath):
os.remove(f)
else:
raise FileExistsError("Non posso sovrascrivere i vecchi file, eliminali manualmente!")
else:
os.mkdir(self.outPath)
os.chdir(self.outPath)
if self.settings['saveSeparateFiles']: if self.settings['saveSeparateFiles']:
self.saveSeparate() self.saveSeparate()
if self.settings['saveBodyFile']: if self.settings['saveBodyFile']:
self.saveBody() self.saveBody()
self.log.info("L'elaborazione del file ha richiesto {:4.2f} sec".format(time.time()-self.beginTime))
except UnicodeDecodeError as ee:
self.log.critical("Il file [{}] contiene caratteri non compatibili con la codifica scelta! [{}]"
.format(self.fileParams['name'],ee))
except FileExistsError as fe:
self.log.critical(fe)
except BaseException as ee:
self.log.warning(ee)
pass pass
def openFile(self): def openFile(self):
try: try:
os.chdir(self.paths["INworkPath"])
self.log.info("Carico il contenuto..") self.log.info("Carico il contenuto..")
fp = open(self.fileParams['name'], mode='r', encoding=self.settings['encoding']) fp = open(self.fileParams['name'], mode='r', encoding=self.settings['encoding'])
self.rawFile = fp.readlines() self.rawFile = fp.readlines()
@@ -57,6 +81,7 @@ class bananaSPLITTER(threading.Thread):
except IOError as e: except IOError as e:
self.log.critical("Impossibile aprire il file: {}! [{}]".format(self.fileName,e)) self.log.critical("Impossibile aprire il file: {}! [{}]".format(self.fileName,e))
raise BaseException("OpenFile") raise BaseException("OpenFile")
os.rmdir(self.outPath)
pass pass
def remEmptyLines(self): def remEmptyLines(self):
@@ -77,6 +102,8 @@ class bananaSPLITTER(threading.Thread):
def splitFile(self): #porting del codice dal programma originale def splitFile(self): #porting del codice dal programma originale
self.log.info("Individuo il contenuto..") self.log.info("Individuo il contenuto..")
self.bodyCounter=0
self.duplicateNumber=0
docNumber = 0 docNumber = 0
docSkipped = 0 docSkipped = 0
docDate = {} docDate = {}
@@ -107,7 +134,7 @@ class bananaSPLITTER(threading.Thread):
# ricerco la data # ricerco la data
if (lineWords[self.settings['monthPosition']]).capitalize() in self.docStruct['dateWords']: if (lineWords[self.settings['monthPosition']]).capitalize() in self.docStruct['dateWords']:
try: try:
docDate=parse.parse(self.docParams['dateFormat'],l).named docDate=parse.parse(self.docStruct['dateFormat'],l).named
docDate['month']=docDate['month'].lstrip().rstrip().capitalize() docDate['month']=docDate['month'].lstrip().rstrip().capitalize()
docDate['month']=self.docStruct['dateWords'].index(docDate['month'])+1 docDate['month']=self.docStruct['dateWords'].index(docDate['month'])+1
title = '' title = ''
@@ -121,11 +148,11 @@ class bananaSPLITTER(threading.Thread):
newsPaperName = self.settings['nameNotFoundStr'] newsPaperName = self.settings['nameNotFoundStr']
except: except:
self.log.warn("E' successo qualcosa mentre stavo cercando il nome della pubblicazione,\ self.log.warn("E' successo qualcosa mentre stavo cercando il nome della pubblicazione,\
controlla i file di uscita! \n\t[{}]".format(prevLine.strip())) controlla i file di uscita! [{}]".format(prevLine.strip()))
else: else:
newsPaperName = self.settings['nameNotFoundStr'] newsPaperName = self.settings['nameNotFoundStr']
except: except:
self.log.warn("Ho trovato una riga ambigua.. potrebbe essere una data ma non so: \n\t[{}]". format(l.strip('\r\n'))) self.log.warn("Ho trovato una riga ambigua.. potrebbe essere una data ma non so: [{}]". format(l.strip('\r\n')))
pass pass
elif lineWords[0] in self.docStruct['headWords']: elif lineWords[0] in self.docStruct['headWords']:
#cambio stato e inizializzo un nuovo documento da riempire #cambio stato e inizializzo un nuovo documento da riempire
@@ -201,10 +228,12 @@ class bananaSPLITTER(threading.Thread):
ff['duplicate'] = True ff['duplicate'] = True
self.fileList[idx]=ff self.fileList[idx]=ff
self.duplicateNumber+=1 self.duplicateNumber+=1
self.log.info("Ho rimosso {} duplicati di {} articoli..\n". format(self.duplicateNumber, len(duplicateList))) self.log.info("Ho rimosso {} duplicati di {} articoli..". format(self.duplicateNumber, len(duplicateList)))
pass pass
def saveSeparate(self): def saveSeparate(self):
outFileCounter = 0
self.paths['OUTworkPath']=self.paths['OUTworkPath']+slugify(self.fileName)
self.log.info("Salvo gli articoli in file separati...") self.log.info("Salvo gli articoli in file separati...")
self.log.debug("Persorso: {0}".format(self.paths['OUTworkPath'].format('nomeFile'))) self.log.debug("Persorso: {0}".format(self.paths['OUTworkPath'].format('nomeFile')))
for ff in self.fileList: for ff in self.fileList:
@@ -212,36 +241,36 @@ class bananaSPLITTER(threading.Thread):
if ff['duplicate'] == False: if ff['duplicate'] == False:
fName=self.paths['OUTnameFormat'].format(title=slugify(ff['title'][:self.settings['maxTitleLen']]),\ fName=self.paths['OUTnameFormat'].format(title=slugify(ff['title'][:self.settings['maxTitleLen']]),\
filename=slugify(self.fileName),\ filename=slugify(self.fileName),\
docnum=self.bodyCounter,\ docnum=outFileCounter,\
papername=ff['newsPaperName'].strip(),\ papername=ff['newsPaperName'].strip(),\
**ff['date']) **ff['date'])
out=open(self.paths['OUTworkPath']+'{0}'.format(fName),'wb') out=open('{0}'.format(fName),'wb')
if self.settings['includeTitle']: if self.settings['includeTitle']:
ff['content'] = ff['title']+os.linesep+ff['content'] ff['content'] = ff['title']+os.linesep+ff['content']
out.write(ff['content'].encode(self.settings['encoding'])) out.write(ff['content'].encode(self.settings['encoding']))
out.close() out.close()
self.bodyCounter+=1 outFileCounter+=1
except IOError as e: except IOError as e:
self.log.error("Qualcosa e\' andato storto, non riesco a scrivere il file: {}".format(e)) self.log.error("Qualcosa e\' andato storto, non riesco a scrivere il file: {}".format(e))
continue continue
if outFileCounter < self.bodyCounter:
raise BaseException("Ho salvato meno file rispetto a quelli trovati!")
pass pass
def saveBody(self): def saveBody(self):
print('Salvo gli articoli in un singolo file vicino agli originali...') self.log.info('Salvo gli articoli in un singolo file vicino agli originali...')
print ('Persorso: {0}'.format(self.paths['OUTworkPath'].format('nomeFile'))) self.log.debug('Persorso: {0}'.format(self.outPath))
os.chdir(self.outPath)
try: try:
fName=slugify(self.fileName) fName=slugify(self.fileName)
fName='BODYFILE_{0}_{1}.txt'.format(self.fileCounter,fName[:self.settings['maxTitleLen']]) fName='BODYFILE_{0}.txt'.format(fName[:self.settings['maxTitleLen']])
fileContent = os.linesep.join([cc['content'] for cc in self.fileList]) fileContent = os.linesep.join([cc['content'] for cc in self.fileList])
out=open(self.paths['OUTworkPath']+'{0}'.format(fName),'wb') out=open('{0}'.format(fName),'wb')
out.write(fileContent.encode(self.settings['encoding'])) out.write(fileContent.encode(self.settings['encoding']))
out.close() out.close()
except IOError as e: except IOError as e:
print("OOPS! Qualcosa e\' andato storto, non riesco a scrivere il file: {}".format(e)) self.log.error("Qualcosa e\' andato storto, non riesco a scrivere il file: {}".format(e))
pass pass
if __name__ == "__main__":
logg = fancyLogger(name="LibSplit")
spp = bananaSPLITTER(fileParams="testfile.txt", logger=logg)

View File

@@ -9,7 +9,7 @@
"name": "", "name": "",
"paths": { "paths": {
"INworkPath": "D:\\Emanuele\\Documenti\\workspace\\bananaSPLIT\\TestFiles\\", "INworkPath": "D:\\Emanuele\\Documenti\\workspace\\bananaSPLIT\\TestFiles\\",
"OUTworkPath": "D:\\Test\\Separati\\", "OUTworkPath": "H:\\",
"OUTnameFormat": "TEST_{docnum}_{year:04d}{month:02d}{day:02d}_{title}.txt" "OUTnameFormat": "TEST_{docnum}_{year:04d}{month:02d}{day:02d}_{title}.txt"
}, },
"docStruct": { "docStruct": {
@@ -58,8 +58,8 @@
"nameNotFoundStr": "ND", "nameNotFoundStr": "ND",
"includeTitle": true, "includeTitle": true,
"removeDuplicates": true, "removeDuplicates": true,
"showSkipped": false, "showSkipped": true,
"showRemovedDuplicates": true, "showRemovedDuplicates": false,
"maxTitleLen": 32, "maxTitleLen": 32,
"loadTXT": true, "loadTXT": true,
"loadDOCX": false, "loadDOCX": false,

View File

@@ -9,7 +9,7 @@
"name": "", "name": "",
"paths": { "paths": {
"INworkPath": "D:\\Emanuele\\Documenti\\workspace\\bananaSPLIT\\TestFiles\\", "INworkPath": "D:\\Emanuele\\Documenti\\workspace\\bananaSPLIT\\TestFiles\\",
"OUTworkPath": "D:\\Test\\Separati\\", "OUTworkPath": "H:\\",
"OUTnameFormat": "TEST_{docnum}_{year:04d}{month:02d}{day:02d}_{title}.txt" "OUTnameFormat": "TEST_{docnum}_{year:04d}{month:02d}{day:02d}_{title}.txt"
}, },
"docStruct": { "docStruct": {

View File

@@ -4,23 +4,31 @@ Created on 1 dic 2019
@author: Emanuele Trabattoni @author: Emanuele Trabattoni
''' '''
import os import os
from glob import glob
from copy import deepcopy
from libsplit import bananaSPLITTER from libsplit import bananaSPLITTER
from libconfload import bananaCONF from libconfload import bananaCONF
from libfancylogger import fancyLogger from libfancylogger import fancyLogger
print("CWD-> "+os.getcwd()) if __name__ == "__main__":
logger = fancyLogger(fileLog = False) print("CWD-> "+os.getcwd())
confl = bananaCONF(workdir=r"./libbananasplit", logger=logger) logger = fancyLogger(fileLog = False)
confl.open() confl = bananaCONF(workdir=r"./libbananasplit", logger=logger)
confl.open()
confl.use("testEN.json")
splconf = confl.getParams("splitter")
splconf["name"] = splconf["paths"]["INworkPath"]+"GUARDIAN 1989.txt"
splitter = bananaSPLITTER(fileParams=splconf, logger=logger)
splitter.start()
splitter.join()
confl.use("testEN.json")
splconf = confl.getParams("splitter")
splist = []
os.chdir(splconf["paths"]["INworkPath"])
for f in glob("*.txt"):
splconf["name"] = f
logger.info("-"*80)
splitter = bananaSPLITTER(fileParams=deepcopy(splconf), logger=logger)
splist.append(splitter)
splitter.start()
splitter.join()
logger.info("\n"+"="*50+"\n\tFINITO!!!\n"+"="*50)