From 04a32c7bd148aaba55eb533cf5e5fd63e56d3514 Mon Sep 17 00:00:00 2001 From: Emanuele Date: Sun, 24 Nov 2019 12:32:23 +0100 Subject: [PATCH] weaponizzate le funzioni di salva file singolo e body --- bananaSPLIT/libbabanasplit/libsplit.py | 71 +++++++++++++++++--- bananaSPLIT/libbabanasplit/testSettings.json | 10 +-- 2 files changed, 67 insertions(+), 14 deletions(-) diff --git a/bananaSPLIT/libbabanasplit/libsplit.py b/bananaSPLIT/libbabanasplit/libsplit.py index 7ba750b..1ed46fd 100644 --- a/bananaSPLIT/libbabanasplit/libsplit.py +++ b/bananaSPLIT/libbabanasplit/libsplit.py @@ -4,7 +4,7 @@ Created on 2 nov 2019 @author: Emanuele Trabattoni ''' from libbabanasplit.libfancylogger import fancyLogger -import threading, time, sys, parse, re, copy +import threading, time, parse, re, copy, slugify, os class bananaSPLITTER(threading.Thread): @@ -14,9 +14,10 @@ class bananaSPLITTER(threading.Thread): self.rawFile = None self.status = "first" self.fileList = list() - + self.bodyCounter=0 if fileParams is not None: self.log.info("Sto operando sul file: {}..".format(self.fileParams['name'])) + self.paths = self.fileParams['paths'] self.docStruct = self.fileParams['docStruct'] self.settings = self.fileParams['settings'] self.fileName = self.fileParams['name'] @@ -38,8 +39,7 @@ class bananaSPLITTER(threading.Thread): self.rawFile = fp.readlines() fp.close() except IOError as e: - self.log.critical("Impossibile aprire il file: {}!". - format(self.fileName)) + self.log.critical("Impossibile aprire il file: {}! [{}]".format(self.fileName,e)) raise BaseException("OpenFile") pass @@ -63,7 +63,6 @@ class bananaSPLITTER(threading.Thread): self.log.info("Individuo il contenuto..") docNumber = 0 docSkipped = 0 - bodyCounter = 0 docDate = {} prevLine = '' newsPaperName = '' @@ -155,22 +154,76 @@ class bananaSPLITTER(threading.Thread): newDoc['content']=copy.deepcopy(''.join(tempBody)) self.fileList.append(copy.deepcopy(newDoc)) tempBody=list() - bodyCounter +=1 + self.bodyCounter +=1 pass else: self.log.critical("Stato Interno Sconosciuto") prevLine=l #salva sempre e comunque il contenuto della linea precedente pass #ricerca terminata, espongo i risultati - self.log.info("Nel file ho trovato {0} articoli..".format(bodyCounter)) + self.log.info("Nel file ho trovato {0} articoli..".format(self.bodyCounter)) if docSkipped > 0: - self.log.warning('Attentione, LexisNexis ne ha saltati {} !!!'.format(docSkipped)) + self.log.warning("Attentione, LexisNexis ne ha saltati {} !!!".format(docSkipped)) + pass pass - + + def removeDuplicates(self): + self.log.info("Controllo se ci sono dei duplicati..") + titleList=[] + duplicateList=[] + duplicateNumber=0 + for idx, ff in enumerate(self.fileList): + if ff['title'] not in titleList: + titleList.append(ff['title']) + ff['duplicate']=False + self.fileList[idx]=ff + pass + else: + if ff['title'] not in duplicateList: + duplicateList.append(ff['title']) + if self.settings['showRemovedDuplicates']: + self.log.info("Duplicato: {}".format(ff['title'].strip())) + ff['duplicate'] = True + self.fileList[idx]=ff + duplicateNumber+=1 + self.log.info("Ho rimosso {} duplicati di {} articoli..\n". format(duplicateNumber, len(duplicateList))) + pass + def saveSeparate(self): + self.log.info("Salvo gli articoli in file separati...") + self.log.debug("Persorso: {0}".format(self.paths['OUTworkPath'].format('nomeFile'))) + for ff in self.fileList: + try: + if ff['duplicate'] == False: + fName=self.paths['OUTnameFormat'].format(title=slugify(ff['title'][:self.settings['maxTitleLen']]),\ + filename=slugify(self.fileName),\ + docnum=self.bodyCounter,\ + papername=ff['newsPaperName'].strip(),\ + **ff['date']) + out=open(self.paths['OUTworkPath']+'{0}'.format(fName),'wb') + if self.settings['includeTitle']: + ff['content'] = ff['title']+os.linesep+ff['content'] + out.write(ff['content'].encode(self.settings['encoding'])) + out.close() + self.bodyCounter+=1 + except IOError as e: + self.log.error("Qualcosa e\' andato storto, non riesco a scrivere il file: {}".format(e)) + continue pass def saveBody(self): + print('Salvo gli articoli in un singolo file vicino agli originali...') + print ('Persorso: {0}'.format(self.paths['OUTworkPath'].format('nomeFile'))) + try: + fName=slugify(self.fileName) + fName='BODYFILE_{0}_{1}.txt'.format(self.fileCounter,fName[:self.settings['maxTitleLen']]) + fileContent = os.linesep.join([cc['content'] for cc in self.fileList]) + out=open(self.paths['OUTworkPath']+'{0}'.format(fName),'wb') + out.write(fileContent.encode(self.settings['encoding'])) + out.close() + except IOError as e: + print("OOPS! Qualcosa e\' andato storto, non riesco a scrivere il file: {}".format(e)) + continue pass if __name__ == "__main__": diff --git a/bananaSPLIT/libbabanasplit/testSettings.json b/bananaSPLIT/libbabanasplit/testSettings.json index c8f9f4e..de830a2 100644 --- a/bananaSPLIT/libbabanasplit/testSettings.json +++ b/bananaSPLIT/libbabanasplit/testSettings.json @@ -1,16 +1,16 @@ { "version": "v1.1a", - "global": { - "INworkPath": "D:\\Test\\", - "OUTworkPath": "D:\\Test\\Separati\\" - }, "logger": { "logFile": "D:\\Test\\bananaSPLIT.log", "logFormat": "%(asctime)s|%(levelname)-8s| %(message)-50s", "logTimeFormat": "%m-%d %H:%M:%S" }, "splitter": { - "OUTnameFormat": "TEST_{docnum}_{year:04d}{month:02d}{day:02d}_{title}.txt", + "paths": { + "INworkPath": "D:\\Test\\", + "OUTworkPath": "D:\\Test\\Separati\\", + "OUTnameFormat": "TEST_{docnum}_{year:04d}{month:02d}{day:02d}_{title}.txt" + }, "docStruct": { "docSep": "\\s*Copyright [(0-9)]+", "dateFormat": "{month} {day:d}, {year:d}{}",