diff --git a/bananaSPLIT/libbananasplit/libsplit.py b/bananaSPLIT/libbananasplit/libsplit.py index 34e6a8d..b521259 100644 --- a/bananaSPLIT/libbananasplit/libsplit.py +++ b/bananaSPLIT/libbananasplit/libsplit.py @@ -52,6 +52,8 @@ class bananaSPLITTER(threading.Thread): self.log.critical("Il file [{}] contiene caratteri non compatibili con la codifica scelta! [{}]" .format(self.fileParams['name'],ee)) os.rmdir(self.outPath) + except BaseException as ee: + self.log.critical(ee) pass def openFile(self): @@ -85,6 +87,8 @@ class bananaSPLITTER(threading.Thread): def splitFile(self): #porting del codice dal programma originale self.log.info("Individuo il contenuto..") + self.bodyCounter=0 + self.duplicateNumber=0 docNumber = 0 docSkipped = 0 docDate = {} @@ -179,6 +183,7 @@ class bananaSPLITTER(threading.Thread): newDoc['content']=copy.deepcopy(''.join(tempBody)) self.fileList.append(copy.deepcopy(newDoc)) tempBody=list() + self.log.info(self.fileName + " - {}".format(self.bodyCounter)) self.bodyCounter +=1 pass else: @@ -214,6 +219,7 @@ class bananaSPLITTER(threading.Thread): def saveSeparate(self): os.chdir(self.outPath) + outFileCounter = 0 self.paths['OUTworkPath']=self.paths['OUTworkPath']+slugify(self.fileName) self.log.info("Salvo gli articoli in file separati...") self.log.debug("Persorso: {0}".format(self.paths['OUTworkPath'].format('nomeFile'))) @@ -222,7 +228,7 @@ class bananaSPLITTER(threading.Thread): if ff['duplicate'] == False: fName=self.paths['OUTnameFormat'].format(title=slugify(ff['title'][:self.settings['maxTitleLen']]),\ filename=slugify(self.fileName),\ - docnum=self.bodyCounter,\ + docnum=outFileCounter,\ papername=ff['newsPaperName'].strip(),\ **ff['date']) out=open('{0}'.format(fName),'wb') @@ -230,10 +236,12 @@ class bananaSPLITTER(threading.Thread): ff['content'] = ff['title']+os.linesep+ff['content'] out.write(ff['content'].encode(self.settings['encoding'])) out.close() - self.bodyCounter+=1 + outFileCounter+=1 except IOError as e: self.log.error("Qualcosa e\' andato storto, non riesco a scrivere il file: {}".format(e)) continue + if outFileCounter < self.bodyCounter: + raise BaseException("Ho salvato meno file rispetto a quelli trovati!") pass def saveBody(self): diff --git a/bananaSPLIT/libbananasplit/testEN.json b/bananaSPLIT/libbananasplit/testEN.json index 7c3c712..b3de8d2 100644 --- a/bananaSPLIT/libbananasplit/testEN.json +++ b/bananaSPLIT/libbananasplit/testEN.json @@ -57,9 +57,9 @@ "getNewsPaperName": true, "nameNotFoundStr": "ND", "includeTitle": true, - "removeDuplicates": true, + "removeDuplicates": false, "showSkipped": true, - "showRemovedDuplicates": true, + "showRemovedDuplicates": false, "maxTitleLen": 32, "loadTXT": true, "loadDOCX": false,