diff --git a/bananaSPLIT/libbananasplit/libsplit.py b/bananaSPLIT/libbananasplit/libsplit.py index 4c5d1cb..0dd08d8 100644 --- a/bananaSPLIT/libbananasplit/libsplit.py +++ b/bananaSPLIT/libbananasplit/libsplit.py @@ -23,7 +23,9 @@ class bananaSPLITTER(threading.Thread): self.docStruct = self.fileParams['docStruct'] self.settings = self.fileParams['settings'] self.fileName = self.fileParams['name'] + self.outPath = self.paths['OUTworkPath']+slugify(self.fileName) self.beginTime = time.time() + os.mkdir(self.outPath) pass else: self.log.critical("Non e' stato fornito il nome di alcun file da splittare!") @@ -31,35 +33,37 @@ class bananaSPLITTER(threading.Thread): def run(self): self.log.info("Nuovo SPLITTER su file: {}".format(self.fileName)) - self.openFile() - self.remEmptyLines() - self.splitFile() - if self.settings['removeDuplicates']: - self.log.info("Controllo se ci sono dei duplicati..") - self.removeDuplicates() - else: - for idx, ff in enumerate(self.fileList): - ff['duplicate']=False - self.fileList[idx]=ff - print('Salto il controllo dei duplicati..') - if self.settings['saveSeparateFiles']: - self.saveSeparate() - if self.settings['saveBodyFile']: - self.saveBody() + try: + self.openFile() + self.remEmptyLines() + self.splitFile() + if self.settings['removeDuplicates']: + self.log.info("Controllo se ci sono dei duplicati..") + self.removeDuplicates() + else: + for idx, ff in enumerate(self.fileList): + ff['duplicate']=False + self.fileList[idx]=ff + print('Salto il controllo dei duplicati..') + if self.settings['saveSeparateFiles']: + self.saveSeparate() + if self.settings['saveBodyFile']: + self.saveBody() + except UnicodeDecodeError as ee: + self.log.critical("Il file [{}] contiene caratteri non compatibili con la codifica scelta! [{}]" + .format(self.fileParams['name'],ee)) pass def openFile(self): try: + os.chdir(self.paths["INworkPath"]) self.log.info("Carico il contenuto..") fp = open(self.fileParams['name'], mode='r', encoding=self.settings['encoding']) self.rawFile = fp.readlines() fp.close() except IOError as e: self.log.critical("Impossibile aprire il file: {}! [{}]".format(self.fileName,e)) - raise BaseException("OpenFile") - except UnicodeDecodeError as ee: - self.log.critical("Il file [{}] contiene caratteri non compatibili con la codifica scelta! [{}]" - .format(self.fileParams['name'],ee)) + raise BaseException("OpenFile") pass def remEmptyLines(self): @@ -204,11 +208,11 @@ class bananaSPLITTER(threading.Thread): ff['duplicate'] = True self.fileList[idx]=ff self.duplicateNumber+=1 - self.log.info("Ho rimosso {} duplicati di {} articoli..\n". format(self.duplicateNumber, len(duplicateList))) + self.log.info("Ho rimosso {} duplicati di {} articoli..". format(self.duplicateNumber, len(duplicateList))) pass def saveSeparate(self): - os.mkdir(self.paths['OUTworkPath']+slugify(self.fileName)) + os.chdir(self.outPath) self.paths['OUTworkPath']=self.paths['OUTworkPath']+slugify(self.fileName) self.log.info("Salvo gli articoli in file separati...") self.log.debug("Persorso: {0}".format(self.paths['OUTworkPath'].format('nomeFile'))) @@ -220,7 +224,7 @@ class bananaSPLITTER(threading.Thread): docnum=self.bodyCounter,\ papername=ff['newsPaperName'].strip(),\ **ff['date']) - out=open(self.paths['OUTworkPath']+'{0}'.format(fName),'wb') + out=open('{0}'.format(fName),'wb') if self.settings['includeTitle']: ff['content'] = ff['title']+os.linesep+ff['content'] out.write(ff['content'].encode(self.settings['encoding'])) @@ -232,8 +236,9 @@ class bananaSPLITTER(threading.Thread): pass def saveBody(self): - print('Salvo gli articoli in un singolo file vicino agli originali...') - print ('Persorso: {0}'.format(self.paths['OUTworkPath'].format('nomeFile'))) + self.log.info('Salvo gli articoli in un singolo file vicino agli originali...') + os.chdir(self.outPath) + print ('Persorso: {0}'.format(self.outPath)) try: fName=slugify(self.fileName) fName='BODYFILE_{0}.txt'.format(fName[:self.settings['maxTitleLen']]) diff --git a/bananaSPLIT/libbananasplit/testEN.json b/bananaSPLIT/libbananasplit/testEN.json index aa500ed..049bb91 100644 --- a/bananaSPLIT/libbananasplit/testEN.json +++ b/bananaSPLIT/libbananasplit/testEN.json @@ -9,7 +9,7 @@ "name": "", "paths": { "INworkPath": "D:\\Emanuele\\Documenti\\workspace\\bananaSPLIT\\TestFiles\\", - "OUTworkPath": "D:\\Test\\Separati\\", + "OUTworkPath": "H:\\", "OUTnameFormat": "TEST_{docnum}_{year:04d}{month:02d}{day:02d}_{title}.txt" }, "docStruct": { diff --git a/bananaSPLIT/libbananasplit/testITA.json b/bananaSPLIT/libbananasplit/testITA.json index 873a3a6..2829cca 100644 --- a/bananaSPLIT/libbananasplit/testITA.json +++ b/bananaSPLIT/libbananasplit/testITA.json @@ -9,7 +9,7 @@ "name": "", "paths": { "INworkPath": "D:\\Emanuele\\Documenti\\workspace\\bananaSPLIT\\TestFiles\\", - "OUTworkPath": "D:\\Test\\Separati\\", + "OUTworkPath": "H:\\", "OUTnameFormat": "TEST_{docnum}_{year:04d}{month:02d}{day:02d}_{title}.txt" }, "docStruct": { diff --git a/bananaSPLIT/libtestmain.py b/bananaSPLIT/libtestmain.py index 34e5b98..8b0d7fb 100644 --- a/bananaSPLIT/libtestmain.py +++ b/bananaSPLIT/libtestmain.py @@ -5,6 +5,7 @@ Created on 1 dic 2019 ''' import os from glob import glob +from copy import deepcopy from libsplit import bananaSPLITTER from libconfload import bananaCONF from libfancylogger import fancyLogger @@ -16,16 +17,18 @@ confl.open() confl.use("testEN.json") splconf = confl.getParams("splitter") -for f in glob(splconf["paths"]["INworkPath"]+"*.txt"): +splist = [] +os.chdir(splconf["paths"]["INworkPath"]) +for f in glob("*.txt"): splconf["name"] = f - splitter = bananaSPLITTER(fileParams=splconf, logger=logger) - splitter.openFile() - splitter.remEmptyLines() - splitter.splitFile() - splitter.removeDuplicates() - splitter.saveBody() - splitter.saveSeparate() - + logger.info("-"*80) + splitter = bananaSPLITTER(fileParams=deepcopy(splconf), logger=logger) + splist.append(splitter) + splitter.start() + splitter.join() + del splitter + +logger.info("\n"+"="*50+"\n\tFINITO!!!\n"+"="*50)