correzioni procedura salvataggio file e cartelle

This commit is contained in:
2019-12-05 20:26:50 +01:00
parent 01dd92e4da
commit e3307d8db5
4 changed files with 43 additions and 35 deletions

View File

@@ -23,7 +23,9 @@ class bananaSPLITTER(threading.Thread):
self.docStruct = self.fileParams['docStruct']
self.settings = self.fileParams['settings']
self.fileName = self.fileParams['name']
self.outPath = self.paths['OUTworkPath']+slugify(self.fileName)
self.beginTime = time.time()
os.mkdir(self.outPath)
pass
else:
self.log.critical("Non e' stato fornito il nome di alcun file da splittare!")
@@ -31,35 +33,37 @@ class bananaSPLITTER(threading.Thread):
def run(self):
self.log.info("Nuovo SPLITTER su file: {}".format(self.fileName))
self.openFile()
self.remEmptyLines()
self.splitFile()
if self.settings['removeDuplicates']:
self.log.info("Controllo se ci sono dei duplicati..")
self.removeDuplicates()
else:
for idx, ff in enumerate(self.fileList):
ff['duplicate']=False
self.fileList[idx]=ff
print('Salto il controllo dei duplicati..')
if self.settings['saveSeparateFiles']:
self.saveSeparate()
if self.settings['saveBodyFile']:
self.saveBody()
try:
self.openFile()
self.remEmptyLines()
self.splitFile()
if self.settings['removeDuplicates']:
self.log.info("Controllo se ci sono dei duplicati..")
self.removeDuplicates()
else:
for idx, ff in enumerate(self.fileList):
ff['duplicate']=False
self.fileList[idx]=ff
print('Salto il controllo dei duplicati..')
if self.settings['saveSeparateFiles']:
self.saveSeparate()
if self.settings['saveBodyFile']:
self.saveBody()
except UnicodeDecodeError as ee:
self.log.critical("Il file [{}] contiene caratteri non compatibili con la codifica scelta! [{}]"
.format(self.fileParams['name'],ee))
pass
def openFile(self):
try:
os.chdir(self.paths["INworkPath"])
self.log.info("Carico il contenuto..")
fp = open(self.fileParams['name'], mode='r', encoding=self.settings['encoding'])
self.rawFile = fp.readlines()
fp.close()
except IOError as e:
self.log.critical("Impossibile aprire il file: {}! [{}]".format(self.fileName,e))
raise BaseException("OpenFile")
except UnicodeDecodeError as ee:
self.log.critical("Il file [{}] contiene caratteri non compatibili con la codifica scelta! [{}]"
.format(self.fileParams['name'],ee))
raise BaseException("OpenFile")
pass
def remEmptyLines(self):
@@ -204,11 +208,11 @@ class bananaSPLITTER(threading.Thread):
ff['duplicate'] = True
self.fileList[idx]=ff
self.duplicateNumber+=1
self.log.info("Ho rimosso {} duplicati di {} articoli..\n". format(self.duplicateNumber, len(duplicateList)))
self.log.info("Ho rimosso {} duplicati di {} articoli..". format(self.duplicateNumber, len(duplicateList)))
pass
def saveSeparate(self):
os.mkdir(self.paths['OUTworkPath']+slugify(self.fileName))
os.chdir(self.outPath)
self.paths['OUTworkPath']=self.paths['OUTworkPath']+slugify(self.fileName)
self.log.info("Salvo gli articoli in file separati...")
self.log.debug("Persorso: {0}".format(self.paths['OUTworkPath'].format('nomeFile')))
@@ -220,7 +224,7 @@ class bananaSPLITTER(threading.Thread):
docnum=self.bodyCounter,\
papername=ff['newsPaperName'].strip(),\
**ff['date'])
out=open(self.paths['OUTworkPath']+'{0}'.format(fName),'wb')
out=open('{0}'.format(fName),'wb')
if self.settings['includeTitle']:
ff['content'] = ff['title']+os.linesep+ff['content']
out.write(ff['content'].encode(self.settings['encoding']))
@@ -232,8 +236,9 @@ class bananaSPLITTER(threading.Thread):
pass
def saveBody(self):
print('Salvo gli articoli in un singolo file vicino agli originali...')
print ('Persorso: {0}'.format(self.paths['OUTworkPath'].format('nomeFile')))
self.log.info('Salvo gli articoli in un singolo file vicino agli originali...')
os.chdir(self.outPath)
print ('Persorso: {0}'.format(self.outPath))
try:
fName=slugify(self.fileName)
fName='BODYFILE_{0}.txt'.format(fName[:self.settings['maxTitleLen']])

View File

@@ -9,7 +9,7 @@
"name": "",
"paths": {
"INworkPath": "D:\\Emanuele\\Documenti\\workspace\\bananaSPLIT\\TestFiles\\",
"OUTworkPath": "D:\\Test\\Separati\\",
"OUTworkPath": "H:\\",
"OUTnameFormat": "TEST_{docnum}_{year:04d}{month:02d}{day:02d}_{title}.txt"
},
"docStruct": {

View File

@@ -9,7 +9,7 @@
"name": "",
"paths": {
"INworkPath": "D:\\Emanuele\\Documenti\\workspace\\bananaSPLIT\\TestFiles\\",
"OUTworkPath": "D:\\Test\\Separati\\",
"OUTworkPath": "H:\\",
"OUTnameFormat": "TEST_{docnum}_{year:04d}{month:02d}{day:02d}_{title}.txt"
},
"docStruct": {

View File

@@ -5,6 +5,7 @@ Created on 1 dic 2019
'''
import os
from glob import glob
from copy import deepcopy
from libsplit import bananaSPLITTER
from libconfload import bananaCONF
from libfancylogger import fancyLogger
@@ -16,16 +17,18 @@ confl.open()
confl.use("testEN.json")
splconf = confl.getParams("splitter")
for f in glob(splconf["paths"]["INworkPath"]+"*.txt"):
splist = []
os.chdir(splconf["paths"]["INworkPath"])
for f in glob("*.txt"):
splconf["name"] = f
splitter = bananaSPLITTER(fileParams=splconf, logger=logger)
splitter.openFile()
splitter.remEmptyLines()
splitter.splitFile()
splitter.removeDuplicates()
splitter.saveBody()
splitter.saveSeparate()
logger.info("-"*80)
splitter = bananaSPLITTER(fileParams=deepcopy(splconf), logger=logger)
splist.append(splitter)
splitter.start()
splitter.join()
del splitter
logger.info("\n"+"="*50+"\n\tFINITO!!!\n"+"="*50)