correzioni procedura salvataggio file e cartelle
This commit is contained in:
@@ -23,7 +23,9 @@ class bananaSPLITTER(threading.Thread):
|
||||
self.docStruct = self.fileParams['docStruct']
|
||||
self.settings = self.fileParams['settings']
|
||||
self.fileName = self.fileParams['name']
|
||||
self.outPath = self.paths['OUTworkPath']+slugify(self.fileName)
|
||||
self.beginTime = time.time()
|
||||
os.mkdir(self.outPath)
|
||||
pass
|
||||
else:
|
||||
self.log.critical("Non e' stato fornito il nome di alcun file da splittare!")
|
||||
@@ -31,35 +33,37 @@ class bananaSPLITTER(threading.Thread):
|
||||
|
||||
def run(self):
|
||||
self.log.info("Nuovo SPLITTER su file: {}".format(self.fileName))
|
||||
self.openFile()
|
||||
self.remEmptyLines()
|
||||
self.splitFile()
|
||||
if self.settings['removeDuplicates']:
|
||||
self.log.info("Controllo se ci sono dei duplicati..")
|
||||
self.removeDuplicates()
|
||||
else:
|
||||
for idx, ff in enumerate(self.fileList):
|
||||
ff['duplicate']=False
|
||||
self.fileList[idx]=ff
|
||||
print('Salto il controllo dei duplicati..')
|
||||
if self.settings['saveSeparateFiles']:
|
||||
self.saveSeparate()
|
||||
if self.settings['saveBodyFile']:
|
||||
self.saveBody()
|
||||
try:
|
||||
self.openFile()
|
||||
self.remEmptyLines()
|
||||
self.splitFile()
|
||||
if self.settings['removeDuplicates']:
|
||||
self.log.info("Controllo se ci sono dei duplicati..")
|
||||
self.removeDuplicates()
|
||||
else:
|
||||
for idx, ff in enumerate(self.fileList):
|
||||
ff['duplicate']=False
|
||||
self.fileList[idx]=ff
|
||||
print('Salto il controllo dei duplicati..')
|
||||
if self.settings['saveSeparateFiles']:
|
||||
self.saveSeparate()
|
||||
if self.settings['saveBodyFile']:
|
||||
self.saveBody()
|
||||
except UnicodeDecodeError as ee:
|
||||
self.log.critical("Il file [{}] contiene caratteri non compatibili con la codifica scelta! [{}]"
|
||||
.format(self.fileParams['name'],ee))
|
||||
pass
|
||||
|
||||
def openFile(self):
|
||||
try:
|
||||
os.chdir(self.paths["INworkPath"])
|
||||
self.log.info("Carico il contenuto..")
|
||||
fp = open(self.fileParams['name'], mode='r', encoding=self.settings['encoding'])
|
||||
self.rawFile = fp.readlines()
|
||||
fp.close()
|
||||
except IOError as e:
|
||||
self.log.critical("Impossibile aprire il file: {}! [{}]".format(self.fileName,e))
|
||||
raise BaseException("OpenFile")
|
||||
except UnicodeDecodeError as ee:
|
||||
self.log.critical("Il file [{}] contiene caratteri non compatibili con la codifica scelta! [{}]"
|
||||
.format(self.fileParams['name'],ee))
|
||||
raise BaseException("OpenFile")
|
||||
pass
|
||||
|
||||
def remEmptyLines(self):
|
||||
@@ -204,11 +208,11 @@ class bananaSPLITTER(threading.Thread):
|
||||
ff['duplicate'] = True
|
||||
self.fileList[idx]=ff
|
||||
self.duplicateNumber+=1
|
||||
self.log.info("Ho rimosso {} duplicati di {} articoli..\n". format(self.duplicateNumber, len(duplicateList)))
|
||||
self.log.info("Ho rimosso {} duplicati di {} articoli..". format(self.duplicateNumber, len(duplicateList)))
|
||||
pass
|
||||
|
||||
def saveSeparate(self):
|
||||
os.mkdir(self.paths['OUTworkPath']+slugify(self.fileName))
|
||||
os.chdir(self.outPath)
|
||||
self.paths['OUTworkPath']=self.paths['OUTworkPath']+slugify(self.fileName)
|
||||
self.log.info("Salvo gli articoli in file separati...")
|
||||
self.log.debug("Persorso: {0}".format(self.paths['OUTworkPath'].format('nomeFile')))
|
||||
@@ -220,7 +224,7 @@ class bananaSPLITTER(threading.Thread):
|
||||
docnum=self.bodyCounter,\
|
||||
papername=ff['newsPaperName'].strip(),\
|
||||
**ff['date'])
|
||||
out=open(self.paths['OUTworkPath']+'{0}'.format(fName),'wb')
|
||||
out=open('{0}'.format(fName),'wb')
|
||||
if self.settings['includeTitle']:
|
||||
ff['content'] = ff['title']+os.linesep+ff['content']
|
||||
out.write(ff['content'].encode(self.settings['encoding']))
|
||||
@@ -232,8 +236,9 @@ class bananaSPLITTER(threading.Thread):
|
||||
pass
|
||||
|
||||
def saveBody(self):
|
||||
print('Salvo gli articoli in un singolo file vicino agli originali...')
|
||||
print ('Persorso: {0}'.format(self.paths['OUTworkPath'].format('nomeFile')))
|
||||
self.log.info('Salvo gli articoli in un singolo file vicino agli originali...')
|
||||
os.chdir(self.outPath)
|
||||
print ('Persorso: {0}'.format(self.outPath))
|
||||
try:
|
||||
fName=slugify(self.fileName)
|
||||
fName='BODYFILE_{0}.txt'.format(fName[:self.settings['maxTitleLen']])
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
"name": "",
|
||||
"paths": {
|
||||
"INworkPath": "D:\\Emanuele\\Documenti\\workspace\\bananaSPLIT\\TestFiles\\",
|
||||
"OUTworkPath": "D:\\Test\\Separati\\",
|
||||
"OUTworkPath": "H:\\",
|
||||
"OUTnameFormat": "TEST_{docnum}_{year:04d}{month:02d}{day:02d}_{title}.txt"
|
||||
},
|
||||
"docStruct": {
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
"name": "",
|
||||
"paths": {
|
||||
"INworkPath": "D:\\Emanuele\\Documenti\\workspace\\bananaSPLIT\\TestFiles\\",
|
||||
"OUTworkPath": "D:\\Test\\Separati\\",
|
||||
"OUTworkPath": "H:\\",
|
||||
"OUTnameFormat": "TEST_{docnum}_{year:04d}{month:02d}{day:02d}_{title}.txt"
|
||||
},
|
||||
"docStruct": {
|
||||
|
||||
@@ -5,6 +5,7 @@ Created on 1 dic 2019
|
||||
'''
|
||||
import os
|
||||
from glob import glob
|
||||
from copy import deepcopy
|
||||
from libsplit import bananaSPLITTER
|
||||
from libconfload import bananaCONF
|
||||
from libfancylogger import fancyLogger
|
||||
@@ -16,16 +17,18 @@ confl.open()
|
||||
|
||||
confl.use("testEN.json")
|
||||
splconf = confl.getParams("splitter")
|
||||
for f in glob(splconf["paths"]["INworkPath"]+"*.txt"):
|
||||
splist = []
|
||||
os.chdir(splconf["paths"]["INworkPath"])
|
||||
for f in glob("*.txt"):
|
||||
splconf["name"] = f
|
||||
splitter = bananaSPLITTER(fileParams=splconf, logger=logger)
|
||||
splitter.openFile()
|
||||
splitter.remEmptyLines()
|
||||
splitter.splitFile()
|
||||
splitter.removeDuplicates()
|
||||
splitter.saveBody()
|
||||
splitter.saveSeparate()
|
||||
|
||||
logger.info("-"*80)
|
||||
splitter = bananaSPLITTER(fileParams=deepcopy(splconf), logger=logger)
|
||||
splist.append(splitter)
|
||||
splitter.start()
|
||||
splitter.join()
|
||||
del splitter
|
||||
|
||||
logger.info("\n"+"="*50+"\n\tFINITO!!!\n"+"="*50)
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user