|
|
|
|
@@ -23,7 +23,9 @@ class bananaSPLITTER(threading.Thread):
|
|
|
|
|
self.docStruct = self.fileParams['docStruct']
|
|
|
|
|
self.settings = self.fileParams['settings']
|
|
|
|
|
self.fileName = self.fileParams['name']
|
|
|
|
|
self.outPath = self.paths['OUTworkPath']+slugify(self.fileName)
|
|
|
|
|
self.beginTime = time.time()
|
|
|
|
|
os.mkdir(self.outPath)
|
|
|
|
|
pass
|
|
|
|
|
else:
|
|
|
|
|
self.log.critical("Non e' stato fornito il nome di alcun file da splittare!")
|
|
|
|
|
@@ -31,25 +33,30 @@ class bananaSPLITTER(threading.Thread):
|
|
|
|
|
|
|
|
|
|
def run(self):
|
|
|
|
|
self.log.info("Nuovo SPLITTER su file: {}".format(self.fileName))
|
|
|
|
|
self.openFile()
|
|
|
|
|
self.remEmptyLines()
|
|
|
|
|
self.splitFile()
|
|
|
|
|
if self.settings['removeDuplicates']:
|
|
|
|
|
self.log.info("Controllo se ci sono dei duplicati..")
|
|
|
|
|
self.removeDuplicates()
|
|
|
|
|
else:
|
|
|
|
|
for idx, ff in enumerate(self.fileList):
|
|
|
|
|
ff['duplicate']=False
|
|
|
|
|
self.fileList[idx]=ff
|
|
|
|
|
print('Salto il controllo dei duplicati..')
|
|
|
|
|
if self.settings['saveSeparateFiles']:
|
|
|
|
|
self.saveSeparate()
|
|
|
|
|
if self.settings['saveBodyFile']:
|
|
|
|
|
self.saveBody()
|
|
|
|
|
try:
|
|
|
|
|
self.openFile()
|
|
|
|
|
self.remEmptyLines()
|
|
|
|
|
self.splitFile()
|
|
|
|
|
if self.settings['removeDuplicates']:
|
|
|
|
|
self.log.info("Controllo se ci sono dei duplicati..")
|
|
|
|
|
self.removeDuplicates()
|
|
|
|
|
else:
|
|
|
|
|
for idx, ff in enumerate(self.fileList):
|
|
|
|
|
ff['duplicate']=False
|
|
|
|
|
self.fileList[idx]=ff
|
|
|
|
|
print('Salto il controllo dei duplicati..')
|
|
|
|
|
if self.settings['saveSeparateFiles']:
|
|
|
|
|
self.saveSeparate()
|
|
|
|
|
if self.settings['saveBodyFile']:
|
|
|
|
|
self.saveBody()
|
|
|
|
|
except UnicodeDecodeError as ee:
|
|
|
|
|
self.log.critical("Il file [{}] contiene caratteri non compatibili con la codifica scelta! [{}]"
|
|
|
|
|
.format(self.fileParams['name'],ee))
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
def openFile(self):
|
|
|
|
|
try:
|
|
|
|
|
os.chdir(self.paths["INworkPath"])
|
|
|
|
|
self.log.info("Carico il contenuto..")
|
|
|
|
|
fp = open(self.fileParams['name'], mode='r', encoding=self.settings['encoding'])
|
|
|
|
|
self.rawFile = fp.readlines()
|
|
|
|
|
@@ -57,9 +64,6 @@ class bananaSPLITTER(threading.Thread):
|
|
|
|
|
except IOError as e:
|
|
|
|
|
self.log.critical("Impossibile aprire il file: {}! [{}]".format(self.fileName,e))
|
|
|
|
|
raise BaseException("OpenFile")
|
|
|
|
|
except UnicodeDecodeError as ee:
|
|
|
|
|
self.log.critical("Il file [{}] contiene caratteri non compatibili con la codifica scelta! [{}]"
|
|
|
|
|
.format(self.fileParams['name'],ee))
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
def remEmptyLines(self):
|
|
|
|
|
@@ -204,11 +208,11 @@ class bananaSPLITTER(threading.Thread):
|
|
|
|
|
ff['duplicate'] = True
|
|
|
|
|
self.fileList[idx]=ff
|
|
|
|
|
self.duplicateNumber+=1
|
|
|
|
|
self.log.info("Ho rimosso {} duplicati di {} articoli..\n". format(self.duplicateNumber, len(duplicateList)))
|
|
|
|
|
self.log.info("Ho rimosso {} duplicati di {} articoli..". format(self.duplicateNumber, len(duplicateList)))
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
def saveSeparate(self):
|
|
|
|
|
os.mkdir(self.paths['OUTworkPath']+slugify(self.fileName))
|
|
|
|
|
os.chdir(self.outPath)
|
|
|
|
|
self.paths['OUTworkPath']=self.paths['OUTworkPath']+slugify(self.fileName)
|
|
|
|
|
self.log.info("Salvo gli articoli in file separati...")
|
|
|
|
|
self.log.debug("Persorso: {0}".format(self.paths['OUTworkPath'].format('nomeFile')))
|
|
|
|
|
@@ -220,7 +224,7 @@ class bananaSPLITTER(threading.Thread):
|
|
|
|
|
docnum=self.bodyCounter,\
|
|
|
|
|
papername=ff['newsPaperName'].strip(),\
|
|
|
|
|
**ff['date'])
|
|
|
|
|
out=open(self.paths['OUTworkPath']+'{0}'.format(fName),'wb')
|
|
|
|
|
out=open('{0}'.format(fName),'wb')
|
|
|
|
|
if self.settings['includeTitle']:
|
|
|
|
|
ff['content'] = ff['title']+os.linesep+ff['content']
|
|
|
|
|
out.write(ff['content'].encode(self.settings['encoding']))
|
|
|
|
|
@@ -232,8 +236,9 @@ class bananaSPLITTER(threading.Thread):
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
def saveBody(self):
|
|
|
|
|
print('Salvo gli articoli in un singolo file vicino agli originali...')
|
|
|
|
|
print ('Persorso: {0}'.format(self.paths['OUTworkPath'].format('nomeFile')))
|
|
|
|
|
self.log.info('Salvo gli articoli in un singolo file vicino agli originali...')
|
|
|
|
|
os.chdir(self.outPath)
|
|
|
|
|
print ('Persorso: {0}'.format(self.outPath))
|
|
|
|
|
try:
|
|
|
|
|
fName=slugify(self.fileName)
|
|
|
|
|
fName='BODYFILE_{0}.txt'.format(fName[:self.settings['maxTitleLen']])
|
|
|
|
|
|