weaponizzate le funzioni di salva file singolo e body
This commit is contained in:
@@ -4,7 +4,7 @@ Created on 2 nov 2019
|
||||
@author: Emanuele Trabattoni
|
||||
'''
|
||||
from libbabanasplit.libfancylogger import fancyLogger
|
||||
import threading, time, sys, parse, re, copy
|
||||
import threading, time, parse, re, copy, slugify, os
|
||||
|
||||
class bananaSPLITTER(threading.Thread):
|
||||
|
||||
@@ -14,9 +14,10 @@ class bananaSPLITTER(threading.Thread):
|
||||
self.rawFile = None
|
||||
self.status = "first"
|
||||
self.fileList = list()
|
||||
|
||||
self.bodyCounter=0
|
||||
if fileParams is not None:
|
||||
self.log.info("Sto operando sul file: {}..".format(self.fileParams['name']))
|
||||
self.paths = self.fileParams['paths']
|
||||
self.docStruct = self.fileParams['docStruct']
|
||||
self.settings = self.fileParams['settings']
|
||||
self.fileName = self.fileParams['name']
|
||||
@@ -38,8 +39,7 @@ class bananaSPLITTER(threading.Thread):
|
||||
self.rawFile = fp.readlines()
|
||||
fp.close()
|
||||
except IOError as e:
|
||||
self.log.critical("Impossibile aprire il file: {}!".
|
||||
format(self.fileName))
|
||||
self.log.critical("Impossibile aprire il file: {}! [{}]".format(self.fileName,e))
|
||||
raise BaseException("OpenFile")
|
||||
pass
|
||||
|
||||
@@ -63,7 +63,6 @@ class bananaSPLITTER(threading.Thread):
|
||||
self.log.info("Individuo il contenuto..")
|
||||
docNumber = 0
|
||||
docSkipped = 0
|
||||
bodyCounter = 0
|
||||
docDate = {}
|
||||
prevLine = ''
|
||||
newsPaperName = ''
|
||||
@@ -155,22 +154,76 @@ class bananaSPLITTER(threading.Thread):
|
||||
newDoc['content']=copy.deepcopy(''.join(tempBody))
|
||||
self.fileList.append(copy.deepcopy(newDoc))
|
||||
tempBody=list()
|
||||
bodyCounter +=1
|
||||
self.bodyCounter +=1
|
||||
pass
|
||||
else:
|
||||
self.log.critical("Stato Interno Sconosciuto")
|
||||
prevLine=l #salva sempre e comunque il contenuto della linea precedente
|
||||
pass
|
||||
#ricerca terminata, espongo i risultati
|
||||
self.log.info("Nel file ho trovato {0} articoli..".format(bodyCounter))
|
||||
self.log.info("Nel file ho trovato {0} articoli..".format(self.bodyCounter))
|
||||
if docSkipped > 0:
|
||||
self.log.warning('Attentione, LexisNexis ne ha saltati {} !!!'.format(docSkipped))
|
||||
self.log.warning("Attentione, LexisNexis ne ha saltati {} !!!".format(docSkipped))
|
||||
pass
|
||||
pass
|
||||
|
||||
|
||||
def removeDuplicates(self):
|
||||
self.log.info("Controllo se ci sono dei duplicati..")
|
||||
titleList=[]
|
||||
duplicateList=[]
|
||||
duplicateNumber=0
|
||||
for idx, ff in enumerate(self.fileList):
|
||||
if ff['title'] not in titleList:
|
||||
titleList.append(ff['title'])
|
||||
ff['duplicate']=False
|
||||
self.fileList[idx]=ff
|
||||
pass
|
||||
else:
|
||||
if ff['title'] not in duplicateList:
|
||||
duplicateList.append(ff['title'])
|
||||
if self.settings['showRemovedDuplicates']:
|
||||
self.log.info("Duplicato: {}".format(ff['title'].strip()))
|
||||
ff['duplicate'] = True
|
||||
self.fileList[idx]=ff
|
||||
duplicateNumber+=1
|
||||
self.log.info("Ho rimosso {} duplicati di {} articoli..\n". format(duplicateNumber, len(duplicateList)))
|
||||
pass
|
||||
|
||||
def saveSeparate(self):
|
||||
self.log.info("Salvo gli articoli in file separati...")
|
||||
self.log.debug("Persorso: {0}".format(self.paths['OUTworkPath'].format('nomeFile')))
|
||||
for ff in self.fileList:
|
||||
try:
|
||||
if ff['duplicate'] == False:
|
||||
fName=self.paths['OUTnameFormat'].format(title=slugify(ff['title'][:self.settings['maxTitleLen']]),\
|
||||
filename=slugify(self.fileName),\
|
||||
docnum=self.bodyCounter,\
|
||||
papername=ff['newsPaperName'].strip(),\
|
||||
**ff['date'])
|
||||
out=open(self.paths['OUTworkPath']+'{0}'.format(fName),'wb')
|
||||
if self.settings['includeTitle']:
|
||||
ff['content'] = ff['title']+os.linesep+ff['content']
|
||||
out.write(ff['content'].encode(self.settings['encoding']))
|
||||
out.close()
|
||||
self.bodyCounter+=1
|
||||
except IOError as e:
|
||||
self.log.error("Qualcosa e\' andato storto, non riesco a scrivere il file: {}".format(e))
|
||||
continue
|
||||
pass
|
||||
|
||||
def saveBody(self):
|
||||
print('Salvo gli articoli in un singolo file vicino agli originali...')
|
||||
print ('Persorso: {0}'.format(self.paths['OUTworkPath'].format('nomeFile')))
|
||||
try:
|
||||
fName=slugify(self.fileName)
|
||||
fName='BODYFILE_{0}_{1}.txt'.format(self.fileCounter,fName[:self.settings['maxTitleLen']])
|
||||
fileContent = os.linesep.join([cc['content'] for cc in self.fileList])
|
||||
out=open(self.paths['OUTworkPath']+'{0}'.format(fName),'wb')
|
||||
out.write(fileContent.encode(self.settings['encoding']))
|
||||
out.close()
|
||||
except IOError as e:
|
||||
print("OOPS! Qualcosa e\' andato storto, non riesco a scrivere il file: {}".format(e))
|
||||
continue
|
||||
pass
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -1,16 +1,16 @@
|
||||
{
|
||||
"version": "v1.1a",
|
||||
"global": {
|
||||
"INworkPath": "D:\\Test\\",
|
||||
"OUTworkPath": "D:\\Test\\Separati\\"
|
||||
},
|
||||
"logger": {
|
||||
"logFile": "D:\\Test\\bananaSPLIT.log",
|
||||
"logFormat": "%(asctime)s|%(levelname)-8s| %(message)-50s",
|
||||
"logTimeFormat": "%m-%d %H:%M:%S"
|
||||
},
|
||||
"splitter": {
|
||||
"OUTnameFormat": "TEST_{docnum}_{year:04d}{month:02d}{day:02d}_{title}.txt",
|
||||
"paths": {
|
||||
"INworkPath": "D:\\Test\\",
|
||||
"OUTworkPath": "D:\\Test\\Separati\\",
|
||||
"OUTnameFormat": "TEST_{docnum}_{year:04d}{month:02d}{day:02d}_{title}.txt"
|
||||
},
|
||||
"docStruct": {
|
||||
"docSep": "\\s*Copyright [(0-9)]+",
|
||||
"dateFormat": "{month} {day:d}, {year:d}{}",
|
||||
|
||||
Reference in New Issue
Block a user