weaponizzate le funzioni di salva file singolo e body

This commit is contained in:
2019-11-24 12:32:23 +01:00
parent e6fb04a9e7
commit 04a32c7bd1
2 changed files with 67 additions and 14 deletions

View File

@@ -4,7 +4,7 @@ Created on 2 nov 2019
@author: Emanuele Trabattoni
'''
from libbabanasplit.libfancylogger import fancyLogger
import threading, time, sys, parse, re, copy
import threading, time, parse, re, copy, slugify, os
class bananaSPLITTER(threading.Thread):
@@ -14,9 +14,10 @@ class bananaSPLITTER(threading.Thread):
self.rawFile = None
self.status = "first"
self.fileList = list()
self.bodyCounter=0
if fileParams is not None:
self.log.info("Sto operando sul file: {}..".format(self.fileParams['name']))
self.paths = self.fileParams['paths']
self.docStruct = self.fileParams['docStruct']
self.settings = self.fileParams['settings']
self.fileName = self.fileParams['name']
@@ -38,8 +39,7 @@ class bananaSPLITTER(threading.Thread):
self.rawFile = fp.readlines()
fp.close()
except IOError as e:
self.log.critical("Impossibile aprire il file: {}!".
format(self.fileName))
self.log.critical("Impossibile aprire il file: {}! [{}]".format(self.fileName,e))
raise BaseException("OpenFile")
pass
@@ -63,7 +63,6 @@ class bananaSPLITTER(threading.Thread):
self.log.info("Individuo il contenuto..")
docNumber = 0
docSkipped = 0
bodyCounter = 0
docDate = {}
prevLine = ''
newsPaperName = ''
@@ -155,22 +154,76 @@ class bananaSPLITTER(threading.Thread):
newDoc['content']=copy.deepcopy(''.join(tempBody))
self.fileList.append(copy.deepcopy(newDoc))
tempBody=list()
bodyCounter +=1
self.bodyCounter +=1
pass
else:
self.log.critical("Stato Interno Sconosciuto")
prevLine=l #salva sempre e comunque il contenuto della linea precedente
pass
#ricerca terminata, espongo i risultati
self.log.info("Nel file ho trovato {0} articoli..".format(bodyCounter))
self.log.info("Nel file ho trovato {0} articoli..".format(self.bodyCounter))
if docSkipped > 0:
self.log.warning('Attentione, LexisNexis ne ha saltati {} !!!'.format(docSkipped))
self.log.warning("Attentione, LexisNexis ne ha saltati {} !!!".format(docSkipped))
pass
pass
def removeDuplicates(self):
self.log.info("Controllo se ci sono dei duplicati..")
titleList=[]
duplicateList=[]
duplicateNumber=0
for idx, ff in enumerate(self.fileList):
if ff['title'] not in titleList:
titleList.append(ff['title'])
ff['duplicate']=False
self.fileList[idx]=ff
pass
else:
if ff['title'] not in duplicateList:
duplicateList.append(ff['title'])
if self.settings['showRemovedDuplicates']:
self.log.info("Duplicato: {}".format(ff['title'].strip()))
ff['duplicate'] = True
self.fileList[idx]=ff
duplicateNumber+=1
self.log.info("Ho rimosso {} duplicati di {} articoli..\n". format(duplicateNumber, len(duplicateList)))
pass
def saveSeparate(self):
self.log.info("Salvo gli articoli in file separati...")
self.log.debug("Persorso: {0}".format(self.paths['OUTworkPath'].format('nomeFile')))
for ff in self.fileList:
try:
if ff['duplicate'] == False:
fName=self.paths['OUTnameFormat'].format(title=slugify(ff['title'][:self.settings['maxTitleLen']]),\
filename=slugify(self.fileName),\
docnum=self.bodyCounter,\
papername=ff['newsPaperName'].strip(),\
**ff['date'])
out=open(self.paths['OUTworkPath']+'{0}'.format(fName),'wb')
if self.settings['includeTitle']:
ff['content'] = ff['title']+os.linesep+ff['content']
out.write(ff['content'].encode(self.settings['encoding']))
out.close()
self.bodyCounter+=1
except IOError as e:
self.log.error("Qualcosa e\' andato storto, non riesco a scrivere il file: {}".format(e))
continue
pass
def saveBody(self):
print('Salvo gli articoli in un singolo file vicino agli originali...')
print ('Persorso: {0}'.format(self.paths['OUTworkPath'].format('nomeFile')))
try:
fName=slugify(self.fileName)
fName='BODYFILE_{0}_{1}.txt'.format(self.fileCounter,fName[:self.settings['maxTitleLen']])
fileContent = os.linesep.join([cc['content'] for cc in self.fileList])
out=open(self.paths['OUTworkPath']+'{0}'.format(fName),'wb')
out.write(fileContent.encode(self.settings['encoding']))
out.close()
except IOError as e:
print("OOPS! Qualcosa e\' andato storto, non riesco a scrivere il file: {}".format(e))
continue
pass
if __name__ == "__main__":

View File

@@ -1,16 +1,16 @@
{
"version": "v1.1a",
"global": {
"INworkPath": "D:\\Test\\",
"OUTworkPath": "D:\\Test\\Separati\\"
},
"logger": {
"logFile": "D:\\Test\\bananaSPLIT.log",
"logFormat": "%(asctime)s|%(levelname)-8s| %(message)-50s",
"logTimeFormat": "%m-%d %H:%M:%S"
},
"splitter": {
"OUTnameFormat": "TEST_{docnum}_{year:04d}{month:02d}{day:02d}_{title}.txt",
"paths": {
"INworkPath": "D:\\Test\\",
"OUTworkPath": "D:\\Test\\Separati\\",
"OUTnameFormat": "TEST_{docnum}_{year:04d}{month:02d}{day:02d}_{title}.txt"
},
"docStruct": {
"docSep": "\\s*Copyright [(0-9)]+",
"dateFormat": "{month} {day:d}, {year:d}{}",