bananaSPLIT/bananaSPLIT/libbananasplit/libsplit.py

'''
Created on 2 nov 2019

@author: Emanuele Trabattoni
'''
from PyQt5.QtCore import QThread
from PyQt5.Qt import pyqtSignal

from slugify import slugify
import time, parse, re, copy, os, json
import traceback

class bananaSPLITTER(QThread):

	sendStatus = pyqtSignal(str)

	def __init__(self, fileParams=None, logger=None):
		QThread.__init__(self)
		self.fileParams = fileParams
		self.log = logger
		self.rawFile = None
		self.status = "first"
		self.contentList = list()
		self.bodyCounter=0
		self.duplicateNumber=0
		#self.log.debug(f"Configurazione: \n {json.dumps(fileParams, indent=2)}")
		if fileParams is not None:
			self.log.info("Sto operando sul file: {}..".format(self.fileParams['name']))
			self.paths = self.fileParams['paths']
			self.docStruct = self.fileParams['docStruct']
			self.settings = self.fileParams['settings']
			self.fileName = self.fileParams['name']
			self.outPath = self.paths['OUTworkPath']+slugify(self.fileName)
			self.beginTime = time.time()
			pass
		else:
			self.log.critical("Non e' stato fornito il nome di alcun file da splittare!")
		pass

	def run(self):
		self.log.info("Nuovo SPLITTER  su file: {}".format(self.fileName))
		self.sendStatus.emit(f"Inizio {self.fileName}")
		try:
			self.openFile()
			self.remEmptyLines()
			self.splitFile()

			if self.settings['removeDuplicates']:
				self.log.info("Controllo se ci sono dei duplicati..")
				self.removeDuplicates()
			else:
				for idx, ff in enumerate(self.contentList):
					ff['duplicate']=False
					self.contentList[idx]=ff
				self.log.warn('Salto il controllo dei duplicati..')
			# se il parse e la rimozione dei duplicati e' andata bene
			# preparo e inizio il salvataggio
			if os.path.exists(self.outPath):
				if self.settings['removeOldFiles']:
					#os.chdir(self.outPath)
					for f in os.listdir(self.outPath):
						os.remove(f)
				else:
					raise FileExistsError("Non posso sovrascrivere i vecchi file, eliminali manualmente!")
			else:
				os.mkdir(self.outPath)
				#os.chdir(self.outPath)
			if self.settings['saveSeparateFiles']:
				self.saveSeparate()
			if self.settings['saveBodyFile']:
				self.saveBody()
			self.log.info("L'elaborazione del file ha richiesto {:4.2f} sec".format(time.time()-self.beginTime))

		except UnicodeDecodeError as ee:
			self.log.critical("Il file [{}] contiene caratteri non compatibili con la codifica scelta! [{}]"
							.format(self.fileParams['name'],ee))
		except FileExistsError as fe:
			self.log.critical(fe)
		except Exception as ee:
			traceback.print_exc()
			self.log.warn(ee)
		pass

	def openFile(self):
		try:
			#os.chdir(self.paths["INworkPath"])
			self.log.info("Carico il contenuto..")
			fp = open(self.paths["INworkPath"]+'/'+self.fileParams['name'], mode='r', encoding=self.settings['encoding'])
			self.rawFile = fp.readlines()
			fp.close()
		except IOError as e:
			self.log.critical("Impossibile aprire il file: {}! [{}]".format(self.fileName,e))
			raise BaseException("OpenFile")
			os.rmdir(self.outPath)
		pass

	def remEmptyLines(self):
		self.log.info("Elimino righe vuote e caratteri inutili..")
		tempContent = []
		try:
			for ll in self.rawFile:
				for c in self.settings['delChars']:
					ll = ll.replace(c,'')
				if ll not in ['\n', '\r']:
					tempContent.append(ll)
			self.rawFile = copy.deepcopy(tempContent)
			return True
		except:
			self.log.error("Errore inaspettato durante l'eliminazione delle righe vuote!")
			raise BaseException("DelLines")
		del tempContent

	def splitFile(self):  #porting del codice dal programma originale
		self.log.info("Individuo il contenuto..")
		self.bodyCounter=0
		self.duplicateNumber=0
		docNumber = 0
		docSkipped = 0
		docDate = {}
		title = ''
		prevLine = ''
		newsPaperName = ''
		titleBegin = False
		tempBody = list()
		docSep=re.compile(self.docStruct['docSep'])

		for l in self.rawFile: #per ogni linea del file
			lineWords = l.lstrip().split(' ') #dividi la riga in parole
			if self.status == 'first':
				#prendo il numero di documento per vedere se ci sono buchi
				try:
					try:
						nn = parse.parse("{current:d} Of {total} Documents",l.strip().capitalize()).named
						if nn["current"]-docNumber==1:
							pass
						else:
							if self.settings["showSkipped"]:
								self.log.warn("Il conto dei documenti non torna! LexisNexis ne ha saltato qualcuno!\n Precedente:{0}-Attuale:{1}".format(docNumber,nn["current"]))
							docSkipped+=1
						docNumber = nn["current"]
					except:
						pass #non segnalare eccezione se il parse fallisce
					# ricerco la data
					if (lineWords[self.settings['monthPosition']]).capitalize() in self.docStruct['language']['dateWords']:
						try:
							docDate=parse.parse(self.docStruct['dateFormat'],l).named
							docDate['month']=docDate['month'].lstrip().rstrip().capitalize()
							docDate['month']=self.docStruct['language']['dateWords'].index(docDate['month'])+1
							title = ''
							titleBegin=True
							# dopo la data inizia il titolo, ma prima si cerca il nome del giornale
							if self.settings['getNewsPaperName']:
								try:
									if prevLine.split(' ')[0].strip().isalpha():
										newsPaperName = prevLine.strip()
									else:
										newsPaperName = self.settings['nameNotFoundStr']
								except:
									self.log.warn("E' successo qualcosa mentre stavo cercando il nome della pubblicazione,\
									controlla i file di uscita! [{}]".format(prevLine.strip()))
							else:
								newsPaperName = self.settings['nameNotFoundStr']
						except:
							self.log.warn("Ho trovato una riga ambigua.. potrebbe essere una data ma non so: [{}]". format(l.strip('\r\n')))
							pass
					elif lineWords[0] in self.docStruct['language']['headWords']:
						#cambio stato e inizializzo un nuovo documento da riempire
						self.status = 'head'
						newDoc=dict()
						newDoc['title']=title
						newDoc['date']=docDate
						newDoc['newsPaperName'] = newsPaperName
						titleBegin=False
					else:
						if titleBegin:
							title += l.strip().capitalize()
				except IndexError:
					self.log.error("Errore inaspettato, contatta il tuo sviluppatore di fiducia!")
					pass
			elif self.status == 'head':
				tempContent = list()
				if lineWords[0] not in self.docStruct['language']['headWords']:  #se la prima parola non e' tra quelle di inizio
					tempBody.append(l)					   # vuol dire che ho trovato l'articolo e aggiungo la prima riga al contenuto del documento
					self.status = 'body'
					pass
			elif self.status == 'body':
				if not lineWords[0] in self.docStruct['language']['tailWords']: #se la prima parola non e' tra quelle di fine
					if self.settings['delLF']:
						tempBody.append(l.strip('\n'))		   #allora sto leggendo l'articolo
					else:
						tempBody.append(l)
				else:
					self.status = 'tail'
					anomaly = False
				if docSep.match(l) is not None: #controlla se ci sono articoli che non hanno le parole chiave finali
					self.log.warn("Ho individuato una separatore valido prima che si chiusesse l'articolo precedente, controlla i tuoi file in uscita!\n\
					L'errore dovrebbe essere intorno all'articolo {} ma non sono sicuro! \n\t\t[{}]".format(docNumber, l.strip()))
					self.status = 'tail'
					anomaly = True
				pass
			elif self.status == 'tail':
				if docSep.match(l) is not None or anomaly:
					self.status = 'first'
					anomaly = False
					if self.settings['delWordBreak']:
						tempContent=[ll.replace('-\n', '') for ll in tempContent]
					newDoc['content']=copy.deepcopy(''.join(tempBody))
					self.contentList.append(copy.deepcopy(newDoc))
					tempBody=list()
					self.bodyCounter +=1
				pass
			else:
				self.log.critical("Stato Interno Sconosciuto")
			prevLine=l #salva sempre e comunque il contenuto della linea precedente
			pass
		#ricerca terminata, espongo i risultati
		self.log.info("Nel file ho trovato {0} articoli..".format(self.bodyCounter))
		if docSkipped > 0:
			self.log.warn("Attentione, LexisNexis ne ha saltati {} !!!".format(docSkipped))
			pass
		pass

	def removeDuplicates(self):
		titleList=[]
		duplicateList=[]
		for idx, ff in enumerate(self.contentList):
			if ff['title'] not in titleList:
				titleList.append(ff['title'])
				ff['duplicate']=False
				self.contentList[idx]=ff
				pass
			else:
				if ff['title'] not in duplicateList:
					duplicateList.append(ff['title'])
					if self.settings['showRemovedDuplicates']:
						self.log.info("Duplicato: {}".format(ff['title'].strip()))
				ff['duplicate'] = True
				self.contentList[idx]=ff
				self.duplicateNumber+=1
		self.log.info("Ho rimosso {} duplicati di {} articoli..". format(self.duplicateNumber, len(duplicateList)))
		pass

	def saveSeparate(self):
		outFileCounter = 0
		self.paths['OUTworkPath']=self.paths['OUTworkPath']+slugify(self.fileName)
		self.log.info("Salvo gli articoli in file separati...")
		self.log.debug("Persorso: {0}".format(self.paths['OUTworkPath'].format('nomeFile')))
		for ff in self.contentList:
			try:
				if ff['duplicate'] == False:
					fName=self.docStruct['outNameFormat'].format(title=slugify(ff['title'][:self.docStruct['maxTitleLen']]),\
														 filename=slugify(self.fileName),\
														 docnum=outFileCounter,\
														 papername=ff['newsPaperName'].strip(),\
														 **ff['date'])
					out=open(self.outPath+'{0}'.format(fName),'wb')
					if self.settings['includeTitle']:
						ff['content'] = ff['title']+os.linesep+ff['content']
					out.write(ff['content'].encode(self.settings['encoding']))
					out.close()
					outFileCounter+=1
			except IOError as e:
				self.log.error("Qualcosa e\' andato storto, non riesco a scrivere il file: {}".format(e))
				continue
			except KeyError as ke:
				self.log.error(f"Chiave {ke} non trovata per:{outFileCounter} {ff['title']} ")
			except Exception as ee:
				traceback.print_exc()
				self.log.error(f"Errore generale nel salvataggio: {ee}")
		if outFileCounter < self.bodyCounter:
			self.log.error("Ho salvato meno file rispetto a quelli trovati!")
		pass

	def saveBody(self):
		self.log.info('Salvo gli articoli in un singolo file vicino agli originali...')
		self.log.debug('Persorso: {0}'.format(self.outPath))
		os.chdir(self.outPath)
		try:
			fName=self.outPath+slugify(self.fileName)
			fName='BODYFILE_{0}.txt'.format(fName[:self.docStruct['maxTitleLen']])
			fileContent = os.linesep.join([cc['content'] for cc in self.contentList])
			out=open('{0}'.format(fName),'wb')
			out.write(fileContent.encode(self.settings['encoding']))
			out.close()
		except IOError as e:
			self.log.error("Qualcosa e\' andato storto, non riesco a scrivere il file: {}".format(e))
		except Exception as ee:
				self.log.error(f"Errore generale nel salvataggio: {ee}")
		pass