merge con il brance lib-devel

2019-12-13 16:20:34 +01:00
parent a6aeee5d89 cf173843d2
commit d2096976fb
15 changed files with 90423 additions and 2 deletions
@@ -1,6 +1,6 @@
 /.DS_Store
 /.project
 /.pydevproject
 bananaSPLIT/build
 bananaSPLIT.exe.spec
 bananaSPLIT/build
 /TestFiles/
@@ -0,0 +1 @@
 /org.eclipse.core.resources.prefs
@@ -0,0 +1 @@
 Qui si mettono i file di test per la versione GUI di bananaSPLIT
@@ -1 +1,2 @@
-*.py[ocd]
+*.py[ocd]
 /bananaSPLIT.exe.spec
@@ -0,0 +1,83 @@
 '''
 Created on 2 nov 2019
@author: Emanuele Trabattoni
 '''
 import json,os,glob,copy
 class bananaCONF(object):
 	'''
 	Carica e Salva file di configurazione per bananaSPLITTER
 	'''
 	def __init__(self, workdir=None, logger=None):
 		self.log = logger
 		self.fileList = None
 		self.workdir = None
 		self.inUse = None
 		self.settingsList = dict()
 		if workdir is not None:
 			self.workdir = workdir
 			os.chdir(workdir)
 			self.log.debug("Cerco le configurazioni in: [{}]".format(os.getcwd()))
 		else:
 			self.log.error("Non mi e' stata fornita una directory per i file di configurazione")
 		pass
 	def open(self):
 		self.log.info("Carico i file di configurazione")
 		self.fileList = glob.glob(r"*.json")
 		if len(self.fileList) > 0:
 			for f in self.fileList:
 				try:
 					fp = open(f)
 					tf = json.load(fp)
 					fName = f.split("\\")[-1]
 					self.settingsList[fName] = copy.deepcopy(tf)
 					self.log.info("Caricato correttamente: {}".format(fName))
 					fp.close()
 				except json.JSONDecodeError as e:
 					self.log.error("Impossibile leggere la configurazione:{}\n \
 									Controlla il file a riga: {} e colonna:{}" .format(e.doc, e.lineno, e.colno))
 				except IOError as ee:
 					self.log.error("Impossibile aprire il file: {}".format(ee))
 				except Exception as eee:
 					self.log.critical("Eccezione inaspettata: {}".format(eee))
 		else:
 			self.log.error("Non ho trovato alcun file di configurazione!")
 		pass
 	def reload(self):
 		self.settingsList = None
 		self.fileList = None
 		self.inUse = None
 		self.open()
 		pass
 	def use(self, toUse):
 		self.inUse = toUse
 		pass
 	def save(self):
 		self.log.info("Salvo la configurazione: {}".format(self.inUse))
 		try:
 			os.chdir(self.workdir)
 			f=open(self.inUse)
 			json.dump(self.settingsList[self.inUse], f)
 			f.close()
 		except IOError as e:
 			self.log.error("Impossibile salvare il file: {} - [{}]".format(self.inUse,e))
 		pass
 	def getFiles(self):
 		return self.fileList
 	def getParams(self, k):
 		return self.settingsList[self.inUse][k]
 		pass
 	def setParams(self, k, v):
 		self.settingsList[self.inUse][k]=dict(v)
 		pass
@@ -0,0 +1,85 @@
 '''
 Created on 2 nov 2019
@author: Emanuele Trabattoni
 '''
 import sys, os
 import json
 import logging
 import colorama
 class fancyLogger(object):
 	'''
 	Colorizza il logger di python, per un' esperienza stile willy wonka
 	'''
 	def __init__(self, name="Logger", consoleLog=True, fileLog=True):
 		settings = json.load(open(os.getcwd()+r"\libbananasplit\testEN.json"))["logger"]
 		colorama.init(convert=True)
 		self.LRED = colorama.Fore.LIGHTRED_EX
 		self.RED = colorama.Fore.RED
 		self.LYELLOW = colorama.Fore.LIGHTYELLOW_EX
 		self.YELLOW = colorama.Fore.YELLOW
 		self.LBLUE = colorama.Fore.LIGHTBLUE_EX
 		self.BLUE = colorama.Fore.BLUE
 		self.LGREEN = colorama.Fore.LIGHTGREEN_EX
 		self.LGREEN = colorama.Fore.GREEN
 		self.WHITE = colorama.Fore.LIGHTWHITE_EX
 		self.RST = colorama.Style.RESET_ALL
 		# Setup Logger
 		self.LOGGER = logging.getLogger(name)
 		self.LOGGER.setLevel(logging.DEBUG)
 		self.LOGGER.propagate = False
 		FORMATTER = logging.Formatter((settings["logFormat"]), (settings["logTimeFormat"]))
 		if fileLog:
 			# File Logging
 			fh = logging.FileHandler((settings["logFile"]))
 			fh.setLevel(logging.DEBUG)
 			fh.setFormatter(FORMATTER)
 			self.LOGGER.addHandler(fh)
 		if consoleLog:	
 			# Console Logging
 			cl= logging.StreamHandler(sys.stdout)
 			cl.setLevel(logging.DEBUG)
 			cl.setFormatter(FORMATTER)
 			self.LOGGER.addHandler(cl)
 		pass
 	def debug(self, msg="Undefined Debug"):
 		print(self.LBLUE, end='')
 		self.LOGGER.debug(msg)
 		print(self.RST, end='')
 		pass
 	def info(self, msg="Undefined Info"):
 		print(self.LGREEN, end='')
 		self.LOGGER.info(msg)
 		print(self.RST, end='')
 		pass
 	def warn(self, msg="Undefined Warning"):
 		print(self.LYELLOW, end='')
 		self.LOGGER.warning(msg)
 		print(self.RST, end='')
 		pass
 	def error(self, msg="Undefined Error"):
 		print(self.LRED, end='')
 		self.LOGGER.error(msg)
 		print(self.RST, end='')
 		pass
 	def critical(self, msg="Undefined Critical"):
 		print(self.RED, end='')
 		self.LOGGER.critical(msg)
 		print(self.RST, end='')
 		pass
 	def testColors(self):
 		self.debug("Test Debug")
 		self.info("Test Info")
 		self.warn("Test Warning")
 		self.error("Test Error")
 		self.critical("Test Critical")
 		pass
@@ -0,0 +1,276 @@
 '''
 Created on 2 nov 2019
@author: Emanuele Trabattoni
 '''
 from slugify.slugify import slugify
 import time, parse, re, copy, os, threading
 class bananaSPLITTER(threading.Thread):
 	def __init__(self, fileParams=None, logger=None):
 		threading.Thread.__init__(self)
 		self.fileParams = fileParams
 		self.log = logger
 		self.rawFile = None
 		self.status = "first"
 		self.fileList = list()
 		self.bodyCounter=0
 		self.duplicateNumber=0
 		if fileParams is not None:
 			self.log.info("Sto operando sul file: {}..".format(self.fileParams['name']))
 			self.paths = self.fileParams['paths']
 			self.docStruct = self.fileParams['docStruct']
 			self.settings = self.fileParams['settings']
 			self.fileName = self.fileParams['name']
 			self.outPath = self.paths['OUTworkPath']+slugify(self.fileName)
 			self.beginTime = time.time()
 			pass
 		else:
 			self.log.critical("Non e' stato fornito il nome di alcun file da splittare!")
 		pass
 	def run(self):
 		self.log.info("Nuovo SPLITTER  su file: {}".format(self.fileName))
 		try:
 			self.openFile()
 			self.remEmptyLines()
 			self.splitFile()
 			if self.settings['removeDuplicates']:
 				self.log.info("Controllo se ci sono dei duplicati..")
 				self.removeDuplicates()
 			else:
 				for idx, ff in enumerate(self.fileList):
 					ff['duplicate']=False
 					self.fileList[idx]=ff
 				self.log.warn('Salto il controllo dei duplicati..')
 			# se il parse e la rimozione dei duplicati e' andata bene
 			# preparo e inizio il salvataggio
 			if os.path.exists(self.outPath):
 				if self.settings['removeOldFiles']:
 					os.chdir(self.outPath)
 					for f in os.listdir(self.outPath):
 						os.remove(f)
 				else:
 					raise FileExistsError("Non posso sovrascrivere i vecchi file, eliminali manualmente!")
 			else:
 				os.mkdir(self.outPath)
 				os.chdir(self.outPath)
 			if self.settings['saveSeparateFiles']:
 				self.saveSeparate()
 			if self.settings['saveBodyFile']:
 				self.saveBody()
 			self.log.info("L'elaborazione del file ha richiesto {:4.2f} sec".format(time.time()-self.beginTime))
 		except UnicodeDecodeError as ee:
 			self.log.critical("Il file [{}] contiene caratteri non compatibili con la codifica scelta! [{}]"
 							.format(self.fileParams['name'],ee))
 		except FileExistsError as fe:
 			self.log.critical(fe)
 		except BaseException as ee:
 			self.log.warning(ee)
 		pass
 	def openFile(self):
 		try:
 			os.chdir(self.paths["INworkPath"])
 			self.log.info("Carico il contenuto..")
 			fp = open(self.fileParams['name'], mode='r', encoding=self.settings['encoding'])
 			self.rawFile = fp.readlines()
 			fp.close()
 		except IOError as e:
 			self.log.critical("Impossibile aprire il file: {}! [{}]".format(self.fileName,e))
 			raise BaseException("OpenFile") 	
 			os.rmdir(self.outPath)
 		pass
 	def remEmptyLines(self):
 		self.log.info("Elimino righe vuote e caratteri inutili..")
 		tempContent = []
 		try:
 			for ll in self.rawFile:
 				for c in self.settings['delChars']:
 					ll = ll.replace(c,'')
 				if ll not in ['\n', '\r']:
 					tempContent.append(ll)
 			self.rawFile = copy.deepcopy(tempContent)
 			return True
 		except:
 			self.log.error("Errore inaspettato durante l'eliminazione delle righe vuote!")
 			raise BaseException("DelLines")
 		del tempContent
 	def splitFile(self):  #porting del codice dal programma originale
 		self.log.info("Individuo il contenuto..")
 		self.bodyCounter=0
 		self.duplicateNumber=0
 		docNumber = 0
 		docSkipped = 0
 		docDate = {}
 		title = ''
 		prevLine = ''
 		newsPaperName = ''
 		titleBegin = False
 		tempBody = list()
 		docSep=re.compile(self.docStruct['docSep'])
 		for l in self.rawFile: #per ogni linea del file
 			lineWords = l.lstrip().split(' ') #dividi la riga in parole
 			if self.status == 'first':
 				#prendo il numero di documento per vedere se ci sono buchi
 				try:	
 					try:
 						nn = parse.parse("{current:d} Of {total} Documents",l.strip().capitalize()).named
 						if nn["current"]-docNumber==1:
 							pass
 						else:
 							if self.settings["showSkipped"]:
 								self.log.warn("Il conto dei documenti non torna! LexisNexis \
 								ne ha saltato qualcuno!\nPrecedente:{0}-Attuale:{1}".format(docNumber,nn["current"]))
 							docSkipped+=1
 						docNumber = nn["current"]
 					except:
 						pass #non segnalare eccezione se il parse fallisce
 					# ricerco la data
 					if (lineWords[self.settings['monthPosition']]).capitalize() in self.docStruct['dateWords']:
 						try:
 							docDate=parse.parse(self.docStruct['dateFormat'],l).named
 							docDate['month']=docDate['month'].lstrip().rstrip().capitalize()
 							docDate['month']=self.docStruct['dateWords'].index(docDate['month'])+1
 							title = ''
 							titleBegin=True
 							# dopo la data inizia il titolo, ma prima si cerca il nome del giornale
 							if self.settings['getNewsPaperName']:
 								try:
 									if prevLine.split(' ')[0].strip().isalpha():
 										newsPaperName = prevLine.strip()
 									else:
 										newsPaperName = self.settings['nameNotFoundStr']
 								except:
 									self.log.warn("E' successo qualcosa mentre stavo cercando il nome della pubblicazione,\
 									controlla i file di uscita! [{}]".format(prevLine.strip()))
 							else:
 								newsPaperName = self.settings['nameNotFoundStr']
 						except:
 							self.log.warn("Ho trovato una riga ambigua.. potrebbe essere una data ma non so: [{}]". format(l.strip('\r\n')))
 							pass
 					elif lineWords[0] in self.docStruct['headWords']:
 						#cambio stato e inizializzo un nuovo documento da riempire
 						self.status = 'head'
 						newDoc=dict()
 						newDoc['title']=title
 						newDoc['date']=docDate
 						newDoc['newsPaperName'] = newsPaperName
 						titleBegin=False
 					else:
 						if titleBegin:
 							title += l.strip().capitalize()
 				except IndexError:
 					self.log.error("Errore inaspettato, contatta il tuo sviluppatore di fiducia!")
 					pass
 			elif self.status == 'head':
 				tempContent = list()
 				if lineWords[0] not in self.docStruct['headWords']:  #se la prima parola non e' tra quelle di inizio	
 					tempBody.append(l)					   # vuol dire che ho trovato l'articolo e aggiungo la prima riga al contenuto del documento
 					self.status = 'body'	 
 					pass
 			elif self.status == 'body':
 				if not lineWords[0] in self.docStruct['tailWords']: #se la prima parola non e' tra quelle di fine
 					if self.settings['delLF']:
 						tempBody.append(l.strip('\n'))		   #allora sto leggendo l'articolo
 					else:
 						tempBody.append(l)
 				else:
 					self.status = 'tail'
 					anomaly = False
 				if docSep.match(l) is not None: #controlla se ci sono articoli che non hanno le parole chiave finali 
 					self.log.warn("Ho individuato una separatore valido prima che si chiusesse l'articolo precedente, controlla i tuoi file in uscita!\n\
 					L'errore dovrebbe essere intorno all'articolo {} ma non sono sicuro! \n\t\t[{}]".format(docNumber, l.strip()))
 					self.status = 'tail' 
 					anomaly = True
 				pass
 			elif self.status == 'tail':
 				if docSep.match(l) is not None or anomaly:
 					self.status = 'first'
 					anomaly = False
 					if self.settings['delWordBreak']:
 						tempContent=[ll.replace('-\n', '') for ll in tempContent]
 					newDoc['content']=copy.deepcopy(''.join(tempBody))
 					self.fileList.append(copy.deepcopy(newDoc))
 					tempBody=list()
 					self.bodyCounter +=1
 				pass
 			else:
 				self.log.critical("Stato Interno Sconosciuto")
 			prevLine=l #salva sempre e comunque il contenuto della linea precedente
 			pass
 		#ricerca terminata, espongo i risultati
 		self.log.info("Nel file ho trovato {0} articoli..".format(self.bodyCounter))
 		if docSkipped > 0:
 			self.log.warn("Attentione, LexisNexis ne ha saltati {} !!!".format(docSkipped))
 			pass
 		pass
 	def removeDuplicates(self):
 		titleList=[]
 		duplicateList=[]
 		for idx, ff in enumerate(self.fileList):
 			if ff['title'] not in titleList:
 				titleList.append(ff['title'])
 				ff['duplicate']=False
 				self.fileList[idx]=ff
 				pass
 			else:
 				if ff['title'] not in duplicateList:
 					duplicateList.append(ff['title'])
 					if self.settings['showRemovedDuplicates']:
 						self.log.info("Duplicato: {}".format(ff['title'].strip()))
 				ff['duplicate'] = True
 				self.fileList[idx]=ff
 				self.duplicateNumber+=1
 		self.log.info("Ho rimosso {} duplicati di {} articoli..". format(self.duplicateNumber, len(duplicateList)))
 		pass
 	def saveSeparate(self):
 		outFileCounter = 0
 		self.paths['OUTworkPath']=self.paths['OUTworkPath']+slugify(self.fileName)
 		self.log.info("Salvo gli articoli in file separati...")
 		self.log.debug("Persorso: {0}".format(self.paths['OUTworkPath'].format('nomeFile')))
 		for ff in self.fileList:
 			try:
 				if ff['duplicate'] == False:
 					fName=self.paths['OUTnameFormat'].format(title=slugify(ff['title'][:self.settings['maxTitleLen']]),\
 														 filename=slugify(self.fileName),\
 														 docnum=outFileCounter,\
 														 papername=ff['newsPaperName'].strip(),\
 														 **ff['date'])
 					out=open('{0}'.format(fName),'wb')
 					if self.settings['includeTitle']:
 						ff['content'] = ff['title']+os.linesep+ff['content']
 					out.write(ff['content'].encode(self.settings['encoding']))
 					out.close()
 					outFileCounter+=1
 			except IOError as e:
 				self.log.error("Qualcosa e\' andato storto, non riesco a scrivere il file: {}".format(e))
 				continue
 		if outFileCounter < self.bodyCounter:
 			raise BaseException("Ho salvato meno file rispetto a quelli trovati!")
 		pass
 	def saveBody(self):
 		self.log.info('Salvo gli articoli in un singolo file vicino agli originali...')
 		self.log.debug('Persorso: {0}'.format(self.outPath))
 		os.chdir(self.outPath)
 		try:
 			fName=slugify(self.fileName)
 			fName='BODYFILE_{0}.txt'.format(fName[:self.settings['maxTitleLen']])
 			fileContent = os.linesep.join([cc['content'] for cc in self.fileList])
 			out=open('{0}'.format(fName),'wb')
 			out.write(fileContent.encode(self.settings['encoding']))
 			out.close()
 		except IOError as e:
 			self.log.error("Qualcosa e\' andato storto, non riesco a scrivere il file: {}".format(e))
 		pass
@@ -0,0 +1,83 @@
 {
 	"version": "v1.1a",
 	"logger": {
 		"logFile": "D:\\Test\\bananaSPLIT.log",
 		"logFormat": "%(asctime)s|%(levelname)-8s| %(message)-50s",
 		"logTimeFormat": "%m-%d %H:%M:%S"
 	},
 	"splitter": {
 	"name": "",
 		"paths": {
 			"INworkPath": "D:\\Emanuele\\Documenti\\workspace\\bananaSPLIT\\TestFiles\\",
 			"OUTworkPath": "H:\\",
 			"OUTnameFormat": "TEST_{docnum}_{year:04d}{month:02d}{day:02d}_{title}.txt"
 		},
 		"docStruct": {
 			"docSep": "\\s*Copyright [(0-9)]+",
 			"dateFormat": "{month} {day:d}, {year:d}{}",
 			"dateWords": [
 				"January",
 				"February",
 				"March",
 				"April",
 				"May",
 				"June",
 				"July",
 				"August",
 				"September",
 				"October",
 				"November",
 				"December"
 			],
 			"headWords": [
 				"BYLINE:",
 				"SECTION:",
 				"LENGTH:",
 				"DATELINE:",
 				"HIGHLIGHT:",
 				"Email:"
 			],
 			"tailWords": [
 				"Newstex ID:",
 				"NOTES:",
 				"LANGUAGE:",
 				"GRAPHIC:",
 				"TYPE:",
 				"URL:",
 				"LOAD-DATE:",
 				"PUBLICATION-TYPE:",
 				"DOCUMENT-TYPE:",
 				"CHARTS:",
 				"JOURNAL-CODE:"
 			]
 		},
 		"settings": {
 			"encoding": "utf-8",
 			"monthPosition": 0,
 			"getNewsPaperName": true,
 			"nameNotFoundStr": "ND",
 			"includeTitle": true,
 			"removeDuplicates": true,
 			"showSkipped": true,
 			"showRemovedDuplicates": false,
 			"maxTitleLen": 32,
 			"loadTXT": true,
 			"loadDOCX": false,
 			"removeOldFiles": true,
 			"saveSeparateFiles": true,
 			"saveBodyFile": true,
 			"saveBodyNumber": true,
 			"delLF": false,
 			"delWordBreak": true,
 			"delChars": [
 				"'",
 				"@",
 				"#",
 				"$",
 				"%",
 				"^",
 				"&"
 			]
 		}
 	}
 }
@@ -0,0 +1,80 @@
 {
 	"version": "v1.1a",
 	"logger": {
 		"logFile": "D:\\Test\\bananaSPLIT.log",
 		"logFormat": "%(asctime)s|%(levelname)-8s| %(message)-50s",
 		"logTimeFormat": "%m-%d %H:%M:%S"
 	},
 	"splitter": {
 		"name": "",
 		"paths": {
 			"INworkPath": "D:\\Emanuele\\Documenti\\workspace\\bananaSPLIT\\TestFiles\\",
 			"OUTworkPath": "H:\\",
 			"OUTnameFormat": "TEST_{docnum}_{year:04d}{month:02d}{day:02d}_{title}.txt"
 		},
 		"docStruct": {
 			"docSep": "Copyright [(0-9)]+",
 			"dateFormat": "{day:d} {month} {year:d} {}",
 			"dateWords": [
 				"Gennaio",
 				"Febbraio",
 				"Marzo",
 				"Aprile",
 				"Maggio",
 				"Giugno",
 				"Luglio",
 				"Agosto",
 				"Settembre",
 				"Ottobre",
 				"Novembre",
 				"Dicembre"
 			],
 			"headWords": [
 				"BYLINE:",
 				"SECTION:",
 				"LENGTH:",
 				"DATELINE:",
 				"HIGHLIGHT:",
 				"Email:"
 			],
 			"tailWords": [
 				"LANGUAGE:",
 				"GRAPHIC:",
 				"TYPE:",
 				"URL:",
 				"LOAD-DATE:",
 				"PUBLICATION-TYPE:",
 				"DOCUMENT-TYPE:",
 				"CHARTS:"
 			]
 		},
 		"settings": {
 			"encoding": "utf-8",
 			"monthPosition": 0,
 			"getNewsPaperName": true,
 			"nameNotFoundStr": "ND",
 			"includeTitle": true,
 			"removeDuplicates": true,
 			"showSkipped": false,
 			"showRemovedDuplicates": true,
 			"maxTitleLen": 32,
 			"loadTXT": true,
 			"loadDOCX": false,
 			"removeOldFiles": true,
 			"saveSeparateFiles": true,
 			"saveBodyFile": true,
 			"saveBodyNumber": true,
 			"delLF": false,
 			"delWordBreak": true,
 			"delChars": [
 				"'",
 				"@",
 				"#",
 				"$",
 				"%",
 				"^",
 				"&"
 			]
 		}
 	}
 }
@@ -0,0 +1,35 @@
 '''
 Created on 1 dic 2019
@author: Emanuele Trabattoni
 '''
 import os
 from glob import glob
 from copy import deepcopy
 from libsplit import bananaSPLITTER
 from libconfload import bananaCONF
 from libfancylogger import fancyLogger
 if __name__ == "__main__":
 	print("CWD-> "+os.getcwd())
 	logger = fancyLogger(fileLog = False)
 	confl = bananaCONF(workdir=r"./libbananasplit", logger=logger)
 	confl.open()
 	confl.use("testEN.json")
 	splconf = confl.getParams("splitter")
 	splist = []
 	os.chdir(splconf["paths"]["INworkPath"])
 	for f in glob("*.txt"):
 		splconf["name"] = f
 		logger.info("-"*80)
 		splitter = bananaSPLITTER(fileParams=deepcopy(splconf), logger=logger)
 		splist.append(splitter)
 		splitter.start()
 		splitter.join()
 	logger.info("\n"+"="*50+"\n\tFINITO!!!\n"+"="*50)
		`@@ -0,0 +1 @@`
							`Qui si mettono i file di test per la versione GUI di bananaSPLIT`
`@@ -1 +1,2 @@`
	`*.py[ocd]`	`*.py[ocd]`
		`/bananaSPLIT.exe.spec`