merge con il brance lib-devel

2019-12-13 16:20:34 +01:00
parent a6aeee5d89 cf173843d2
commit d2096976fb
15 changed files with 90423 additions and 2 deletions
@@ -1,6 +1,6 @@
 /.DS_Store
 /.project
 /.pydevproject
-bananaSPLIT/build
 bananaSPLIT.exe.spec
+bananaSPLIT/build
 /TestFiles/
@@ -0,0 +1 @@
+/org.eclipse.core.resources.prefs
@@ -0,0 +1 @@
+Qui si mettono i file di test per la versione GUI di bananaSPLIT
@@ -1 +1,2 @@
 *.py[ocd]
+/bananaSPLIT.exe.spec
@@ -0,0 +1,83 @@
+'''
+Created on 2 nov 2019
+
+@author: Emanuele Trabattoni
+'''
+import json,os,glob,copy
+
+class bananaCONF(object):
+	'''
+	Carica e Salva file di configurazione per bananaSPLITTER
+	'''
+	def __init__(self, workdir=None, logger=None):
+		self.log = logger
+		self.fileList = None
+		self.workdir = None
+		self.inUse = None
+		self.settingsList = dict()
+		if workdir is not None:
+			self.workdir = workdir
+			os.chdir(workdir)
+			self.log.debug("Cerco le configurazioni in: [{}]".format(os.getcwd()))
+		else:
+			self.log.error("Non mi e' stata fornita una directory per i file di configurazione")
+		pass
+	
+	def open(self):
+		self.log.info("Carico i file di configurazione")
+		self.fileList = glob.glob(r"*.json")
+		if len(self.fileList) > 0:
+			for f in self.fileList:
+				try:
+					fp = open(f)
+					tf = json.load(fp)
+					fName = f.split("\\")[-1]
+					self.settingsList[fName] = copy.deepcopy(tf)
+					self.log.info("Caricato correttamente: {}".format(fName))
+					fp.close()
+				except json.JSONDecodeError as e:
+					self.log.error("Impossibile leggere la configurazione:{}\n \
+									Controlla il file a riga: {} e colonna:{}" .format(e.doc, e.lineno, e.colno))
+				except IOError as ee:
+					self.log.error("Impossibile aprire il file: {}".format(ee))
+				except Exception as eee:
+					self.log.critical("Eccezione inaspettata: {}".format(eee))
+		else:
+			self.log.error("Non ho trovato alcun file di configurazione!")
+		pass
+	
+	def reload(self):
+		self.settingsList = None
+		self.fileList = None
+		self.inUse = None
+		self.open()
+		pass
+	
+	def use(self, toUse):
+		self.inUse = toUse
+		pass
+	
+	def save(self):
+		self.log.info("Salvo la configurazione: {}".format(self.inUse))
+		try:
+			os.chdir(self.workdir)
+			f=open(self.inUse)
+			json.dump(self.settingsList[self.inUse], f)
+			f.close()
+		except IOError as e:
+			self.log.error("Impossibile salvare il file: {} - [{}]".format(self.inUse,e))
+		pass
+	
+	def getFiles(self):
+		return self.fileList
+	
+	def getParams(self, k):
+		return self.settingsList[self.inUse][k]
+		pass
+	
+	def setParams(self, k, v):
+		self.settingsList[self.inUse][k]=dict(v)
+		pass
+
+	
+	
@@ -0,0 +1,85 @@
+'''
+Created on 2 nov 2019
+
+@author: Emanuele Trabattoni
+'''
+import sys, os
+import json
+import logging
+import colorama
+
+class fancyLogger(object):
+	'''
+	Colorizza il logger di python, per un' esperienza stile willy wonka
+	'''
+	def __init__(self, name="Logger", consoleLog=True, fileLog=True):
+		settings = json.load(open(os.getcwd()+r"\libbananasplit\testEN.json"))["logger"]
+		colorama.init(convert=True)
+		self.LRED = colorama.Fore.LIGHTRED_EX
+		self.RED = colorama.Fore.RED
+		self.LYELLOW = colorama.Fore.LIGHTYELLOW_EX
+		self.YELLOW = colorama.Fore.YELLOW
+		self.LBLUE = colorama.Fore.LIGHTBLUE_EX
+		self.BLUE = colorama.Fore.BLUE
+		self.LGREEN = colorama.Fore.LIGHTGREEN_EX
+		self.LGREEN = colorama.Fore.GREEN
+		self.WHITE = colorama.Fore.LIGHTWHITE_EX
+		self.RST = colorama.Style.RESET_ALL
+		
+		# Setup Logger
+		self.LOGGER = logging.getLogger(name)
+		self.LOGGER.setLevel(logging.DEBUG)
+		self.LOGGER.propagate = False
+		FORMATTER = logging.Formatter((settings["logFormat"]), (settings["logTimeFormat"]))
+		if fileLog:
+			# File Logging
+			fh = logging.FileHandler((settings["logFile"]))
+			fh.setLevel(logging.DEBUG)
+			fh.setFormatter(FORMATTER)
+			self.LOGGER.addHandler(fh)
+		if consoleLog:	
+			# Console Logging
+			cl= logging.StreamHandler(sys.stdout)
+			cl.setLevel(logging.DEBUG)
+			cl.setFormatter(FORMATTER)
+			self.LOGGER.addHandler(cl)
+		pass
+	
+	def debug(self, msg="Undefined Debug"):
+		print(self.LBLUE, end='')
+		self.LOGGER.debug(msg)
+		print(self.RST, end='')
+		pass
+	
+	def info(self, msg="Undefined Info"):
+		print(self.LGREEN, end='')
+		self.LOGGER.info(msg)
+		print(self.RST, end='')
+		pass
+	
+	def warn(self, msg="Undefined Warning"):
+		print(self.LYELLOW, end='')
+		self.LOGGER.warning(msg)
+		print(self.RST, end='')
+		pass
+	
+	def error(self, msg="Undefined Error"):
+		print(self.LRED, end='')
+		self.LOGGER.error(msg)
+		print(self.RST, end='')
+		pass
+	
+	def critical(self, msg="Undefined Critical"):
+		print(self.RED, end='')
+		self.LOGGER.critical(msg)
+		print(self.RST, end='')
+		pass
+	
+	def testColors(self):
+		self.debug("Test Debug")
+		self.info("Test Info")
+		self.warn("Test Warning")
+		self.error("Test Error")
+		self.critical("Test Critical")
+		pass
+	
@@ -0,0 +1,276 @@
+'''
+Created on 2 nov 2019
+
+@author: Emanuele Trabattoni
+'''
+from slugify.slugify import slugify
+import time, parse, re, copy, os, threading
+
+class bananaSPLITTER(threading.Thread):
+	def __init__(self, fileParams=None, logger=None):
+		threading.Thread.__init__(self)
+		self.fileParams = fileParams
+		self.log = logger
+		self.rawFile = None
+		self.status = "first"
+		self.fileList = list()
+		self.bodyCounter=0
+		self.duplicateNumber=0
+		if fileParams is not None:
+			self.log.info("Sto operando sul file: {}..".format(self.fileParams['name']))
+			self.paths = self.fileParams['paths']
+			self.docStruct = self.fileParams['docStruct']
+			self.settings = self.fileParams['settings']
+			self.fileName = self.fileParams['name']
+			self.outPath = self.paths['OUTworkPath']+slugify(self.fileName)
+			self.beginTime = time.time()
+			pass
+		else:
+			self.log.critical("Non e' stato fornito il nome di alcun file da splittare!")
+		pass
+	
+	def run(self):
+		self.log.info("Nuovo SPLITTER  su file: {}".format(self.fileName))
+		try:
+			self.openFile()
+			self.remEmptyLines()
+			self.splitFile()
+			
+			if self.settings['removeDuplicates']:
+				self.log.info("Controllo se ci sono dei duplicati..")
+				self.removeDuplicates()
+			else:
+				for idx, ff in enumerate(self.fileList):
+					ff['duplicate']=False
+					self.fileList[idx]=ff
+				self.log.warn('Salto il controllo dei duplicati..')
+			# se il parse e la rimozione dei duplicati e' andata bene
+			# preparo e inizio il salvataggio
+			if os.path.exists(self.outPath):
+				if self.settings['removeOldFiles']:
+					os.chdir(self.outPath)
+					for f in os.listdir(self.outPath):
+						os.remove(f)
+				else:
+					raise FileExistsError("Non posso sovrascrivere i vecchi file, eliminali manualmente!")
+			else:
+				os.mkdir(self.outPath)
+				os.chdir(self.outPath)
+			if self.settings['saveSeparateFiles']:
+				self.saveSeparate()
+			if self.settings['saveBodyFile']:
+				self.saveBody()
+			self.log.info("L'elaborazione del file ha richiesto {:4.2f} sec".format(time.time()-self.beginTime))
+		
+		except UnicodeDecodeError as ee:
+			self.log.critical("Il file [{}] contiene caratteri non compatibili con la codifica scelta! [{}]"
+							.format(self.fileParams['name'],ee))
+		except FileExistsError as fe:
+			self.log.critical(fe)
+		except BaseException as ee:
+			self.log.warning(ee)
+		pass
+		
+	def openFile(self):
+		try:
+			os.chdir(self.paths["INworkPath"])
+			self.log.info("Carico il contenuto..")
+			fp = open(self.fileParams['name'], mode='r', encoding=self.settings['encoding'])
+			self.rawFile = fp.readlines()
+			fp.close()
+		except IOError as e:
+			self.log.critical("Impossibile aprire il file: {}! [{}]".format(self.fileName,e))
+			raise BaseException("OpenFile") 	
+			os.rmdir(self.outPath)
+		pass
+
+	def remEmptyLines(self):
+		self.log.info("Elimino righe vuote e caratteri inutili..")
+		tempContent = []
+		try:
+			for ll in self.rawFile:
+				for c in self.settings['delChars']:
+					ll = ll.replace(c,'')
+				if ll not in ['\n', '\r']:
+					tempContent.append(ll)
+			self.rawFile = copy.deepcopy(tempContent)
+			return True
+		except:
+			self.log.error("Errore inaspettato durante l'eliminazione delle righe vuote!")
+			raise BaseException("DelLines")
+		del tempContent
+	
+	def splitFile(self):  #porting del codice dal programma originale
+		self.log.info("Individuo il contenuto..")
+		self.bodyCounter=0
+		self.duplicateNumber=0
+		docNumber = 0
+		docSkipped = 0
+		docDate = {}
+		title = ''
+		prevLine = ''
+		newsPaperName = ''
+		titleBegin = False
+		tempBody = list()
+		docSep=re.compile(self.docStruct['docSep'])
+		
+		for l in self.rawFile: #per ogni linea del file
+			lineWords = l.lstrip().split(' ') #dividi la riga in parole
+			if self.status == 'first':
+				#prendo il numero di documento per vedere se ci sono buchi
+				try:	
+					try:
+						nn = parse.parse("{current:d} Of {total} Documents",l.strip().capitalize()).named
+						if nn["current"]-docNumber==1:
+							pass
+						else:
+							if self.settings["showSkipped"]:
+								self.log.warn("Il conto dei documenti non torna! LexisNexis \
+								ne ha saltato qualcuno!\nPrecedente:{0}-Attuale:{1}".format(docNumber,nn["current"]))
+							docSkipped+=1
+						docNumber = nn["current"]
+					except:
+						pass #non segnalare eccezione se il parse fallisce
+					# ricerco la data
+					if (lineWords[self.settings['monthPosition']]).capitalize() in self.docStruct['dateWords']:
+						try:
+							docDate=parse.parse(self.docStruct['dateFormat'],l).named
+							docDate['month']=docDate['month'].lstrip().rstrip().capitalize()
+							docDate['month']=self.docStruct['dateWords'].index(docDate['month'])+1
+							title = ''
+							titleBegin=True
+							# dopo la data inizia il titolo, ma prima si cerca il nome del giornale
+							if self.settings['getNewsPaperName']:
+								try:
+									if prevLine.split(' ')[0].strip().isalpha():
+										newsPaperName = prevLine.strip()
+									else:
+										newsPaperName = self.settings['nameNotFoundStr']
+								except:
+									self.log.warn("E' successo qualcosa mentre stavo cercando il nome della pubblicazione,\
+									controlla i file di uscita! [{}]".format(prevLine.strip()))
+							else:
+								newsPaperName = self.settings['nameNotFoundStr']
+						except:
+							self.log.warn("Ho trovato una riga ambigua.. potrebbe essere una data ma non so: [{}]". format(l.strip('\r\n')))
+							pass
+					elif lineWords[0] in self.docStruct['headWords']:
+						#cambio stato e inizializzo un nuovo documento da riempire
+						self.status = 'head'
+						newDoc=dict()
+						newDoc['title']=title
+						newDoc['date']=docDate
+						newDoc['newsPaperName'] = newsPaperName
+						titleBegin=False
+					else:
+						if titleBegin:
+							title += l.strip().capitalize()
+				except IndexError:
+					self.log.error("Errore inaspettato, contatta il tuo sviluppatore di fiducia!")
+					pass
+			elif self.status == 'head':
+				tempContent = list()
+				if lineWords[0] not in self.docStruct['headWords']:  #se la prima parola non e' tra quelle di inizio	
+					tempBody.append(l)					   # vuol dire che ho trovato l'articolo e aggiungo la prima riga al contenuto del documento
+					self.status = 'body'	 
+					pass
+			elif self.status == 'body':
+				if not lineWords[0] in self.docStruct['tailWords']: #se la prima parola non e' tra quelle di fine
+					if self.settings['delLF']:
+						tempBody.append(l.strip('\n'))		   #allora sto leggendo l'articolo
+					else:
+						tempBody.append(l)
+				else:
+					self.status = 'tail'
+					anomaly = False
+				if docSep.match(l) is not None: #controlla se ci sono articoli che non hanno le parole chiave finali 
+					self.log.warn("Ho individuato una separatore valido prima che si chiusesse l'articolo precedente, controlla i tuoi file in uscita!\n\
+					L'errore dovrebbe essere intorno all'articolo {} ma non sono sicuro! \n\t\t[{}]".format(docNumber, l.strip()))
+					self.status = 'tail' 
+					anomaly = True
+				pass
+			elif self.status == 'tail':
+				if docSep.match(l) is not None or anomaly:
+					self.status = 'first'
+					anomaly = False
+					if self.settings['delWordBreak']:
+						tempContent=[ll.replace('-\n', '') for ll in tempContent]
+					newDoc['content']=copy.deepcopy(''.join(tempBody))
+					self.fileList.append(copy.deepcopy(newDoc))
+					tempBody=list()
+					self.bodyCounter +=1
+				pass
+			else:
+				self.log.critical("Stato Interno Sconosciuto")
+			prevLine=l #salva sempre e comunque il contenuto della linea precedente
+			pass
+		#ricerca terminata, espongo i risultati
+		self.log.info("Nel file ho trovato {0} articoli..".format(self.bodyCounter))
+		if docSkipped > 0:
+			self.log.warn("Attentione, LexisNexis ne ha saltati {} !!!".format(docSkipped))
+			pass
+		pass
+
+	def removeDuplicates(self):
+		titleList=[]
+		duplicateList=[]
+		for idx, ff in enumerate(self.fileList):
+			if ff['title'] not in titleList:
+				titleList.append(ff['title'])
+				ff['duplicate']=False
+				self.fileList[idx]=ff
+				pass
+			else:
+				if ff['title'] not in duplicateList:
+					duplicateList.append(ff['title'])
+					if self.settings['showRemovedDuplicates']:
+						self.log.info("Duplicato: {}".format(ff['title'].strip()))
+				ff['duplicate'] = True
+				self.fileList[idx]=ff
+				self.duplicateNumber+=1
+		self.log.info("Ho rimosso {} duplicati di {} articoli..". format(self.duplicateNumber, len(duplicateList)))
+		pass
+		
+	def saveSeparate(self):
+		outFileCounter = 0
+		self.paths['OUTworkPath']=self.paths['OUTworkPath']+slugify(self.fileName)
+		self.log.info("Salvo gli articoli in file separati...")
+		self.log.debug("Persorso: {0}".format(self.paths['OUTworkPath'].format('nomeFile')))
+		for ff in self.fileList:
+			try:
+				if ff['duplicate'] == False:
+					fName=self.paths['OUTnameFormat'].format(title=slugify(ff['title'][:self.settings['maxTitleLen']]),\
+														 filename=slugify(self.fileName),\
+														 docnum=outFileCounter,\
+														 papername=ff['newsPaperName'].strip(),\
+														 **ff['date'])
+					out=open('{0}'.format(fName),'wb')
+					if self.settings['includeTitle']:
+						ff['content'] = ff['title']+os.linesep+ff['content']
+					out.write(ff['content'].encode(self.settings['encoding']))
+					out.close()
+					outFileCounter+=1
+			except IOError as e:
+				self.log.error("Qualcosa e\' andato storto, non riesco a scrivere il file: {}".format(e))
+				continue
+		if outFileCounter < self.bodyCounter:
+			raise BaseException("Ho salvato meno file rispetto a quelli trovati!")
+		pass
+	
+	def saveBody(self):
+		self.log.info('Salvo gli articoli in un singolo file vicino agli originali...')
+		self.log.debug('Persorso: {0}'.format(self.outPath))
+		os.chdir(self.outPath)
+		try:
+			fName=slugify(self.fileName)
+			fName='BODYFILE_{0}.txt'.format(fName[:self.settings['maxTitleLen']])
+			fileContent = os.linesep.join([cc['content'] for cc in self.fileList])
+			out=open('{0}'.format(fName),'wb')
+			out.write(fileContent.encode(self.settings['encoding']))
+			out.close()
+		except IOError as e:
+			self.log.error("Qualcosa e\' andato storto, non riesco a scrivere il file: {}".format(e))
+		pass
+
+	
+		
@@ -0,0 +1,83 @@
+{
+	"version": "v1.1a",
+	"logger": {
+		"logFile": "D:\\Test\\bananaSPLIT.log",
+		"logFormat": "%(asctime)s|%(levelname)-8s| %(message)-50s",
+		"logTimeFormat": "%m-%d %H:%M:%S"
+	},
+	"splitter": {
+	"name": "",
+		"paths": {
+			"INworkPath": "D:\\Emanuele\\Documenti\\workspace\\bananaSPLIT\\TestFiles\\",
+			"OUTworkPath": "H:\\",
+			"OUTnameFormat": "TEST_{docnum}_{year:04d}{month:02d}{day:02d}_{title}.txt"
+		},
+		"docStruct": {
+			"docSep": "\\s*Copyright [(0-9)]+",
+			"dateFormat": "{month} {day:d}, {year:d}{}",
+			"dateWords": [
+				"January",
+				"February",
+				"March",
+				"April",
+				"May",
+				"June",
+				"July",
+				"August",
+				"September",
+				"October",
+				"November",
+				"December"
+			],
+			"headWords": [
+				"BYLINE:",
+				"SECTION:",
+				"LENGTH:",
+				"DATELINE:",
+				"HIGHLIGHT:",
+				"Email:"
+			],
+			"tailWords": [
+				"Newstex ID:",
+				"NOTES:",
+				"LANGUAGE:",
+				"GRAPHIC:",
+				"TYPE:",
+				"URL:",
+				"LOAD-DATE:",
+				"PUBLICATION-TYPE:",
+				"DOCUMENT-TYPE:",
+				"CHARTS:",
+				"JOURNAL-CODE:"
+			]
+		},
+		"settings": {
+			"encoding": "utf-8",
+			"monthPosition": 0,
+			"getNewsPaperName": true,
+			"nameNotFoundStr": "ND",
+			"includeTitle": true,
+			"removeDuplicates": true,
+			"showSkipped": true,
+			"showRemovedDuplicates": false,
+			"maxTitleLen": 32,
+			"loadTXT": true,
+			"loadDOCX": false,
+			"removeOldFiles": true,
+			"saveSeparateFiles": true,
+			"saveBodyFile": true,
+			"saveBodyNumber": true,
+			"delLF": false,
+			"delWordBreak": true,
+			"delChars": [
+				"'",
+				"@",
+				"#",
+				"$",
+				"%",
+				"^",
+				"&"
+			]
+		}
+	}
+}
@@ -0,0 +1,80 @@
+{
+	"version": "v1.1a",
+	"logger": {
+		"logFile": "D:\\Test\\bananaSPLIT.log",
+		"logFormat": "%(asctime)s|%(levelname)-8s| %(message)-50s",
+		"logTimeFormat": "%m-%d %H:%M:%S"
+	},
+	"splitter": {
+		"name": "",
+		"paths": {
+			"INworkPath": "D:\\Emanuele\\Documenti\\workspace\\bananaSPLIT\\TestFiles\\",
+			"OUTworkPath": "H:\\",
+			"OUTnameFormat": "TEST_{docnum}_{year:04d}{month:02d}{day:02d}_{title}.txt"
+		},
+		"docStruct": {
+			"docSep": "Copyright [(0-9)]+",
+			"dateFormat": "{day:d} {month} {year:d} {}",
+			"dateWords": [
+				"Gennaio",
+				"Febbraio",
+				"Marzo",
+				"Aprile",
+				"Maggio",
+				"Giugno",
+				"Luglio",
+				"Agosto",
+				"Settembre",
+				"Ottobre",
+				"Novembre",
+				"Dicembre"
+			],
+			"headWords": [
+				"BYLINE:",
+				"SECTION:",
+				"LENGTH:",
+				"DATELINE:",
+				"HIGHLIGHT:",
+				"Email:"
+			],
+			"tailWords": [
+				"LANGUAGE:",
+				"GRAPHIC:",
+				"TYPE:",
+				"URL:",
+				"LOAD-DATE:",
+				"PUBLICATION-TYPE:",
+				"DOCUMENT-TYPE:",
+				"CHARTS:"
+			]
+		},
+		"settings": {
+			"encoding": "utf-8",
+			"monthPosition": 0,
+			"getNewsPaperName": true,
+			"nameNotFoundStr": "ND",
+			"includeTitle": true,
+			"removeDuplicates": true,
+			"showSkipped": false,
+			"showRemovedDuplicates": true,
+			"maxTitleLen": 32,
+			"loadTXT": true,
+			"loadDOCX": false,
+			"removeOldFiles": true,
+			"saveSeparateFiles": true,
+			"saveBodyFile": true,
+			"saveBodyNumber": true,
+			"delLF": false,
+			"delWordBreak": true,
+			"delChars": [
+				"'",
+				"@",
+				"#",
+				"$",
+				"%",
+				"^",
+				"&"
+			]
+		}
+	}
+}
@@ -0,0 +1,35 @@
+'''
+Created on 1 dic 2019
+
+@author: Emanuele Trabattoni
+'''
+import os
+from glob import glob
+from copy import deepcopy
+from libsplit import bananaSPLITTER
+from libconfload import bananaCONF
+from libfancylogger import fancyLogger
+
+if __name__ == "__main__":
+	print("CWD-> "+os.getcwd())
+	logger = fancyLogger(fileLog = False)
+	confl = bananaCONF(workdir=r"./libbananasplit", logger=logger)
+	confl.open()
+	
+	confl.use("testEN.json")
+	splconf = confl.getParams("splitter")
+	splist = []
+	os.chdir(splconf["paths"]["INworkPath"])
+	for f in glob("*.txt"):
+		splconf["name"] = f
+		logger.info("-"*80)
+		splitter = bananaSPLITTER(fileParams=deepcopy(splconf), logger=logger)
+		splist.append(splitter)
+		splitter.start()
+		splitter.join()
+
+	logger.info("\n"+"="*50+"\n\tFINITO!!!\n"+"="*50)
+
+
+
+
				`@@ -0,0 +1 @@`
				`Qui si mettono i file di test per la versione GUI di bananaSPLIT`