prima versione funzionante con il nuovo tipo di file!! yuppi!

2020-03-30 16:15:59 +02:00
parent 955ac56e34
commit 3978569e35
3 changed files with 28 additions and 22 deletions
@@ -33,6 +33,7 @@
        "December"
      ],
      "headWords": [
+      	"BODY",
        "BYLINE:",
        "SECTION:",
        "LENGTH:",
@@ -105,5 +106,5 @@
      "&"
    ]
  },
-  "name": "FILE1.txt"
+  "name": "Files(38).txt"
 }
@@ -455,11 +455,11 @@ class bananaSelezOut(PyQt5.QtWidgets.QWidget):
 		# costruisco i thread
 		tDict={}
 		try:
-			for f in [splconf['paths']['fileList'][0]]:
+			for f in splconf['paths']['fileList']:
 				splconf['name']=f
 				tDict[f] = bananaSPLITTER(fileParams=copy.deepcopy(splconf), logger=self.log)
+				tDict[f].sendStatus.connect(updateState)
 				tDict[f].run()
-				#tDict[f].sendStatus.connect(updateState)
 				#QThreadPool.globalInstance().start(tDict[f])
 		except Exception as e:
 			self.log.error(f"Impossibile avviare lo splitter: {e}")
@@ -3,18 +3,19 @@ Created on 2 nov 2019

@author: Emanuele Trabattoni
 '''
-#from PyQt5.QtCore import QRunnable, QObject, pyqtSignal
+from PyQt5.QtCore import QRunnable, QObject, pyqtSignal

 from slugify import slugify
 import time, parse, re, copy, os,json
-#import traceback
+import traceback
+from numpy.ma.core import anom

-class bananaSPLITTER():
+class bananaSPLITTER(QObject, QRunnable):
 	
-	#sendStatus = pyqtSignal(str)
+	sendStatus = pyqtSignal(str)
 	
 	def __init__(self, fileParams=None, logger=None):
-		#QRunnable.__init__(self)
+		QRunnable.__init__(self)
 		self.fileParams = fileParams
 		self.log = logger
 		self.rawFile = None
@@ -22,7 +23,7 @@ class bananaSPLITTER():
 		self.contentList = list()
 		self.bodyCounter=0
 		self.duplicateNumber=0
-		self.log.debug(f"Configurazione: \n {json.dumps(fileParams, indent=2)}")
+		#self.log.debug(f"Configurazione: \n {json.dumps(fileParams, indent=2)}")
 		if fileParams is not None:
 			self.log.info("Sto operando sul file: {}..".format(self.fileParams['name']))
 			self.paths = self.fileParams['paths']
@@ -79,7 +80,7 @@ class bananaSPLITTER():
 		except FileExistsError as fe:
 			self.log.critical(fe)
 		except Exception as ee:
-			#traceback.print_exc()
+			traceback.print_exc()
 			self.log.warn(ee)
 		pass
 		
@@ -106,6 +107,7 @@ class bananaSPLITTER():
 				if ll not in ['\n', '\r', '\r\n']:
 					tempContent.append(ll)
 			self.rawFile = copy.deepcopy(tempContent)
+			self.rawFile.append('\n')  #linea vuota finale per essere sicuri di parsare bene
 			return True
 		except:
 			self.log.error("Errore inaspettato durante l'eliminazione delle righe vuote!")
@@ -127,6 +129,7 @@ class bananaSPLITTER():
 		tempBody = list()
 		
 		for l in self.rawFile: #per ogni linea del file
+			l=l.replace('\xa0', ' ')
 			lineWords = l.lstrip().split(' ') #dividi la riga in parole
 			if self.status == 'first':
 				try:	
@@ -140,7 +143,7 @@ class bananaSPLITTER():
 						except:
 							self.log.warn("Ho trovato una riga ambigua.. potrebbe essere una data ma non so: [{}]". format(l.strip('\r\n')))
 							pass
-					elif lineWords[0] in self.docStruct['language']['headWords']:
+					elif lineWords[0].upper() in self.docStruct['language']['headWords']:
 						#cambio stato e inizializzo un nuovo documento da riempire
 						self.status = 'head'
 						newDoc=dict()
@@ -151,13 +154,13 @@ class bananaSPLITTER():
 						newsName = False
 					else:
 						if titleBegin:
-							title += l.strip().capitalize()
+							title = l.strip().capitalize()
 							titleBegin = False
 							newsName = True
 						elif newsName:
 							if self.settings['getNewsPaperName']:
 								try:
-									if l.strip().isalpha():
+									if l.strip():
 										newsPaperName = l.strip()
 									else:
 										newsPaperName = self.settings['nameNotFoundStr']
@@ -175,11 +178,16 @@ class bananaSPLITTER():
 				#doppio check per trovare l'inizio del corpo documento
 				if re.match(self.docStruct['beginOfDocument'],l):
 					self.status='body' 
-				if lineWords[0] not in self.docStruct['language']['headWords']:  #se la prima parola non e' tra quelle di inizio	
+				if lineWords[0].upper() not in self.docStruct['language']['headWords']:  #se la prima parola non e' tra quelle di inizio	
 					tempBody.append(l)					   # vuol dire che ho trovato l'articolo e aggiungo la prima riga al contenuto del documento
 					self.status = 'body'
 			elif self.status == 'body':
-				if not lineWords[0] in self.docStruct['language']['tailWords']: #se la prima parola non e' tra quelle di fine
+				if re.match(self.docStruct['endOfDocument'],l) is not None: #controlla se ci sono articoli che non hanno le parole chiave finali 
+					self.log.warn("Ho individuato una separatore valido prima che si chiusesse l'articolo precedente, controlla i tuoi file in uscita!\n\
+			L'errore dovrebbe essere intorno all'articolo {} ma non sono sicuro! \n\t\t\t[{}]".format(docNumber, l.strip()))
+					self.status = 'tail' 
+					anomaly = True
+				elif not lineWords[0].upper() in self.docStruct['language']['tailWords']: #se la prima parola non e' tra quelle di fine
 					if self.settings['delLF']:
 						tempBody.append(l.strip('\n'))		   #allora sto leggendo l'articolo
 					else:
@@ -187,16 +195,13 @@ class bananaSPLITTER():
 				else:
 					self.status = 'tail'
 					anomaly = False
-				if re.match(self.docStruct['endOfDocument'],l) is not None: #controlla se ci sono articoli che non hanno le parole chiave finali 
-					self.log.warn("Ho individuato una separatore valido prima che si chiusesse l'articolo precedente, controlla i tuoi file in uscita!\n\
-					L'errore dovrebbe essere intorno all'articolo {} ma non sono sicuro! \n\t\t[{}]".format(docNumber, l.strip()))
-					self.status = 'tail' 
-					anomaly = True
+				
 				pass
 			elif self.status == 'tail':
-				if re.match(self.docStruct['endOfDocument'],l) is not None or anomaly:
+				if anomaly or (re.match(self.docStruct['endOfDocument'],l) is not None):
 					self.status = 'first'
 					anomaly = False
+					titleBegin = True
 					if self.settings['delWordBreak']:
 						tempContent=[ll.replace('-\n', '') for ll in tempContent]
 					newDoc['content']=copy.deepcopy(''.join(tempBody))
@@ -370,7 +375,7 @@ class bananaSPLITTER():
 			except KeyError as ke:
 				self.log.error(f"Chiave {ke} non trovata per:{outFileCounter} {ff['title']} ")
 			except Exception as ee:
-				#traceback.print_exc()
+				traceback.print_exc()
 				self.log.error(f"Errore generale nel salvataggio: {ee}")
 		if outFileCounter < self.bodyCounter:
 			self.log.error("Ho salvato meno file rispetto a quelli trovati!")