prima della prima prova della nuova versione

2020-03-30 13:01:32 +02:00
parent 33fd392725
commit 83f3a1f629
2 changed files with 27 additions and 33 deletions
@@ -13,7 +13,7 @@ if __name__ == '__main__':
 		try:
 			txt=docx2txt.process(f)
 			with open(f.replace('.docx', '.txt'), 'w') as fp:
-				fp.write(txt.replace('\r\n','\n'))
+				fp.write(txt)
 				fp.close()
 		except Exception as e:
 			print(e)
@@ -120,46 +120,21 @@ class bananaSPLITTER(QRunnable):
 		title = ''
 		prevLine = ''
 		newsPaperName = ''
-		titleBegin = False
+		titleBegin = True
+		newsName = False
 		tempBody = list()
-		docSep=re.compile(self.docStruct['docSep'])
 		
 		for l in self.rawFile: #per ogni linea del file
 			lineWords = l.lstrip().split(' ') #dividi la riga in parole
 			if self.status == 'first':
-				#prendo il numero di documento per vedere se ci sono buchi
 				try:	
-					try:
-						nn = parse.parse("{current:d} Of {total} Documents",l.strip().capitalize()).named
-						if nn["current"]-docNumber==1:
-							pass
-						else:
-							if self.settings["showSkipped"]:
-								self.log.warn("Il conto dei documenti non torna! LexisNexis ne ha saltato qualcuno!\n Precedente:{0}-Attuale:{1}".format(docNumber,nn["current"]))
-							docSkipped+=1
-						docNumber = nn["current"]
-					except:
-						pass #non segnalare eccezione se il parse fallisce
 					# ricerco la data
 					if (lineWords[self.settings['monthPosition']]).capitalize() in self.docStruct['language']['dateWords']:
 						try:
 							docDate=parse.parse(self.docStruct['dateFormat'],l).named
 							docDate['month']=docDate['month'].lstrip().rstrip().capitalize()
 							docDate['month']=self.docStruct['language']['dateWords'].index(docDate['month'])+1
-							title = ''
-							titleBegin=True
 							# dopo la data inizia il titolo, ma prima si cerca il nome del giornale
-							if self.settings['getNewsPaperName']:
-								try:
-									if prevLine.split(' ')[0].strip().isalpha():
-										newsPaperName = prevLine.strip()
-									else:
-										newsPaperName = self.settings['nameNotFoundStr']
-								except:
-									self.log.warn("E' successo qualcosa mentre stavo cercando il nome della pubblicazione,\
-									controlla i file di uscita! [{}]".format(prevLine.strip()))
-							else:
-								newsPaperName = self.settings['nameNotFoundStr']
 						except:
 							self.log.warn("Ho trovato una riga ambigua.. potrebbe essere una data ma non so: [{}]". format(l.strip('\r\n')))
 							pass
@@ -170,19 +145,38 @@ class bananaSPLITTER(QRunnable):
 						newDoc['title']=title
 						newDoc['date']=docDate
 						newDoc['newsPaperName'] = newsPaperName
-						titleBegin=False
+						titleBegin = False
+						newsName = False
 					else:
 						if titleBegin:
 							title += l.strip().capitalize()
+							titleBegin = False
+							newsName = True
+						elif newsName:
+							if self.settings['getNewsPaperName']:
+								try:
+									if l.strip().isalpha():
+										newsPaperName = l.strip()
+									else:
+										newsPaperName = self.settings['nameNotFoundStr']
+								except:
+									self.log.warn("E' successo qualcosa mentre stavo cercando il nome della pubblicazione,\
+									controlla i file di uscita! [{}]".format(l.strip()))
+							else:
+								newsPaperName = self.settings['nameNotFoundStr']
+							newsName = False
+							
 				except IndexError:
 					self.log.error("Errore inaspettato, contatta il tuo sviluppatore di fiducia!")
 					pass
 			elif self.status == 'head':
 				tempContent = list()
+				#doppio check per trovare línizio del corpo documento
+				if re.match(self.docStruct['beginOfDocument'],l,re.i):
+					self.status='body' 
 				if lineWords[0] not in self.docStruct['language']['headWords']:  #se la prima parola non e' tra quelle di inizio	
 					tempBody.append(l)					   # vuol dire che ho trovato l'articolo e aggiungo la prima riga al contenuto del documento
-					self.status = 'body'	 
-					pass
+					self.status = 'body'
 			elif self.status == 'body':
 				if not lineWords[0] in self.docStruct['language']['tailWords']: #se la prima parola non e' tra quelle di fine
 					if self.settings['delLF']:
@@ -192,14 +186,14 @@ class bananaSPLITTER(QRunnable):
 				else:
 					self.status = 'tail'
 					anomaly = False
-				if docSep.match(l) is not None: #controlla se ci sono articoli che non hanno le parole chiave finali 
+				if re.match(self.docStruct['endOfDocument'],l) is not None: #controlla se ci sono articoli che non hanno le parole chiave finali 
 					self.log.warn("Ho individuato una separatore valido prima che si chiusesse l'articolo precedente, controlla i tuoi file in uscita!\n\
 					L'errore dovrebbe essere intorno all'articolo {} ma non sono sicuro! \n\t\t[{}]".format(docNumber, l.strip()))
 					self.status = 'tail' 
 					anomaly = True
 				pass
 			elif self.status == 'tail':
-				if docSep.match(l) is not None or anomaly:
+				if re.match(self.docStruct['endOfDocument'],l) is not None or anomaly:
 					self.status = 'first'
 					anomaly = False
 					if self.settings['delWordBreak']: