fixata indentazione titolo, aggiunti file conf alla cartella di dist

2019-10-30 17:10:16 +01:00
parent 8ec7e5e835
commit 84a38d0cb3
3 changed files with 402 additions and 277 deletions
@@ -0,0 +1,64 @@
+{
+	"INworkPath": "D:\\Test\\",
+	"OUTworkPath": "D:\\Test\\Separati\\",
+	"OUTnameFormat":"TEST_{docnum}_{year:04d}{month:02d}{day:02d}_{title}.txt",
+	"docStruct": {
+		"docSep": "\\s*Copyright [(0-9)]+",
+		"dateFormat":"{month} {day:d}, {year:d}{}",
+		"dateWords": [
+			"January",
+			"February",
+			"March",
+			"April",
+			"May",
+			"June",
+			"July",
+			"August",
+			"September",
+			"October",
+			"November",
+			"December"
+		],
+		"headWords": [
+			"BYLINE:",
+			"SECTION:",
+			"LENGTH:",
+			"DATELINE:",
+			"HIGHLIGHT:",
+			"Email:"
+		],
+		"tailWords": [
+			"Newstex ID" ,
+			"NOTES",
+			"LANGUAGE:",
+			"GRAPHIC:",
+			"TYPE:",
+			"URL:",
+			"LOAD-DATE:",
+			"PUBLICATION-TYPE:",
+			"DOCUMENT-TYPE:",
+			"CHARTS:",
+			"JOURNAL-CODE:"
+		]
+	},
+	"settings": {
+		"encoding": "utf-8",
+		"monthPosition": 0,
+		"getNewsPaperName": true,
+		"nameNotFoundStr" : "--ND--",
+		"includeTitle" : true,
+		"removeDuplicates": true,
+		"showSkipped": false,
+		"showRemovedDuplicates": true,
+		"maxTitleLen": 32,
+		"loadTXT": true,
+		"loadDOCX": false,
+		"removeOldFiles":true,
+		"saveSeparateFiles": true,
+		"saveBodyFile": true,
+		"saveBodyNumber":true,
+		"delLF": false,
+		"delWordBreak": true,
+		"delChars": "'|@|#"
+	}
+}
@@ -0,0 +1,61 @@
+{
+	"INworkPath": "C:\\Test\\",
+	"OUTworkPath": "C:\\Test\\Separati\\",
+	"OUTnameFormat":"TEST_{docnum}_{year:04d}{month:02d}{day:02d}_{papername}_{title}.txt",
+	"docStruct": {
+		"docSep": "Copyright [(0-9)]+",
+		"dateFormat":"{day:d} {month} {year:d} {}",
+		"dateWords": [
+			"Gennaio",
+			"Febbraio",
+			"Marzo",
+			"Aprile",
+			"Maggio",
+			"Giugno",
+			"Luglio",
+			"Agosto",
+			"Settembre",
+			"Ottobre",
+			"Novembre",
+			"Dicembre"
+		],
+		"headWords": [
+			"BYLINE:",
+			"SECTION:",
+			"LENGTH:",
+			"DATELINE:",
+			"HIGHLIGHT:",
+			"Email:"
+		],
+		"tailWords": [
+			"LANGUAGE:",
+			"GRAPHIC:",
+			"TYPE:",
+			"URL:",
+			"LOAD-DATE:",
+			"PUBLICATION-TYPE:",
+			"DOCUMENT-TYPE:",
+			"CHARTS:"
+		]
+	}, 
+	"settings": {
+		"encoding": "utf-8",
+		"monthPosition": 0,
+		"getNewsPaperName": true,
+		"nameNotFoundStr" : "--ND--",
+		"includeTitle" : true,
+		"removeDuplicates": true,
+		"showSkipped": false,
+		"showRemovedDuplicates": true,
+		"maxTitleLen": 32,
+		"loadTXT": true,
+		"loadDOCX": false,
+		"removeOldFiles":true,
+		"saveSeparateFiles": true,
+		"saveBodyFile": true,
+		"saveBodyNumber":true,
+		"delLF": false,
+		"delWordBreak": true,
+		"delChars": "'|@|#"
+	}
+}
@@ -17,22 +17,22 @@ from copy import deepcopy
 from slugify import slugify
 ####### VAR GLOBALI #######

-####### FUNZIONI GLOBALI #######		
+####### FUNZIONI GLOBALI #######        
 def printTitle():
-	print(".  .   .				 ,	 \n|  | _ | _. _ ._ _  _   -+- _  *	\n|/\|(/,|(_.(_)[ | )(/,   | (_) *")
+    print(".  .   .                 ,     \n|  | _ | _. _ ._ _  _   -+- _  *    \n|/\|(/,|(_.(_)[ | )(/,   | (_) *")

-	print(Fore.LIGHTYELLOW_EX,\
-	'	_								   ___________ _	 _____ _____	 \n\
-	| |								 /  ___| ___ \ |   |_   _|_   _|		\n\
-	| |__   __ _ _ __   __ _ _ __   __ _\ `--.| |_/ / |	 | |   | |		  \n\
-	| \'_ \ / _` | \'_ \ / _` | \'_ \ / _` |`--. \  __/| |	 | |   | |	   \n\
-	| |_) | (_| | | | | (_| | | | | (_| /\__/ / |   | |_____| |_  | |		  \n\
-	|_.__/ \__,_|_| |_|\__,_|_| |_|\__,_\____/\_|   \_____/\___/  \_/')
-	
-	print ('\t\t\t\t\t\t\tVersione 0.4a\n\n', Style.RESET_ALL)
-	print('Iniziamo!!')
-	pass
-	
+    print(Fore.LIGHTYELLOW_EX,'    _                                   ___________ _     _____ _____  \n\
+    | |                                 /  ___| ___ \ |   |_   _|_   _|        \n\
+    | |__   __ _ _ __   __ _ _ __   __ _\ `--.| |_/ / |     | |   | |          \n\
+    | \'_ \ / _` | \'_ \ / _` | \'_ \ / _` |`--. \  __/| |     | |   | |       \n\
+    | |_) | (_| | | | | (_| | | | | (_| /\__/ / |   | |_____| |_  | |          \n\
+    |_.__/ \__,_|_| |_|\__,_|_| |_|\__,_\____/\_|   \_____/\___/  \_/')
+    
+    print ('\t\t\t\t\t\t\tVersione 0.4b\n\n', Style.RESET_ALL)
+    print('Iniziamo!!')
+    pass
+    
+    
 #############################################################
 ####################### MAIN ################################
 #############################################################
@@ -42,86 +42,86 @@ status={'first':0, 'head':1, 'body':2, 'tail':3}
 s=status['first']

 try:
-	conffiles = glob('*.json')
-	if len(conffiles) < 1:
-		raise 
-	fileValid = False
-	while not fileValid:
-		print ('Seleziona un file di configurazione per la lingua:')
-		for cf in enumerate(conffiles):
-			print('[{0}] - {1}'.format(cf[0],cf[1]))
-		try:
-			fn = int(input('Scrivi il numero del file e premi Invio: '))
-			if (fn > len(conffiles)-1 or fn < 0):
-				raise
-			fileValid = True
-		except:
-			print('Scusa, non ho capito bene, ricominciamo..\n')
-			fileValid = False
+    conffiles = glob('*.json')
+    if len(conffiles) < 1:
+        raise 
+    fileValid = False
+    while not fileValid:
+        print ('Seleziona un file di configurazione per la lingua:')
+        for cf in enumerate(conffiles):
+            print('[{0}] - {1}'.format(cf[0],cf[1]))
+        try:
+            fn = int(input('Scrivi il numero del file e premi Invio: '))
+            if (fn > len(conffiles)-1 or fn < 0):
+                raise
+            fileValid = True
+        except:
+            print('Scusa, non ho capito bene, ricominciamo..\n')
+            fileValid = False
 except:
-	print(Fore.LIGHTRED_EX, 'OOPS!! File di configurazione non selezionato o non presente..', Style.RESET_ALL)
-	input()
-	sys.exit()
+    print(Fore.LIGHTRED_EX, 'OOPS!! File di configurazione non selezionato o non presente..', Style.RESET_ALL)
+    input()
+    sys.exit()

 try:
-	print('\nApro il file di configurazione [{}]...'.format(conffiles[fn]))
-	fp = open(conffiles[fn],'r')
-	cfg = json.load(fp)
-	fp.close
-	try:
-		print('Carico i parametri...')
-		docParams=cfg['docStruct']
-		basePath=cfg['INworkPath']+"{0}.{1}"
-		settings=cfg['settings']
-		delChars=settings['delChars'].split('|')
-		docSep=re.compile(docParams['docSep'])
-	except:
-		print(Fore.LIGHTRED_EX,'OOPS! Qualcosa e\' andato storto, non riesco a caricare la configurazione, controlla la sintassi! :)', Style.RESET_ALL)
-		input()
-		sys.exit()
+    print('\nApro il file di configurazione [{}]...'.format(conffiles[fn]))
+    fp = open(conffiles[fn],'r')
+    cfg = json.load(fp)
+    fp.close
+    try:
+        print('Carico i parametri...')
+        docParams=cfg['docStruct']
+        basePath=cfg['INworkPath']+"{0}.{1}"
+        settings=cfg['settings']
+        delChars=settings['delChars'].split('|')
+        docSep=re.compile(docParams['docSep'])
+    except:
+        print(Fore.LIGHTRED_EX,'OOPS! Qualcosa e\' andato storto, non riesco a caricare la configurazione, controlla la sintassi! :)', Style.RESET_ALL)
+        input()
+        sys.exit()
 except IOError as e:
-	print(Fore.LIGHTRED_EX, 'OOPS! Qualcosa e\' andato storto, non trovo il file di configurazione: {}'.format(e), Style.RESET_ALL)
-	input()
-	sys.exit()
+    print(Fore.LIGHTRED_EX, 'OOPS! Qualcosa e\' andato storto, non trovo il file di configurazione: {}'.format(e), Style.RESET_ALL)
+    input()
+    sys.exit()

 files=[]
 try:
-	print('Ottengo la lista dei file da separare...')
-	if settings['loadTXT']:
-		files+=glob(basePath.format('*','txt'))
-	if settings['loadDOCX']:
-		print(Fore.LIGHTRED_EX, 'OOPS! Scusa ma non posso accontentarti, per ora non so leggere i file DOCX.. :(', Style.RESET_ALL)
-		input()
-		sys.exit()
-		#files+=glob(basePath.format('*','docx'))
-	if len(files)<=0:
-		raise 
-	pprint(files)
-	while True:
-		r=input('\nVuoi davvero bananaSPLITTARE questi documenti? [y/n]:')
-		r.strip()
-		if r=='Y' or r =='y':
-			break
-		elif r=='n' or r=='N':
-			print('OK! Nessun problema, ci vediamo dopo :)')
-			input()
-			sys.exit()
-		else:
-			print('Non ho capito la risposta, sii un po\' piu\' specifico... [y/n]\n')
-			pass
+    print('Ottengo la lista dei file da separare...')
+    if settings['loadTXT']:
+        files+=glob(basePath.format('*','txt'))
+    if settings['loadDOCX']:
+        print(Fore.LIGHTRED_EX, 'OOPS! Scusa ma non posso accontentarti, per ora non so leggere i file DOCX.. :(', Style.RESET_ALL)
+        input()
+        sys.exit()
+        #files+=glob(basePath.format('*','docx'))
+    if len(files)<=0:
+        raise 
+    pprint(files)
+    while True:
+        r=input('\nVuoi davvero bananaSPLITTARE questi documenti? [y/n]:')
+        r.strip()
+        if r=='Y' or r =='y':
+            break
+        elif r=='n' or r=='N':
+            print('OK! Nessun problema, ci vediamo dopo :)')
+            input()
+            sys.exit()
+        else:
+            print('Non ho capito la risposta, sii un po\' piu\' specifico... [y/n]\n')
+            pass
 except:
-	print(Fore.LIGHTRED_EX, 'OOPS! Qualcosa e\' andato storto, non riesco a trovare nessun file da leggere :(', Style.RESET_ALL)
-	input()
-	sys.exit()
+    print(Fore.LIGHTRED_EX, 'OOPS! Qualcosa e\' andato storto, non riesco a trovare nessun file da leggere :(', Style.RESET_ALL)
+    input()
+    sys.exit()

 try:
-	if settings['removeOldFiles']:
-		print("Rimuovo i vecchi file dalla cartella di destinazione..")
-		for x in glob(cfg['OUTworkPath']+'*.txt'):
-			os.remove(x)
+    if settings['removeOldFiles']:
+        print("Rimuovo i vecchi file dalla cartella di destinazione..")
+        for x in glob(cfg['OUTworkPath']+'*.txt'):
+            os.remove(x)
 except:
-	print(Fore.LIGHTRED_EX, 'OOPS! Qualcosa e\' andato storto, non riesco a rimuovere i vecchi file :(', Style.RESET_ALL)
-	
+    print(Fore.LIGHTRED_EX, 'OOPS! Qualcosa e\' andato storto, non riesco a rimuovere i vecchi file :(', Style.RESET_ALL)
+    
 lastTime=time.time()
 fileCounter = 1
 fileBodyCounter = 1
@@ -130,201 +130,201 @@ totSkipped=0

 # per ogni file nella lista
 for f in files:
-	s=status['first']
-	prevLine=''
-	doc={'title':'',
-		  'date':'', 
-		  'content':[]
-		  }
-	fileContent=''
-	tempContent=list()
-	fileBaseName=os.path.split(f)[1].split('.')[0]
-	fileBodyList=list()
-	fileContent = list()
-	try:
-		print()
-		print('-'*50)
-		print('Apro il file: {}'.format(fileBaseName))
-		fp = open(f,mode='r', encoding=settings['encoding'])
-		fileContent=fp.readlines() #leggi le linee del file
-		fp.close()
-	except IOError as e:
-		print(Fore.LIGHTRED_EX, 'OOPS! Qualcosa e\' andato storto, non riesco ad aprire il file: {}'.format(fileBaseName), Style.RESET_ALL)
-		continue
-		
-	print('Elimino le righe vuote...')
-	for ll in fileContent:
-		for c in delChars:
-			ll=ll.replace(c,'')
-		if ll not in  ['\n','\r']: 
-			tempContent.append(ll)			
-	fileContent=deepcopy(tempContent)
-	tempContent=list()
-	
-	print('Individuo il contenuto...')
-	docNumber = 0
-	docSkipped = 0
-	bodyCounter = 0
-	duplicateNumber = 0
-	docDate = {}
-	prevLine = ''
-	newsPaperName = ''
-	titleBegin = False
+    s=status['first']
+    prevLine=''
+    doc={'title':'',
+          'date':'', 
+          'content':[]
+          }
+    fileContent=''
+    tempContent=list()
+    fileBaseName=os.path.split(f)[1].split('.')[0]
+    fileBodyList=list()
+    fileContent = list()
+    try:
+        print()
+        print('-'*50)
+        print('Apro il file: {}'.format(fileBaseName))
+        fp = open(f,mode='r', encoding=settings['encoding'])
+        fileContent=fp.readlines() #leggi le linee del file
+        fp.close()
+    except IOError as e:
+        print(Fore.LIGHTRED_EX, 'OOPS! Qualcosa e\' andato storto, non riesco ad aprire il file: {}'.format(fileBaseName), Style.RESET_ALL)
+        continue
+        
+    print('Elimino le righe vuote...')
+    for ll in fileContent:
+        for c in delChars:
+            ll=ll.replace(c,'')
+        if ll not in  ['\n','\r']: 
+            tempContent.append(ll)            
+    fileContent=deepcopy(tempContent)
+    tempContent=list()
+    
+    print('Individuo il contenuto...')
+    docNumber = 0
+    docSkipped = 0
+    bodyCounter = 0
+    duplicateNumber = 0
+    docDate = {}
+    prevLine = ''
+    newsPaperName = ''
+    titleBegin = False

-	for l in fileContent: #per ogni linea del file
-		lineWords=l.lstrip().split(' ') #dividi la riga in parole
-		if s==status['first']:
-			try:
-				#prendo il numero di documento per vedere se ci sono buchi
-				try:
-					nn = parse.parse("{current:d} Of {total} Documents",l.strip().capitalize()).named
-					if nn["current"]-docNumber==1:
-						pass
-					else:
-						if settings["showSkipped"]:
-							print(Fore.LIGHTRED_EX,"OOPS!! Il conto dei documenti non torna! LexisNexis \
-							ne ha saltato qualcuno!\nPrecedente:{0}-Attuale:{1}".format(docNumber,nn["current"]), Style.RESET_ALL)
-						docSkipped+=1
-					docNumber = nn["current"]
-				except:
-					pass
-				if (lineWords[settings['monthPosition']]).capitalize() in docParams['dateWords']:
-					try:
-						docDate=parse.parse(docParams['dateFormat'],l).named
-						docDate['month']=docDate['month'].lstrip().rstrip().capitalize()
-						docDate['month']=docParams['dateWords'].index(docDate['month'])+1
-						title = ''
-						titleBegin=True
-						if settings['getNewsPaperName']:
-							try:
-								if prevLine.split(' ')[0].strip().isalpha():
-									newsPaperName = prevLine.strip()
-								else:
-									newsPaperName = settings['nameNotFoundStr']
-							except:
-								print(Fore.LIGHTRED_EX, "OOPS! E' successo qualcosa mentre stavo cercando il nome della pubblicazione,\
-								controlla i file di uscita! \n\t[{}]".format(prevLine.strip()), Style.RESET_ALL)
-						else:
-							newsPaperName = settings['nameNotFoundStr']
-					except:
-						print(Fore.LIGHTRED_EX, "OOPS! Ho trovato una riga ambigua.. potrebbe essere una data ma non so: \n\t[{}]". format(l.strip('\r\n')), 
-							  Style.RESET_ALL)
-				elif lineWords[0] in docParams['headWords']:
-					s=status['head']
-					newDoc=deepcopy(doc)
-					newDoc['title']=title
-					newDoc['date']=docDate
-					newDoc['newsPaperName'] = newsPaperName
-					titleBegin=False
-				else:
-					if titleBegin:
-						title += l.strip().capitalize()
-			except IndexError:
-				print (Fore.LIGHTRED_EX, 'OOPS! Errore inaspettato, contatta il tuo sviluppatore di fiducia!', Style.RESET_ALL)
-		elif s==status['head']:
-			if lineWords[0] not in docParams['headWords']:  #se la prima parola non e' tra quelle di inizio	
-				tempContent.append(l)					   # vuol dire che ho trovato l'articolo
-				s=status['body']		
-		elif s==status['body']: 
-			if not lineWords[0] in docParams['tailWords']: #se la prima parola non e' tra quelle di fine
-				if settings['delLF']:
-					tempContent.append(l.strip('\n'))		   #allora sto leggendo l'articolo
-				else:
-					tempContent.append(l)
-			else:
-				s=status['tail']
-				anomaly = False
-			if docSep.match(l) is not None: #controlla se ci sono articoli che non hanno le parole chiave finali 
-				print(Fore.YELLOW, "HEY! Ho individuato una separatore valido prima che si chiusesse l'articolo precedente, controlla i tuoi file in uscita!\n\
-				L'errore dovrebbe essere intorno all'articolo {} ma non sono sicuro! \n\t\t[{}]".format(docNumber,
-																										l.strip()), Style.RESET_ALL)
-				s=status['tail'] 
-				anomaly = True
-		elif s==status['tail']: #cerco il separatore di articoli e aggiungo quello letto alla lista
-			if docSep.match(l) is not None or anomaly:
-				s=status['first']
-				anomaly = False
-				if settings['delWordBreak']:
-					tempContent=[ll.replace('-\n', '') for ll in tempContent]
-				newDoc['content']=deepcopy(''.join(tempContent))
-				fileBodyList.append(deepcopy(newDoc))
-				tempContent=list()
-				bodyCounter +=1
-		else:
-			pass 
-		prevLine=l	 
-	pass
-	print ('Nel file ho trovato {0} articoli..'.format(bodyCounter))
-	if docSkipped > 0:
-		print (Fore.YELLOW, 'Attentione, LexisNexis ne ha saltati {} !!!'.format(docSkipped), Style.RESET_ALL)
-	print()
-	
-	if bodyCounter >= 1:
-		if settings['removeDuplicates']:
-			print ('Controllo se ci sono dei duplicati..')
-			titleList=[]
-			duplicateList=[]
-			duplicateNumber=0
-			for idx, ff in enumerate(fileBodyList):
-				if ff['title'] not in titleList:
-					titleList.append(ff['title'])
-					ff['duplicate']=False
-					fileBodyList[idx]=ff
-					pass
-				else:
-					if ff['title'] not in duplicateList:
-						duplicateList.append(ff['title'])
-						if settings['showRemovedDuplicates']:
-							print ('Duplicato: {}'.format(ff['title'].strip()))
-					ff['duplicate'] = True
-					fileBodyList[idx]=ff
-					duplicateNumber+=1
-			print ('Ho rimosso {} duplicati di {} articoli..\n'. format(duplicateNumber, len(duplicateList)))
-		else:
-			for idx, ff in enumerate(fileBodyList):
-				ff['duplicate']=False
-				fileBodyList[idx]=ff
-			print('Salto il controllo dei duplicati..')
-			pass
-		if settings['saveSeparateFiles']:
-			print ('Salvo gli articoli in file separati...')
-			print ('Persorso: {0}'.format(cfg['OUTworkPath'].format('nomeFile')))
-			for ff in fileBodyList:
-				try:
-					if ff['duplicate'] == False:
-						fileName=cfg['OUTnameFormat'].format(title=slugify(ff['title'][:settings['maxTitleLen']]),\
-															 filename=slugify(fileBaseName),\
-															 docnum=fileBodyCounter,\
-															 papername=ff['newsPaperName'].strip(),\
-															 **ff['date'])
-						out=open(cfg['OUTworkPath']+'{0}'.format(fileName),'wb')
-						if settings['includeTitle']:
-							ff['content'] = ff['title']+os.linesep+ff['content']
-						out.write(ff['content'].encode(settings['encoding']))
-						out.close()
-						fileBodyCounter+=1
-				except IOError as e:
-					print(Fore.LIGHTRED_EX, 'OOPS! Qualcosa e\' andato storto, non riesco a scrivere il file: {}'.format(e), Style.RESET_ALL)
-					continue
-		if settings['saveBodyFile']:
-			print('Salvo gli articoli in un singolo file vicino agli originali...')
-			print ('Persorso: {0}'.format(cfg['OUTworkPath'].format('nomeFile')))
-			try:
-				fileName=slugify(fileBaseName)
-				fileName='BODYFILE_{0}_{1}.txt'.format(fileCounter,fileName[:settings['maxTitleLen']])
-				fileContent = os.linesep.join([cc['content'] for cc in fileBodyList])
-				out=open(cfg['OUTworkPath']+'{0}'.format(fileName),'wb')
-				out.write(fileContent.encode(settings['encoding']))
-				out.close()
-			except IOError as e:
-				print(Fore.LIGHTRED_EX, 'OOPS! Qualcosa e\' andato storto, non riesco a scrivere il file: {}'.format(e), Style.RESET_ALL)
-				continue
-		fileCounter+=1
-		totSkipped+=docSkipped
-		totFound+=bodyCounter-duplicateNumber
-	else:
-		print('[{0}] non contiene articoli, \n controlla meglio le parole chiave! SGRUNT'.format(fileBaseName))
-	print('-'*50)
+    for l in fileContent: #per ogni linea del file
+        lineWords=l.lstrip().split(' ') #dividi la riga in parole
+        if s==status['first']:
+            try:
+                #prendo il numero di documento per vedere se ci sono buchi
+                try:
+                    nn = parse.parse("{current:d} Of {total} Documents",l.strip().capitalize()).named
+                    if nn["current"]-docNumber==1:
+                        pass
+                    else:
+                        if settings["showSkipped"]:
+                            print(Fore.LIGHTRED_EX,"OOPS!! Il conto dei documenti non torna! LexisNexis \
+                            ne ha saltato qualcuno!\nPrecedente:{0}-Attuale:{1}".format(docNumber,nn["current"]), Style.RESET_ALL)
+                        docSkipped+=1
+                    docNumber = nn["current"]
+                except:
+                    pass
+                if (lineWords[settings['monthPosition']]).capitalize() in docParams['dateWords']:
+                    try:
+                        docDate=parse.parse(docParams['dateFormat'],l).named
+                        docDate['month']=docDate['month'].lstrip().rstrip().capitalize()
+                        docDate['month']=docParams['dateWords'].index(docDate['month'])+1
+                        title = ''
+                        titleBegin=True
+                        if settings['getNewsPaperName']:
+                            try:
+                                if prevLine.split(' ')[0].strip().isalpha():
+                                    newsPaperName = prevLine.strip()
+                                else:
+                                    newsPaperName = settings['nameNotFoundStr']
+                            except:
+                                print(Fore.LIGHTRED_EX, "OOPS! E' successo qualcosa mentre stavo cercando il nome della pubblicazione,\
+                                controlla i file di uscita! \n\t[{}]".format(prevLine.strip()), Style.RESET_ALL)
+                        else:
+                            newsPaperName = settings['nameNotFoundStr']
+                    except:
+                        print(Fore.LIGHTRED_EX, "OOPS! Ho trovato una riga ambigua.. potrebbe essere una data ma non so: \n\t[{}]". format(l.strip('\r\n')), 
+                              Style.RESET_ALL)
+                elif lineWords[0] in docParams['headWords']:
+                    s=status['head']
+                    newDoc=deepcopy(doc)
+                    newDoc['title']=title
+                    newDoc['date']=docDate
+                    newDoc['newsPaperName'] = newsPaperName
+                    titleBegin=False
+                else:
+                    if titleBegin:
+                        title += l.strip().capitalize()
+            except IndexError:
+                print (Fore.LIGHTRED_EX, 'OOPS! Errore inaspettato, contatta il tuo sviluppatore di fiducia!', Style.RESET_ALL)
+        elif s==status['head']:
+            if lineWords[0] not in docParams['headWords']:  #se la prima parola non e' tra quelle di inizio    
+                tempContent.append(l)                       # vuol dire che ho trovato l'articolo
+                s=status['body']        
+        elif s==status['body']: 
+            if not lineWords[0] in docParams['tailWords']: #se la prima parola non e' tra quelle di fine
+                if settings['delLF']:
+                    tempContent.append(l.strip('\n'))           #allora sto leggendo l'articolo
+                else:
+                    tempContent.append(l)
+            else:
+                s=status['tail']
+                anomaly = False
+            if docSep.match(l) is not None: #controlla se ci sono articoli che non hanno le parole chiave finali 
+                print(Fore.YELLOW, "HEY! Ho individuato una separatore valido prima che si chiusesse l'articolo precedente, controlla i tuoi file in uscita!\n\
+                L'errore dovrebbe essere intorno all'articolo {} ma non sono sicuro! \n\t\t[{}]".format(docNumber,
+                                                                                                        l.strip()), Style.RESET_ALL)
+                s=status['tail'] 
+                anomaly = True
+        elif s==status['tail']: #cerco il separatore di articoli e aggiungo quello letto alla lista
+            if docSep.match(l) is not None or anomaly:
+                s=status['first']
+                anomaly = False
+                if settings['delWordBreak']:
+                    tempContent=[ll.replace('-\n', '') for ll in tempContent]
+                newDoc['content']=deepcopy(''.join(tempContent))
+                fileBodyList.append(deepcopy(newDoc))
+                tempContent=list()
+                bodyCounter +=1
+        else:
+            pass 
+        prevLine=l     
+    pass
+    print ('Nel file ho trovato {0} articoli..'.format(bodyCounter))
+    if docSkipped > 0:
+        print (Fore.YELLOW, 'Attentione, LexisNexis ne ha saltati {} !!!'.format(docSkipped), Style.RESET_ALL)
+    print()
+    
+    if bodyCounter >= 1:
+        if settings['removeDuplicates']:
+            print ('Controllo se ci sono dei duplicati..')
+            titleList=[]
+            duplicateList=[]
+            duplicateNumber=0
+            for idx, ff in enumerate(fileBodyList):
+                if ff['title'] not in titleList:
+                    titleList.append(ff['title'])
+                    ff['duplicate']=False
+                    fileBodyList[idx]=ff
+                    pass
+                else:
+                    if ff['title'] not in duplicateList:
+                        duplicateList.append(ff['title'])
+                        if settings['showRemovedDuplicates']:
+                            print ('Duplicato: {}'.format(ff['title'].strip()))
+                    ff['duplicate'] = True
+                    fileBodyList[idx]=ff
+                    duplicateNumber+=1
+            print ('Ho rimosso {} duplicati di {} articoli..\n'. format(duplicateNumber, len(duplicateList)))
+        else:
+            for idx, ff in enumerate(fileBodyList):
+                ff['duplicate']=False
+                fileBodyList[idx]=ff
+            print('Salto il controllo dei duplicati..')
+            pass
+        if settings['saveSeparateFiles']:
+            print ('Salvo gli articoli in file separati...')
+            print ('Persorso: {0}'.format(cfg['OUTworkPath'].format('nomeFile')))
+            for ff in fileBodyList:
+                try:
+                    if ff['duplicate'] == False:
+                        fileName=cfg['OUTnameFormat'].format(title=slugify(ff['title'][:settings['maxTitleLen']]),\
+                                                             filename=slugify(fileBaseName),\
+                                                             docnum=fileBodyCounter,\
+                                                             papername=ff['newsPaperName'].strip(),\
+                                                             **ff['date'])
+                        out=open(cfg['OUTworkPath']+'{0}'.format(fileName),'wb')
+                        if settings['includeTitle']:
+                            ff['content'] = ff['title']+os.linesep+ff['content']
+                        out.write(ff['content'].encode(settings['encoding']))
+                        out.close()
+                        fileBodyCounter+=1
+                except IOError as e:
+                    print(Fore.LIGHTRED_EX, 'OOPS! Qualcosa e\' andato storto, non riesco a scrivere il file: {}'.format(e), Style.RESET_ALL)
+                    continue
+        if settings['saveBodyFile']:
+            print('Salvo gli articoli in un singolo file vicino agli originali...')
+            print ('Persorso: {0}'.format(cfg['OUTworkPath'].format('nomeFile')))
+            try:
+                fileName=slugify(fileBaseName)
+                fileName='BODYFILE_{0}_{1}.txt'.format(fileCounter,fileName[:settings['maxTitleLen']])
+                fileContent = os.linesep.join([cc['content'] for cc in fileBodyList])
+                out=open(cfg['OUTworkPath']+'{0}'.format(fileName),'wb')
+                out.write(fileContent.encode(settings['encoding']))
+                out.close()
+            except IOError as e:
+                print(Fore.LIGHTRED_EX, 'OOPS! Qualcosa e\' andato storto, non riesco a scrivere il file: {}'.format(e), Style.RESET_ALL)
+                continue
+        fileCounter+=1
+        totSkipped+=docSkipped
+        totFound+=bodyCounter-duplicateNumber
+    else:
+        print('[{0}] non contiene articoli, \n controlla meglio le parole chiave! SGRUNT'.format(fileBaseName))
+    print('-'*50)

 print()
 print (Fore.LIGHTYELLOW_EX, 'bananaSPLIT ha concluso con successo in {0:1.3f} secondi, \n\