commit 452578c56b03bf7be2f1db384b8b9202022fbd3e Author: Emanuele Date: Wed Oct 9 19:42:17 2019 +0200 Commit iniziale per una nuova era di bananaSPLIT diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..794e2f4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/.DS_Store +/.project +/.pydevproject diff --git a/bananaSPLIT/.gitignore b/bananaSPLIT/.gitignore new file mode 100644 index 0000000..cb94557 --- /dev/null +++ b/bananaSPLIT/.gitignore @@ -0,0 +1,6 @@ +/@Archivio/ +/build/ +/dist/ +/bananaSPLIT.exe.spec\ +*.py[ocd] +/.DS_Store diff --git a/bananaSPLIT/87537783-banana-icon-cartoon-illustration-of-banana-vector-icon-for-web.xcf b/bananaSPLIT/87537783-banana-icon-cartoon-illustration-of-banana-vector-icon-for-web.xcf new file mode 100644 index 0000000..7f223e0 Binary files /dev/null and b/bananaSPLIT/87537783-banana-icon-cartoon-illustration-of-banana-vector-icon-for-web.xcf differ diff --git a/bananaSPLIT/Manuali/Pdf/bananaMANUAL-v0.2.pdf b/bananaSPLIT/Manuali/Pdf/bananaMANUAL-v0.2.pdf new file mode 100644 index 0000000..2d531e6 Binary files /dev/null and b/bananaSPLIT/Manuali/Pdf/bananaMANUAL-v0.2.pdf differ diff --git a/bananaSPLIT/Manuali/Pdf/bananaMANUAL-v0.3a.pdf b/bananaSPLIT/Manuali/Pdf/bananaMANUAL-v0.3a.pdf new file mode 100644 index 0000000..48b75f4 Binary files /dev/null and b/bananaSPLIT/Manuali/Pdf/bananaMANUAL-v0.3a.pdf differ diff --git a/bananaSPLIT/Manuali/bananaMANUAL-v0.1.docx b/bananaSPLIT/Manuali/bananaMANUAL-v0.1.docx new file mode 100644 index 0000000..d8a2848 Binary files /dev/null and b/bananaSPLIT/Manuali/bananaMANUAL-v0.1.docx differ diff --git a/bananaSPLIT/Manuali/bananaMANUAL-v0.2.docx b/bananaSPLIT/Manuali/bananaMANUAL-v0.2.docx new file mode 100644 index 0000000..9691e8d Binary files /dev/null and b/bananaSPLIT/Manuali/bananaMANUAL-v0.2.docx differ diff --git a/bananaSPLIT/Manuali/bananaMANUAL-v0.3a.docx b/bananaSPLIT/Manuali/bananaMANUAL-v0.3a.docx new file mode 100644 index 0000000..d43461d Binary files /dev/null and b/bananaSPLIT/Manuali/bananaMANUAL-v0.3a.docx differ diff --git a/bananaSPLIT/Manuali/bananaMANUAL.docx b/bananaSPLIT/Manuali/bananaMANUAL.docx new file mode 100644 index 0000000..bf02c72 Binary files /dev/null and b/bananaSPLIT/Manuali/bananaMANUAL.docx differ diff --git a/bananaSPLIT/__init__.py b/bananaSPLIT/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bananaSPLIT/banana.ico b/bananaSPLIT/banana.ico new file mode 100644 index 0000000..7f8d32a Binary files /dev/null and b/bananaSPLIT/banana.ico differ diff --git a/bananaSPLIT/bananaconfEN.json b/bananaSPLIT/bananaconfEN.json new file mode 100644 index 0000000..282714e --- /dev/null +++ b/bananaSPLIT/bananaconfEN.json @@ -0,0 +1,64 @@ +{ + "INworkPath": "D:\\Test\\", + "OUTworkPath": "D:\\Test\\Separati\\", + "OUTnameFormat":"TEST_{docnum}_{year:04d}{month:02d}{day:02d}_{title}.txt", + "docStruct": { + "docSep": "\\s*Copyright [(0-9)]+", + "dateFormat":"{month} {day:d}, {year:d}{}", + "dateWords": [ + "January", + "February", + "March", + "April", + "May", + "June", + "July", + "August", + "September", + "October", + "November", + "December" + ], + "headWords": [ + "BYLINE:", + "SECTION:", + "LENGTH:", + "DATELINE:", + "HIGHLIGHT:", + "Email:" + ], + "tailWords": [ + "Newstex ID" , + "NOTES", + "LANGUAGE:", + "GRAPHIC:", + "TYPE:", + "URL:", + "LOAD-DATE:", + "PUBLICATION-TYPE:", + "DOCUMENT-TYPE:", + "CHARTS:", + "JOURNAL-CODE:" + ] + }, + "settings": { + "encoding": "utf-8", + "monthPosition": 0, + "getNewsPaperName": true, + "nameNotFoundStr" : "--ND--", + "includeTitle" : true, + "removeDuplicates": true, + "showSkipped": false, + "showRemovedDuplicates": true, + "maxTitleLen": 32, + "loadTXT": true, + "loadDOCX": false, + "removeOldFiles":true, + "saveSeparateFiles": true, + "saveBodyFile": true, + "saveBodyNumber":true, + "delLF": false, + "delWordBreak": true, + "delChars": "'|@|#" + } +} diff --git a/bananaSPLIT/bananaconfITA.json b/bananaSPLIT/bananaconfITA.json new file mode 100644 index 0000000..8c1a07c --- /dev/null +++ b/bananaSPLIT/bananaconfITA.json @@ -0,0 +1,62 @@ +{ + "INworkPath": "C:\\Test\\", + "OUTworkPath": "C:\\Test\\Separati\\", + "OUTnameFormat":"TEST_{docnum}_{year:04d}{month:02d}{day:02d}_{papername}_{title}.txt", + "docStruct": { + "docSep": "Copyright [(0-9)]+", + "dateFormat":"{day:d} {month} {year:d} {}", + "dateWords": [ + "Gennaio", + "Febbraio", + "Marzo", + "Aprile", + "Maggio", + "Giugno", + "Luglio", + "Agosto", + "Settembre", + "Ottobre", + "Novembre", + "Dicembre" + ], + "headWords": [ + "BYLINE:", + "SECTION:", + "LENGTH:", + "DATELINE:", + "HIGHLIGHT:", + "Email:" + ], + "tailWords": [ + "LANGUAGE:", + "GRAPHIC:", + "TYPE:", + "URL:", + "LOAD-DATE:", + "PUBLICATION-TYPE:", + "DOCUMENT-TYPE:", + "CHARTS:" + ] + }, + "settings": + "settings": { + "encoding": "utf-8", + "monthPosition": 0, + "getNewsPaperName": true, + "nameNotFoundStr" : "--ND--", + "includeTitle" : true, + "removeDuplicates": true, + "showSkipped": false, + "showRemovedDuplicates": true, + "maxTitleLen": 32, + "loadTXT": true, + "loadDOCX": false, + "removeOldFiles":true, + "saveSeparateFiles": true, + "saveBodyFile": true, + "saveBodyNumber":true, + "delLF": false, + "delWordBreak": true, + "delChars": "'|@|#" + } +} diff --git a/bananaSPLIT/main.py b/bananaSPLIT/main.py new file mode 100644 index 0000000..22db218 --- /dev/null +++ b/bananaSPLIT/main.py @@ -0,0 +1,330 @@ +''' +Created on 27 nov 2018 + +@author: Emanuele +''' +import json +import time +import os +import sys +import re +import parse + +from colorama import Fore, Style, init +from pprint import pprint +from glob import glob +from copy import deepcopy +from slugify import slugify + + +####### VAR GLOBALI ####### + +####### FUNZIONI GLOBALI ####### +def printTitle(): + print(". . . , \n| | _ | _. _ ._ _ _ -+- _ * \n|/\|(/,|(_.(_)[ | )(/, | (_) *") + + print(Fore.LIGHTYELLOW_EX,\ + ' _ ___________ _ _____ _____ \n\ + | | / ___| ___ \ | |_ _|_ _| \n\ + | |__ __ _ _ __ __ _ _ __ __ _\ `--.| |_/ / | | | | | \n\ + | \'_ \ / _` | \'_ \ / _` | \'_ \ / _` |`--. \ __/| | | | | | \n\ + | |_) | (_| | | | | (_| | | | | (_| /\__/ / | | |_____| |_ | | \n\ + |_.__/ \__,_|_| |_|\__,_|_| |_|\__,_\____/\_| \_____/\___/ \_/') + + print ('\t\t\t\t\t\t\tVersione 0.4a\n\n', Style.RESET_ALL) + print('Iniziamo!!') + pass + +############################################################# +####################### MAIN ################################ +############################################################# +init(convert=True) +printTitle() +status={'first':0, 'head':1, 'body':2, 'tail':3} +s=status['first'] + +try: + conffiles = glob('*.json') + if len(conffiles) < 1: + raise + fileValid = False + while not fileValid: + print ('Seleziona un file di configurazione per la lingua:') + for cf in enumerate(conffiles): + print('[{0}] - {1}'.format(cf[0],cf[1])) + try: + fn = int(input('Scrivi il numero del file e premi Invio: ')) + if (fn > len(conffiles)-1 or fn < 0): + raise + fileValid = True + except: + print('Scusa, non ho capito bene, ricominciamo..\n') + fileValid = False +except: + print(Fore.LIGHTRED_EX, 'OOPS!! File di configurazione non selezionato o non presente..', Style.RESET_ALL) + input() + sys.exit() + +try: + print('\nApro il file di configurazione [{}]...'.format(conffiles[fn])) + fp = open(conffiles[fn],'r') + cfg = json.load(fp) + fp.close + try: + print('Carico i parametri...') + docParams=cfg['docStruct'] + basePath=cfg['INworkPath']+"{0}.{1}" + settings=cfg['settings'] + delChars=settings['delChars'].split('|') + docSep=re.compile(docParams['docSep']) + except: + print(Fore.LIGHTRED_EX,'OOPS! Qualcosa e\' andato storto, non riesco a caricare la configurazione, controlla la sintassi! :)', Style.RESET_ALL) + input() + sys.exit() +except IOError as e: + print(Fore.LIGHTRED_EX, 'OOPS! Qualcosa e\' andato storto, non trovo il file di configurazione: {}'.format(e), Style.RESET_ALL) + input() + sys.exit() + +files=[] +try: + print('Ottengo la lista dei file da separare...') + if settings['loadTXT']: + files+=glob(basePath.format('*','txt')) + if settings['loadDOCX']: + print(Fore.LIGHTRED_EX, 'OOPS! Scusa ma non posso accontentarti, per ora non so leggere i file DOCX.. :(', Style.RESET_ALL) + input() + sys.exit() + #files+=glob(basePath.format('*','docx')) + if len(files)<=0: + raise + pprint(files) + while True: + r=input('\nVuoi davvero bananaSPLITTARE questi documenti? [y/n]:') + r.strip() + if r=='Y' or r =='y': + break + elif r=='n' or r=='N': + print('OK! Nessun problema, ci vediamo dopo :)') + input() + sys.exit() + else: + print('Non ho capito la risposta, sii un po\' piu\' specifico... [y/n]\n') + pass + if settings['removeOldFiles']: + print("Rimuovo i vecchi file dalla cartella di destinazione..") + for x in glob(cfg['OUTworkPath']+'*.txt'): + os.remove(x) +except: + print(Fore.LIGHTRED_EX, 'OOPS! Qualcosa e\' andato storto, non riesco a trovare nessun file da leggere :(', Style.RESET_ALL) + input() + sys.exit() + +lastTime=time.time() +fileCounter = 1 +fileBodyCounter = 1 +totFound=0 +totSkipped=0 + +# per ogni file nella lista +for f in files: + s=status['first'] + prevLine='' + doc={'title':'', + 'date':'', + 'content':[] + } + fileContent='' + tempContent=list() + fileBaseName=os.path.split(f)[1].split('.')[0] + fileBodyList=list() + fileContent = list() + try: + print() + print('-'*50) + print('Apro il file: {}'.format(fileBaseName)) + fp = open(f,mode='r', encoding=settings['encoding']) + fileContent=fp.readlines() #leggi le linee del file + fp.close() + except IOError as e: + print(Fore.LIGHTRED_EX, 'OOPS! Qualcosa e\' andato storto, non riesco ad aprire il file: {}'.format(fileBaseName), Style.RESET_ALL) + continue + + print('Elimino le righe vuote...') + for ll in fileContent: + for c in delChars: + ll=ll.replace(c,'') + if ll not in ['\n','\r']: + tempContent.append(ll) + fileContent=deepcopy(tempContent) + tempContent=list() + + print('Individuo il contenuto...') + docNumber = 0 + docSkipped = 0 + bodyCounter = 0 + duplicateNumber = 0 + docDate = {} + prevLine = '' + newsPaperName = '' + titleBegin = False + + for l in fileContent: #per ogni linea del file + lineWords=l.lstrip().split(' ') #dividi la riga in parole + if s==status['first']: + try: + #prendo il numero di documento per vedere se ci sono buchi + try: + nn = parse.parse("{current:d} Of {total} Documents",l.strip().capitalize()).named + if nn["current"]-docNumber==1: + pass + else: + if settings["showSkipped"]: + print(Fore.LIGHTRED_EX,"OOPS!! Il conto dei documenti non torna! LexisNexis \ + ne ha saltato qualcuno!\nPrecedente:{0}-Attuale:{1}".format(docNumber,nn["current"]), Style.RESET_ALL) + docSkipped+=1 + docNumber = nn["current"] + except: + pass + if (lineWords[settings['monthPosition']]).capitalize() in docParams['dateWords']: + try: + docDate=parse.parse(docParams['dateFormat'],l).named + docDate['month']=docDate['month'].lstrip().rstrip().capitalize() + docDate['month']=docParams['dateWords'].index(docDate['month'])+1 + title = '' + titleBegin=True + if settings['getNewsPaperName']: + try: + if prevLine.split(' ')[0].strip().isalpha(): + newsPaperName = prevLine.strip() + else: + newsPaperName = settings['nameNotFoundStr'] + except: + print(Fore.LIGHTRED_EX, "OOPS! E' successo qualcosa mentre stavo cercando il nome della pubblicazione,\ + controlla i file di uscita! \n\t[{}]".format(prevLine.strip()), Style.RESET_ALL) + else: + newsPaperName = settings['nameNotFoundStr'] + except: + print(Fore.LIGHTRED_EX, "OOPS! Ho trovato una riga ambigua.. potrebbe essere una data ma non so: \n\t[{}]". format(l.strip('\r\n')), + Style.RESET_ALL) + elif lineWords[0] in docParams['headWords']: + s=status['head'] + newDoc=deepcopy(doc) + newDoc['title']=title + newDoc['date']=docDate + newDoc['newsPaperName'] = newsPaperName + titleBegin=False + else: + if titleBegin: + title += l.strip().capitalize() + except IndexError: + print (Fore.LIGHTRED_EX, 'OOPS! Errore inaspettato, contatta il tuo sviluppatore di fiducia!', Style.RESET_ALL) + elif s==status['head']: + if lineWords[0] not in docParams['headWords']: #se la prima parola non e' tra quelle di inizio + tempContent.append(l) # vuol dire che ho trovato l'articolo + s=status['body'] + elif s==status['body']: + if not lineWords[0] in docParams['tailWords']: #se la prima parola non e' tra quelle di fine + if settings['delLF']: + tempContent.append(l.strip('\n')) #allora sto leggendo l'articolo + else: + tempContent.append(l) + else: + s=status['tail'] + anomaly = False + if docSep.match(l) is not None: #controlla se ci sono articoli che non hanno le parole chiave finali + print(Fore.YELLOW, "HEY! Ho individuato una separatore valido prima che si chiusesse l'articolo precedente, controlla i tuoi file in uscita!\n\ + L'errore dovrebbe essere intorno all'articolo {} ma non sono sicuro! \n\t\t[{}]".format(docNumber, + l.strip()), Style.RESET_ALL) + s=status['tail'] + anomaly = True + elif s==status['tail']: #cerco il separatore di articoli e aggiungo quello letto alla lista + if docSep.match(l) is not None or anomaly: + s=status['first'] + anomaly = False + if settings['delWordBreak']: + tempContent=[ll.replace('-\n', '') for ll in tempContent] + newDoc['content']=deepcopy(''.join(tempContent)) + fileBodyList.append(deepcopy(newDoc)) + tempContent=list() + bodyCounter +=1 + else: + pass + prevLine=l + pass + print ('Nel file ho trovato {0} articoli..'.format(bodyCounter)) + if docSkipped > 0: + print (Fore.YELLOW, 'Attentione, LexisNexis ne ha saltati {} !!!'.format(docSkipped), Style.RESET_ALL) + print() + + if bodyCounter >= 1: + if settings['removeDuplicates']: + print ('Controllo se ci sono dei duplicati..') + titleList=[] + duplicateList=[] + duplicateNumber=0 + for idx, ff in enumerate(fileBodyList): + if ff['title'] not in titleList: + titleList.append(ff['title']) + ff['duplicate']=False + fileBodyList[idx]=ff + pass + else: + if ff['title'] not in duplicateList: + duplicateList.append(ff['title']) + if settings['showRemovedDuplicates']: + print ('Duplicato: {}'.format(ff['title'].strip())) + ff['duplicate'] = True + fileBodyList[idx]=ff + duplicateNumber+=1 + print ('Ho rimosso {} duplicati di {} articoli..\n'. format(duplicateNumber, len(duplicateList))) + else: + for idx, ff in enumerate(fileBodyList): + ff['duplicate']=False + fileBodyList[idx]=ff + print('Salto il controllo dei duplicati..') + pass + if settings['saveSeparateFiles']: + print ('Salvo gli articoli in file separati...') + print ('Persorso: {0}'.format(cfg['OUTworkPath'].format('nomeFile'))) + for ff in fileBodyList: + try: + if ff['duplicate'] == False: + fileName=cfg['OUTnameFormat'].format(title=slugify(ff['title'][:settings['maxTitleLen']]),\ + filename=slugify(fileBaseName),\ + docnum=fileBodyCounter,\ + papername=ff['newsPaperName'].strip(),\ + **ff['date']) + out=open(cfg['OUTworkPath']+'{0}'.format(fileName),'wb') + if settings['includeTitle']: + ff['content'] = ff['title']+os.linesep+ff['content'] + out.write(ff['content'].encode(settings['encoding'])) + out.close() + fileBodyCounter+=1 + except IOError as e: + print(Fore.LIGHTRED_EX, 'OOPS! Qualcosa e\' andato storto, non riesco a scrivere il file: {}'.format(e), Style.RESET_ALL) + continue + if settings['saveBodyFile']: + print('Salvo gli articoli in un singolo file vicino agli originali...') + print ('Persorso: {0}'.format(cfg['OUTworkPath'].format('nomeFile'))) + try: + fileName=slugify(fileBaseName) + fileName='BODYFILE_{0}_{1}.txt'.format(fileCounter,fileName[:settings['maxTitleLen']]) + fileContent = os.linesep.join([cc['content'] for cc in fileBodyList]) + out=open(cfg['OUTworkPath']+'{0}'.format(fileName),'wb') + out.write(fileContent.encode(settings['encoding'])) + out.close() + except IOError as e: + print(Fore.LIGHTRED_EX, 'OOPS! Qualcosa e\' andato storto, non riesco a scrivere il file: {}'.format(e), Style.RESET_ALL) + continue + fileCounter+=1 + totSkipped+=docSkipped + totFound+=bodyCounter-duplicateNumber + else: + print('[{0}] non contiene articoli, \n controlla meglio le parole chiave! SGRUNT'.format(fileBaseName)) + print('-'*50) + +print() +print (Fore.LIGHTYELLOW_EX, 'bananaSPLIT ha concluso con successo in {0:1.3f} secondi, \n\ + trovando {1} articoli in {2} file; goditi il tuo dessert!'.format(time.time()-lastTime,totFound,len(files)), Style.RESET_ALL) +input() diff --git a/bananaSPLIT/printutils.py b/bananaSPLIT/printutils.py new file mode 100644 index 0000000..79a57f4 --- /dev/null +++ b/bananaSPLIT/printutils.py @@ -0,0 +1,5 @@ +''' +Created on 27 nov 2018 + +@author: Emanuele +'''