commit 452578c56b03bf7be2f1db384b8b9202022fbd3e
Author: Emanuele <ema.trabattoni@gmail.com>
Date:   Wed Oct 9 19:42:17 2019 +0200

    Commit iniziale per una nuova era di bananaSPLIT

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..794e2f4
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+/.DS_Store
+/.project
+/.pydevproject
diff --git a/bananaSPLIT/.gitignore b/bananaSPLIT/.gitignore
new file mode 100644
index 0000000..cb94557
--- /dev/null
+++ b/bananaSPLIT/.gitignore
@@ -0,0 +1,6 @@
+/@Archivio/
+/build/
+/dist/
+/bananaSPLIT.exe.spec\
+*.py[ocd]
+/.DS_Store
diff --git a/bananaSPLIT/87537783-banana-icon-cartoon-illustration-of-banana-vector-icon-for-web.xcf b/bananaSPLIT/87537783-banana-icon-cartoon-illustration-of-banana-vector-icon-for-web.xcf
new file mode 100644
index 0000000..7f223e0
Binary files /dev/null and b/bananaSPLIT/87537783-banana-icon-cartoon-illustration-of-banana-vector-icon-for-web.xcf differ
diff --git a/bananaSPLIT/Manuali/Pdf/bananaMANUAL-v0.2.pdf b/bananaSPLIT/Manuali/Pdf/bananaMANUAL-v0.2.pdf
new file mode 100644
index 0000000..2d531e6
Binary files /dev/null and b/bananaSPLIT/Manuali/Pdf/bananaMANUAL-v0.2.pdf differ
diff --git a/bananaSPLIT/Manuali/Pdf/bananaMANUAL-v0.3a.pdf b/bananaSPLIT/Manuali/Pdf/bananaMANUAL-v0.3a.pdf
new file mode 100644
index 0000000..48b75f4
Binary files /dev/null and b/bananaSPLIT/Manuali/Pdf/bananaMANUAL-v0.3a.pdf differ
diff --git a/bananaSPLIT/Manuali/bananaMANUAL-v0.1.docx b/bananaSPLIT/Manuali/bananaMANUAL-v0.1.docx
new file mode 100644
index 0000000..d8a2848
Binary files /dev/null and b/bananaSPLIT/Manuali/bananaMANUAL-v0.1.docx differ
diff --git a/bananaSPLIT/Manuali/bananaMANUAL-v0.2.docx b/bananaSPLIT/Manuali/bananaMANUAL-v0.2.docx
new file mode 100644
index 0000000..9691e8d
Binary files /dev/null and b/bananaSPLIT/Manuali/bananaMANUAL-v0.2.docx differ
diff --git a/bananaSPLIT/Manuali/bananaMANUAL-v0.3a.docx b/bananaSPLIT/Manuali/bananaMANUAL-v0.3a.docx
new file mode 100644
index 0000000..d43461d
Binary files /dev/null and b/bananaSPLIT/Manuali/bananaMANUAL-v0.3a.docx differ
diff --git a/bananaSPLIT/Manuali/bananaMANUAL.docx b/bananaSPLIT/Manuali/bananaMANUAL.docx
new file mode 100644
index 0000000..bf02c72
Binary files /dev/null and b/bananaSPLIT/Manuali/bananaMANUAL.docx differ
diff --git a/bananaSPLIT/__init__.py b/bananaSPLIT/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/bananaSPLIT/banana.ico b/bananaSPLIT/banana.ico
new file mode 100644
index 0000000..7f8d32a
Binary files /dev/null and b/bananaSPLIT/banana.ico differ
diff --git a/bananaSPLIT/bananaconfEN.json b/bananaSPLIT/bananaconfEN.json
new file mode 100644
index 0000000..282714e
--- /dev/null
+++ b/bananaSPLIT/bananaconfEN.json
@@ -0,0 +1,64 @@
+{
+	"INworkPath": "D:\\Test\\",
+	"OUTworkPath": "D:\\Test\\Separati\\",
+	"OUTnameFormat":"TEST_{docnum}_{year:04d}{month:02d}{day:02d}_{title}.txt",
+	"docStruct": {
+		"docSep": "\\s*Copyright [(0-9)]+",
+		"dateFormat":"{month} {day:d}, {year:d}{}",
+		"dateWords": [
+			"January",
+			"February",
+			"March",
+			"April",
+			"May",
+			"June",
+			"July",
+			"August",
+			"September",
+			"October",
+			"November",
+			"December"
+		],
+		"headWords": [
+			"BYLINE:",
+			"SECTION:",
+			"LENGTH:",
+			"DATELINE:",
+			"HIGHLIGHT:",
+			"Email:"
+		],
+		"tailWords": [
+			"Newstex ID" ,
+			"NOTES",
+			"LANGUAGE:",
+			"GRAPHIC:",
+			"TYPE:",
+			"URL:",
+			"LOAD-DATE:",
+			"PUBLICATION-TYPE:",
+			"DOCUMENT-TYPE:",
+			"CHARTS:",
+			"JOURNAL-CODE:"
+		]
+	},
+	"settings": {
+		"encoding": "utf-8",
+		"monthPosition": 0,
+		"getNewsPaperName": true,
+		"nameNotFoundStr" : "--ND--",
+		"includeTitle" : true,
+		"removeDuplicates": true,
+		"showSkipped": false,
+		"showRemovedDuplicates": true,
+		"maxTitleLen": 32,
+		"loadTXT": true,
+		"loadDOCX": false,
+		"removeOldFiles":true,
+		"saveSeparateFiles": true,
+		"saveBodyFile": true,
+		"saveBodyNumber":true,
+		"delLF": false,
+		"delWordBreak": true,
+		"delChars": "'|@|#"
+	}
+}
diff --git a/bananaSPLIT/bananaconfITA.json b/bananaSPLIT/bananaconfITA.json
new file mode 100644
index 0000000..8c1a07c
--- /dev/null
+++ b/bananaSPLIT/bananaconfITA.json
@@ -0,0 +1,62 @@
+{
+	"INworkPath": "C:\\Test\\",
+	"OUTworkPath": "C:\\Test\\Separati\\",
+	"OUTnameFormat":"TEST_{docnum}_{year:04d}{month:02d}{day:02d}_{papername}_{title}.txt",
+	"docStruct": {
+		"docSep": "Copyright [(0-9)]+",
+		"dateFormat":"{day:d} {month} {year:d} {}",
+		"dateWords": [
+			"Gennaio",
+			"Febbraio",
+			"Marzo",
+			"Aprile",
+			"Maggio",
+			"Giugno",
+			"Luglio",
+			"Agosto",
+			"Settembre",
+			"Ottobre",
+			"Novembre",
+			"Dicembre"
+		],
+		"headWords": [
+			"BYLINE:",
+			"SECTION:",
+			"LENGTH:",
+			"DATELINE:",
+			"HIGHLIGHT:",
+			"Email:"
+		],
+		"tailWords": [
+			"LANGUAGE:",
+			"GRAPHIC:",
+			"TYPE:",
+			"URL:",
+			"LOAD-DATE:",
+			"PUBLICATION-TYPE:",
+			"DOCUMENT-TYPE:",
+			"CHARTS:"
+		]
+	},
+	"settings": 
+	"settings": {
+		"encoding": "utf-8",
+		"monthPosition": 0,
+		"getNewsPaperName": true,
+		"nameNotFoundStr" : "--ND--",
+		"includeTitle" : true,
+		"removeDuplicates": true,
+		"showSkipped": false,
+		"showRemovedDuplicates": true,
+		"maxTitleLen": 32,
+		"loadTXT": true,
+		"loadDOCX": false,
+		"removeOldFiles":true,
+		"saveSeparateFiles": true,
+		"saveBodyFile": true,
+		"saveBodyNumber":true,
+		"delLF": false,
+		"delWordBreak": true,
+		"delChars": "'|@|#"
+	}
+}
diff --git a/bananaSPLIT/main.py b/bananaSPLIT/main.py
new file mode 100644
index 0000000..22db218
--- /dev/null
+++ b/bananaSPLIT/main.py
@@ -0,0 +1,330 @@
+'''
+Created on 27 nov 2018
+
+@author: Emanuele
+'''
+import json
+import time
+import os
+import sys
+import re
+import parse
+
+from colorama import Fore, Style, init
+from pprint import pprint
+from glob import glob
+from copy import deepcopy
+from slugify import slugify
+
+     
+####### VAR GLOBALI #######
+
+####### FUNZIONI GLOBALI #######        
+def printTitle():
+    print(".  .   .                 ,     \n|  | _ | _. _ ._ _  _   -+- _  *    \n|/\|(/,|(_.(_)[ | )(/,   | (_) *")
+
+    print(Fore.LIGHTYELLOW_EX,\
+    '    _                                   ___________ _     _____ _____     \n\
+    | |                                 /  ___| ___ \ |   |_   _|_   _|        \n\
+    | |__   __ _ _ __   __ _ _ __   __ _\ `--.| |_/ / |     | |   | |          \n\
+    | \'_ \ / _` | \'_ \ / _` | \'_ \ / _` |`--. \  __/| |     | |   | |       \n\
+    | |_) | (_| | | | | (_| | | | | (_| /\__/ / |   | |_____| |_  | |          \n\
+    |_.__/ \__,_|_| |_|\__,_|_| |_|\__,_\____/\_|   \_____/\___/  \_/')
+    
+    print ('\t\t\t\t\t\t\tVersione 0.4a\n\n', Style.RESET_ALL)
+    print('Iniziamo!!')
+    pass
+    
+#############################################################
+####################### MAIN ################################
+#############################################################
+init(convert=True)
+printTitle()
+status={'first':0, 'head':1, 'body':2, 'tail':3}
+s=status['first']
+
+try:
+    conffiles = glob('*.json')
+    if len(conffiles) < 1:
+        raise 
+    fileValid = False
+    while not fileValid:
+        print ('Seleziona un file di configurazione per la lingua:')
+        for cf in enumerate(conffiles):
+            print('[{0}] - {1}'.format(cf[0],cf[1]))
+        try:
+            fn = int(input('Scrivi il numero del file e premi Invio: '))
+            if (fn > len(conffiles)-1 or fn < 0):
+                raise
+            fileValid = True
+        except:
+            print('Scusa, non ho capito bene, ricominciamo..\n')
+            fileValid = False
+except:
+    print(Fore.LIGHTRED_EX, 'OOPS!! File di configurazione non selezionato o non presente..', Style.RESET_ALL)
+    input()
+    sys.exit()
+
+try:
+    print('\nApro il file di configurazione [{}]...'.format(conffiles[fn]))
+    fp = open(conffiles[fn],'r')
+    cfg = json.load(fp)
+    fp.close
+    try:
+        print('Carico i parametri...')
+        docParams=cfg['docStruct']
+        basePath=cfg['INworkPath']+"{0}.{1}"
+        settings=cfg['settings']
+        delChars=settings['delChars'].split('|')
+        docSep=re.compile(docParams['docSep'])
+    except:
+        print(Fore.LIGHTRED_EX,'OOPS! Qualcosa e\' andato storto, non riesco a caricare la configurazione, controlla la sintassi! :)', Style.RESET_ALL)
+        input()
+        sys.exit()
+except IOError as e:
+    print(Fore.LIGHTRED_EX, 'OOPS! Qualcosa e\' andato storto, non trovo il file di configurazione: {}'.format(e), Style.RESET_ALL)
+    input()
+    sys.exit()
+
+files=[]
+try:
+    print('Ottengo la lista dei file da separare...')
+    if settings['loadTXT']:
+        files+=glob(basePath.format('*','txt'))
+    if settings['loadDOCX']:
+        print(Fore.LIGHTRED_EX, 'OOPS! Scusa ma non posso accontentarti, per ora non so leggere i file DOCX.. :(', Style.RESET_ALL)
+        input()
+        sys.exit()
+        #files+=glob(basePath.format('*','docx'))
+    if len(files)<=0:
+        raise 
+    pprint(files)
+    while True:
+        r=input('\nVuoi davvero bananaSPLITTARE questi documenti? [y/n]:')
+        r.strip()
+        if r=='Y' or r =='y':
+            break
+        elif r=='n' or r=='N':
+            print('OK! Nessun problema, ci vediamo dopo :)')
+            input()
+            sys.exit()
+        else:
+            print('Non ho capito la risposta, sii un po\' piu\' specifico... [y/n]\n')
+            pass
+    if settings['removeOldFiles']:
+            print("Rimuovo i vecchi file dalla cartella di destinazione..")
+            for x in glob(cfg['OUTworkPath']+'*.txt'):
+                os.remove(x)
+except:
+    print(Fore.LIGHTRED_EX, 'OOPS! Qualcosa e\' andato storto, non riesco a trovare nessun file da leggere :(', Style.RESET_ALL)
+    input()
+    sys.exit()
+    
+lastTime=time.time()
+fileCounter = 1
+fileBodyCounter = 1
+totFound=0
+totSkipped=0
+
+# per ogni file nella lista
+for f in files:
+    s=status['first']
+    prevLine=''
+    doc={'title':'',
+          'date':'', 
+          'content':[]
+          }
+    fileContent=''
+    tempContent=list()
+    fileBaseName=os.path.split(f)[1].split('.')[0]
+    fileBodyList=list()
+    fileContent = list()
+    try:
+        print()
+        print('-'*50)
+        print('Apro il file: {}'.format(fileBaseName))
+        fp = open(f,mode='r', encoding=settings['encoding'])
+        fileContent=fp.readlines() #leggi le linee del file
+        fp.close()
+    except IOError as e:
+        print(Fore.LIGHTRED_EX, 'OOPS! Qualcosa e\' andato storto, non riesco ad aprire il file: {}'.format(fileBaseName), Style.RESET_ALL)
+        continue
+        
+    print('Elimino le righe vuote...')
+    for ll in fileContent:
+        for c in delChars:
+            ll=ll.replace(c,'')
+        if ll not in  ['\n','\r']: 
+            tempContent.append(ll)            
+    fileContent=deepcopy(tempContent)
+    tempContent=list()
+    
+    print('Individuo il contenuto...')
+    docNumber = 0
+    docSkipped = 0
+    bodyCounter = 0
+    duplicateNumber = 0
+    docDate = {}
+    prevLine = ''
+    newsPaperName = ''
+    titleBegin = False
+
+    for l in fileContent: #per ogni linea del file
+        lineWords=l.lstrip().split(' ') #dividi la riga in parole
+        if s==status['first']:
+            try:
+                #prendo il numero di documento per vedere se ci sono buchi
+                try:
+                    nn = parse.parse("{current:d} Of {total} Documents",l.strip().capitalize()).named
+                    if nn["current"]-docNumber==1:
+                        pass
+                    else:
+                        if settings["showSkipped"]:
+                            print(Fore.LIGHTRED_EX,"OOPS!! Il conto dei documenti non torna! LexisNexis \
+                            ne ha saltato qualcuno!\nPrecedente:{0}-Attuale:{1}".format(docNumber,nn["current"]), Style.RESET_ALL)
+                        docSkipped+=1
+                    docNumber = nn["current"]
+                except:
+                    pass
+                if (lineWords[settings['monthPosition']]).capitalize() in docParams['dateWords']:
+                    try:
+                        docDate=parse.parse(docParams['dateFormat'],l).named
+                        docDate['month']=docDate['month'].lstrip().rstrip().capitalize()
+                        docDate['month']=docParams['dateWords'].index(docDate['month'])+1
+                        title = ''
+                        titleBegin=True
+                        if settings['getNewsPaperName']:
+                            try:
+                                if prevLine.split(' ')[0].strip().isalpha():
+                                    newsPaperName = prevLine.strip()
+                                else:
+                                    newsPaperName = settings['nameNotFoundStr']
+                            except:
+                                print(Fore.LIGHTRED_EX, "OOPS! E' successo qualcosa mentre stavo cercando il nome della pubblicazione,\
+                                controlla i file di uscita! \n\t[{}]".format(prevLine.strip()), Style.RESET_ALL)
+                        else:
+                            newsPaperName = settings['nameNotFoundStr']
+                    except:
+                        print(Fore.LIGHTRED_EX, "OOPS! Ho trovato una riga ambigua.. potrebbe essere una data ma non so: \n\t[{}]". format(l.strip('\r\n')), 
+                              Style.RESET_ALL)
+                elif lineWords[0] in docParams['headWords']:
+                    s=status['head']
+                    newDoc=deepcopy(doc)
+                    newDoc['title']=title
+                    newDoc['date']=docDate
+                    newDoc['newsPaperName'] = newsPaperName
+                    titleBegin=False
+                else:
+                    if titleBegin:
+                        title += l.strip().capitalize()
+            except IndexError:
+                print (Fore.LIGHTRED_EX, 'OOPS! Errore inaspettato, contatta il tuo sviluppatore di fiducia!', Style.RESET_ALL)
+        elif s==status['head']:
+            if lineWords[0] not in docParams['headWords']:  #se la prima parola non e' tra quelle di inizio    
+                tempContent.append(l)                       # vuol dire che ho trovato l'articolo
+                s=status['body']        
+        elif s==status['body']: 
+            if not lineWords[0] in docParams['tailWords']: #se la prima parola non e' tra quelle di fine
+                if settings['delLF']:
+                    tempContent.append(l.strip('\n'))           #allora sto leggendo l'articolo
+                else:
+                    tempContent.append(l)
+            else:
+                s=status['tail']
+                anomaly = False
+            if docSep.match(l) is not None: #controlla se ci sono articoli che non hanno le parole chiave finali 
+                print(Fore.YELLOW, "HEY! Ho individuato una separatore valido prima che si chiusesse l'articolo precedente, controlla i tuoi file in uscita!\n\
+                L'errore dovrebbe essere intorno all'articolo {} ma non sono sicuro! \n\t\t[{}]".format(docNumber,
+                                                                                                        l.strip()), Style.RESET_ALL)
+                s=status['tail'] 
+                anomaly = True
+        elif s==status['tail']: #cerco il separatore di articoli e aggiungo quello letto alla lista
+            if docSep.match(l) is not None or anomaly:
+                s=status['first']
+                anomaly = False
+                if settings['delWordBreak']:
+                    tempContent=[ll.replace('-\n', '') for ll in tempContent]
+                newDoc['content']=deepcopy(''.join(tempContent))
+                fileBodyList.append(deepcopy(newDoc))
+                tempContent=list()
+                bodyCounter +=1
+        else:
+            pass 
+        prevLine=l     
+    pass
+    print ('Nel file ho trovato {0} articoli..'.format(bodyCounter))
+    if docSkipped > 0:
+        print (Fore.YELLOW, 'Attentione, LexisNexis ne ha saltati {} !!!'.format(docSkipped), Style.RESET_ALL)
+    print()
+    
+    if bodyCounter >= 1:
+        if settings['removeDuplicates']:
+            print ('Controllo se ci sono dei duplicati..')
+            titleList=[]
+            duplicateList=[]
+            duplicateNumber=0
+            for idx, ff in enumerate(fileBodyList):
+                if ff['title'] not in titleList:
+                    titleList.append(ff['title'])
+                    ff['duplicate']=False
+                    fileBodyList[idx]=ff
+                    pass
+                else:
+                    if ff['title'] not in duplicateList:
+                        duplicateList.append(ff['title'])
+                        if settings['showRemovedDuplicates']:
+                            print ('Duplicato: {}'.format(ff['title'].strip()))
+                    ff['duplicate'] = True
+                    fileBodyList[idx]=ff
+                    duplicateNumber+=1
+            print ('Ho rimosso {} duplicati di {} articoli..\n'. format(duplicateNumber, len(duplicateList)))
+        else:
+            for idx, ff in enumerate(fileBodyList):
+                ff['duplicate']=False
+                fileBodyList[idx]=ff
+            print('Salto il controllo dei duplicati..')
+            pass
+        if settings['saveSeparateFiles']:
+            print ('Salvo gli articoli in file separati...')
+            print ('Persorso: {0}'.format(cfg['OUTworkPath'].format('nomeFile')))
+            for ff in fileBodyList:
+                try:
+                    if ff['duplicate'] == False:
+                        fileName=cfg['OUTnameFormat'].format(title=slugify(ff['title'][:settings['maxTitleLen']]),\
+                                                             filename=slugify(fileBaseName),\
+                                                             docnum=fileBodyCounter,\
+                                                             papername=ff['newsPaperName'].strip(),\
+                                                             **ff['date'])
+                        out=open(cfg['OUTworkPath']+'{0}'.format(fileName),'wb')
+                        if settings['includeTitle']:
+                            ff['content'] = ff['title']+os.linesep+ff['content']
+                        out.write(ff['content'].encode(settings['encoding']))
+                        out.close()
+                        fileBodyCounter+=1
+                except IOError as e:
+                    print(Fore.LIGHTRED_EX, 'OOPS! Qualcosa e\' andato storto, non riesco a scrivere il file: {}'.format(e), Style.RESET_ALL)
+                    continue
+        if settings['saveBodyFile']:
+            print('Salvo gli articoli in un singolo file vicino agli originali...')
+            print ('Persorso: {0}'.format(cfg['OUTworkPath'].format('nomeFile')))
+            try:
+                fileName=slugify(fileBaseName)
+                fileName='BODYFILE_{0}_{1}.txt'.format(fileCounter,fileName[:settings['maxTitleLen']])
+                fileContent = os.linesep.join([cc['content'] for cc in fileBodyList])
+                out=open(cfg['OUTworkPath']+'{0}'.format(fileName),'wb')
+                out.write(fileContent.encode(settings['encoding']))
+                out.close()
+            except IOError as e:
+                print(Fore.LIGHTRED_EX, 'OOPS! Qualcosa e\' andato storto, non riesco a scrivere il file: {}'.format(e), Style.RESET_ALL)
+                continue
+        fileCounter+=1
+        totSkipped+=docSkipped
+        totFound+=bodyCounter-duplicateNumber
+    else:
+        print('[{0}] non contiene articoli, \n controlla meglio le parole chiave! SGRUNT'.format(fileBaseName))
+    print('-'*50)
+
+print()
+print (Fore.LIGHTYELLOW_EX, 'bananaSPLIT ha concluso con successo in {0:1.3f} secondi, \n\
+ trovando {1} articoli in {2} file; goditi il tuo dessert!'.format(time.time()-lastTime,totFound,len(files)), Style.RESET_ALL)
+input()
diff --git a/bananaSPLIT/printutils.py b/bananaSPLIT/printutils.py
new file mode 100644
index 0000000..79a57f4
--- /dev/null
+++ b/bananaSPLIT/printutils.py
@@ -0,0 +1,5 @@
+'''
+Created on 27 nov 2018
+
+@author: Emanuele
+'''