46 Commits

Author SHA1 Message Date
b6000c49fd Il test lancia correttamente i moduli ma siamo tornati agli errori di
parsing, debug necessario...
2019-12-01 17:44:14 +01:00
851dcf103d correzioni alla librerie 2019-12-01 17:17:48 +01:00
4118ed82c3 aggiornate le configurazioni con path file di test 2019-12-01 17:17:37 +01:00
435bb144ab aggiunti file di test 2019-12-01 17:17:16 +01:00
ccc06bc5f1 corretto un typo stupidissimo che mi ha fatto perdere un sacco di tempo 2019-12-01 17:04:55 +01:00
ef91f8d26a iniziano test librerie, modifiche file configurazione di test 2019-12-01 16:50:15 +01:00
fe75d99736 Inizia il testing della libreria 2019-12-01 16:21:23 +01:00
a4e550797c Aggiunta cartella TestFiles 2019-11-24 16:13:58 +01:00
6e83ce559d Cambiamenti che non mi ricordo, mi hanno distratto nel mentre 2019-11-24 12:48:30 +01:00
04a32c7bd1 weaponizzate le funzioni di salva file singolo e body 2019-11-24 12:32:23 +01:00
e6fb04a9e7 modifiche generiche 2019-11-23 15:11:44 +01:00
b53fb27a60 Porting del codice dal programma originale a libsplit
Poco Refactoring, il codice e' pressoche invariato e andra' sistemato in
futuro.
Predisposizione al multi-threading
2019-11-11 22:49:59 +01:00
0bbb1947ee Prima scrittura libSPLIT 2019-11-10 20:01:47 +01:00
7ca177f29a aggiunta getFiles 2019-11-10 19:50:48 +01:00
1e8867182f Finalizzato caricamento file
aggiunta proprieta' in use per definire il file in uso attualmente,
modificabile a runtime per ogni file ingress
2019-11-10 19:49:23 +01:00
a798f29b73 Prima versione gestore configurazioni 2019-11-10 12:49:49 +01:00
59c41cc3ba Inizio scrittura confloader
integrazione del fancylogger e settings nel file di test
2019-11-02 19:53:27 +01:00
18566dda2c modifiche varie 2019-11-02 18:43:20 +01:00
59f11a2f8b Elimina i caratteri speciali dal file logger 2019-11-02 14:32:44 +01:00
dfcbef7ae4 FancyLogger funzionante e file di configurazione test 2019-11-02 14:29:48 +01:00
0502d77c14 Primo commit della libsplit 2019-11-02 13:54:08 +01:00
b7af353ab8 finestra selezione output limiti misure 2019-10-30 22:06:52 +01:00
3f5ef6c103 finestra selezione output base 2019-10-30 22:02:06 +01:00
21e7a29c33 Merge branch 'gui-devel' of https://git.etss.it/12Parsec/bananaSPLIT.git into gui-devel 2019-10-30 22:00:30 +01:00
d01afc2ab3 finestra selezione output base 2019-10-30 21:58:52 +01:00
9537530a34 Merge branch 'gui-devel' of https://git.etss.it/12Parsec/bananaSPLIT.git into gui-devel 2019-10-30 21:49:48 +01:00
c401518db5 eliminato confGiulia 2019-10-30 21:49:32 +01:00
22413c3fb2 Main window aggiornata 2019-10-30 21:42:42 +01:00
bfbd0e8c79 ritorno alla normalità 2019-10-30 21:27:38 +01:00
af880df5b3 test commit niccolo 2019-10-30 21:23:27 +01:00
Giulia
3624d21e90 Cartella con file configurazione su PC Giulia 2019-10-30 18:27:35 +01:00
Giulia
3c1aad543e Main window. Prima versione completa. 2019-10-30 18:23:39 +01:00
Giulia
c5ee729683 Sistemato pulsanti cartelle 2019-10-30 18:09:41 +01:00
56a509a5e6 Ricompilato versione 0.4b 2019-10-30 17:26:36 +01:00
84a38d0cb3 fixata indentazione titolo, aggiunti file conf alla cartella di dist 2019-10-30 17:10:16 +01:00
8ec7e5e835 spostata l'eliminazione dei vecchi file in un altro try 2019-10-30 16:59:05 +01:00
4f05729782 spostati i file di config nella dir del progetto 2019-10-30 16:55:30 +01:00
d7321a5041 riaggiornato gitignore 2019-10-30 15:52:44 +01:00
a29aa4da91 rimossa cartella build 2019-10-30 15:52:01 +01:00
170959e529 aggiornato gitignore 2019-10-30 15:50:23 +01:00
3fa74f6ed9 rebuild exe 2019-10-30 15:47:58 +01:00
2f1236449a Corretta la rimozione dei file 2019-10-30 15:44:49 +01:00
Giulia
f43ed84d72 Prima prova.
Label cartelle sorgente e destinatario con pushbtn/icona che non so
mettere
2019-10-28 20:54:10 +01:00
4057b31203 Aggiornate Abbreviazioni 2019-10-27 19:33:43 +01:00
Giulia
0ddd8af3bc tolta icona 2019-10-27 19:22:40 +01:00
eec006f6c5 Primo commit branch interfaccia grafica 2019-10-27 19:04:18 +01:00
27 changed files with 102841 additions and 26 deletions

2
.gitignore vendored
View File

@@ -1,3 +1,5 @@
/.DS_Store /.DS_Store
/.project /.project
/.pydevproject /.pydevproject
/org.eclipse.core.resources.prefs
bananaSPLIT/build

1
.settings/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
/org.eclipse.core.resources.prefs

12012
TestFiles/GUARDIAN 1989.txt Normal file

File diff suppressed because it is too large Load Diff

13811
TestFiles/GUARDIAN 1993.txt Normal file

File diff suppressed because it is too large Load Diff

22792
TestFiles/NYT 1990.txt Normal file

File diff suppressed because it is too large Load Diff

23149
TestFiles/NYT 1994.txt Normal file

File diff suppressed because it is too large Load Diff

30024
TestFiles/NYT 1997.txt Normal file

File diff suppressed because it is too large Load Diff

1
TestFiles/README Normal file
View File

@@ -0,0 +1 @@
Qui si mettono i file di test per la versione GUI di bananaSPLIT

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 45 KiB

After

Width:  |  Height:  |  Size: 79 KiB

View File

@@ -0,0 +1,121 @@
<?xml version="1.0" encoding="UTF-8"?>
<ui version="4.0">
<class>Dialog</class>
<widget class="QDialog" name="Dialog">
<property name="geometry">
<rect>
<x>0</x>
<y>0</y>
<width>509</width>
<height>303</height>
</rect>
</property>
<property name="minimumSize">
<size>
<width>509</width>
<height>303</height>
</size>
</property>
<property name="windowTitle">
<string>Dialog</string>
</property>
<widget class="QWidget" name="verticalLayoutWidget">
<property name="geometry">
<rect>
<x>20</x>
<y>80</y>
<width>251</width>
<height>141</height>
</rect>
</property>
<layout class="QVBoxLayout" name="verticalLayout">
<item>
<widget class="QRadioButton" name="rad_MainbodyEFile">
<property name="text">
<string>Main body + File singoli</string>
</property>
</widget>
</item>
<item>
<widget class="QRadioButton" name="rad_File">
<property name="text">
<string>File singoli</string>
</property>
</widget>
</item>
<item>
<widget class="QRadioButton" name="rad_Mainbody">
<property name="text">
<string>Main body</string>
</property>
</widget>
</item>
</layout>
</widget>
<widget class="QLabel" name="lbl_indicazioni">
<property name="geometry">
<rect>
<x>30</x>
<y>20</y>
<width>401</width>
<height>41</height>
</rect>
</property>
<property name="text">
<string>Seleziona i file che il programma ti deve fare*chiaramente da cambiare le parole</string>
</property>
</widget>
<widget class="QWidget" name="horizontalLayoutWidget">
<property name="geometry">
<rect>
<x>330</x>
<y>230</y>
<width>160</width>
<height>51</height>
</rect>
</property>
<layout class="QHBoxLayout" name="horizontalLayout">
<item>
<widget class="QPushButton" name="btn_Indietro">
<property name="minimumSize">
<size>
<width>75</width>
<height>0</height>
</size>
</property>
<property name="maximumSize">
<size>
<width>75</width>
<height>16777215</height>
</size>
</property>
<property name="text">
<string>Indietro</string>
</property>
</widget>
</item>
<item>
<widget class="QPushButton" name="btn_Avanti">
<property name="minimumSize">
<size>
<width>75</width>
<height>0</height>
</size>
</property>
<property name="maximumSize">
<size>
<width>75</width>
<height>16777215</height>
</size>
</property>
<property name="text">
<string>Avanti</string>
</property>
</widget>
</item>
</layout>
</widget>
</widget>
<resources/>
<connections/>
</ui>

View File

@@ -1,25 +1,164 @@
<ui version="4.0" > <?xml version="1.0" encoding="UTF-8"?>
<author></author> <ui version="4.0">
<comment></comment> <class>win_main</class>
<exportmacro></exportmacro> <widget class="QMainWindow" name="win_main">
<class>MainWindow</class> <property name="geometry">
<widget class="QMainWindow" name="MainWindow" >
<property name="geometry" >
<rect> <rect>
<x>0</x> <x>0</x>
<y>0</y> <y>0</y>
<width>800</width> <width>519</width>
<height>600</height> <height>391</height>
</rect> </rect>
</property> </property>
<property name="windowTitle" > <property name="windowTitle">
<string>MainWindow</string> <string>MainWindow</string>
</property> </property>
<widget class="QMenuBar" name="menubar" /> <widget class="QWidget" name="centralwidget">
<widget class="QWidget" name="centralwidget" /> <widget class="QWidget" name="verticalLayoutWidget">
<widget class="QStatusBar" name="statusbar" /> <property name="geometry">
<rect>
<x>20</x>
<y>110</y>
<width>131</width>
<height>71</height>
</rect>
</property>
<layout class="QVBoxLayout" name="verticalLayout">
<item>
<widget class="QLabel" name="lbl_cartellasorg">
<property name="font">
<font>
<pointsize>10</pointsize>
</font>
</property>
<property name="text">
<string>Cartella sorgente</string>
</property>
</widget> </widget>
<pixmapfunction></pixmapfunction> </item>
<item>
<widget class="QLabel" name="lbl_cartelladest">
<property name="font">
<font>
<pointsize>10</pointsize>
</font>
</property>
<property name="text">
<string>Cartella destinazione</string>
</property>
</widget>
</item>
</layout>
</widget>
<widget class="QWidget" name="verticalLayoutWidget_2">
<property name="geometry">
<rect>
<x>150</x>
<y>110</y>
<width>34</width>
<height>71</height>
</rect>
</property>
<layout class="QVBoxLayout" name="verticalLayout_2">
<item>
<widget class="QPushButton" name="btn_cartellasorg">
<property name="text">
<string/>
</property>
</widget>
</item>
<item>
<widget class="QPushButton" name="btn_cartelladest">
<property name="text">
<string/>
</property>
</widget>
</item>
</layout>
</widget>
<widget class="Line" name="line">
<property name="geometry">
<rect>
<x>10</x>
<y>80</y>
<width>491</width>
<height>16</height>
</rect>
</property>
<property name="orientation">
<enum>Qt::Horizontal</enum>
</property>
</widget>
<widget class="QWidget" name="horizontalLayoutWidget">
<property name="geometry">
<rect>
<x>20</x>
<y>180</y>
<width>421</width>
<height>80</height>
</rect>
</property>
<layout class="QHBoxLayout" name="horizontalLayout_2">
<item>
<widget class="QCheckBox" name="chk_predefinita">
<property name="text">
<string>Rendi predefinite le cartelle</string>
</property>
</widget>
</item>
<item>
<widget class="QPushButton" name="btn_avanti">
<property name="minimumSize">
<size>
<width>79</width>
<height>0</height>
</size>
</property>
<property name="maximumSize">
<size>
<width>79</width>
<height>16777215</height>
</size>
</property>
<property name="text">
<string>Avanti</string>
</property>
</widget>
</item>
</layout>
</widget>
<widget class="QPushButton" name="btn_opzioni">
<property name="geometry">
<rect>
<x>420</x>
<y>300</y>
<width>75</width>
<height>23</height>
</rect>
</property>
<property name="text">
<string>Opzioni</string>
</property>
</widget>
</widget>
<widget class="QMenuBar" name="menubar">
<property name="geometry">
<rect>
<x>0</x>
<y>0</y>
<width>519</width>
<height>21</height>
</rect>
</property>
<widget class="QMenu" name="menuBananaSplit">
<property name="title">
<string>BananaSplit</string>
</property>
</widget>
<addaction name="menuBananaSplit"/>
</widget>
<widget class="QStatusBar" name="statusbar"/>
</widget>
<resources/>
<connections/> <connections/>
</ui> </ui>

View File

@@ -0,0 +1,32 @@
# -*- mode: python -*-
block_cipher = None
a = Analysis(['main.py'],
pathex=['D:\\Emanuele\\Documenti\\workspace\\bananaSPLIT\\bananaSPLIT'],
binaries=[],
datas=[('C:\\Users\\Emanuele Trabattoni\\AppData\\Roaming\\Python\\Python37\\site-packages\\text_unidecode\\data.bin', '.\\text_unidecode')],
hiddenimports=[],
hookspath=[],
runtime_hooks=[],
excludes=[],
win_no_prefer_redirects=False,
win_private_assemblies=False,
cipher=block_cipher,
noarchive=False)
pyz = PYZ(a.pure, a.zipped_data,
cipher=block_cipher)
exe = EXE(pyz,
a.scripts,
a.binaries,
a.zipfiles,
a.datas,
[],
name='bananaSPLIT.exe',
debug=False,
bootloader_ignore_signals=False,
strip=False,
upx=True,
runtime_tmpdir=None,
console=True , icon='banana.ico')

BIN
bananaSPLIT/dist/bananaSPLIT.exe vendored Normal file

Binary file not shown.

View File

@@ -38,7 +38,6 @@
"CHARTS:" "CHARTS:"
] ]
}, },
"settings":
"settings": { "settings": {
"encoding": "utf-8", "encoding": "utf-8",
"monthPosition": 0, "monthPosition": 0,

View File

View File

@@ -0,0 +1,83 @@
'''
Created on 2 nov 2019
@author: Emanuele Trabattoni
'''
import json,os,glob,copy
class bananaCONF(object):
'''
Carica e Salva file di configurazione per bananaSPLITTER
'''
def __init__(self, workdir=None, logger=None):
self.log = logger
self.fileList = None
self.workdir = None
self.inUse = None
self.settingsList = dict()
if workdir is not None:
self.workdir = workdir
os.chdir(workdir)
self.log.debug("Cerco le configurazioni in: [{}]".format(os.getcwd()))
else:
self.log.error("Non mi e' stata fornita una directory per i file di configurazione")
pass
def open(self):
self.log.info("Carico i file di configurazione")
self.fileList = glob.glob(r"*.json")
if len(self.fileList) > 0:
for f in self.fileList:
try:
fp = open(f)
tf = json.load(fp)
fName = f.split("\\")[-1]
self.settingsList[fName] = copy.deepcopy(tf)
self.log.info("Caricato correttamente: {}".format(fName))
fp.close()
except json.JSONDecodeError as e:
self.log.error("Impossibile leggere la configurazione:{}\n \
Controlla il file a riga: {} e colonna:{}" .format(e.doc, e.lineno, e.colno))
except IOError as ee:
self.log.error("Impossibile aprire il file: {}".format(ee))
except Exception as eee:
self.log.critical("Eccezione inaspettata: {}".format(eee))
else:
self.log.error("Non ho trovato alcun file di configurazione!")
pass
def reload(self):
self.settingsList = None
self.fileList = None
self.inUse = None
self.open()
pass
def use(self, toUse):
self.inUse = toUse
pass
def save(self):
self.log.info("Salvo la configurazione: {}".format(self.inUse))
try:
os.chdir(self.workdir)
f=open(self.inUse)
json.dump(self.settingsList[self.inUse], f)
f.close()
except IOError as e:
self.log.error("Impossibile salvare il file: {} - [{}]".format(self.inUse,e))
pass
def getFiles(self):
return self.fileList
def getParams(self, k):
return self.settingsList[self.inUse][k]
pass
def setParams(self, k, v):
self.settingsList[self.inUse][k]=dict(v)
pass

View File

@@ -0,0 +1,85 @@
'''
Created on 2 nov 2019
@author: Emanuele Trabattoni
'''
import sys, os
import json
import logging
import colorama
class fancyLogger(object):
'''
Colorizza il logger di python, per un' esperienza stile willy wonka
'''
def __init__(self, name="Logger", consoleLog=True, fileLog=True):
settings = json.load(open(os.getcwd()+r"\libbananasplit\testEN.json"))["logger"]
colorama.init(convert=True)
self.LRED = colorama.Fore.LIGHTRED_EX
self.RED = colorama.Fore.RED
self.LYELLOW = colorama.Fore.LIGHTYELLOW_EX
self.YELLOW = colorama.Fore.YELLOW
self.LBLUE = colorama.Fore.LIGHTBLUE_EX
self.BLUE = colorama.Fore.BLUE
self.LGREEN = colorama.Fore.LIGHTGREEN_EX
self.LGREEN = colorama.Fore.GREEN
self.WHITE = colorama.Fore.LIGHTWHITE_EX
self.RST = colorama.Style.RESET_ALL
# Setup Logger
self.LOGGER = logging.getLogger(name)
self.LOGGER.setLevel(logging.DEBUG)
self.LOGGER.propagate = False
FORMATTER = logging.Formatter((settings["logFormat"]), (settings["logTimeFormat"]))
if fileLog:
# File Logging
fh = logging.FileHandler((settings["logFile"]))
fh.setLevel(logging.DEBUG)
fh.setFormatter(FORMATTER)
self.LOGGER.addHandler(fh)
if consoleLog:
# Console Logging
cl= logging.StreamHandler(sys.stdout)
cl.setLevel(logging.DEBUG)
cl.setFormatter(FORMATTER)
self.LOGGER.addHandler(cl)
pass
def debug(self, msg="Undefined Debug"):
print(self.LBLUE, end='')
self.LOGGER.debug(msg)
print(self.RST, end='')
pass
def info(self, msg="Undefined Info"):
print(self.LGREEN, end='')
self.LOGGER.info(msg)
print(self.RST, end='')
pass
def warn(self, msg="Undefined Warning"):
print(self.LYELLOW, end='')
self.LOGGER.warning(msg)
print(self.RST, end='')
pass
def error(self, msg="Undefined Error"):
print(self.LRED, end='')
self.LOGGER.error(msg)
print(self.RST, end='')
pass
def critical(self, msg="Undefined Critical"):
print(self.RED, end='')
self.LOGGER.critical(msg)
print(self.RST, end='')
pass
def testColors(self):
self.debug("Test Debug")
self.info("Test Info")
self.warn("Test Warning")
self.error("Test Error")
self.critical("Test Critical")
pass

View File

@@ -0,0 +1,247 @@
'''
Created on 2 nov 2019
@author: Emanuele Trabattoni
'''
from libfancylogger import fancyLogger
from slugify.slugify import slugify
import threading, time, parse, re, copy, os
class bananaSPLITTER(threading.Thread):
def __init__(self, fileParams=None, logger=None):
threading.Thread.__init__(self)
self.fileParams = fileParams
self.log = logger
self.rawFile = None
self.status = "first"
self.fileList = list()
self.bodyCounter=0
self.duplicateNumber=0
if fileParams is not None:
self.log.info("Sto operando sul file: {}..".format(self.fileParams['name']))
self.paths = self.fileParams['paths']
self.docStruct = self.fileParams['docStruct']
self.settings = self.fileParams['settings']
self.fileName = self.fileParams['name']
self.beginTime = time.time()
pass
else:
self.log.critical("Non e' stato fornito il nome di alcun file da splittare!")
pass
def run(self):
self.log.info("Nuovo SPLITTER su file: {}".format(self.fileName))
self.openFile()
self.remEmptyLines()
self.splitFile()
if self.settings['removeDuplicates']:
self.log.info("Controllo se ci sono dei duplicati..")
self.removeDuplicates()
else:
for idx, ff in enumerate(self.fileList):
ff['duplicate']=False
self.fileList[idx]=ff
print('Salto il controllo dei duplicati..')
if self.settings['saveSeparateFiles']:
self.saveSeparate()
if self.settings['saveBodyFile']:
self.saveBody()
pass
def openFile(self):
try:
self.log.info("Carico il contenuto..")
fp = open(self.fileParams['name'], mode='r', encoding=self.settings['encoding'])
self.rawFile = fp.readlines()
fp.close()
except IOError as e:
self.log.critical("Impossibile aprire il file: {}! [{}]".format(self.fileName,e))
raise BaseException("OpenFile")
pass
def remEmptyLines(self):
self.log.info("Elimino righe vuote e caratteri inutili..")
tempContent = []
try:
for ll in self.rawFile:
for c in self.settings['delChars']:
ll = ll.replace(c,'')
if ll not in ['\n', '\r']:
tempContent.append(ll)
self.rawFile = copy.deepcopy(tempContent)
return True
except:
self.log.error("Errore inaspettato durante l'eliminazione delle righe vuote!")
raise BaseException("DelLines")
del tempContent
def splitFile(self): #porting del codice dal programma originale
self.log.info("Individuo il contenuto..")
docNumber = 0
docSkipped = 0
docDate = {}
title = ''
prevLine = ''
newsPaperName = ''
titleBegin = False
tempBody = list()
docSep=re.compile(self.docStruct['docSep'])
for l in self.rawFile: #per ogni linea del file
lineWords = l.lstrip().split(' ') #dividi la riga in parole
if self.status == 'first':
#prendo il numero di documento per vedere se ci sono buchi
try:
try:
nn = parse.parse("{current:d} Of {total} Documents",l.strip().capitalize()).named
if nn["current"]-docNumber==1:
pass
else:
if self.settings["showSkipped"]:
self.log.warn("Il conto dei documenti non torna! LexisNexis \
ne ha saltato qualcuno!\nPrecedente:{0}-Attuale:{1}".format(docNumber,nn["current"]))
docSkipped+=1
docNumber = nn["current"]
except:
pass #non segnalare eccezione se il parse fallisce
# ricerco la data
if (lineWords[self.settings['monthPosition']]).capitalize() in self.docStruct['dateWords']:
try:
docDate=parse.parse(self.docParams['dateFormat'],l).named
docDate['month']=docDate['month'].lstrip().rstrip().capitalize()
docDate['month']=self.docStruct['dateWords'].index(docDate['month'])+1
title = ''
titleBegin=True
# dopo la data inizia il titolo, ma prima si cerca il nome del giornale
if self.settings['getNewsPaperName']:
try:
if prevLine.split(' ')[0].strip().isalpha():
newsPaperName = prevLine.strip()
else:
newsPaperName = self.settings['nameNotFoundStr']
except:
self.log.warn("E' successo qualcosa mentre stavo cercando il nome della pubblicazione,\
controlla i file di uscita! \n\t[{}]".format(prevLine.strip()))
else:
newsPaperName = self.settings['nameNotFoundStr']
except:
self.log.warn("Ho trovato una riga ambigua.. potrebbe essere una data ma non so: \n\t[{}]". format(l.strip('\r\n')))
pass
elif lineWords[0] in self.docStruct['headWords']:
#cambio stato e inizializzo un nuovo documento da riempire
self.status = 'head'
newDoc=dict()
newDoc['title']=title
newDoc['date']=docDate
newDoc['newsPaperName'] = newsPaperName
titleBegin=False
else:
if titleBegin:
title += l.strip().capitalize()
except IndexError:
self.log.error("Errore inaspettato, contatta il tuo sviluppatore di fiducia!")
pass
elif self.status == 'head':
tempContent = list()
if lineWords[0] not in self.docStruct['headWords']: #se la prima parola non e' tra quelle di inizio
tempBody.append(l) # vuol dire che ho trovato l'articolo e aggiungo la prima riga al contenuto del documento
self.status = 'body'
pass
elif self.status == 'body':
if not lineWords[0] in self.docStruct['tailWords']: #se la prima parola non e' tra quelle di fine
if self.settings['delLF']:
tempBody.append(l.strip('\n')) #allora sto leggendo l'articolo
else:
tempBody.append(l)
else:
self.status = 'tail'
anomaly = False
if docSep.match(l) is not None: #controlla se ci sono articoli che non hanno le parole chiave finali
self.log.warn("Ho individuato una separatore valido prima che si chiusesse l'articolo precedente, controlla i tuoi file in uscita!\n\
L'errore dovrebbe essere intorno all'articolo {} ma non sono sicuro! \n\t\t[{}]".format(docNumber, l.strip()))
self.status = 'tail'
anomaly = True
pass
elif self.status == 'tail':
if docSep.match(l) is not None or anomaly:
self.status = 'first'
anomaly = False
if self.settings['delWordBreak']:
tempContent=[ll.replace('-\n', '') for ll in tempContent]
newDoc['content']=copy.deepcopy(''.join(tempBody))
self.fileList.append(copy.deepcopy(newDoc))
tempBody=list()
self.bodyCounter +=1
pass
else:
self.log.critical("Stato Interno Sconosciuto")
prevLine=l #salva sempre e comunque il contenuto della linea precedente
pass
#ricerca terminata, espongo i risultati
self.log.info("Nel file ho trovato {0} articoli..".format(self.bodyCounter))
if docSkipped > 0:
self.log.warn("Attentione, LexisNexis ne ha saltati {} !!!".format(docSkipped))
pass
pass
def removeDuplicates(self):
titleList=[]
duplicateList=[]
for idx, ff in enumerate(self.fileList):
if ff['title'] not in titleList:
titleList.append(ff['title'])
ff['duplicate']=False
self.fileList[idx]=ff
pass
else:
if ff['title'] not in duplicateList:
duplicateList.append(ff['title'])
if self.settings['showRemovedDuplicates']:
self.log.info("Duplicato: {}".format(ff['title'].strip()))
ff['duplicate'] = True
self.fileList[idx]=ff
self.duplicateNumber+=1
self.log.info("Ho rimosso {} duplicati di {} articoli..\n". format(self.duplicateNumber, len(duplicateList)))
pass
def saveSeparate(self):
self.log.info("Salvo gli articoli in file separati...")
self.log.debug("Persorso: {0}".format(self.paths['OUTworkPath'].format('nomeFile')))
for ff in self.fileList:
try:
if ff['duplicate'] == False:
fName=self.paths['OUTnameFormat'].format(title=slugify(ff['title'][:self.settings['maxTitleLen']]),\
filename=slugify(self.fileName),\
docnum=self.bodyCounter,\
papername=ff['newsPaperName'].strip(),\
**ff['date'])
out=open(self.paths['OUTworkPath']+'{0}'.format(fName),'wb')
if self.settings['includeTitle']:
ff['content'] = ff['title']+os.linesep+ff['content']
out.write(ff['content'].encode(self.settings['encoding']))
out.close()
self.bodyCounter+=1
except IOError as e:
self.log.error("Qualcosa e\' andato storto, non riesco a scrivere il file: {}".format(e))
continue
pass
def saveBody(self):
print('Salvo gli articoli in un singolo file vicino agli originali...')
print ('Persorso: {0}'.format(self.paths['OUTworkPath'].format('nomeFile')))
try:
fName=slugify(self.fileName)
fName='BODYFILE_{0}_{1}.txt'.format(self.fileCounter,fName[:self.settings['maxTitleLen']])
fileContent = os.linesep.join([cc['content'] for cc in self.fileList])
out=open(self.paths['OUTworkPath']+'{0}'.format(fName),'wb')
out.write(fileContent.encode(self.settings['encoding']))
out.close()
except IOError as e:
print("OOPS! Qualcosa e\' andato storto, non riesco a scrivere il file: {}".format(e))
pass
if __name__ == "__main__":
logg = fancyLogger(name="LibSplit")
spp = bananaSPLITTER(fileParams="testfile.txt", logger=logg)

View File

@@ -0,0 +1,83 @@
{
"version": "v1.1a",
"logger": {
"logFile": "D:\\Test\\bananaSPLIT.log",
"logFormat": "%(asctime)s|%(levelname)-8s| %(message)-50s",
"logTimeFormat": "%m-%d %H:%M:%S"
},
"splitter": {
"name": "",
"paths": {
"INworkPath": "D:\\Emanuele\\Documenti\\workspace\\bananaSPLIT\\TestFiles\\",
"OUTworkPath": "D:\\Test\\Separati\\",
"OUTnameFormat": "TEST_{docnum}_{year:04d}{month:02d}{day:02d}_{title}.txt"
},
"docStruct": {
"docSep": "\\s*Copyright [(0-9)]+",
"dateFormat": "{month} {day:d}, {year:d}{}",
"dateWords": [
"January",
"February",
"March",
"April",
"May",
"June",
"July",
"August",
"September",
"October",
"November",
"December"
],
"headWords": [
"BYLINE:",
"SECTION:",
"LENGTH:",
"DATELINE:",
"HIGHLIGHT:",
"Email:"
],
"tailWords": [
"Newstex ID:",
"NOTES:",
"LANGUAGE:",
"GRAPHIC:",
"TYPE:",
"URL:",
"LOAD-DATE:",
"PUBLICATION-TYPE:",
"DOCUMENT-TYPE:",
"CHARTS:",
"JOURNAL-CODE:"
]
},
"settings": {
"encoding": "utf-8",
"monthPosition": 0,
"getNewsPaperName": true,
"nameNotFoundStr": "ND",
"includeTitle": true,
"removeDuplicates": true,
"showSkipped": false,
"showRemovedDuplicates": true,
"maxTitleLen": 32,
"loadTXT": true,
"loadDOCX": false,
"removeOldFiles": true,
"saveSeparateFiles": true,
"saveBodyFile": true,
"saveBodyNumber": true,
"delLF": false,
"delWordBreak": true,
"delChars": [
"'",
"@",
"#",
"$",
"%",
"^",
"&"
]
}
}
}

View File

@@ -0,0 +1,80 @@
{
"version": "v1.1a",
"logger": {
"logFile": "D:\\Test\\bananaSPLIT.log",
"logFormat": "%(asctime)s|%(levelname)-8s| %(message)-50s",
"logTimeFormat": "%m-%d %H:%M:%S"
},
"splitter": {
"name": "",
"paths": {
"INworkPath": "D:\\Emanuele\\Documenti\\workspace\\bananaSPLIT\\TestFiles\\",
"OUTworkPath": "D:\\Test\\Separati\\",
"OUTnameFormat": "TEST_{docnum}_{year:04d}{month:02d}{day:02d}_{title}.txt"
},
"docStruct": {
"docSep": "Copyright [(0-9)]+",
"dateFormat": "{day:d} {month} {year:d} {}",
"dateWords": [
"Gennaio",
"Febbraio",
"Marzo",
"Aprile",
"Maggio",
"Giugno",
"Luglio",
"Agosto",
"Settembre",
"Ottobre",
"Novembre",
"Dicembre"
],
"headWords": [
"BYLINE:",
"SECTION:",
"LENGTH:",
"DATELINE:",
"HIGHLIGHT:",
"Email:"
],
"tailWords": [
"LANGUAGE:",
"GRAPHIC:",
"TYPE:",
"URL:",
"LOAD-DATE:",
"PUBLICATION-TYPE:",
"DOCUMENT-TYPE:",
"CHARTS:"
]
},
"settings": {
"encoding": "utf-8",
"monthPosition": 0,
"getNewsPaperName": true,
"nameNotFoundStr": "ND",
"includeTitle": true,
"removeDuplicates": true,
"showSkipped": false,
"showRemovedDuplicates": true,
"maxTitleLen": 32,
"loadTXT": true,
"loadDOCX": false,
"removeOldFiles": true,
"saveSeparateFiles": true,
"saveBodyFile": true,
"saveBodyNumber": true,
"delLF": false,
"delWordBreak": true,
"delChars": [
"'",
"@",
"#",
"$",
"%",
"^",
"&"
]
}
}
}

View File

@@ -0,0 +1,27 @@
'''
Created on 1 dic 2019
@author: Emanuele Trabattoni
'''
import os
from libsplit import bananaSPLITTER
from libconfload import bananaCONF
from libfancylogger import fancyLogger
print("CWD-> "+os.getcwd())
logger = fancyLogger(fileLog = False)
confl = bananaCONF(workdir=r"./libbananasplit", logger=logger)
confl.open()
confl.use("testEN.json")
splconf = confl.getParams("splitter")
splconf["name"] = splconf["paths"]["INworkPath"]+"GUARDIAN 1989.txt"
splitter = bananaSPLITTER(fileParams=splconf, logger=logger)
splitter.start()
splitter.join()

View File

@@ -15,26 +15,24 @@ from pprint import pprint
from glob import glob from glob import glob
from copy import deepcopy from copy import deepcopy
from slugify import slugify from slugify import slugify
####### VAR GLOBALI ####### ####### VAR GLOBALI #######
####### FUNZIONI GLOBALI ####### ####### FUNZIONI GLOBALI #######
def printTitle(): def printTitle():
print(". . . , \n| | _ | _. _ ._ _ _ -+- _ * \n|/\|(/,|(_.(_)[ | )(/, | (_) *") print(". . . , \n| | _ | _. _ ._ _ _ -+- _ * \n|/\|(/,|(_.(_)[ | )(/, | (_) *")
print(Fore.LIGHTYELLOW_EX,\ print(Fore.LIGHTYELLOW_EX,' _ ___________ _ _____ _____ \n\
' _ ___________ _ _____ _____ \n\
| | / ___| ___ \ | |_ _|_ _| \n\ | | / ___| ___ \ | |_ _|_ _| \n\
| |__ __ _ _ __ __ _ _ __ __ _\ `--.| |_/ / | | | | | \n\ | |__ __ _ _ __ __ _ _ __ __ _\ `--.| |_/ / | | | | | \n\
| \'_ \ / _` | \'_ \ / _` | \'_ \ / _` |`--. \ __/| | | | | | \n\ | \'_ \ / _` | \'_ \ / _` | \'_ \ / _` |`--. \ __/| | | | | | \n\
| |_) | (_| | | | | (_| | | | | (_| /\__/ / | | |_____| |_ | | \n\ | |_) | (_| | | | | (_| | | | | (_| /\__/ / | | |_____| |_ | | \n\
|_.__/ \__,_|_| |_|\__,_|_| |_|\__,_\____/\_| \_____/\___/ \_/') |_.__/ \__,_|_| |_|\__,_|_| |_|\__,_\____/\_| \_____/\___/ \_/')
print ('\t\t\t\t\t\t\tVersione 0.4a\n\n', Style.RESET_ALL) print ('\t\t\t\t\t\t\tVersione 0.4b\n\n', Style.RESET_ALL)
print('Iniziamo!!') print('Iniziamo!!')
pass pass
############################################################# #############################################################
####################### MAIN ################################ ####################### MAIN ################################
############################################################# #############################################################
@@ -111,14 +109,18 @@ try:
else: else:
print('Non ho capito la risposta, sii un po\' piu\' specifico... [y/n]\n') print('Non ho capito la risposta, sii un po\' piu\' specifico... [y/n]\n')
pass pass
except:
print(Fore.LIGHTRED_EX, 'OOPS! Qualcosa e\' andato storto, non riesco a trovare nessun file da leggere :(', Style.RESET_ALL)
input()
sys.exit()
try:
if settings['removeOldFiles']: if settings['removeOldFiles']:
print("Rimuovo i vecchi file dalla cartella di destinazione..") print("Rimuovo i vecchi file dalla cartella di destinazione..")
for x in glob(cfg['OUTworkPath']+'*.txt'): for x in glob(cfg['OUTworkPath']+'*.txt'):
os.remove(x) os.remove(x)
except: except:
print(Fore.LIGHTRED_EX, 'OOPS! Qualcosa e\' andato storto, non riesco a trovare nessun file da leggere :(', Style.RESET_ALL) print(Fore.LIGHTRED_EX, 'OOPS! Qualcosa e\' andato storto, non riesco a rimuovere i vecchi file :(', Style.RESET_ALL)
input()
sys.exit()
lastTime=time.time() lastTime=time.time()
fileCounter = 1 fileCounter = 1

64
bananaconfEN.json Normal file
View File

@@ -0,0 +1,64 @@
{
"INworkPath": "D:\\Test\\",
"OUTworkPath": "D:\\Test\\Separati\\",
"OUTnameFormat":"TEST_{docnum}_{year:04d}{month:02d}{day:02d}_{title}.txt",
"docStruct": {
"docSep": "\\s*Copyright [(0-9)]+",
"dateFormat":"{month} {day:d}, {year:d}{}",
"dateWords": [
"January",
"February",
"March",
"April",
"May",
"June",
"July",
"August",
"September",
"October",
"November",
"December"
],
"headWords": [
"BYLINE:",
"SECTION:",
"LENGTH:",
"DATELINE:",
"HIGHLIGHT:",
"Email:"
],
"tailWords": [
"Newstex ID" ,
"NOTES",
"LANGUAGE:",
"GRAPHIC:",
"TYPE:",
"URL:",
"LOAD-DATE:",
"PUBLICATION-TYPE:",
"DOCUMENT-TYPE:",
"CHARTS:",
"JOURNAL-CODE:"
]
},
"settings": {
"encoding": "utf-8",
"monthPosition": 0,
"getNewsPaperName": true,
"nameNotFoundStr" : "--ND--",
"includeTitle" : true,
"removeDuplicates": true,
"showSkipped": false,
"showRemovedDuplicates": true,
"maxTitleLen": 32,
"loadTXT": true,
"loadDOCX": false,
"removeOldFiles":true,
"saveSeparateFiles": true,
"saveBodyFile": true,
"saveBodyNumber":true,
"delLF": false,
"delWordBreak": true,
"delChars": "'|@|#"
}
}

61
bananaconfITA.json Normal file
View File

@@ -0,0 +1,61 @@
{
"INworkPath": "C:\\Test\\",
"OUTworkPath": "C:\\Test\\Separati\\",
"OUTnameFormat":"TEST_{docnum}_{year:04d}{month:02d}{day:02d}_{papername}_{title}.txt",
"docStruct": {
"docSep": "Copyright [(0-9)]+",
"dateFormat":"{day:d} {month} {year:d} {}",
"dateWords": [
"Gennaio",
"Febbraio",
"Marzo",
"Aprile",
"Maggio",
"Giugno",
"Luglio",
"Agosto",
"Settembre",
"Ottobre",
"Novembre",
"Dicembre"
],
"headWords": [
"BYLINE:",
"SECTION:",
"LENGTH:",
"DATELINE:",
"HIGHLIGHT:",
"Email:"
],
"tailWords": [
"LANGUAGE:",
"GRAPHIC:",
"TYPE:",
"URL:",
"LOAD-DATE:",
"PUBLICATION-TYPE:",
"DOCUMENT-TYPE:",
"CHARTS:"
]
},
"settings": {
"encoding": "utf-8",
"monthPosition": 0,
"getNewsPaperName": true,
"nameNotFoundStr" : "--ND--",
"includeTitle" : true,
"removeDuplicates": true,
"showSkipped": false,
"showRemovedDuplicates": true,
"maxTitleLen": 32,
"loadTXT": true,
"loadDOCX": false,
"removeOldFiles":true,
"saveSeparateFiles": true,
"saveBodyFile": true,
"saveBodyNumber":true,
"delLF": false,
"delWordBreak": true,
"delChars": "'|@|#"
}
}