3 Commits

3 changed files with 299 additions and 14 deletions

View File

@@ -0,0 +1,278 @@
<?xml version="1.0" encoding="UTF-8"?>
<ui version="4.0">
<class>selezout</class>
<widget class="QWidget" name="selezout">
<property name="geometry">
<rect>
<x>0</x>
<y>0</y>
<width>577</width>
<height>400</height>
</rect>
</property>
<property name="windowTitle">
<string>Seleziona Linuga e Output</string>
</property>
<layout class="QGridLayout" name="gridLayout_2">
<item row="5" column="0">
<layout class="QVBoxLayout" name="verticalLayout">
<property name="sizeConstraint">
<enum>QLayout::SetMaximumSize</enum>
</property>
<item>
<widget class="QRadioButton" name="rad_mainbodyEFile">
<property name="font">
<font>
<pointsize>10</pointsize>
</font>
</property>
<property name="text">
<string>Bodyfile + File singoli</string>
</property>
<property name="checked">
<bool>true</bool>
</property>
</widget>
</item>
<item>
<widget class="QRadioButton" name="rad_mainbody">
<property name="font">
<font>
<pointsize>10</pointsize>
</font>
</property>
<property name="text">
<string>Bodyfile</string>
</property>
</widget>
</item>
<item>
<widget class="QRadioButton" name="rad_file">
<property name="font">
<font>
<pointsize>10</pointsize>
</font>
</property>
<property name="text">
<string>File singoli</string>
</property>
</widget>
</item>
</layout>
</item>
<item row="9" column="1">
<spacer name="verticalSpacer">
<property name="orientation">
<enum>Qt::Vertical</enum>
</property>
<property name="sizeHint" stdset="0">
<size>
<width>20</width>
<height>40</height>
</size>
</property>
</spacer>
</item>
<item row="1" column="0" colspan="4">
<widget class="Line" name="line_2">
<property name="orientation">
<enum>Qt::Horizontal</enum>
</property>
</widget>
</item>
<item row="0" column="0">
<widget class="QLabel" name="lbl_lingua">
<property name="font">
<font>
<pointsize>10</pointsize>
</font>
</property>
<property name="text">
<string>Seleziona la lingua degli articoli</string>
</property>
</widget>
</item>
<item row="4" column="0" colspan="4">
<widget class="Line" name="line">
<property name="orientation">
<enum>Qt::Horizontal</enum>
</property>
</widget>
</item>
<item row="7" column="0" colspan="4">
<widget class="Line" name="line_3">
<property name="orientation">
<enum>Qt::Horizontal</enum>
</property>
</widget>
</item>
<item row="8" column="0" colspan="4">
<layout class="QGridLayout" name="gridLayout">
<item row="0" column="0">
<widget class="QCheckBox" name="chk_removeDuplicates">
<property name="font">
<font>
<pointsize>10</pointsize>
</font>
</property>
<property name="text">
<string>Tenta rimozione duplicati</string>
</property>
</widget>
</item>
<item row="0" column="1">
<widget class="QCheckBox" name="chk_includeTitle">
<property name="font">
<font>
<pointsize>10</pointsize>
</font>
</property>
<property name="text">
<string>Includi titolo all'interno del file</string>
</property>
</widget>
</item>
<item row="1" column="0">
<widget class="QCheckBox" name="chk_removeBreakWord">
<property name="font">
<font>
<pointsize>10</pointsize>
</font>
</property>
<property name="text">
<string>Rimuovi interruzioni parola (a capo)</string>
</property>
</widget>
</item>
<item row="1" column="1">
<widget class="QCheckBox" name="chk_cleaDestFolder">
<property name="font">
<font>
<pointsize>10</pointsize>
</font>
</property>
<property name="text">
<string>Svuota cartella di destinazione</string>
</property>
</widget>
</item>
<item row="2" column="0" colspan="2">
<layout class="QHBoxLayout" name="horizontalLayout">
<item>
<widget class="QLabel" name="label">
<property name="font">
<font>
<pointsize>10</pointsize>
</font>
</property>
<property name="text">
<string>Caratteri speciali da rimuovere </string>
</property>
</widget>
</item>
<item>
<widget class="QLineEdit" name="lin_specialChars"/>
</item>
</layout>
</item>
</layout>
</item>
<item row="9" column="3">
<widget class="QPushButton" name="btn_split">
<property name="minimumSize">
<size>
<width>75</width>
<height>0</height>
</size>
</property>
<property name="maximumSize">
<size>
<width>75</width>
<height>16777215</height>
</size>
</property>
<property name="font">
<font>
<pointsize>10</pointsize>
</font>
</property>
<property name="text">
<string>SPLITTA!</string>
</property>
</widget>
</item>
<item row="3" column="0">
<widget class="QLabel" name="lbl_indicazioni">
<property name="sizePolicy">
<sizepolicy hsizetype="Preferred" vsizetype="Minimum">
<horstretch>0</horstretch>
<verstretch>0</verstretch>
</sizepolicy>
</property>
<property name="font">
<font>
<pointsize>10</pointsize>
</font>
</property>
<property name="text">
<string>Seleziona il tipo di file da salvare</string>
</property>
</widget>
</item>
<item row="6" column="0">
<widget class="QLabel" name="lbl_indicazioni_2">
<property name="sizePolicy">
<sizepolicy hsizetype="Preferred" vsizetype="Minimum">
<horstretch>0</horstretch>
<verstretch>0</verstretch>
</sizepolicy>
</property>
<property name="font">
<font>
<pointsize>10</pointsize>
</font>
</property>
<property name="text">
<string>Altre Opzioni </string>
</property>
</widget>
</item>
<item row="9" column="2">
<widget class="QPushButton" name="btn_indietro">
<property name="minimumSize">
<size>
<width>75</width>
<height>0</height>
</size>
</property>
<property name="maximumSize">
<size>
<width>75</width>
<height>16777215</height>
</size>
</property>
<property name="font">
<font>
<pointsize>10</pointsize>
</font>
</property>
<property name="text">
<string>Indietro</string>
</property>
</widget>
</item>
<item row="2" column="0">
<widget class="QComboBox" name="cmb_lingua">
<property name="frame">
<bool>true</bool>
</property>
<property name="modelColumn">
<number>0</number>
</property>
</widget>
</item>
</layout>
</widget>
<resources/>
<connections/>
</ui>

View File

@@ -437,6 +437,13 @@ class bananaSelezOut(PyQt5.QtWidgets.QWidget):
#unisco i pezzi
splconf['docStruct']['outNameFormat'] = splconf['docStruct']['outNameSep'].join(nametemp)+splconf['docStruct']['outExt']
# costruisco i thread
tDict={}
for f in splconf['paths']['fileList']:
splconf['name']=f
tDict[f] = bananaSPLITTER(fileParams=copy.deepcopy(splconf), logger=self.log)
pass

View File

@@ -20,7 +20,7 @@ class bananaSPLITTER(QThread):
self.log = logger
self.rawFile = None
self.status = "first"
self.fileList = list()
self.contentList = list()
self.bodyCounter=0
self.duplicateNumber=0
if fileParams is not None:
@@ -47,9 +47,9 @@ class bananaSPLITTER(QThread):
self.log.info("Controllo se ci sono dei duplicati..")
self.removeDuplicates()
else:
for idx, ff in enumerate(self.fileList):
for idx, ff in enumerate(self.contentList):
ff['duplicate']=False
self.fileList[idx]=ff
self.contentList[idx]=ff
self.log.warn('Salto il controllo dei duplicati..')
# se il parse e la rimozione dei duplicati e' andata bene
# preparo e inizio il salvataggio
@@ -139,11 +139,11 @@ class bananaSPLITTER(QThread):
except:
pass #non segnalare eccezione se il parse fallisce
# ricerco la data
if (lineWords[self.settings['monthPosition']]).capitalize() in self.docStruct['dateWords']:
if (lineWords[self.settings['monthPosition']]).capitalize() in self.docStruct['language']['dateWords']:
try:
docDate=parse.parse(self.docStruct['dateFormat'],l).named
docDate['month']=docDate['month'].lstrip().rstrip().capitalize()
docDate['month']=self.docStruct['dateWords'].index(docDate['month'])+1
docDate['month']=self.docStruct['language']['dateWords'].index(docDate['month'])+1
title = ''
titleBegin=True
# dopo la data inizia il titolo, ma prima si cerca il nome del giornale
@@ -161,7 +161,7 @@ class bananaSPLITTER(QThread):
except:
self.log.warn("Ho trovato una riga ambigua.. potrebbe essere una data ma non so: [{}]". format(l.strip('\r\n')))
pass
elif lineWords[0] in self.docStruct['headWords']:
elif lineWords[0] in self.docStruct['language']['headWords']:
#cambio stato e inizializzo un nuovo documento da riempire
self.status = 'head'
newDoc=dict()
@@ -177,12 +177,12 @@ class bananaSPLITTER(QThread):
pass
elif self.status == 'head':
tempContent = list()
if lineWords[0] not in self.docStruct['headWords']: #se la prima parola non e' tra quelle di inizio
if lineWords[0] not in self.docStruct['language']['headWords']: #se la prima parola non e' tra quelle di inizio
tempBody.append(l) # vuol dire che ho trovato l'articolo e aggiungo la prima riga al contenuto del documento
self.status = 'body'
pass
elif self.status == 'body':
if not lineWords[0] in self.docStruct['tailWords']: #se la prima parola non e' tra quelle di fine
if not lineWords[0] in self.docStruct['language']['tailWords']: #se la prima parola non e' tra quelle di fine
if self.settings['delLF']:
tempBody.append(l.strip('\n')) #allora sto leggendo l'articolo
else:
@@ -203,7 +203,7 @@ class bananaSPLITTER(QThread):
if self.settings['delWordBreak']:
tempContent=[ll.replace('-\n', '') for ll in tempContent]
newDoc['content']=copy.deepcopy(''.join(tempBody))
self.fileList.append(copy.deepcopy(newDoc))
self.contentList.append(copy.deepcopy(newDoc))
tempBody=list()
self.bodyCounter +=1
pass
@@ -221,11 +221,11 @@ class bananaSPLITTER(QThread):
def removeDuplicates(self):
titleList=[]
duplicateList=[]
for idx, ff in enumerate(self.fileList):
for idx, ff in enumerate(self.contentList):
if ff['title'] not in titleList:
titleList.append(ff['title'])
ff['duplicate']=False
self.fileList[idx]=ff
self.contentList[idx]=ff
pass
else:
if ff['title'] not in duplicateList:
@@ -233,7 +233,7 @@ class bananaSPLITTER(QThread):
if self.settings['showRemovedDuplicates']:
self.log.info("Duplicato: {}".format(ff['title'].strip()))
ff['duplicate'] = True
self.fileList[idx]=ff
self.contentList[idx]=ff
self.duplicateNumber+=1
self.log.info("Ho rimosso {} duplicati di {} articoli..". format(self.duplicateNumber, len(duplicateList)))
pass
@@ -243,7 +243,7 @@ class bananaSPLITTER(QThread):
self.paths['OUTworkPath']=self.paths['OUTworkPath']+slugify(self.fileName)
self.log.info("Salvo gli articoli in file separati...")
self.log.debug("Persorso: {0}".format(self.paths['OUTworkPath'].format('nomeFile')))
for ff in self.fileList:
for ff in self.contentList:
try:
if ff['duplicate'] == False:
fName=self.paths['OUTnameFormat'].format(title=slugify(ff['title'][:self.settings['maxTitleLen']]),\
@@ -271,7 +271,7 @@ class bananaSPLITTER(QThread):
try:
fName=slugify(self.fileName)
fName='BODYFILE_{0}.txt'.format(fName[:self.settings['maxTitleLen']])
fileContent = os.linesep.join([cc['content'] for cc in self.fileList])
fileContent = os.linesep.join([cc['content'] for cc in self.contentList])
out=open('{0}'.format(fName),'wb')
out.write(fileContent.encode(self.settings['encoding']))
out.close()