224 lines
8.6 KiB
Python
224 lines
8.6 KiB
Python
# Class to solve Shortforms, data comes from Abkuerzungen.txt
|
|
|
|
import hickle as hkl
|
|
|
|
import FASTsearch
|
|
|
|
class SolveShorts(object):
|
|
|
|
def __init__(self, hklDatabaseDir_Shorts, hklDatabaseDir_Shorts_All):
|
|
|
|
self.ShortsDB_All = hkl.load(hklDatabaseDir_Shorts_All)
|
|
self.ShortsDB = hkl.load(hklDatabaseDir_Shorts)
|
|
|
|
|
|
# Input: csv file with the form ['d.h.', n] , ['das', 'heißt'] for each line
|
|
# Output: hkl dump of array in form [[1],[d.h.],['das', 'heißt']]
|
|
|
|
def create_hklDB_from_csv(self, csvDbDir):
|
|
|
|
with open(csvDbDir) as lines:
|
|
|
|
ShortsDB_All = []
|
|
|
|
for line in lines:
|
|
|
|
ShortsDB_All.append(list(eval(line)))
|
|
|
|
|
|
#print(ShortsDB_All)
|
|
|
|
#print(ShortsDB_All[0][0])
|
|
|
|
|
|
hkldbShorts = []
|
|
counter = 0
|
|
for n in range(len(ShortsDB_All)):
|
|
|
|
counter += 1
|
|
#if counter % 1000 == 0:
|
|
#print(counter)
|
|
|
|
hkldbShorts.append([ShortsDB_All[n][0][0]])
|
|
|
|
#print('hkldbShorts', hkldbShorts)
|
|
|
|
#print('creating the hkl dump of ShortsDBAll')
|
|
hkl.dump(ShortsDB_All, 'hkldbShorts_All.hkl', mode='w', compression='gzip')
|
|
#print('done..')
|
|
|
|
#print('Creating the hkl dump of ShortsDB')
|
|
hkl.dump(hkldbShorts, 'hkldbShorts.hkl', mode='w', compression='gzip')
|
|
#print('done..')
|
|
|
|
return 'done'
|
|
|
|
|
|
def load_DB_into_FASTsearch(self):
|
|
|
|
#print('loading hkldbShorts ..')
|
|
self.fsearch1 = FASTsearch.FASTsearch('hkldbShorts.hkl')
|
|
#print('done')
|
|
|
|
#print('generating BoW Model..')
|
|
#self.fsearch1.Gen_BoW_Model(3000, "word", punctuation = True)
|
|
#print('done')
|
|
|
|
#print('loading the bow model')
|
|
self.fsearch1.Load_BoW_Model('bagofwordshkldbShorts.pkl', 'DataBaseOneZeroshkldbShorts.hkl')
|
|
#print('done')
|
|
import spacy
|
|
#print('loading the german spacy model..')
|
|
self.nlp = spacy.load('de_core_news_sm')
|
|
#print('done')
|
|
|
|
|
|
#print('oi thats the get_feature_names', self.fsearch1.vectorizer.get_feature_names())
|
|
|
|
|
|
def ExplainShortsInSentencesWithBrackets(self, sentences):
|
|
outsentences = []
|
|
count = 0
|
|
for sentence in sentences:
|
|
|
|
count += 1
|
|
#print('processing sentence', count)
|
|
|
|
nshort = []
|
|
therewasapossibleshort = 0
|
|
explanationlist = []
|
|
|
|
doc = self.nlp(' '.join(sentence))
|
|
#print('da sentence', sentence)
|
|
newshorts = []
|
|
wordcount = 0
|
|
for oriword in sentence:
|
|
wordcount += 1
|
|
if wordcount == len(sentence):
|
|
word = oriword + '.'
|
|
else:
|
|
word = oriword
|
|
newshort = []
|
|
prenewshort = []
|
|
punctcount = list(word).count('.')
|
|
#print(word, list(word), punctcount)
|
|
if punctcount > 1:
|
|
replaceindex = sentence.index(oriword)
|
|
dacount = 0
|
|
for letter in list(word):
|
|
#print('letter in word split', letter)
|
|
prenewshort.append(letter)
|
|
|
|
if letter == '.':
|
|
dacount += 1
|
|
newshort.append(''.join(prenewshort))
|
|
prenewshort = []
|
|
if dacount == punctcount:
|
|
newshorts.append([newshort, replaceindex])
|
|
#print(newshorts)
|
|
for newshort in newshorts[::-1]:
|
|
if len(newshort) > 0:
|
|
del sentence[newshort[1]]
|
|
for part in newshort[0][::-1]:
|
|
sentence.insert(newshort[1], part)
|
|
#print('sentence after newshortreplace', sentence)
|
|
for n in range(len(sentence)):
|
|
|
|
NhasToBeChecked = True
|
|
for r in range(len(explanationlist)):
|
|
if explanationlist[r][3] <= n < explanationlist[r][1]:
|
|
NhasToBeChecked = False
|
|
|
|
# Liste von falsch erkannten, zb er sollte nicht erkannt werden :)
|
|
if sentence[n] in ['Er', 'er', 'ab', 'Ab', 'so', 'da', 'an', 'mit']:
|
|
NhasToBeChecked = False
|
|
|
|
if n != 0 and sentence[n][-1] != '.' and doc[n - 1].dep_[:2] != 'ART':
|
|
NhasToBeChecked = False
|
|
|
|
|
|
if NhasToBeChecked == True:
|
|
|
|
|
|
|
|
|
|
bestmatches1, matchindex = self.fsearch1.search_with_highest_multiplikation_Output(sentence[n], 1)
|
|
|
|
#print(bestmatches1, matchindex)
|
|
|
|
interestingindex = 0
|
|
if sentence[n][-1] == '.':
|
|
#print(sentence[n])
|
|
#print('oioioioioi')
|
|
if len(sentence) - n > 5:
|
|
|
|
for m in range(5):
|
|
#print(n, m, n+m+1, len(sentence))
|
|
if sentence[n + m][-1] == '.' and sentence[n + m + 1][-1] != '.':
|
|
interestingindex = m
|
|
break
|
|
if len(sentence) - n <= 5 and n != len(sentence) - 1:
|
|
for m in range((len(sentence) - n)):
|
|
#print('oleolaolu',n, m, n+m+1, len(sentence))
|
|
|
|
if m == (len(sentence) - n) - 1:
|
|
if sentence[n + m][-1] == '.':
|
|
interestingindex = m
|
|
break
|
|
else:
|
|
|
|
if sentence[n + m][-1] == '.' and sentence[n + m + 1][-1] != '.' :
|
|
interestingindex = m
|
|
break
|
|
|
|
#print(interestingindex, 'interestingindex')
|
|
if interestingindex == 0:
|
|
finalmatchindex = matchindex
|
|
|
|
if interestingindex >= 1:
|
|
thesentence = ''
|
|
for i in range(interestingindex + 1):
|
|
#print('sentence', sentence[n+i])
|
|
#print(thesentence + sentence[n+i])
|
|
if i == 0:
|
|
presentence = sentence[n + i]
|
|
if i >= 1:
|
|
presentence = ' ' + sentence[n + i]
|
|
thesentence = thesentence + presentence
|
|
|
|
#print('thesentence',thesentence)
|
|
mbestmatches, mmatchindex = self.fsearch1.search_with_highest_multiplikation_Output(thesentence , 1)
|
|
#print(mmatchindex)
|
|
finalmatchindex = mmatchindex
|
|
|
|
|
|
|
|
if finalmatchindex[1] == 1:
|
|
wordexplanationIndex = finalmatchindex[0]
|
|
|
|
wordexplanation = self.ShortsDB_All[wordexplanationIndex][1]
|
|
|
|
explanationlist.insert(0, [wordexplanation, n + interestingindex + 1, interestingindex, n])
|
|
|
|
#print('explanationlist', explanationlist)
|
|
for i in range(len(explanationlist)):
|
|
for k in range(len(explanationlist)):
|
|
if explanationlist[i][3] == explanationlist[k][3] and i != k:
|
|
if explanationlist[i][2] > explanationlist[k][2]:
|
|
del explanationlist[k]
|
|
if explanationlist[i][2] < explanationlist[k][2]:
|
|
del explanationlist[i]
|
|
|
|
for j in range(len(explanationlist)):
|
|
|
|
sentence.insert(explanationlist[j][1], '(' + ' '.join(explanationlist[j][0]) + ')')
|
|
|
|
#print(sentence)
|
|
|
|
outsentences.append(sentence)
|
|
# if uebereinstimmung, go to index and exchange
|
|
return outsentences
|
|
|
|
|
|
|
|
|