|
|
- # Class to solve Shortforms, data comes from Abkuerzungen.txt
-
- import hickle as hkl
-
- import FASTsearch
-
- class SolveShorts(object):
-
- def __init__(self, hklDatabaseDir_Shorts, hklDatabaseDir_Shorts_All):
-
- self.ShortsDB_All = hkl.load(hklDatabaseDir_Shorts_All)
- self.ShortsDB = hkl.load(hklDatabaseDir_Shorts)
-
-
- # Input: csv file with the form ['d.h.', n] , ['das', 'heißt'] for each line
- # Output: hkl dump of array in form [[1],[d.h.],['das', 'heißt']]
-
- def create_hklDB_from_csv(self, csvDbDir):
-
- with open(csvDbDir) as lines:
-
- ShortsDB_All = []
-
- for line in lines:
-
- ShortsDB_All.append(list(eval(line)))
-
-
- #print(ShortsDB_All)
-
- #print(ShortsDB_All[0][0])
-
-
- hkldbShorts = []
- counter = 0
- for n in range(len(ShortsDB_All)):
-
- counter += 1
- #if counter % 1000 == 0:
- #print(counter)
-
- hkldbShorts.append([ShortsDB_All[n][0][0]])
-
- #print('hkldbShorts', hkldbShorts)
-
- #print('creating the hkl dump of ShortsDBAll')
- hkl.dump(ShortsDB_All, 'hkldbShorts_All.hkl', mode='w', compression='gzip')
- #print('done..')
-
- #print('Creating the hkl dump of ShortsDB')
- hkl.dump(hkldbShorts, 'hkldbShorts.hkl', mode='w', compression='gzip')
- #print('done..')
-
- return 'done'
-
-
- def load_DB_into_FASTsearch(self):
-
- #print('loading hkldbShorts ..')
- self.fsearch1 = FASTsearch.FASTsearch('hkldbShorts.hkl')
- #print('done')
-
- #print('generating BoW Model..')
- #self.fsearch1.Gen_BoW_Model(3000, "word", punctuation = True)
- #print('done')
-
- #print('loading the bow model')
- self.fsearch1.Load_BoW_Model('bagofwordshkldbShorts.pkl', 'DataBaseOneZeroshkldbShorts.hkl')
- #print('done')
- import spacy
- #print('loading the german spacy model..')
- self.nlp = spacy.load('de_core_news_sm')
- #print('done')
-
-
- #print('oi thats the get_feature_names', self.fsearch1.vectorizer.get_feature_names())
-
-
- def ExplainShortsInSentencesWithBrackets(self, sentences):
- outsentences = []
- count = 0
- for sentence in sentences:
-
- count += 1
- #print('processing sentence', count)
-
- nshort = []
- therewasapossibleshort = 0
- explanationlist = []
-
- doc = self.nlp(' '.join(sentence))
- #print('da sentence', sentence)
- newshorts = []
- wordcount = 0
- for oriword in sentence:
- wordcount += 1
- if wordcount == len(sentence):
- word = oriword + '.'
- else:
- word = oriword
- newshort = []
- prenewshort = []
- punctcount = list(word).count('.')
- #print(word, list(word), punctcount)
- if punctcount > 1:
- replaceindex = sentence.index(oriword)
- dacount = 0
- for letter in list(word):
- #print('letter in word split', letter)
- prenewshort.append(letter)
-
- if letter == '.':
- dacount += 1
- newshort.append(''.join(prenewshort))
- prenewshort = []
- if dacount == punctcount:
- newshorts.append([newshort, replaceindex])
- #print(newshorts)
- for newshort in newshorts[::-1]:
- if len(newshort) > 0:
- del sentence[newshort[1]]
- for part in newshort[0][::-1]:
- sentence.insert(newshort[1], part)
- #print('sentence after newshortreplace', sentence)
- for n in range(len(sentence)):
-
- NhasToBeChecked = True
- for r in range(len(explanationlist)):
- if explanationlist[r][3] <= n < explanationlist[r][1]:
- NhasToBeChecked = False
-
- # Liste von falsch erkannten, zb er sollte nicht erkannt werden :)
- if sentence[n] in ['Er', 'er', 'ab', 'Ab', 'so', 'da', 'an', 'mit', 'Am', 'am']:
- NhasToBeChecked = False
-
- if n != 0 and sentence[n][-1] != '.' and doc[n - 1].dep_[:2] != 'ART':
- NhasToBeChecked = False
-
-
- if NhasToBeChecked == True:
-
-
-
-
- bestmatches1, matchindex = self.fsearch1.search_with_highest_multiplikation_Output(sentence[n], 1)
-
- #print(bestmatches1, matchindex)
-
- interestingindex = 0
- if sentence[n][-1] == '.':
- #print(sentence[n])
- #print('oioioioioi')
- if len(sentence) - n > 5:
-
- for m in range(5):
- #print(n, m, n+m+1, len(sentence))
- if sentence[n + m][-1] == '.' and sentence[n + m + 1][-1] != '.':
- interestingindex = m
- break
- if len(sentence) - n <= 5 and n != len(sentence) - 1:
- for m in range((len(sentence) - n)):
- #print('oleolaolu',n, m, n+m+1, len(sentence))
-
- if m == (len(sentence) - n) - 1:
- if sentence[n + m][-1] == '.':
- interestingindex = m
- break
- else:
-
- if sentence[n + m][-1] == '.' and sentence[n + m + 1][-1] != '.' :
- interestingindex = m
- break
-
- #print(interestingindex, 'interestingindex')
- if interestingindex == 0:
- finalmatchindex = matchindex
-
- if interestingindex >= 1:
- thesentence = ''
- for i in range(interestingindex + 1):
- #print('sentence', sentence[n+i])
- #print(thesentence + sentence[n+i])
- if i == 0:
- presentence = sentence[n + i]
- if i >= 1:
- presentence = ' ' + sentence[n + i]
- thesentence = thesentence + presentence
-
- #print('thesentence',thesentence)
- mbestmatches, mmatchindex = self.fsearch1.search_with_highest_multiplikation_Output(thesentence , 1)
- #print(mmatchindex)
- finalmatchindex = mmatchindex
-
-
-
- if finalmatchindex[1] == 1:
- wordexplanationIndex = finalmatchindex[0]
-
- wordexplanation = self.ShortsDB_All[wordexplanationIndex][1]
-
- explanationlist.insert(0, [wordexplanation, n + interestingindex + 1, interestingindex, n])
-
- #print('explanationlist', explanationlist)
- for i in range(len(explanationlist)):
- for k in range(len(explanationlist)):
- if explanationlist[i][3] == explanationlist[k][3] and i != k:
- if explanationlist[i][2] > explanationlist[k][2]:
- del explanationlist[k]
- if explanationlist[i][2] < explanationlist[k][2]:
- del explanationlist[i]
-
- for j in range(len(explanationlist)):
-
- sentence.insert(explanationlist[j][1], '(' + ' '.join(explanationlist[j][0]) + ')')
-
- #print(sentence)
-
- outsentences.append(sentence)
- # if uebereinstimmung, go to index and exchange
- return outsentences
-
-
-
-
|