2020-08-16 19:36:44 +02:00
|
|
|
|
|
|
|
|
|
|
|
import spacy
|
|
|
|
import nltk
|
|
|
|
from nltk.stem.snowball import SnowballStemmer
|
|
|
|
|
|
|
|
import hickle as hkl
|
|
|
|
import FASTsearch
|
|
|
|
|
|
|
|
stemmer = SnowballStemmer("german")
|
|
|
|
|
|
|
|
|
|
|
|
class FremdWB(object):
|
|
|
|
|
|
|
|
def __init__(self, hklDatabaseDir_Fremd_WB, hklDatabaseDir_Fremd_WB_All):
|
|
|
|
|
|
|
|
if hklDatabaseDir_Fremd_WB_All is not None:
|
|
|
|
self.Fremd_WBDB_All = hkl.load(hklDatabaseDir_Fremd_WB_All)
|
|
|
|
|
|
|
|
|
|
|
|
#print('loading the german spacy model..')
|
|
|
|
self.nlp = spacy.load('de_core_news_sm')
|
|
|
|
#print('done')
|
|
|
|
|
|
|
|
#print('loading the stemmer..')
|
|
|
|
self.stemmer = SnowballStemmer("german")
|
|
|
|
#print('done')
|
|
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
def create_hklDB_from_csv(self, csvDbDir, StemOrNot):
|
|
|
|
|
|
|
|
with open(csvDbDir) as lines:
|
|
|
|
|
|
|
|
self.Fremd_WBDB_All = []
|
|
|
|
|
|
|
|
for line in lines:
|
|
|
|
|
|
|
|
#print(line)
|
|
|
|
|
|
|
|
self.Fremd_WBDB_All.append(list(eval(line)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.hkldbFremd_WB1 = []
|
|
|
|
self.hkldbFremd_WB2 = []
|
|
|
|
|
|
|
|
|
|
|
|
counter = 0
|
|
|
|
for n in range(len(self.Fremd_WBDB_All)):
|
|
|
|
|
|
|
|
counter += 1
|
|
|
|
if counter % 1000 == 0:
|
|
|
|
print(counter)
|
|
|
|
|
|
|
|
|
|
|
|
self.hkldbFremd_WB1.append([self.Fremd_WBDB_All[n][0][0]] )
|
|
|
|
self.hkldbFremd_WB2.append([self.Fremd_WBDB_All[n][1][0]] )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print('creating the hkl dump of Fremd_WBDBAll')
|
|
|
|
hkl.dump(self.Fremd_WBDB_All, 'hkldbFremd_WB_All.hkl', mode='w', compression='lzf')
|
|
|
|
print('done..')
|
|
|
|
|
|
|
|
print('Creating the hkl dump of Fremd_WBDB 1')
|
|
|
|
hkl.dump(self.hkldbFremd_WB1, 'hkldbFremd_WB1.hkl', mode='w', compression='lzf')
|
|
|
|
#print('done..')
|
|
|
|
|
|
|
|
print('Creating the hkl dump of Fremd_WBDB 2')
|
|
|
|
hkl.dump(self.hkldbFremd_WB2, 'hkldbFremd_WB2.hkl', mode='w', compression='lzf')
|
|
|
|
#print('done..')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return 'done'
|
|
|
|
|
|
|
|
def load_DB_into_FASTsearch(self):
|
|
|
|
|
|
|
|
#print('loading the hkldbFremd_WB1...')
|
|
|
|
self.hkldbFremd_WB1 = hkl.load('hkldbFremd_WB1.hkl')
|
|
|
|
#print('done')
|
|
|
|
|
|
|
|
#print('loading the hkldbFremd_WB2...')
|
|
|
|
self.hkldbFremd_WB2 = hkl.load('hkldbFremd_WB2.hkl')
|
|
|
|
#print('done')
|
|
|
|
|
|
|
|
#print('loading hkldbFremd_WB 1 into FASTsearch..')
|
|
|
|
self.fsearch1 = FASTsearch.FASTsearch('hkldbFremd_WB1.hkl')
|
|
|
|
#print('done')
|
|
|
|
|
|
|
|
#print('loading hkldbFremd_WB 2 into FASTsearch..')
|
|
|
|
self.fsearch2 = FASTsearch.FASTsearch('hkldbFremd_WB2.hkl')
|
|
|
|
#print('done')
|
|
|
|
|
|
|
|
#print('generating BoW Model 1..')
|
|
|
|
self.fsearch1.Gen_BoW_Model(50000, "word", punctuation = False)
|
|
|
|
#print('done')
|
|
|
|
|
|
|
|
#print('generating BoW Model 2..')
|
|
|
|
self.fsearch2.Gen_BoW_Model(50000, "word", punctuation = False)
|
|
|
|
#print('done')
|
|
|
|
|
|
|
|
|
|
|
|
#print('loading the bow model 1')
|
|
|
|
self.fsearch1.Load_BoW_Model('bagofwordshkldbFremd_WB1.pkl', 'DataBaseOneZeroshkldbFremd_WB1.hkl')
|
|
|
|
#print('done')
|
|
|
|
|
|
|
|
#print('loading the bow model 2')
|
|
|
|
self.fsearch2.Load_BoW_Model('bagofwordshkldbFremd_WB2.pkl', 'DataBaseOneZeroshkldbFremd_WB2.hkl')
|
|
|
|
#print('done')
|
|
|
|
|
|
|
|
|
|
|
|
#print('oi thats the get_feature_names', self.fsearch1.vectorizer.get_feature_names())
|
|
|
|
#print('oi thats the get_feature_names', self.fsearch2.vectorizer.get_feature_names())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def fremdEintragAppend(self, sentences, punctuations):
|
|
|
|
outsentences = []
|
|
|
|
#print('something')
|
|
|
|
sentencecount = 0
|
|
|
|
alleeintraege = []
|
|
|
|
for sentence in sentences:
|
2020-09-06 01:33:50 +02:00
|
|
|
oldpunctuations = punctuations
|
|
|
|
try:
|
|
|
|
#print('sentence', sentence)
|
|
|
|
sentencecount += 1
|
|
|
|
#print('processing sentence', sentencecount)
|
|
|
|
|
|
|
|
doc = self.nlp(' '.join(sentence))
|
|
|
|
|
|
|
|
fremds_of_sentence = []
|
|
|
|
count = 0
|
|
|
|
|
|
|
|
for word in doc:
|
|
|
|
count += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if word.tag_[0] == 'V' or word.tag_[0] == 'N' or word.tag_[0] == 'A':
|
|
|
|
fremds_of_sentence.append(word.text)
|
|
|
|
|
|
|
|
|
|
|
|
#print(fremds_of_sentence)
|
|
|
|
fremdeintraege = []
|
|
|
|
for word in fremds_of_sentence:
|
|
|
|
|
|
|
|
bestmatches2, matchindex2 = self.fsearch1.search_with_highest_multiplikation_Output(word, 1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fremd = self.hkldbFremd_WB1[matchindex2[0]][0].split()
|
|
|
|
fremdeintrag = self.hkldbFremd_WB2[matchindex2[0]][0].split()
|
|
|
|
|
|
|
|
#print(fremd)
|
|
|
|
#print('fremdeintrag', fremdeintrag)
|
|
|
|
|
|
|
|
if fremd[0] == word:
|
|
|
|
fremdeintraege.append(fremdeintrag)
|
|
|
|
#print('fremdeintraege',fremdeintraege)
|
|
|
|
outsentences.append(sentence)
|
|
|
|
|
|
|
|
for eintrag in fremdeintraege:
|
|
|
|
if eintrag[-1][-1] == '.':
|
|
|
|
eintrag[-1] = eintrag[-1][:-1]
|
|
|
|
if eintrag not in alleeintraege:
|
|
|
|
outsentences.append(eintrag)
|
|
|
|
punctuations.insert(sentencecount, '.')
|
|
|
|
alleeintraege.append(eintrag)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#print('the endsentence',sentence)
|
|
|
|
except:
|
2020-09-17 14:40:50 +02:00
|
|
|
#print('konnte nicht' + str(sentence) + 'in FremdWB prozessieren..')
|
|
|
|
#print('outsentence und co ', outsentences[-1], eintrag, sentence)
|
|
|
|
if sentence != outsentences[-1] and alleeintraege[-1] != outsentences[-1]:
|
2020-09-06 01:33:50 +02:00
|
|
|
outsentences.append(sentence)
|
|
|
|
punctuations = oldpunctuations
|
2020-08-16 19:36:44 +02:00
|
|
|
return outsentences, punctuations
|
|
|
|
|
|
|
|
|