import spacy import nltk from nltk.stem.snowball import SnowballStemmer import hickle as hkl import FASTsearch stemmer = SnowballStemmer("german") class FremdWB(object): def __init__(self, hklDatabaseDir_Fremd_WB, hklDatabaseDir_Fremd_WB_All): if hklDatabaseDir_Fremd_WB_All is not None: self.Fremd_WBDB_All = hkl.load(hklDatabaseDir_Fremd_WB_All) #print('loading the german spacy model..') self.nlp = spacy.load('de_core_news_sm') #print('done') #print('loading the stemmer..') self.stemmer = SnowballStemmer("german") #print('done') return def create_hklDB_from_csv(self, csvDbDir, StemOrNot): with open(csvDbDir) as lines: self.Fremd_WBDB_All = [] for line in lines: #print(line) self.Fremd_WBDB_All.append(list(eval(line))) self.hkldbFremd_WB1 = [] self.hkldbFremd_WB2 = [] counter = 0 for n in range(len(self.Fremd_WBDB_All)): counter += 1 if counter % 1000 == 0: print(counter) self.hkldbFremd_WB1.append([self.Fremd_WBDB_All[n][0][0]] ) self.hkldbFremd_WB2.append([self.Fremd_WBDB_All[n][1][0]] ) print('creating the hkl dump of Fremd_WBDBAll') hkl.dump(self.Fremd_WBDB_All, 'hkldbFremd_WB_All.hkl', mode='w', compression='lzf') print('done..') print('Creating the hkl dump of Fremd_WBDB 1') hkl.dump(self.hkldbFremd_WB1, 'hkldbFremd_WB1.hkl', mode='w', compression='lzf') #print('done..') print('Creating the hkl dump of Fremd_WBDB 2') hkl.dump(self.hkldbFremd_WB2, 'hkldbFremd_WB2.hkl', mode='w', compression='lzf') #print('done..') return 'done' def load_DB_into_FASTsearch(self): #print('loading the hkldbFremd_WB1...') self.hkldbFremd_WB1 = hkl.load('hkldbFremd_WB1.hkl') #print('done') #print('loading the hkldbFremd_WB2...') self.hkldbFremd_WB2 = hkl.load('hkldbFremd_WB2.hkl') #print('done') #print('loading hkldbFremd_WB 1 into FASTsearch..') self.fsearch1 = FASTsearch.FASTsearch('hkldbFremd_WB1.hkl') #print('done') #print('loading hkldbFremd_WB 2 into FASTsearch..') self.fsearch2 = FASTsearch.FASTsearch('hkldbFremd_WB2.hkl') #print('done') #print('generating BoW Model 1..') self.fsearch1.Gen_BoW_Model(50000, "word", punctuation = False) #print('done') #print('generating BoW Model 2..') self.fsearch2.Gen_BoW_Model(50000, "word", punctuation = False) #print('done') #print('loading the bow model 1') self.fsearch1.Load_BoW_Model('bagofwordshkldbFremd_WB1.pkl', 'DataBaseOneZeroshkldbFremd_WB1.hkl') #print('done') #print('loading the bow model 2') self.fsearch2.Load_BoW_Model('bagofwordshkldbFremd_WB2.pkl', 'DataBaseOneZeroshkldbFremd_WB2.hkl') #print('done') #print('oi thats the get_feature_names', self.fsearch1.vectorizer.get_feature_names()) #print('oi thats the get_feature_names', self.fsearch2.vectorizer.get_feature_names()) def fremdEintragAppend(self, sentences, punctuations): outsentences = [] #print('something') sentencecount = 0 alleeintraege = [] for sentence in sentences: oldpunctuations = punctuations try: #print('sentence', sentence) sentencecount += 1 #print('processing sentence', sentencecount) doc = self.nlp(' '.join(sentence)) fremds_of_sentence = [] count = 0 for word in doc: count += 1 if word.tag_[0] == 'V' or word.tag_[0] == 'N' or word.tag_[0] == 'A': fremds_of_sentence.append(word.text) #print(fremds_of_sentence) fremdeintraege = [] for word in fremds_of_sentence: bestmatches2, matchindex2 = self.fsearch1.search_with_highest_multiplikation_Output(word, 1) fremd = self.hkldbFremd_WB1[matchindex2[0]][0].split() fremdeintrag = self.hkldbFremd_WB2[matchindex2[0]][0].split() #print(fremd) #print('fremdeintrag', fremdeintrag) if fremd[0] == word: fremdeintraege.append(fremdeintrag) #print('fremdeintraege',fremdeintraege) outsentences.append(sentence) for eintrag in fremdeintraege: if eintrag[-1][-1] == '.': eintrag[-1] = eintrag[-1][:-1] if eintrag not in alleeintraege: outsentences.append(eintrag) punctuations.insert(sentencecount, '.') alleeintraege.append(eintrag) #print('the endsentence',sentence) except: #print('konnte nicht' + str(sentence) + 'in FremdWB prozessieren..') #print('outsentence und co ', outsentences[-1], eintrag, sentence) if sentence != outsentences[-1] and alleeintraege[-1] != outsentences[-1]: outsentences.append(sentence) punctuations = oldpunctuations return outsentences, punctuations