alpcentaur
/
basabuuka_prototyp



import spacyimport nltkfrom nltk.stem.snowball import SnowballStemmer
import hickle as hklimport FASTsearch
stemmer = SnowballStemmer("german")

class FremdWB(object):        def __init__(self, hklDatabaseDir_Fremd_WB, hklDatabaseDir_Fremd_WB_All):                if hklDatabaseDir_Fremd_WB_All is not None:            self.Fremd_WBDB_All = hkl.load(hklDatabaseDir_Fremd_WB_All)                         #print('loading the german spacy model..')        self.nlp = spacy.load('de_core_news_sm')        #print('done')                #print('loading the stemmer..')        self.stemmer = SnowballStemmer("german")        #print('done')                return            def create_hklDB_from_csv(self, csvDbDir, StemOrNot):                with open(csvDbDir) as lines:                        self.Fremd_WBDB_All = []                        for line in lines:                                #print(line)                                self.Fremd_WBDB_All.append(list(eval(line)))                                                                        self.hkldbFremd_WB1 = []            self.hkldbFremd_WB2 = []                                    counter = 0            for n in range(len(self.Fremd_WBDB_All)):                                counter += 1                if counter % 1000 == 0:                    print(counter)                                                self.hkldbFremd_WB1.append([self.Fremd_WBDB_All[n][0][0]] )                self.hkldbFremd_WB2.append([self.Fremd_WBDB_All[n][1][0]] )                                                        print('creating the hkl dump of Fremd_WBDBAll')                hkl.dump(self.Fremd_WBDB_All, 'hkldbFremd_WB_All.hkl', mode='w', compression='lzf')            print('done..')                        print('Creating the hkl dump of Fremd_WBDB 1')            hkl.dump(self.hkldbFremd_WB1, 'hkldbFremd_WB1.hkl', mode='w', compression='lzf')            #print('done..')                        print('Creating the hkl dump of Fremd_WBDB 2')            hkl.dump(self.hkldbFremd_WB2, 'hkldbFremd_WB2.hkl', mode='w', compression='lzf')            #print('done..')                                        return 'done'        def load_DB_into_FASTsearch(self):                #print('loading the hkldbFremd_WB1...')        self.hkldbFremd_WB1 = hkl.load('hkldbFremd_WB1.hkl')        #print('done')                #print('loading the hkldbFremd_WB2...')        self.hkldbFremd_WB2 = hkl.load('hkldbFremd_WB2.hkl')        #print('done')                #print('loading hkldbFremd_WB 1 into FASTsearch..')        self.fsearch1 = FASTsearch.FASTsearch('hkldbFremd_WB1.hkl')        #print('done')                #print('loading hkldbFremd_WB 2 into FASTsearch..')        self.fsearch2 = FASTsearch.FASTsearch('hkldbFremd_WB2.hkl')        #print('done')                #print('generating BoW Model 1..')        self.fsearch1.Gen_BoW_Model(50000, "word", punctuation = False)        #print('done')                #print('generating BoW Model 2..')        self.fsearch2.Gen_BoW_Model(50000, "word", punctuation = False)        #print('done')                        #print('loading the bow model 1')        self.fsearch1.Load_BoW_Model('bagofwordshkldbFremd_WB1.pkl', 'DataBaseOneZeroshkldbFremd_WB1.hkl')        #print('done')                #print('loading the bow model 2')        self.fsearch2.Load_BoW_Model('bagofwordshkldbFremd_WB2.pkl', 'DataBaseOneZeroshkldbFremd_WB2.hkl')        #print('done')                       #print('oi thats the get_feature_names', self.fsearch1.vectorizer.get_feature_names())        #print('oi thats the get_feature_names', self.fsearch2.vectorizer.get_feature_names())                            def fremdEintragAppend(self, sentences, punctuations):        outsentences = []        #print('something')        sentencecount = 0        alleeintraege = []        for sentence in sentences:            oldpunctuations = punctuations            try:                #print('sentence', sentence)                sentencecount += 1                #print('processing sentence', sentencecount)
                doc = self.nlp(' '.join(sentence))
                fremds_of_sentence = []                count = 0
                for word in doc:                    count += 1


                    if word.tag_[0] == 'V' or word.tag_[0] == 'N' or word.tag_[0] == 'A':                        fremds_of_sentence.append(word.text)

                #print(fremds_of_sentence)                fremdeintraege = []                   for word in fremds_of_sentence:
                    bestmatches2, matchindex2 = self.fsearch1.search_with_highest_multiplikation_Output(word, 1)


                    fremd = self.hkldbFremd_WB1[matchindex2[0]][0].split()                    fremdeintrag = self.hkldbFremd_WB2[matchindex2[0]][0].split()
                    #print(fremd)                    #print('fremdeintrag', fremdeintrag)
                    if fremd[0] == word:                        fremdeintraege.append(fremdeintrag)                #print('fremdeintraege',fremdeintraege)                outsentences.append(sentence)
                for eintrag in fremdeintraege:                    if eintrag[-1][-1] == '.':                        eintrag[-1] = eintrag[-1][:-1]                    if eintrag not in alleeintraege:                        outsentences.append(eintrag)                        punctuations.insert(sentencecount, '.')                    alleeintraege.append(eintrag)


                #print('the endsentence',sentence)            except:                #print('konnte nicht' + str(sentence) + 'in FremdWB prozessieren..')                #print('outsentence und co ', outsentences[-1], eintrag, sentence)                 if sentence != outsentences[-1] and alleeintraege[-1] != outsentences[-1]:                    outsentences.append(sentence)                punctuations = oldpunctuations        return outsentences, punctuations