alpcentaur
/
basabuuka_prototyp



								import spacy

								import nltk

								from nltk.stem.snowball import SnowballStemmer


								import hickle as hkl

								import FASTsearch


								stemmer = SnowballStemmer("german")


								class FremdWB(object):


								    def __init__(self, hklDatabaseDir_Fremd_WB, hklDatabaseDir_Fremd_WB_All):


								        if hklDatabaseDir_Fremd_WB_All is not None:

								            self.Fremd_WBDB_All = hkl.load(hklDatabaseDir_Fremd_WB_All)


								        #print('loading the german spacy model..')

								        self.nlp = spacy.load('de_core_news_sm')

								        #print('done')


								        #print('loading the stemmer..')

								        self.stemmer = SnowballStemmer("german")

								        #print('done')


								        return


								    def create_hklDB_from_csv(self, csvDbDir, StemOrNot):


								        with open(csvDbDir) as lines:


								            self.Fremd_WBDB_All = []


								            for line in lines:


								                #print(line)


								                self.Fremd_WBDB_All.append(list(eval(line)))


								            self.hkldbFremd_WB1 = []

								            self.hkldbFremd_WB2 = []


								            counter = 0

								            for n in range(len(self.Fremd_WBDB_All)):


								                counter += 1

								                if counter % 1000 == 0:

								                    print(counter)


								                self.hkldbFremd_WB1.append([self.Fremd_WBDB_All[n][0][0]] )

								                self.hkldbFremd_WB2.append([self.Fremd_WBDB_All[n][1][0]] )


								            print('creating the hkl dump of Fremd_WBDBAll')

								            hkl.dump(self.Fremd_WBDB_All, 'hkldbFremd_WB_All.hkl', mode='w', compression='lzf')

								            print('done..')


								            print('Creating the hkl dump of Fremd_WBDB 1')

								            hkl.dump(self.hkldbFremd_WB1, 'hkldbFremd_WB1.hkl', mode='w', compression='lzf')

								            #print('done..')


								            print('Creating the hkl dump of Fremd_WBDB 2')

								            hkl.dump(self.hkldbFremd_WB2, 'hkldbFremd_WB2.hkl', mode='w', compression='lzf')

								            #print('done..')


								        return 'done'


								    def load_DB_into_FASTsearch(self):


								        #print('loading the hkldbFremd_WB1...')

								        self.hkldbFremd_WB1 = hkl.load('hkldbFremd_WB1.hkl')

								        #print('done')


								        #print('loading the hkldbFremd_WB2...')

								        self.hkldbFremd_WB2 = hkl.load('hkldbFremd_WB2.hkl')

								        #print('done')


								        #print('loading hkldbFremd_WB 1 into FASTsearch..')

								        self.fsearch1 = FASTsearch.FASTsearch('hkldbFremd_WB1.hkl')

								        #print('done')


								        #print('loading hkldbFremd_WB 2 into FASTsearch..')

								        self.fsearch2 = FASTsearch.FASTsearch('hkldbFremd_WB2.hkl')

								        #print('done')


								        #print('generating BoW Model 1..')

								        self.fsearch1.Gen_BoW_Model(50000, "word", punctuation = False)

								        #print('done')


								        #print('generating BoW Model 2..')

								        self.fsearch2.Gen_BoW_Model(50000, "word", punctuation = False)

								        #print('done')


								        #print('loading the bow model 1')

								        self.fsearch1.Load_BoW_Model('bagofwordshkldbFremd_WB1.pkl', 'DataBaseOneZeroshkldbFremd_WB1.hkl')

								        #print('done')


								        #print('loading the bow model 2')

								        self.fsearch2.Load_BoW_Model('bagofwordshkldbFremd_WB2.pkl', 'DataBaseOneZeroshkldbFremd_WB2.hkl')

								        #print('done')


								        #print('oi thats the get_feature_names', self.fsearch1.vectorizer.get_feature_names())

								        #print('oi thats the get_feature_names', self.fsearch2.vectorizer.get_feature_names())


								    def fremdEintragAppend(self, sentences, punctuations):

								        outsentences = []

								        #print('something')

								        sentencecount = 0

								        alleeintraege = []

								        for sentence in sentences:

								            oldpunctuations = punctuations

								            try:

								                #print('sentence', sentence)

								                sentencecount += 1

								                #print('processing sentence', sentencecount)


								                doc = self.nlp(' '.join(sentence))


								                fremds_of_sentence = []

								                count = 0


								                for word in doc:

								                    count += 1


								                    if word.tag_[0] == 'V' or word.tag_[0] == 'N' or word.tag_[0] == 'A':

								                        fremds_of_sentence.append(word.text)


								                #print(fremds_of_sentence)

								                fremdeintraege = []

								                for word in fremds_of_sentence:


								                    bestmatches2, matchindex2 = self.fsearch1.search_with_highest_multiplikation_Output(word, 1)


								                    fremd = self.hkldbFremd_WB1[matchindex2[0]][0].split()

								                    fremdeintrag = self.hkldbFremd_WB2[matchindex2[0]][0].split()


								                    #print(fremd)

								                    #print('fremdeintrag', fremdeintrag)


								                    if fremd[0] == word:

								                        fremdeintraege.append(fremdeintrag)

								                #print('fremdeintraege',fremdeintraege)

								                outsentences.append(sentence)


								                for eintrag in fremdeintraege:

								                    if eintrag[-1][-1] == '.':

								                        eintrag[-1] = eintrag[-1][:-1]

								                    if eintrag not in alleeintraege:

								                        outsentences.append(eintrag)

								                        punctuations.insert(sentencecount, '.')

								                    alleeintraege.append(eintrag)


								                #print('the endsentence',sentence)

								            except:

								                #print('konnte nicht' + str(sentence) + 'in FremdWB prozessieren..')

								                #print('outsentence und co ', outsentences[-1], eintrag, sentence)

								                if sentence != outsentences[-1] and alleeintraege[-1] != outsentences[-1]:

								                    outsentences.append(sentence)

								                punctuations = oldpunctuations

								        return outsentences, punctuations