|
|
-
-
- import spacy
- import nltk
- from nltk.stem.snowball import SnowballStemmer
-
- import hickle as hkl
- import FASTsearch
-
- stemmer = SnowballStemmer("german")
-
-
- class FremdWB(object):
-
- def __init__(self, hklDatabaseDir_Fremd_WB, hklDatabaseDir_Fremd_WB_All):
-
- if hklDatabaseDir_Fremd_WB_All is not None:
- self.Fremd_WBDB_All = hkl.load(hklDatabaseDir_Fremd_WB_All)
-
-
- #print('loading the german spacy model..')
- self.nlp = spacy.load('de_core_news_sm')
- #print('done')
-
- #print('loading the stemmer..')
- self.stemmer = SnowballStemmer("german")
- #print('done')
-
- return
-
-
- def create_hklDB_from_csv(self, csvDbDir, StemOrNot):
-
- with open(csvDbDir) as lines:
-
- self.Fremd_WBDB_All = []
-
- for line in lines:
-
- #print(line)
-
- self.Fremd_WBDB_All.append(list(eval(line)))
-
-
-
-
-
- self.hkldbFremd_WB1 = []
- self.hkldbFremd_WB2 = []
-
-
- counter = 0
- for n in range(len(self.Fremd_WBDB_All)):
-
- counter += 1
- if counter % 1000 == 0:
- print(counter)
-
-
- self.hkldbFremd_WB1.append([self.Fremd_WBDB_All[n][0][0]] )
- self.hkldbFremd_WB2.append([self.Fremd_WBDB_All[n][1][0]] )
-
-
-
- print('creating the hkl dump of Fremd_WBDBAll')
- hkl.dump(self.Fremd_WBDB_All, 'hkldbFremd_WB_All.hkl', mode='w', compression='lzf')
- print('done..')
-
- print('Creating the hkl dump of Fremd_WBDB 1')
- hkl.dump(self.hkldbFremd_WB1, 'hkldbFremd_WB1.hkl', mode='w', compression='lzf')
- #print('done..')
-
- print('Creating the hkl dump of Fremd_WBDB 2')
- hkl.dump(self.hkldbFremd_WB2, 'hkldbFremd_WB2.hkl', mode='w', compression='lzf')
- #print('done..')
-
-
-
- return 'done'
-
- def load_DB_into_FASTsearch(self):
-
- #print('loading the hkldbFremd_WB1...')
- self.hkldbFremd_WB1 = hkl.load('hkldbFremd_WB1.hkl')
- #print('done')
-
- #print('loading the hkldbFremd_WB2...')
- self.hkldbFremd_WB2 = hkl.load('hkldbFremd_WB2.hkl')
- #print('done')
-
- #print('loading hkldbFremd_WB 1 into FASTsearch..')
- self.fsearch1 = FASTsearch.FASTsearch('hkldbFremd_WB1.hkl')
- #print('done')
-
- #print('loading hkldbFremd_WB 2 into FASTsearch..')
- self.fsearch2 = FASTsearch.FASTsearch('hkldbFremd_WB2.hkl')
- #print('done')
-
- #print('generating BoW Model 1..')
- self.fsearch1.Gen_BoW_Model(50000, "word", punctuation = False)
- #print('done')
-
- #print('generating BoW Model 2..')
- self.fsearch2.Gen_BoW_Model(50000, "word", punctuation = False)
- #print('done')
-
-
- #print('loading the bow model 1')
- self.fsearch1.Load_BoW_Model('bagofwordshkldbFremd_WB1.pkl', 'DataBaseOneZeroshkldbFremd_WB1.hkl')
- #print('done')
-
- #print('loading the bow model 2')
- self.fsearch2.Load_BoW_Model('bagofwordshkldbFremd_WB2.pkl', 'DataBaseOneZeroshkldbFremd_WB2.hkl')
- #print('done')
-
-
- #print('oi thats the get_feature_names', self.fsearch1.vectorizer.get_feature_names())
- #print('oi thats the get_feature_names', self.fsearch2.vectorizer.get_feature_names())
-
-
-
- def fremdEintragAppend(self, sentences, punctuations):
- outsentences = []
- #print('something')
- sentencecount = 0
- alleeintraege = []
- for sentence in sentences:
- oldpunctuations = punctuations
- try:
- #print('sentence', sentence)
- sentencecount += 1
- #print('processing sentence', sentencecount)
-
- doc = self.nlp(' '.join(sentence))
-
- fremds_of_sentence = []
- count = 0
-
- for word in doc:
- count += 1
-
-
-
- if word.tag_[0] == 'V' or word.tag_[0] == 'N' or word.tag_[0] == 'A':
- fremds_of_sentence.append(word.text)
-
-
- #print(fremds_of_sentence)
- fremdeintraege = []
- for word in fremds_of_sentence:
-
- bestmatches2, matchindex2 = self.fsearch1.search_with_highest_multiplikation_Output(word, 1)
-
-
-
- fremd = self.hkldbFremd_WB1[matchindex2[0]][0].split()
- fremdeintrag = self.hkldbFremd_WB2[matchindex2[0]][0].split()
-
- #print(fremd)
- #print('fremdeintrag', fremdeintrag)
-
- if fremd[0] == word:
- fremdeintraege.append(fremdeintrag)
- #print('fremdeintraege',fremdeintraege)
- outsentences.append(sentence)
-
- for eintrag in fremdeintraege:
- if eintrag[-1][-1] == '.':
- eintrag[-1] = eintrag[-1][:-1]
- if eintrag not in alleeintraege:
- outsentences.append(eintrag)
- punctuations.insert(sentencecount, '.')
- alleeintraege.append(eintrag)
-
-
-
- #print('the endsentence',sentence)
- except:
- #print('konnte nicht' + str(sentence) + 'in FremdWB prozessieren..')
- #print('outsentence und co ', outsentences[-1], eintrag, sentence)
- if sentence != outsentences[-1] and alleeintraege[-1] != outsentences[-1]:
- outsentences.append(sentence)
- punctuations = oldpunctuations
- return outsentences, punctuations
-
-
|