From b7b329154ea4533b1c04f910a8fe5d8129db557c Mon Sep 17 00:00:00 2001 From: alpcentaur Date: Thu, 27 Aug 2020 21:26:02 +0200 Subject: [PATCH] Updated - more functions for easier use --- FASTsearch.py | 267 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 255 insertions(+), 12 deletions(-) diff --git a/FASTsearch.py b/FASTsearch.py index 4858915..609db7f 100644 --- a/FASTsearch.py +++ b/FASTsearch.py @@ -1,5 +1,6 @@ -# The new class FASTsearch. Every DB can be represented in Lists. The Brain actually is constituted from lists. Access to all Documents the same moment. +# The new class FASTsearch. Every DB can be represented in Lists. The Brain actually is constituted from lists. Access to all Documents the same moment +# The new class FASTsearch. Every DB can be represented in Lists. The Brain actually is constituted from lists. Access to all Documents almost the same moment. # TODO GPU Multithreading has to be implemented. @@ -30,16 +31,88 @@ def convert_sparse_matrix_to_sparse_tensor(X): + +# The whole class is initialized with input of the database in [['word','word2'],[],[],[]] List format, 2 dimensional, the index of the list in the matrix defines its id +## in every list element of the input, each document is represented by one string +# This list must be saved as a hkl dump and then loaded into the database. + + +def my_tokenizer(s): + return s.split('\+') + class FASTsearch(object): - def __init__(self, DatabaseDir, BoWModelDir): + def __init__(self, DatabaseDir): + + self.DatabaseDir = DatabaseDir[:-4] + + database = [] + hkl_load = hkl.load(DatabaseDir) + + for element in hkl_load: + #print('element',element) + #print('joined element', ' '.join(element)) + database.append(' '.join(element)) + # input has to be hkl format - self.database = hkl.load(DatabaseDir).astype('float32') + self.database = database + + + + + def Gen_BoW_Model(self, max_features, analyzer, punctuation = False): + + print("Creating the bag of words...\n") + from sklearn.feature_extraction.text import CountVectorizer + + # Initialize the "CountVectorizer" object, which is scikit-learn's + # bag of words tool. + if punctuation == False: + vectorizer = CountVectorizer(analyzer = analyzer, \ + tokenizer = None, \ + preprocessor = None, \ + stop_words = None, \ + max_features = max_features) + + if punctuation == True: + vectorizer = CountVectorizer(analyzer = analyzer, \ + tokenizer = my_tokenizer, \ + preprocessor = None, \ + stop_words = None, \ + max_features = max_features) + + + # token_pattern = r'(?u)\w') + # fit_transform() does two functions: First, it fits the model + # and learns the vocabulary; second, it transforms our training data + # into feature vectors. The input to fit_transform should be a list of + # strings. + train_data_features = vectorizer.fit_transform(self.database) + + joblib.dump(vectorizer, 'bagofwords' + self.DatabaseDir + '.pkl') + + print('dumping the data to hkl format..') + hkl.dump(train_data_features, 'DataBaseOneZeros' + self.DatabaseDir + '.hkl', mode='w', compression='gzip') + print('done') + + + return vectorizer + + def Load_BoW_Model(self, BoWModelDir, DatabaseOneZerosDir): # input has to be pkl format self.vectorizer = joblib.load(BoWModelDir) - + + self.dbOZ = hkl.load(DatabaseOneZerosDir).astype('float32') + + + return self.vectorizer + + + # input: string to search for in the documents, the numberofmatches to get the best n documents + # output the numberofmatches documents with their indexes on the database which is searched, the highest accordance number plus index [index, number] + def search(self, string , numberofmatches): @@ -49,22 +122,23 @@ class FASTsearch(object): # Convert user input to Zeros and Ones user_array = [] user_array.append(string) - + user_input_OnesZeros = self.vectorizer.transform(user_array) + uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False) - + + uiOZ = uOZ[np.newaxis, :] uiOZ = uiOZ.transpose() - sess = tf.Session() with sess.as_default(): uiOZ_tensor = tf.constant(uiOZ) - dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(dbOZ) + dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ) #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None ) #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None ) @@ -75,7 +149,6 @@ class FASTsearch(object): wCD = np.array(wordCountDoku.eval()) - indexedwCD = [] for n in range(len(wCD)): indexedwCD.append([n,wCD[n][0]]) @@ -85,13 +158,183 @@ class FASTsearch(object): best_n_documents = [] + eq_number = 0 + for number in uiOZ: + #print(number) + eq_number += number ** 2 + + #print(eq_number) + + n = 0 + done = False + while n < len(indexedwCD) and done == False: + n += 1 + if indexedwCD[n][1] == eq_number: + best_n_documents = indexedwCD[n][0] + done = True + + if indexedwCD[n][1] < eq_number: + best_n_documents = indexedwCD[n - 1][0] + done = True + + + #for n in range(numberofmatches): + + #best_n_documents.append([indexedwCD[n][0], indexedwCD[n][1]]) + + + return best_n_documents, indexedwCD[0] + + def search_with_highest_multiplikation_Output(self, string , numberofmatches): + + + + + numberofmatches = numberofmatches + + + # Convert user input to Zeros and Ones + user_array = [] + user_array.append(string) + + user_input_OnesZeros = self.vectorizer.transform(user_array) + + uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False) + + + uiOZ = uOZ[np.newaxis, :] - for n in range(numberofmatches): + uiOZ = uiOZ.transpose() - best_n_documents.append(indexedwCD[n][0]) + sess = tf.Session() + + with sess.as_default(): + + uiOZ_tensor = tf.constant(uiOZ) + + dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ) + #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None ) + #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None ) + + + #wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor) + wordCountDoku = tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor) + + wCD = np.array(wordCountDoku.eval()) + + indexedwCD = [] + for n in range(len(wCD)): + indexedwCD.append([n,wCD[n][0]]) + + + indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True) + + best_n_documents = [] + + for n in range(numberofmatches): + best_n_documents.append(indexedwCD[n][0]) - return best_n_documents + return best_n_documents, indexedwCD[0] + + + def searchPatternMatch(self, string , numberofmatches): + + + numberofmatches = numberofmatches + + + # Convert user input to Zeros and Ones + user_array = [] + user_array.append(string) + + user_input_OnesZeros = self.vectorizer.transform(user_array) + + uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False) + + + uiOZ = uOZ[np.newaxis, :] + uiOZ = uiOZ.transpose() + + sess = tf.Session() + + with sess.as_default(): + uiOZ_tensor = tf.constant(uiOZ) + + dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ) + + #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None ) + #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None ) + + + #wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor) + wordCountDoku = tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor) + + wCD = np.array(wordCountDoku.eval()) + + indexedwCD = [] + for n in range(len(wCD)): + indexedwCD.append([n,wCD[n][0]]) + + # Sort the biggest matches + indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True) + + best_n_documents = [] + + best_docs_surrounding = [] + + + # Get the number which is result when same words would be in the document as in one grammar scheme + eq_number = 0 + for number in uiOZ: + #print(number) + eq_number += number ** 2 + + print(eq_number) + + # Create new array of closest grammar schemes, I have chosen around 3 (in the matchnumber, not regarding words or so) + n = 0 + done = False + while n < len(indexedwCD) and done == False: + n += 1 + #print('a',indexedwCD) + #print('oo', indexedwCD[n]) + if indexedwCD[n][1] == eq_number: + best_docs_surrounding.append(indexedwCD[n][0]) + + #if indexedwCD[n][1] < eq_number: + #best_docs_surrounding.append(indexedwCD[n][0]) + + if indexedwCD[n][1] < eq_number : + done = True + + + # Count for these docs in surrounding the matches of wordnumbers per word + # would be much faster when using the sparse class + + best_docs_surrounding_new = [] + for doc in best_docs_surrounding: + dok_BoW = self.dbOZ[doc].toarray()[0].astype(np.float32, copy=False) + Number_equal_words = 0 + for n in range(len(uiOZ)): + #print(uiOZ[n]) + #print(dok_BoW[n]) + #print('dok_BoW',dok_BoW) + if uiOZ[n] == dok_BoW[n]: + Number_equal_words += 1 + best_docs_surrounding_new.append([doc , Number_equal_words]) + + # Sort the result again with the original indexes + best_n_documents = sorted(best_docs_surrounding_new[::-1], key=lambda tup: tup[1], reverse=True) + + + + #for n in range(numberofmatches): + + #best_n_documents.append([indexedwCD[n][0], indexedwCD[n][1]]) + + + return best_n_documents