# The new class FASTsearch. Every DB can be represented in Lists. The Brain actually is constituted from lists. Access to all Documents almost the same moment. # TODO GPU Multithreading has to be implemented. # USAGE: Learn scikit-learn count vectorizer on a database of lines or docs. from sklearn.externals import joblib from sklearn.feature_extraction.text import CountVectorizer import numpy as np import scipy as sc import tensorflow as tf import _pickle as cPickle import hickle as hkl import os # Define function to convert scipy csr matrix to tf tensor for working on gpu def convert_sparse_matrix_to_sparse_tensor(X): coo = sc.sparse.coo_matrix(X) indices = np.mat([coo.row, coo.col]).transpose() return tf.SparseTensorValue(indices, coo.data, coo.shape) # The whole class is initialized with input of the database in [['word','word2'],[],[],[]] List format, 2 dimensional, the index of the list in the matrix defines its id ## in every list element of the input, each document is represented by one string # This list must be saved as a hkl dump and then loaded into the database. def my_tokenizer(s): return s.split('\+') class FASTsearch(object): def __init__(self, DatabaseDir): self.DatabaseDir = DatabaseDir[:-4] database = [] hkl_load = hkl.load(DatabaseDir) for element in hkl_load: #print('element',element) #print('joined element', ' '.join(element)) database.append(' '.join(element)) # input has to be hkl format self.database = database def Gen_BoW_Model(self, max_features, analyzer, punctuation = False): print("Creating the bag of words...\n") from sklearn.feature_extraction.text import CountVectorizer # Initialize the "CountVectorizer" object, which is scikit-learn's # bag of words tool. if punctuation == False: vectorizer = CountVectorizer(analyzer = analyzer, \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features = max_features) if punctuation == True: vectorizer = CountVectorizer(analyzer = analyzer, \ tokenizer = my_tokenizer, \ preprocessor = None, \ stop_words = None, \ max_features = max_features) # token_pattern = r'(?u)\w') # fit_transform() does two functions: First, it fits the model # and learns the vocabulary; second, it transforms our training data # into feature vectors. The input to fit_transform should be a list of # strings. train_data_features = vectorizer.fit_transform(self.database) joblib.dump(vectorizer, 'bagofwords' + self.DatabaseDir + '.pkl') print('dumping the data to hkl format..') hkl.dump(train_data_features, 'DataBaseOneZeros' + self.DatabaseDir + '.hkl', mode='w', compression='gzip') print('done') return vectorizer def Load_BoW_Model(self, BoWModelDir, DatabaseOneZerosDir): # input has to be pkl format self.vectorizer = joblib.load(BoWModelDir) self.dbOZ = hkl.load(DatabaseOneZerosDir).astype('float32') return self.vectorizer # input: string to search for in the documents, the numberofmatches to get the best n documents # output the numberofmatches documents with their indexes on the database which is searched, the highest accordance number plus index [index, number] def search(self, string , numberofmatches): numberofmatches = numberofmatches # Convert user input to Zeros and Ones user_array = [] user_array.append(string) user_input_OnesZeros = self.vectorizer.transform(user_array) uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False) uiOZ = uOZ[np.newaxis, :] uiOZ = uiOZ.transpose() sess = tf.Session() with sess.as_default(): uiOZ_tensor = tf.constant(uiOZ) dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ) #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None ) #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None ) #wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor) wordCountDoku = tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor) wCD = np.array(wordCountDoku.eval()) indexedwCD = [] for n in range(len(wCD)): indexedwCD.append([n,wCD[n][0]]) indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True) best_n_documents = [] eq_number = 0 for number in uiOZ: #print(number) eq_number += number ** 2 #print(eq_number) n = 0 done = False while n < len(indexedwCD) and done == False: n += 1 if indexedwCD[n][1] == eq_number: best_n_documents = indexedwCD[n][0] done = True if indexedwCD[n][1] < eq_number: best_n_documents = indexedwCD[n - 1][0] done = True #for n in range(numberofmatches): #best_n_documents.append([indexedwCD[n][0], indexedwCD[n][1]]) return best_n_documents, indexedwCD[0] def search_with_highest_multiplikation_Output(self, string , numberofmatches): numberofmatches = numberofmatches # Convert user input to Zeros and Ones user_array = [] user_array.append(string) user_input_OnesZeros = self.vectorizer.transform(user_array) uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False) uiOZ = uOZ[np.newaxis, :] uiOZ = uiOZ.transpose() sess = tf.Session() with sess.as_default(): uiOZ_tensor = tf.constant(uiOZ) dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ) #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None ) #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None ) #wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor) wordCountDoku = tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor) wCD = np.array(wordCountDoku.eval()) indexedwCD = [] for n in range(len(wCD)): indexedwCD.append([n,wCD[n][0]]) indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True) best_n_documents = [] for n in range(numberofmatches): best_n_documents.append(indexedwCD[n][0]) return best_n_documents, indexedwCD[0] def searchPatternMatch(self, string , numberofmatches): numberofmatches = numberofmatches # Convert user input to Zeros and Ones user_array = [] user_array.append(string) user_input_OnesZeros = self.vectorizer.transform(user_array) uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False) uiOZ = uOZ[np.newaxis, :] uiOZ = uiOZ.transpose() sess = tf.Session() with sess.as_default(): uiOZ_tensor = tf.constant(uiOZ) dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ) #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None ) #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None ) #wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor) wordCountDoku = tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor) wCD = np.array(wordCountDoku.eval()) indexedwCD = [] for n in range(len(wCD)): indexedwCD.append([n,wCD[n][0]]) # Sort the biggest matches indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True) best_n_documents = [] best_docs_surrounding = [] # Get the number which is result when same words would be in the document as in one grammar scheme eq_number = 0 for number in uiOZ: #print(number) eq_number += number ** 2 print(eq_number) # Create new array of closest grammar schemes, I have chosen around 3 (in the matchnumber, not regarding words or so) n = 0 done = False while n < len(indexedwCD) and done == False: n += 1 #print('a',indexedwCD) #print('oo', indexedwCD[n]) if indexedwCD[n][1] == eq_number: best_docs_surrounding.append(indexedwCD[n][0]) #if indexedwCD[n][1] < eq_number: #best_docs_surrounding.append(indexedwCD[n][0]) if indexedwCD[n][1] < eq_number : done = True # Count for these docs in surrounding the matches of wordnumbers per word # would be much faster when using the sparse class best_docs_surrounding_new = [] for doc in best_docs_surrounding: dok_BoW = self.dbOZ[doc].toarray()[0].astype(np.float32, copy=False) Number_equal_words = 0 for n in range(len(uiOZ)): #print(uiOZ[n]) #print(dok_BoW[n]) #print('dok_BoW',dok_BoW) if uiOZ[n] == dok_BoW[n]: Number_equal_words += 1 best_docs_surrounding_new.append([doc , Number_equal_words]) # Sort the result again with the original indexes best_n_documents = sorted(best_docs_surrounding_new[::-1], key=lambda tup: tup[1], reverse=True) #for n in range(numberofmatches): #best_n_documents.append([indexedwCD[n][0], indexedwCD[n][1]]) return best_n_documents