alpcentaur
/
fastsearch



								# The new class FASTsearch. Every DB can be represented in Lists. The Brain actually is constituted from lists. Access to all Documents almost the same moment.


								# TODO GPU Multithreading has to be implemented.


								# USAGE: Learn scikit-learn count vectorizer on a database of lines or docs.


								from sklearn.externals import joblib

								from sklearn.feature_extraction.text import CountVectorizer


								import numpy as np

								import scipy as sc


								import tensorflow as tf


								import _pickle as cPickle


								import hickle as hkl


								import os


								# Define function to convert scipy csr matrix to tf tensor for working on gpu

								def convert_sparse_matrix_to_sparse_tensor(X):

								    coo = sc.sparse.coo_matrix(X)

								    indices = np.mat([coo.row, coo.col]).transpose()

								    return tf.SparseTensorValue(indices, coo.data, coo.shape)


								# The whole class is initialized with input of the database in [['word','word2'],[],[],[]] List format, 2 dimensional, the index of the list in the matrix defines its id

								## in every list element of the input, each document is represented by one string

								# This list must be saved as a hkl dump and then loaded into the database.


								def my_tokenizer(s):

								    return s.split('\+')


								class FASTsearch(object):


								    def __init__(self, DatabaseDir):


								        self.DatabaseDir = DatabaseDir[:-4]


								        database = []

								        hkl_load = hkl.load(DatabaseDir)


								        for element in hkl_load:

								            #print('element',element)

								            #print('joined element', ' '.join(element))

								            database.append(' '.join(element))


								        # input has to be hkl format

								        self.database = database


								    def Gen_BoW_Model(self, max_features, analyzer, punctuation = False):


								        print("Creating the bag of words...\n")

								        from sklearn.feature_extraction.text import CountVectorizer


								        # Initialize the "CountVectorizer" object, which is scikit-learn's

								        # bag of words tool.

								        if punctuation == False:

								            vectorizer = CountVectorizer(analyzer = analyzer,   \

								                                        tokenizer = None,    \

								                                        preprocessor = None, \

								                                        stop_words = None,   \

								                                        max_features = max_features)


								        if punctuation == True:

								            vectorizer = CountVectorizer(analyzer = analyzer,   \

								                                        tokenizer = my_tokenizer,    \

								                                        preprocessor = None, \

								                                        stop_words = None,   \

								                                        max_features = max_features)


								        # token_pattern = r'(?u)\w')

								        # fit_transform() does two functions: First, it fits the model

								        # and learns the vocabulary; second, it transforms our training data

								        # into feature vectors. The input to fit_transform should be a list of

								        # strings.

								        train_data_features = vectorizer.fit_transform(self.database)


								        joblib.dump(vectorizer, 'bagofwords' + self.DatabaseDir + '.pkl')


								        print('dumping the data to hkl format..')

								        hkl.dump(train_data_features, 'DataBaseOneZeros' + self.DatabaseDir + '.hkl', mode='w', compression='gzip')

								        print('done')


								        return vectorizer


								    def Load_BoW_Model(self, BoWModelDir, DatabaseOneZerosDir):


								        # input has to be pkl format

								        self.vectorizer = joblib.load(BoWModelDir)


								        self.dbOZ = hkl.load(DatabaseOneZerosDir).astype('float32')


								        return self.vectorizer


								    # input: string to search for in the documents, the numberofmatches to get the best n documents

								    # output the numberofmatches documents with their indexes on the database which is searched, the highest accordance number plus index [index, number]


								    def search(self, string , numberofmatches):


								        numberofmatches = numberofmatches


								        # Convert user input to Zeros and Ones

								        user_array = []

								        user_array.append(string)


								        user_input_OnesZeros = self.vectorizer.transform(user_array)


								        uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False)


								        uiOZ = uOZ[np.newaxis, :]


								        uiOZ = uiOZ.transpose()


								        sess = tf.Session()


								        with sess.as_default():


								            uiOZ_tensor = tf.constant(uiOZ)


								            dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ)


								            #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )

								            #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )


								            #wordCountDoku =  tf.matmul(uiOZ_tensor, dbOZ_tensor)

								            wordCountDoku =  tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor)


								            wCD = np.array(wordCountDoku.eval())


								        indexedwCD = []

								        for n in range(len(wCD)):

								            indexedwCD.append([n,wCD[n][0]])


								        indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True)


								        best_n_documents = []


								        eq_number = 0

								        for number in uiOZ:

								            #print(number)

								            eq_number += number ** 2


								        #print(eq_number)


								        n = 0

								        done = False

								        while n < len(indexedwCD) and done == False:

								            n += 1

								            if indexedwCD[n][1] == eq_number:

								                best_n_documents = indexedwCD[n][0]

								                done = True


								            if indexedwCD[n][1] < eq_number:

								                best_n_documents = indexedwCD[n - 1][0]

								                done = True


								        #for n in range(numberofmatches):


								            #best_n_documents.append([indexedwCD[n][0], indexedwCD[n][1]])


								        return best_n_documents, indexedwCD[0]


								    def search_with_highest_multiplikation_Output(self, string , numberofmatches):


								        numberofmatches = numberofmatches


								        # Convert user input to Zeros and Ones

								        user_array = []

								        user_array.append(string)


								        user_input_OnesZeros = self.vectorizer.transform(user_array)


								        uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False)


								        uiOZ = uOZ[np.newaxis, :]


								        uiOZ = uiOZ.transpose()


								        sess = tf.Session()


								        with sess.as_default():


								            uiOZ_tensor = tf.constant(uiOZ)


								            dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ)


								            #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )

								            #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )


								            #wordCountDoku =  tf.matmul(uiOZ_tensor, dbOZ_tensor)

								            wordCountDoku =  tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor)


								            wCD = np.array(wordCountDoku.eval())


								        indexedwCD = []

								        for n in range(len(wCD)):

								            indexedwCD.append([n,wCD[n][0]])


								        indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True)


								        best_n_documents = []


								        for n in range(numberofmatches):

								            best_n_documents.append(indexedwCD[n][0])


								        return best_n_documents, indexedwCD[0]


								    def searchPatternMatch(self, string , numberofmatches):


								        numberofmatches = numberofmatches


								        # Convert user input to Zeros and Ones

								        user_array = []

								        user_array.append(string)


								        user_input_OnesZeros = self.vectorizer.transform(user_array)


								        uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False)


								        uiOZ = uOZ[np.newaxis, :]


								        uiOZ = uiOZ.transpose()


								        sess = tf.Session()


								        with sess.as_default():


								            uiOZ_tensor = tf.constant(uiOZ)


								            dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ)


								            #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )

								            #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )


								            #wordCountDoku =  tf.matmul(uiOZ_tensor, dbOZ_tensor)

								            wordCountDoku =  tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor)


								            wCD = np.array(wordCountDoku.eval())


								        indexedwCD = []

								        for n in range(len(wCD)):

								            indexedwCD.append([n,wCD[n][0]])


								        # Sort the biggest matches

								        indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True)


								        best_n_documents = []


								        best_docs_surrounding = []


								        # Get the number which is result when same words would be in the document as in one grammar scheme

								        eq_number = 0

								        for number in uiOZ:

								            #print(number)

								            eq_number += number ** 2


								        print(eq_number)


								        # Create new array of closest grammar schemes, I have chosen around 3 (in the matchnumber, not regarding words or so)

								        n = 0

								        done = False

								        while n < len(indexedwCD) and done == False:

								            n += 1

								            #print('a',indexedwCD)

								            #print('oo', indexedwCD[n])

								            if indexedwCD[n][1] == eq_number:

								                best_docs_surrounding.append(indexedwCD[n][0])


								            #if indexedwCD[n][1] < eq_number:

								                #best_docs_surrounding.append(indexedwCD[n][0])


								            if indexedwCD[n][1] < eq_number  :

								                done = True


								        # Count for these docs in surrounding the matches of wordnumbers per word

								        # would be much faster when using the sparse class


								        best_docs_surrounding_new = []

								        for doc in best_docs_surrounding:

								            dok_BoW = self.dbOZ[doc].toarray()[0].astype(np.float32, copy=False)

								            Number_equal_words = 0

								            for n in range(len(uiOZ)):

								                #print(uiOZ[n])

								                #print(dok_BoW[n])

								                #print('dok_BoW',dok_BoW)

								                if uiOZ[n] == dok_BoW[n]:

								                    Number_equal_words += 1

								            best_docs_surrounding_new.append([doc , Number_equal_words])


								        # Sort the result again with the original indexes

								        best_n_documents = sorted(best_docs_surrounding_new[::-1], key=lambda tup: tup[1], reverse=True)


								        #for n in range(numberofmatches):


								            #best_n_documents.append([indexedwCD[n][0], indexedwCD[n][1]])


								        return best_n_documents