PluriTon/build/tfgpu-pluriton/FASTsearch.py


# The new class FASTsearch. Every DB can be represented in Lists. The Brain actually is constituted from lists. Access to all Documents almost the same moment.

# TODO GPU Multithreading has to be implemented.


# USAGE: Learn scikit-learn count vectorizer on a database of lines or docs. 

import joblib
from sklearn.feature_extraction.text import CountVectorizer

import numpy as np
import scipy as sc

import tensorflow.compat.v1 as tf

tf.compat.v1.disable_eager_execution()

import _pickle as cPickle

import hickle as hkl

import os


# Define function to convert scipy csr matrix to tf tensor for working on gpu
def convert_sparse_matrix_to_sparse_tensor(X):
    coo = sc.sparse.coo_matrix(X)
    indices = np.mat([coo.row, coo.col]).transpose()
    return tf.SparseTensorValue(indices, coo.data, coo.shape)


# The whole class is initialized with input of the database in [['word','word2'],[],[],[]] List format, 2 dimensional, the index of the list in the matrix defines its id
## in every list element of the input, each document is represented by one string
# This list must be saved as a hkl dump and then loaded into the database.


def my_tokenizer(s):
    return s.split('\+')

class FASTsearch(object):
    
    def __init__(self, DatabaseDir):
        
        self.DatabaseDir = DatabaseDir[:-4]
        
        database = []
        hkl_load = hkl.load(DatabaseDir)
        
        for element in hkl_load:
            #print('element',element)
            #print('joined element', ' '.join(element))
            database.append(' '.join(element))
            
        
        # input has to be hkl format
        self.database = database
        
        
    def Gen_BoW_Model(self, max_features, analyzer, punctuation = False):
        
        print("Creating the bag of words...\n")
        from sklearn.feature_extraction.text import CountVectorizer

        # Initialize the "CountVectorizer" object, which is scikit-learn's
        # bag of words tool.  
        if punctuation == False:
            vectorizer = CountVectorizer(analyzer = analyzer,   \
                                        tokenizer = None,    \
                                        preprocessor = None, \
                                        stop_words = None,   \
                                        max_features = max_features)
            
        if punctuation == True:
            vectorizer = CountVectorizer(analyzer = analyzer,   \
                                        tokenizer = my_tokenizer,    \
                                        preprocessor = None, \
                                        stop_words = None,   \
                                        max_features = max_features)
                                        

        # token_pattern = r'(?u)\w')
        # fit_transform() does two functions: First, it fits the model
        # and learns the vocabulary; second, it transforms our training data
        # into feature vectors. The input to fit_transform should be a list of 
        # strings.
        train_data_features = vectorizer.fit_transform(self.database)
        
        joblib.dump(vectorizer, 'bagofwords' + self.DatabaseDir + '.pkl')
        
        print('dumping the data to hkl format..')
        hkl.dump(train_data_features, 'DataBaseOneZeros' + self.DatabaseDir + '.hkl', mode='w', compression='gzip')
        print('done')

        
        return vectorizer
    
    def Load_BoW_Model(self, BoWModelDir, DatabaseOneZerosDir):
        
        # input has to be pkl format
        self.vectorizer = joblib.load(BoWModelDir)
        
        self.dbOZ = hkl.load(DatabaseOneZerosDir).astype('float32')
        
        
        return self.vectorizer
    
    
    # input: string to search for in the documents, the numberofmatches to get the best n documents
    # output the numberofmatches documents with their indexes on the database which is searched, the highest accordance number plus index [index, number]
    
    def search(self, string , numberofmatches):
        
        
        numberofmatches = numberofmatches
        
        
        # Convert user input to Zeros and Ones
        user_array = []
        user_array.append(string)
        
        user_input_OnesZeros = self.vectorizer.transform(user_array)
        
        uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False)
        
        
        uiOZ = uOZ[np.newaxis, :]

        uiOZ = uiOZ.transpose()

        sess = tf.Session()
        with tf.device('/gpu:0'):
            with sess.as_default():

                uiOZ_tensor = tf.constant(uiOZ)

                dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ)

                #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
                #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )


                #wordCountDoku =  tf.matmul(uiOZ_tensor, dbOZ_tensor)
                wordCountDoku =  tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor)

                wCD = np.array(wordCountDoku.eval())
            
        indexedwCD = []
        for n in range(len(wCD)):
            indexedwCD.append([n,wCD[n][0]])


        indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True)

        best_n_documents = []

        eq_number = 0
        for number in uiOZ:
            #print(number)
            eq_number += number ** 2
        
        #print(eq_number)
        
        n = 0
        done = False
        while n < len(indexedwCD) and done == False:
            n += 1
            if indexedwCD[n][1] == eq_number:
                best_n_documents = indexedwCD[n][0]
                done = True
                
            if indexedwCD[n][1] < eq_number:
                best_n_documents = indexedwCD[n - 1][0]
                done = True
            
            
        #for n in range(numberofmatches):

            #best_n_documents.append([indexedwCD[n][0], indexedwCD[n][1]])
            
        
        return best_n_documents, indexedwCD[0]
    
    def search_with_highest_multiplikation_Output(self, string , numberofmatches):
        
        
        numberofmatches = numberofmatches
        
        
        # Convert user input to Zeros and Ones
        user_array = []
        user_array.append(string)
        
        user_input_OnesZeros = self.vectorizer.transform(user_array)
        
        uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False)
        
        
        uiOZ = uOZ[np.newaxis, :]

        uiOZ = uiOZ.transpose()

        sess = tf.Session()
        with tf.device('/gpu:0'):
            with sess.as_default():

                uiOZ_tensor = tf.constant(uiOZ)

                dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ)

                #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
                #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )


                #wordCountDoku =  tf.matmul(uiOZ_tensor, dbOZ_tensor)
                wordCountDoku =  tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor)

                wCD = np.array(wordCountDoku.eval())
            
        indexedwCD = []
        for n in range(len(wCD)):
            indexedwCD.append([n,wCD[n][0]])


        indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True)

        best_n_documents = []

        for n in range(numberofmatches):
            best_n_documents.append(indexedwCD[n][0])
        
        return best_n_documents, indexedwCD[0]
    
    
    def searchPatternMatch(self, string , numberofmatches):
        
        
        numberofmatches = numberofmatches
        
        
        # Convert user input to Zeros and Ones
        user_array = []
        user_array.append(string)
        
        user_input_OnesZeros = self.vectorizer.transform(user_array)
        
        uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False)
        
        
        uiOZ = uOZ[np.newaxis, :]

        uiOZ = uiOZ.transpose()

        sess = tf.Session()
        with tf.device('/gpu:0'):
            with sess.as_default():

                uiOZ_tensor = tf.constant(uiOZ)

                dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ)

                #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
                #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )


                #wordCountDoku =  tf.matmul(uiOZ_tensor, dbOZ_tensor)
                wordCountDoku =  tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor)

                wCD = np.array(wordCountDoku.eval())
            
        indexedwCD = []
        for n in range(len(wCD)):
            indexedwCD.append([n,wCD[n][0]])

        # Sort the biggest matches
        indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True)

        best_n_documents = []
        
        best_docs_surrounding = []
        
        
        # Get the number which is result when same words would be in the document as in one grammar scheme
        eq_number = 0
        for number in uiOZ:
            #print(number)
            eq_number += number ** 2
        
        print(eq_number)
        
        # Create new array of closest grammar schemes, I have chosen around 3 (in the matchnumber, not regarding words or so)
        n = 0
        done = False
        while n < len(indexedwCD) and done == False:
            n += 1
            #print('a',indexedwCD)
            #print('oo', indexedwCD[n])
            if indexedwCD[n][1] == eq_number:
                best_docs_surrounding.append(indexedwCD[n][0])
                
            #if indexedwCD[n][1] < eq_number:
                #best_docs_surrounding.append(indexedwCD[n][0])
            
            if indexedwCD[n][1] < eq_number  :
                done = True
        
        
        # Count for these docs in surrounding the matches of wordnumbers per word
        # would be much faster when using the sparse class
        
        best_docs_surrounding_new = []
        for doc in best_docs_surrounding:
            dok_BoW = self.dbOZ[doc].toarray()[0].astype(np.float32, copy=False)
            Number_equal_words = 0
            for n in range(len(uiOZ)):
                #print(uiOZ[n])
                #print(dok_BoW[n])
                #print('dok_BoW',dok_BoW)
                if uiOZ[n] == dok_BoW[n]:
                    Number_equal_words += 1
            best_docs_surrounding_new.append([doc , Number_equal_words])
        
        # Sort the result again with the original indexes
        best_n_documents = sorted(best_docs_surrounding_new[::-1], key=lambda tup: tup[1], reverse=True)
        
        
        #for n in range(numberofmatches):

            #best_n_documents.append([indexedwCD[n][0], indexedwCD[n][1]])
            
        
        return best_n_documents
first commit 2021-09-24 15:13:03 +02:00
			`# The new class FASTsearch. Every DB can be represented in Lists. The Brain actually is constituted from lists. Access to all Documents almost the same moment.`

			`# TODO GPU Multithreading has to be implemented.`



			`# USAGE: Learn scikit-learn count vectorizer on a database of lines or docs.`

			`import joblib`
			`from sklearn.feature_extraction.text import CountVectorizer`

			`import numpy as np`
			`import scipy as sc`

docker+rust 2021-10-18 18:22:03 +02:00			`import tensorflow.compat.v1 as tf`
first commit 2021-09-24 15:13:03 +02:00
docker+rust 2021-10-18 18:22:03 +02:00			`tf.compat.v1.disable_eager_execution()`
first commit 2021-09-24 15:13:03 +02:00
			`import _pickle as cPickle`

			`import hickle as hkl`

			`import os`


			`# Define function to convert scipy csr matrix to tf tensor for working on gpu`
			`def convert_sparse_matrix_to_sparse_tensor(X):`
			`coo = sc.sparse.coo_matrix(X)`
			`indices = np.mat([coo.row, coo.col]).transpose()`
			`return tf.SparseTensorValue(indices, coo.data, coo.shape)`




			`# The whole class is initialized with input of the database in [['word','word2'],[],[],[]] List format, 2 dimensional, the index of the list in the matrix defines its id`
			`## in every list element of the input, each document is represented by one string`
			`# This list must be saved as a hkl dump and then loaded into the database.`


			`def my_tokenizer(s):`
			`return s.split('\+')`

			`class FASTsearch(object):`

			`def __init__(self, DatabaseDir):`

			`self.DatabaseDir = DatabaseDir[:-4]`

			`database = []`
			`hkl_load = hkl.load(DatabaseDir)`

			`for element in hkl_load:`
			`#print('element',element)`
			`#print('joined element', ' '.join(element))`
			`database.append(' '.join(element))`


			`# input has to be hkl format`
			`self.database = database`




			`def Gen_BoW_Model(self, max_features, analyzer, punctuation = False):`

			`print("Creating the bag of words...\n")`
			`from sklearn.feature_extraction.text import CountVectorizer`

			`# Initialize the "CountVectorizer" object, which is scikit-learn's`
			`# bag of words tool.`
			`if punctuation == False:`
			`vectorizer = CountVectorizer(analyzer = analyzer, \`
			`tokenizer = None, \`
			`preprocessor = None, \`
			`stop_words = None, \`
			`max_features = max_features)`

			`if punctuation == True:`
			`vectorizer = CountVectorizer(analyzer = analyzer, \`
			`tokenizer = my_tokenizer, \`
			`preprocessor = None, \`
			`stop_words = None, \`
			`max_features = max_features)`


			`# token_pattern = r'(?u)\w')`
			`# fit_transform() does two functions: First, it fits the model`
			`# and learns the vocabulary; second, it transforms our training data`
			`# into feature vectors. The input to fit_transform should be a list of`
			`# strings.`
			`train_data_features = vectorizer.fit_transform(self.database)`

			`joblib.dump(vectorizer, 'bagofwords' + self.DatabaseDir + '.pkl')`

			`print('dumping the data to hkl format..')`
			`hkl.dump(train_data_features, 'DataBaseOneZeros' + self.DatabaseDir + '.hkl', mode='w', compression='gzip')`
			`print('done')`


			`return vectorizer`

			`def Load_BoW_Model(self, BoWModelDir, DatabaseOneZerosDir):`

			`# input has to be pkl format`
			`self.vectorizer = joblib.load(BoWModelDir)`

			`self.dbOZ = hkl.load(DatabaseOneZerosDir).astype('float32')`


			`return self.vectorizer`


			`# input: string to search for in the documents, the numberofmatches to get the best n documents`
			`# output the numberofmatches documents with their indexes on the database which is searched, the highest accordance number plus index [index, number]`

			`def search(self, string , numberofmatches):`


			`numberofmatches = numberofmatches`


			`# Convert user input to Zeros and Ones`
			`user_array = []`
			`user_array.append(string)`

			`user_input_OnesZeros = self.vectorizer.transform(user_array)`

			`uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False)`


			`uiOZ = uOZ[np.newaxis, :]`

			`uiOZ = uiOZ.transpose()`

			`sess = tf.Session()`
			`with tf.device('/gpu:0'):`
			`with sess.as_default():`

			`uiOZ_tensor = tf.constant(uiOZ)`

			`dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ)`

			`#uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )`
			`#dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )`


			`#wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor)`
			`wordCountDoku = tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor)`

			`wCD = np.array(wordCountDoku.eval())`

			`indexedwCD = []`
			`for n in range(len(wCD)):`
			`indexedwCD.append([n,wCD[n][0]])`


			`indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True)`

			`best_n_documents = []`

			`eq_number = 0`
			`for number in uiOZ:`
			`#print(number)`
			`eq_number += number ** 2`

			`#print(eq_number)`

			`n = 0`
			`done = False`
			`while n < len(indexedwCD) and done == False:`
			`n += 1`
			`if indexedwCD[n][1] == eq_number:`
			`best_n_documents = indexedwCD[n][0]`
			`done = True`

			`if indexedwCD[n][1] < eq_number:`
			`best_n_documents = indexedwCD[n - 1][0]`
			`done = True`


			`#for n in range(numberofmatches):`

			`#best_n_documents.append([indexedwCD[n][0], indexedwCD[n][1]])`


			`return best_n_documents, indexedwCD[0]`

			`def search_with_highest_multiplikation_Output(self, string , numberofmatches):`




			`numberofmatches = numberofmatches`


			`# Convert user input to Zeros and Ones`
			`user_array = []`
			`user_array.append(string)`

			`user_input_OnesZeros = self.vectorizer.transform(user_array)`

			`uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False)`


			`uiOZ = uOZ[np.newaxis, :]`

			`uiOZ = uiOZ.transpose()`

			`sess = tf.Session()`
			`with tf.device('/gpu:0'):`
			`with sess.as_default():`

			`uiOZ_tensor = tf.constant(uiOZ)`

			`dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ)`

			`#uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )`
			`#dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )`


			`#wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor)`
			`wordCountDoku = tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor)`

			`wCD = np.array(wordCountDoku.eval())`

			`indexedwCD = []`
			`for n in range(len(wCD)):`
			`indexedwCD.append([n,wCD[n][0]])`


			`indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True)`

			`best_n_documents = []`

			`for n in range(numberofmatches):`
			`best_n_documents.append(indexedwCD[n][0])`

			`return best_n_documents, indexedwCD[0]`


			`def searchPatternMatch(self, string , numberofmatches):`


			`numberofmatches = numberofmatches`


			`# Convert user input to Zeros and Ones`
			`user_array = []`
			`user_array.append(string)`

			`user_input_OnesZeros = self.vectorizer.transform(user_array)`

			`uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False)`


			`uiOZ = uOZ[np.newaxis, :]`

			`uiOZ = uiOZ.transpose()`

			`sess = tf.Session()`
			`with tf.device('/gpu:0'):`
			`with sess.as_default():`

			`uiOZ_tensor = tf.constant(uiOZ)`

			`dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ)`

			`#uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )`
			`#dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )`


			`#wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor)`
			`wordCountDoku = tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor)`

			`wCD = np.array(wordCountDoku.eval())`

			`indexedwCD = []`
			`for n in range(len(wCD)):`
			`indexedwCD.append([n,wCD[n][0]])`

			`# Sort the biggest matches`
			`indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True)`

			`best_n_documents = []`

			`best_docs_surrounding = []`


			`# Get the number which is result when same words would be in the document as in one grammar scheme`
			`eq_number = 0`
			`for number in uiOZ:`
			`#print(number)`
			`eq_number += number ** 2`

			`print(eq_number)`

			`# Create new array of closest grammar schemes, I have chosen around 3 (in the matchnumber, not regarding words or so)`
			`n = 0`
			`done = False`
			`while n < len(indexedwCD) and done == False:`
			`n += 1`
			`#print('a',indexedwCD)`
			`#print('oo', indexedwCD[n])`
			`if indexedwCD[n][1] == eq_number:`
			`best_docs_surrounding.append(indexedwCD[n][0])`

			`#if indexedwCD[n][1] < eq_number:`
			`#best_docs_surrounding.append(indexedwCD[n][0])`

			`if indexedwCD[n][1] < eq_number :`
			`done = True`


			`# Count for these docs in surrounding the matches of wordnumbers per word`
			`# would be much faster when using the sparse class`

			`best_docs_surrounding_new = []`
			`for doc in best_docs_surrounding:`
			`dok_BoW = self.dbOZ[doc].toarray()[0].astype(np.float32, copy=False)`
			`Number_equal_words = 0`
			`for n in range(len(uiOZ)):`
			`#print(uiOZ[n])`
			`#print(dok_BoW[n])`
			`#print('dok_BoW',dok_BoW)`
			`if uiOZ[n] == dok_BoW[n]:`
			`Number_equal_words += 1`
			`best_docs_surrounding_new.append([doc , Number_equal_words])`

			`# Sort the result again with the original indexes`
			`best_n_documents = sorted(best_docs_surrounding_new[::-1], key=lambda tup: tup[1], reverse=True)`



			`#for n in range(numberofmatches):`

			`#best_n_documents.append([indexedwCD[n][0], indexedwCD[n][1]])`


			`return best_n_documents`