First commit of FASTsearch

2020-08-27 21:05:52 +02:00 · 2020-08-27 21:05:52 +02:00 · da193f8889
commit da193f8889
1 changed files with 97 additions and 0 deletions
--- a/FASTsearch.py
+++ b/FASTsearch.py
@ -0,0 +1,97 @@
 # The new class FASTsearch. Every DB can be represented in Lists. The Brain actually is constituted from lists. Access to all Documents the same moment.
 # TODO GPU Multithreading has to be implemented.
 # USAGE: Learn scikit-learn count vectorizer on a database of lines or docs. 
 from sklearn.externals import joblib
 from sklearn.feature_extraction.text import CountVectorizer
 import numpy as np
 import scipy as sc
 import tensorflow as tf
 import _pickle as cPickle
 import hickle as hkl
 import os
 # Define function to convert scipy csr matrix to tf tensor for working on gpu
 def convert_sparse_matrix_to_sparse_tensor(X):
    coo = sc.sparse.coo_matrix(X)
    indices = np.mat([coo.row, coo.col]).transpose()
    return tf.SparseTensorValue(indices, coo.data, coo.shape)
 class FASTsearch(object):
    def __init__(self, DatabaseDir, BoWModelDir):
        # input has to be hkl format
        self.database = hkl.load(DatabaseDir).astype('float32')
        # input has to be pkl format
        self.vectorizer = joblib.load(BoWModelDir)
    def search(self, string , numberofmatches):
        numberofmatches = numberofmatches
        # Convert user input to Zeros and Ones
        user_array = []
        user_array.append(string)
        user_input_OnesZeros = self.vectorizer.transform(user_array)
        uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False)
        uiOZ = uOZ[np.newaxis, :]
        uiOZ = uiOZ.transpose()
        sess = tf.Session()
        with sess.as_default():
            uiOZ_tensor = tf.constant(uiOZ)
            dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(dbOZ)
            #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
            #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
            #wordCountDoku =  tf.matmul(uiOZ_tensor, dbOZ_tensor)
            wordCountDoku =  tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor)
            wCD = np.array(wordCountDoku.eval())
        indexedwCD = []
        for n in range(len(wCD)):
            indexedwCD.append([n,wCD[n][0]])
        indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True)
        best_n_documents = []
        for n in range(numberofmatches):
            best_n_documents.append(indexedwCD[n][0])
        return best_n_documents