alpcentaur
/
fastsearch


# The new class FASTsearch. Every DB can be represented in Lists. The Brain actually is constituted from lists. Access to all Documents the same moment.
# TODO GPU Multithreading has to be implemented.


# USAGE: Learn scikit-learn count vectorizer on a database of lines or docs. 
from sklearn.externals import joblibfrom sklearn.feature_extraction.text import CountVectorizer
import numpy as npimport scipy as sc
import tensorflow as tf
import _pickle as cPickle
import hickle as hkl
import os

# Define function to convert scipy csr matrix to tf tensor for working on gpudef convert_sparse_matrix_to_sparse_tensor(X):    coo = sc.sparse.coo_matrix(X)    indices = np.mat([coo.row, coo.col]).transpose()    return tf.SparseTensorValue(indices, coo.data, coo.shape)


class FASTsearch(object):        def __init__(self, DatabaseDir, BoWModelDir):                # input has to be hkl format        self.database = hkl.load(DatabaseDir).astype('float32')                # input has to be pkl format        self.vectorizer = joblib.load(BoWModelDir)
    def search(self, string , numberofmatches):                        numberofmatches = numberofmatches                        # Convert user input to Zeros and Ones        user_array = []        user_array.append(string)
        user_input_OnesZeros = self.vectorizer.transform(user_array)        uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False)
        uiOZ = uOZ[np.newaxis, :]
        uiOZ = uiOZ.transpose()

        sess = tf.Session()
        with sess.as_default():
            uiOZ_tensor = tf.constant(uiOZ)                        dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(dbOZ)                        #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )            #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )                                    #wordCountDoku =  tf.matmul(uiOZ_tensor, dbOZ_tensor)            wordCountDoku =  tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor)                        wCD = np.array(wordCountDoku.eval())            
        indexedwCD = []        for n in range(len(wCD)):            indexedwCD.append([n,wCD[n][0]])

        indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True)
        best_n_documents = []

        for n in range(numberofmatches):
            best_n_documents.append(indexedwCD[n][0])                            return best_n_documents