alpcentaur
/
fastsearch



								# The new class FASTsearch. Every DB can be represented in Lists. The Brain actually is constituted from lists. Access to all Documents the same moment.


								# TODO GPU Multithreading has to be implemented.


								# USAGE: Learn scikit-learn count vectorizer on a database of lines or docs.


								from sklearn.externals import joblib

								from sklearn.feature_extraction.text import CountVectorizer


								import numpy as np

								import scipy as sc


								import tensorflow as tf


								import _pickle as cPickle


								import hickle as hkl


								import os


								# Define function to convert scipy csr matrix to tf tensor for working on gpu

								def convert_sparse_matrix_to_sparse_tensor(X):

								    coo = sc.sparse.coo_matrix(X)

								    indices = np.mat([coo.row, coo.col]).transpose()

								    return tf.SparseTensorValue(indices, coo.data, coo.shape)


								class FASTsearch(object):


								    def __init__(self, DatabaseDir, BoWModelDir):


								        # input has to be hkl format

								        self.database = hkl.load(DatabaseDir).astype('float32')


								        # input has to be pkl format

								        self.vectorizer = joblib.load(BoWModelDir)


								    def search(self, string , numberofmatches):


								        numberofmatches = numberofmatches


								        # Convert user input to Zeros and Ones

								        user_array = []

								        user_array.append(string)


								        user_input_OnesZeros = self.vectorizer.transform(user_array)

								        uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False)


								        uiOZ = uOZ[np.newaxis, :]


								        uiOZ = uiOZ.transpose()


								        sess = tf.Session()


								        with sess.as_default():


								            uiOZ_tensor = tf.constant(uiOZ)


								            dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(dbOZ)


								            #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )

								            #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )


								            #wordCountDoku =  tf.matmul(uiOZ_tensor, dbOZ_tensor)

								            wordCountDoku =  tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor)


								            wCD = np.array(wordCountDoku.eval())


								        indexedwCD = []

								        for n in range(len(wCD)):

								            indexedwCD.append([n,wCD[n][0]])


								        indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True)


								        best_n_documents = []


								        for n in range(numberofmatches):


								            best_n_documents.append(indexedwCD[n][0])


								        return best_n_documents