|
|
-
- # The new class FASTsearch. Every DB can be represented in Lists. The Brain actually is constituted from lists. Access to all Documents the same moment.
-
- # TODO GPU Multithreading has to be implemented.
-
-
-
- # USAGE: Learn scikit-learn count vectorizer on a database of lines or docs.
-
- from sklearn.externals import joblib
- from sklearn.feature_extraction.text import CountVectorizer
-
- import numpy as np
- import scipy as sc
-
- import tensorflow as tf
-
- import _pickle as cPickle
-
- import hickle as hkl
-
- import os
-
-
- # Define function to convert scipy csr matrix to tf tensor for working on gpu
- def convert_sparse_matrix_to_sparse_tensor(X):
- coo = sc.sparse.coo_matrix(X)
- indices = np.mat([coo.row, coo.col]).transpose()
- return tf.SparseTensorValue(indices, coo.data, coo.shape)
-
-
-
- class FASTsearch(object):
-
- def __init__(self, DatabaseDir, BoWModelDir):
-
- # input has to be hkl format
- self.database = hkl.load(DatabaseDir).astype('float32')
-
- # input has to be pkl format
- self.vectorizer = joblib.load(BoWModelDir)
-
- def search(self, string , numberofmatches):
-
-
- numberofmatches = numberofmatches
-
-
- # Convert user input to Zeros and Ones
- user_array = []
- user_array.append(string)
-
- user_input_OnesZeros = self.vectorizer.transform(user_array)
- uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False)
-
- uiOZ = uOZ[np.newaxis, :]
-
- uiOZ = uiOZ.transpose()
-
-
- sess = tf.Session()
-
- with sess.as_default():
-
- uiOZ_tensor = tf.constant(uiOZ)
-
- dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(dbOZ)
-
- #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
- #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
-
-
- #wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor)
- wordCountDoku = tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor)
-
- wCD = np.array(wordCountDoku.eval())
-
-
- indexedwCD = []
- for n in range(len(wCD)):
- indexedwCD.append([n,wCD[n][0]])
-
-
- indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True)
-
- best_n_documents = []
-
-
- for n in range(numberofmatches):
-
- best_n_documents.append(indexedwCD[n][0])
-
-
- return best_n_documents
-
-
-
|