|
|
@ -0,0 +1,97 @@ |
|
|
|
|
|
|
|
# The new class FASTsearch. Every DB can be represented in Lists. The Brain actually is constituted from lists. Access to all Documents the same moment. |
|
|
|
|
|
|
|
# TODO GPU Multithreading has to be implemented. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# USAGE: Learn scikit-learn count vectorizer on a database of lines or docs. |
|
|
|
|
|
|
|
from sklearn.externals import joblib |
|
|
|
from sklearn.feature_extraction.text import CountVectorizer |
|
|
|
|
|
|
|
import numpy as np |
|
|
|
import scipy as sc |
|
|
|
|
|
|
|
import tensorflow as tf |
|
|
|
|
|
|
|
import _pickle as cPickle |
|
|
|
|
|
|
|
import hickle as hkl |
|
|
|
|
|
|
|
import os |
|
|
|
|
|
|
|
|
|
|
|
# Define function to convert scipy csr matrix to tf tensor for working on gpu |
|
|
|
def convert_sparse_matrix_to_sparse_tensor(X): |
|
|
|
coo = sc.sparse.coo_matrix(X) |
|
|
|
indices = np.mat([coo.row, coo.col]).transpose() |
|
|
|
return tf.SparseTensorValue(indices, coo.data, coo.shape) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class FASTsearch(object): |
|
|
|
|
|
|
|
def __init__(self, DatabaseDir, BoWModelDir): |
|
|
|
|
|
|
|
# input has to be hkl format |
|
|
|
self.database = hkl.load(DatabaseDir).astype('float32') |
|
|
|
|
|
|
|
# input has to be pkl format |
|
|
|
self.vectorizer = joblib.load(BoWModelDir) |
|
|
|
|
|
|
|
def search(self, string , numberofmatches): |
|
|
|
|
|
|
|
|
|
|
|
numberofmatches = numberofmatches |
|
|
|
|
|
|
|
|
|
|
|
# Convert user input to Zeros and Ones |
|
|
|
user_array = [] |
|
|
|
user_array.append(string) |
|
|
|
|
|
|
|
user_input_OnesZeros = self.vectorizer.transform(user_array) |
|
|
|
uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False) |
|
|
|
|
|
|
|
uiOZ = uOZ[np.newaxis, :] |
|
|
|
|
|
|
|
uiOZ = uiOZ.transpose() |
|
|
|
|
|
|
|
|
|
|
|
sess = tf.Session() |
|
|
|
|
|
|
|
with sess.as_default(): |
|
|
|
|
|
|
|
uiOZ_tensor = tf.constant(uiOZ) |
|
|
|
|
|
|
|
dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(dbOZ) |
|
|
|
|
|
|
|
#uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None ) |
|
|
|
#dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None ) |
|
|
|
|
|
|
|
|
|
|
|
#wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor) |
|
|
|
wordCountDoku = tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor) |
|
|
|
|
|
|
|
wCD = np.array(wordCountDoku.eval()) |
|
|
|
|
|
|
|
|
|
|
|
indexedwCD = [] |
|
|
|
for n in range(len(wCD)): |
|
|
|
indexedwCD.append([n,wCD[n][0]]) |
|
|
|
|
|
|
|
|
|
|
|
indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True) |
|
|
|
|
|
|
|
best_n_documents = [] |
|
|
|
|
|
|
|
|
|
|
|
for n in range(numberofmatches): |
|
|
|
|
|
|
|
best_n_documents.append(indexedwCD[n][0]) |
|
|
|
|
|
|
|
|
|
|
|
return best_n_documents |
|
|
|
|
|
|
|
|
|
|
|
|