First commit of FASTsearch
This commit is contained in:
commit
da193f8889
1 changed files with 97 additions and 0 deletions
97
FASTsearch.py
Normal file
97
FASTsearch.py
Normal file
|
@ -0,0 +1,97 @@
|
|||
|
||||
# The new class FASTsearch. Every DB can be represented in Lists. The Brain actually is constituted from lists. Access to all Documents the same moment.
|
||||
|
||||
# TODO GPU Multithreading has to be implemented.
|
||||
|
||||
|
||||
|
||||
# USAGE: Learn scikit-learn count vectorizer on a database of lines or docs.
|
||||
|
||||
from sklearn.externals import joblib
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
|
||||
import numpy as np
|
||||
import scipy as sc
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
import _pickle as cPickle
|
||||
|
||||
import hickle as hkl
|
||||
|
||||
import os
|
||||
|
||||
|
||||
# Define function to convert scipy csr matrix to tf tensor for working on gpu
|
||||
def convert_sparse_matrix_to_sparse_tensor(X):
|
||||
coo = sc.sparse.coo_matrix(X)
|
||||
indices = np.mat([coo.row, coo.col]).transpose()
|
||||
return tf.SparseTensorValue(indices, coo.data, coo.shape)
|
||||
|
||||
|
||||
|
||||
class FASTsearch(object):
|
||||
|
||||
def __init__(self, DatabaseDir, BoWModelDir):
|
||||
|
||||
# input has to be hkl format
|
||||
self.database = hkl.load(DatabaseDir).astype('float32')
|
||||
|
||||
# input has to be pkl format
|
||||
self.vectorizer = joblib.load(BoWModelDir)
|
||||
|
||||
def search(self, string , numberofmatches):
|
||||
|
||||
|
||||
numberofmatches = numberofmatches
|
||||
|
||||
|
||||
# Convert user input to Zeros and Ones
|
||||
user_array = []
|
||||
user_array.append(string)
|
||||
|
||||
user_input_OnesZeros = self.vectorizer.transform(user_array)
|
||||
uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False)
|
||||
|
||||
uiOZ = uOZ[np.newaxis, :]
|
||||
|
||||
uiOZ = uiOZ.transpose()
|
||||
|
||||
|
||||
sess = tf.Session()
|
||||
|
||||
with sess.as_default():
|
||||
|
||||
uiOZ_tensor = tf.constant(uiOZ)
|
||||
|
||||
dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(dbOZ)
|
||||
|
||||
#uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
|
||||
#dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
|
||||
|
||||
|
||||
#wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor)
|
||||
wordCountDoku = tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor)
|
||||
|
||||
wCD = np.array(wordCountDoku.eval())
|
||||
|
||||
|
||||
indexedwCD = []
|
||||
for n in range(len(wCD)):
|
||||
indexedwCD.append([n,wCD[n][0]])
|
||||
|
||||
|
||||
indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True)
|
||||
|
||||
best_n_documents = []
|
||||
|
||||
|
||||
for n in range(numberofmatches):
|
||||
|
||||
best_n_documents.append(indexedwCD[n][0])
|
||||
|
||||
|
||||
return best_n_documents
|
||||
|
||||
|
||||
|
Loading…
Reference in a new issue