|
|
@ -0,0 +1,341 @@ |
|
|
|
|
|
|
|
# The new class FASTsearch. Every DB can be represented in Lists. The Brain actually is constituted from lists. Access to all Documents almost the same moment. |
|
|
|
|
|
|
|
# TODO GPU Multithreading has to be implemented. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# USAGE: Learn scikit-learn count vectorizer on a database of lines or docs. |
|
|
|
|
|
|
|
import joblib |
|
|
|
from sklearn.feature_extraction.text import CountVectorizer |
|
|
|
|
|
|
|
import numpy as np |
|
|
|
import scipy as sc |
|
|
|
|
|
|
|
import tensorflow as tf |
|
|
|
|
|
|
|
|
|
|
|
import _pickle as cPickle |
|
|
|
|
|
|
|
import hickle as hkl |
|
|
|
|
|
|
|
import os |
|
|
|
|
|
|
|
|
|
|
|
# Define function to convert scipy csr matrix to tf tensor for working on gpu |
|
|
|
def convert_sparse_matrix_to_sparse_tensor(X): |
|
|
|
coo = sc.sparse.coo_matrix(X) |
|
|
|
indices = np.mat([coo.row, coo.col]).transpose() |
|
|
|
return tf.SparseTensorValue(indices, coo.data, coo.shape) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# The whole class is initialized with input of the database in [['word','word2'],[],[],[]] List format, 2 dimensional, the index of the list in the matrix defines its id |
|
|
|
## in every list element of the input, each document is represented by one string |
|
|
|
# This list must be saved as a hkl dump and then loaded into the database. |
|
|
|
|
|
|
|
|
|
|
|
def my_tokenizer(s): |
|
|
|
return s.split('\+') |
|
|
|
|
|
|
|
class FASTsearch(object): |
|
|
|
|
|
|
|
def __init__(self, DatabaseDir): |
|
|
|
|
|
|
|
self.DatabaseDir = DatabaseDir[:-4] |
|
|
|
|
|
|
|
database = [] |
|
|
|
hkl_load = hkl.load(DatabaseDir) |
|
|
|
|
|
|
|
for element in hkl_load: |
|
|
|
#print('element',element) |
|
|
|
#print('joined element', ' '.join(element)) |
|
|
|
database.append(' '.join(element)) |
|
|
|
|
|
|
|
|
|
|
|
# input has to be hkl format |
|
|
|
self.database = database |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def Gen_BoW_Model(self, max_features, analyzer, punctuation = False): |
|
|
|
|
|
|
|
print("Creating the bag of words...\n") |
|
|
|
from sklearn.feature_extraction.text import CountVectorizer |
|
|
|
|
|
|
|
# Initialize the "CountVectorizer" object, which is scikit-learn's |
|
|
|
# bag of words tool. |
|
|
|
if punctuation == False: |
|
|
|
vectorizer = CountVectorizer(analyzer = analyzer, \ |
|
|
|
tokenizer = None, \ |
|
|
|
preprocessor = None, \ |
|
|
|
stop_words = None, \ |
|
|
|
max_features = max_features) |
|
|
|
|
|
|
|
if punctuation == True: |
|
|
|
vectorizer = CountVectorizer(analyzer = analyzer, \ |
|
|
|
tokenizer = my_tokenizer, \ |
|
|
|
preprocessor = None, \ |
|
|
|
stop_words = None, \ |
|
|
|
max_features = max_features) |
|
|
|
|
|
|
|
|
|
|
|
# token_pattern = r'(?u)\w') |
|
|
|
# fit_transform() does two functions: First, it fits the model |
|
|
|
# and learns the vocabulary; second, it transforms our training data |
|
|
|
# into feature vectors. The input to fit_transform should be a list of |
|
|
|
# strings. |
|
|
|
train_data_features = vectorizer.fit_transform(self.database) |
|
|
|
|
|
|
|
joblib.dump(vectorizer, 'bagofwords' + self.DatabaseDir + '.pkl') |
|
|
|
|
|
|
|
print('dumping the data to hkl format..') |
|
|
|
hkl.dump(train_data_features, 'DataBaseOneZeros' + self.DatabaseDir + '.hkl', mode='w', compression='gzip') |
|
|
|
print('done') |
|
|
|
|
|
|
|
|
|
|
|
return vectorizer |
|
|
|
|
|
|
|
def Load_BoW_Model(self, BoWModelDir, DatabaseOneZerosDir): |
|
|
|
|
|
|
|
# input has to be pkl format |
|
|
|
self.vectorizer = joblib.load(BoWModelDir) |
|
|
|
|
|
|
|
self.dbOZ = hkl.load(DatabaseOneZerosDir).astype('float32') |
|
|
|
|
|
|
|
|
|
|
|
return self.vectorizer |
|
|
|
|
|
|
|
|
|
|
|
# input: string to search for in the documents, the numberofmatches to get the best n documents |
|
|
|
# output the numberofmatches documents with their indexes on the database which is searched, the highest accordance number plus index [index, number] |
|
|
|
|
|
|
|
def search(self, string , numberofmatches): |
|
|
|
|
|
|
|
|
|
|
|
numberofmatches = numberofmatches |
|
|
|
|
|
|
|
|
|
|
|
# Convert user input to Zeros and Ones |
|
|
|
user_array = [] |
|
|
|
user_array.append(string) |
|
|
|
|
|
|
|
user_input_OnesZeros = self.vectorizer.transform(user_array) |
|
|
|
|
|
|
|
uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False) |
|
|
|
|
|
|
|
|
|
|
|
uiOZ = uOZ[np.newaxis, :] |
|
|
|
|
|
|
|
uiOZ = uiOZ.transpose() |
|
|
|
|
|
|
|
sess = tf.Session() |
|
|
|
with tf.device('/gpu:0'): |
|
|
|
with sess.as_default(): |
|
|
|
|
|
|
|
uiOZ_tensor = tf.constant(uiOZ) |
|
|
|
|
|
|
|
dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ) |
|
|
|
|
|
|
|
#uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None ) |
|
|
|
#dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None ) |
|
|
|
|
|
|
|
|
|
|
|
#wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor) |
|
|
|
wordCountDoku = tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor) |
|
|
|
|
|
|
|
wCD = np.array(wordCountDoku.eval()) |
|
|
|
|
|
|
|
indexedwCD = [] |
|
|
|
for n in range(len(wCD)): |
|
|
|
indexedwCD.append([n,wCD[n][0]]) |
|
|
|
|
|
|
|
|
|
|
|
indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True) |
|
|
|
|
|
|
|
best_n_documents = [] |
|
|
|
|
|
|
|
eq_number = 0 |
|
|
|
for number in uiOZ: |
|
|
|
#print(number) |
|
|
|
eq_number += number ** 2 |
|
|
|
|
|
|
|
#print(eq_number) |
|
|
|
|
|
|
|
n = 0 |
|
|
|
done = False |
|
|
|
while n < len(indexedwCD) and done == False: |
|
|
|
n += 1 |
|
|
|
if indexedwCD[n][1] == eq_number: |
|
|
|
best_n_documents = indexedwCD[n][0] |
|
|
|
done = True |
|
|
|
|
|
|
|
if indexedwCD[n][1] < eq_number: |
|
|
|
best_n_documents = indexedwCD[n - 1][0] |
|
|
|
done = True |
|
|
|
|
|
|
|
|
|
|
|
#for n in range(numberofmatches): |
|
|
|
|
|
|
|
#best_n_documents.append([indexedwCD[n][0], indexedwCD[n][1]]) |
|
|
|
|
|
|
|
|
|
|
|
return best_n_documents, indexedwCD[0] |
|
|
|
|
|
|
|
def search_with_highest_multiplikation_Output(self, string , numberofmatches): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
numberofmatches = numberofmatches |
|
|
|
|
|
|
|
|
|
|
|
# Convert user input to Zeros and Ones |
|
|
|
user_array = [] |
|
|
|
user_array.append(string) |
|
|
|
|
|
|
|
user_input_OnesZeros = self.vectorizer.transform(user_array) |
|
|
|
|
|
|
|
uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False) |
|
|
|
|
|
|
|
|
|
|
|
uiOZ = uOZ[np.newaxis, :] |
|
|
|
|
|
|
|
uiOZ = uiOZ.transpose() |
|
|
|
|
|
|
|
sess = tf.Session() |
|
|
|
with tf.device('/gpu:0'): |
|
|
|
with sess.as_default(): |
|
|
|
|
|
|
|
uiOZ_tensor = tf.constant(uiOZ) |
|
|
|
|
|
|
|
dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ) |
|
|
|
|
|
|
|
#uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None ) |
|
|
|
#dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None ) |
|
|
|
|
|
|
|
|
|
|
|
#wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor) |
|
|
|
wordCountDoku = tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor) |
|
|
|
|
|
|
|
wCD = np.array(wordCountDoku.eval()) |
|
|
|
|
|
|
|
indexedwCD = [] |
|
|
|
for n in range(len(wCD)): |
|
|
|
indexedwCD.append([n,wCD[n][0]]) |
|
|
|
|
|
|
|
|
|
|
|
indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True) |
|
|
|
|
|
|
|
best_n_documents = [] |
|
|
|
|
|
|
|
for n in range(numberofmatches): |
|
|
|
best_n_documents.append(indexedwCD[n][0]) |
|
|
|
|
|
|
|
return best_n_documents, indexedwCD[0] |
|
|
|
|
|
|
|
|
|
|
|
def searchPatternMatch(self, string , numberofmatches): |
|
|
|
|
|
|
|
|
|
|
|
numberofmatches = numberofmatches |
|
|
|
|
|
|
|
|
|
|
|
# Convert user input to Zeros and Ones |
|
|
|
user_array = [] |
|
|
|
user_array.append(string) |
|
|
|
|
|
|
|
user_input_OnesZeros = self.vectorizer.transform(user_array) |
|
|
|
|
|
|
|
uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False) |
|
|
|
|
|
|
|
|
|
|
|
uiOZ = uOZ[np.newaxis, :] |
|
|
|
|
|
|
|
uiOZ = uiOZ.transpose() |
|
|
|
|
|
|
|
sess = tf.Session() |
|
|
|
with tf.device('/gpu:0'): |
|
|
|
with sess.as_default(): |
|
|
|
|
|
|
|
uiOZ_tensor = tf.constant(uiOZ) |
|
|
|
|
|
|
|
dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ) |
|
|
|
|
|
|
|
#uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None ) |
|
|
|
#dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None ) |
|
|
|
|
|
|
|
|
|
|
|
#wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor) |
|
|
|
wordCountDoku = tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor) |
|
|
|
|
|
|
|
wCD = np.array(wordCountDoku.eval()) |
|
|
|
|
|
|
|
indexedwCD = [] |
|
|
|
for n in range(len(wCD)): |
|
|
|
indexedwCD.append([n,wCD[n][0]]) |
|
|
|
|
|
|
|
# Sort the biggest matches |
|
|
|
indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True) |
|
|
|
|
|
|
|
best_n_documents = [] |
|
|
|
|
|
|
|
best_docs_surrounding = [] |
|
|
|
|
|
|
|
|
|
|
|
# Get the number which is result when same words would be in the document as in one grammar scheme |
|
|
|
eq_number = 0 |
|
|
|
for number in uiOZ: |
|
|
|
#print(number) |
|
|
|
eq_number += number ** 2 |
|
|
|
|
|
|
|
print(eq_number) |
|
|
|
|
|
|
|
# Create new array of closest grammar schemes, I have chosen around 3 (in the matchnumber, not regarding words or so) |
|
|
|
n = 0 |
|
|
|
done = False |
|
|
|
while n < len(indexedwCD) and done == False: |
|
|
|
n += 1 |
|
|
|
#print('a',indexedwCD) |
|
|
|
#print('oo', indexedwCD[n]) |
|
|
|
if indexedwCD[n][1] == eq_number: |
|
|
|
best_docs_surrounding.append(indexedwCD[n][0]) |
|
|
|
|
|
|
|
#if indexedwCD[n][1] < eq_number: |
|
|
|
#best_docs_surrounding.append(indexedwCD[n][0]) |
|
|
|
|
|
|
|
if indexedwCD[n][1] < eq_number : |
|
|
|
done = True |
|
|
|
|
|
|
|
|
|
|
|
# Count for these docs in surrounding the matches of wordnumbers per word |
|
|
|
# would be much faster when using the sparse class |
|
|
|
|
|
|
|
best_docs_surrounding_new = [] |
|
|
|
for doc in best_docs_surrounding: |
|
|
|
dok_BoW = self.dbOZ[doc].toarray()[0].astype(np.float32, copy=False) |
|
|
|
Number_equal_words = 0 |
|
|
|
for n in range(len(uiOZ)): |
|
|
|
#print(uiOZ[n]) |
|
|
|
#print(dok_BoW[n]) |
|
|
|
#print('dok_BoW',dok_BoW) |
|
|
|
if uiOZ[n] == dok_BoW[n]: |
|
|
|
Number_equal_words += 1 |
|
|
|
best_docs_surrounding_new.append([doc , Number_equal_words]) |
|
|
|
|
|
|
|
# Sort the result again with the original indexes |
|
|
|
best_n_documents = sorted(best_docs_surrounding_new[::-1], key=lambda tup: tup[1], reverse=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#for n in range(numberofmatches): |
|
|
|
|
|
|
|
#best_n_documents.append([indexedwCD[n][0], indexedwCD[n][1]]) |
|
|
|
|
|
|
|
|
|
|
|
return best_n_documents |
|
|
|
|
|
|
|
|