|
|
-
- # The new class FASTsearch. Every DB can be represented in Lists. The Brain actually is constituted from lists. Access to all Documents almost the same moment.
-
- # TODO GPU Multithreading has to be implemented.
-
-
-
- # USAGE: Learn scikit-learn count vectorizer on a database of lines or docs.
-
- import joblib
- from sklearn.feature_extraction.text import CountVectorizer
-
- import numpy as np
- import scipy as sc
-
- import tensorflow as tf
-
-
- import _pickle as cPickle
-
- import hickle as hkl
-
- import os
-
-
- # Define function to convert scipy csr matrix to tf tensor for working on gpu
- def convert_sparse_matrix_to_sparse_tensor(X):
- coo = sc.sparse.coo_matrix(X)
- indices = np.mat([coo.row, coo.col]).transpose()
- return tf.SparseTensorValue(indices, coo.data, coo.shape)
-
-
-
-
- # The whole class is initialized with input of the database in [['word','word2'],[],[],[]] List format, 2 dimensional, the index of the list in the matrix defines its id
- ## in every list element of the input, each document is represented by one string
- # This list must be saved as a hkl dump and then loaded into the database.
-
-
- def my_tokenizer(s):
- return s.split('\+')
-
- class FASTsearch(object):
-
- def __init__(self, DatabaseDir):
-
- self.DatabaseDir = DatabaseDir[:-4]
-
- database = []
- hkl_load = hkl.load(DatabaseDir)
-
- for element in hkl_load:
- #print('element',element)
- #print('joined element', ' '.join(element))
- database.append(' '.join(element))
-
-
- # input has to be hkl format
- self.database = database
-
-
-
-
- def Gen_BoW_Model(self, max_features, analyzer, punctuation = False):
-
- print("Creating the bag of words...\n")
- from sklearn.feature_extraction.text import CountVectorizer
-
- # Initialize the "CountVectorizer" object, which is scikit-learn's
- # bag of words tool.
- if punctuation == False:
- vectorizer = CountVectorizer(analyzer = analyzer, \
- tokenizer = None, \
- preprocessor = None, \
- stop_words = None, \
- max_features = max_features)
-
- if punctuation == True:
- vectorizer = CountVectorizer(analyzer = analyzer, \
- tokenizer = my_tokenizer, \
- preprocessor = None, \
- stop_words = None, \
- max_features = max_features)
-
-
- # token_pattern = r'(?u)\w')
- # fit_transform() does two functions: First, it fits the model
- # and learns the vocabulary; second, it transforms our training data
- # into feature vectors. The input to fit_transform should be a list of
- # strings.
- train_data_features = vectorizer.fit_transform(self.database)
-
- joblib.dump(vectorizer, 'bagofwords' + self.DatabaseDir + '.pkl')
-
- print('dumping the data to hkl format..')
- hkl.dump(train_data_features, 'DataBaseOneZeros' + self.DatabaseDir + '.hkl', mode='w', compression='gzip')
- print('done')
-
-
- return vectorizer
-
- def Load_BoW_Model(self, BoWModelDir, DatabaseOneZerosDir):
-
- # input has to be pkl format
- self.vectorizer = joblib.load(BoWModelDir)
-
- self.dbOZ = hkl.load(DatabaseOneZerosDir).astype('float32')
-
-
- return self.vectorizer
-
-
- # input: string to search for in the documents, the numberofmatches to get the best n documents
- # output the numberofmatches documents with their indexes on the database which is searched, the highest accordance number plus index [index, number]
-
- def search(self, string , numberofmatches):
-
-
- numberofmatches = numberofmatches
-
-
- # Convert user input to Zeros and Ones
- user_array = []
- user_array.append(string)
-
- user_input_OnesZeros = self.vectorizer.transform(user_array)
-
- uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False)
-
-
- uiOZ = uOZ[np.newaxis, :]
-
- uiOZ = uiOZ.transpose()
-
- sess = tf.Session()
- with tf.device('/gpu:0'):
- with sess.as_default():
-
- uiOZ_tensor = tf.constant(uiOZ)
-
- dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ)
-
- #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
- #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
-
-
- #wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor)
- wordCountDoku = tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor)
-
- wCD = np.array(wordCountDoku.eval())
-
- indexedwCD = []
- for n in range(len(wCD)):
- indexedwCD.append([n,wCD[n][0]])
-
-
- indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True)
-
- best_n_documents = []
-
- eq_number = 0
- for number in uiOZ:
- #print(number)
- eq_number += number ** 2
-
- #print(eq_number)
-
- n = 0
- done = False
- while n < len(indexedwCD) and done == False:
- n += 1
- if indexedwCD[n][1] == eq_number:
- best_n_documents = indexedwCD[n][0]
- done = True
-
- if indexedwCD[n][1] < eq_number:
- best_n_documents = indexedwCD[n - 1][0]
- done = True
-
-
- #for n in range(numberofmatches):
-
- #best_n_documents.append([indexedwCD[n][0], indexedwCD[n][1]])
-
-
- return best_n_documents, indexedwCD[0]
-
- def search_with_highest_multiplikation_Output(self, string , numberofmatches):
-
-
-
-
- numberofmatches = numberofmatches
-
-
- # Convert user input to Zeros and Ones
- user_array = []
- user_array.append(string)
-
- user_input_OnesZeros = self.vectorizer.transform(user_array)
-
- uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False)
-
-
- uiOZ = uOZ[np.newaxis, :]
-
- uiOZ = uiOZ.transpose()
-
- sess = tf.Session()
- with tf.device('/gpu:0'):
- with sess.as_default():
-
- uiOZ_tensor = tf.constant(uiOZ)
-
- dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ)
-
- #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
- #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
-
-
- #wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor)
- wordCountDoku = tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor)
-
- wCD = np.array(wordCountDoku.eval())
-
- indexedwCD = []
- for n in range(len(wCD)):
- indexedwCD.append([n,wCD[n][0]])
-
-
- indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True)
-
- best_n_documents = []
-
- for n in range(numberofmatches):
- best_n_documents.append(indexedwCD[n][0])
-
- return best_n_documents, indexedwCD[0]
-
-
- def searchPatternMatch(self, string , numberofmatches):
-
-
- numberofmatches = numberofmatches
-
-
- # Convert user input to Zeros and Ones
- user_array = []
- user_array.append(string)
-
- user_input_OnesZeros = self.vectorizer.transform(user_array)
-
- uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False)
-
-
- uiOZ = uOZ[np.newaxis, :]
-
- uiOZ = uiOZ.transpose()
-
- sess = tf.Session()
- with tf.device('/gpu:0'):
- with sess.as_default():
-
- uiOZ_tensor = tf.constant(uiOZ)
-
- dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ)
-
- #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
- #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
-
-
- #wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor)
- wordCountDoku = tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor)
-
- wCD = np.array(wordCountDoku.eval())
-
- indexedwCD = []
- for n in range(len(wCD)):
- indexedwCD.append([n,wCD[n][0]])
-
- # Sort the biggest matches
- indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True)
-
- best_n_documents = []
-
- best_docs_surrounding = []
-
-
- # Get the number which is result when same words would be in the document as in one grammar scheme
- eq_number = 0
- for number in uiOZ:
- #print(number)
- eq_number += number ** 2
-
- print(eq_number)
-
- # Create new array of closest grammar schemes, I have chosen around 3 (in the matchnumber, not regarding words or so)
- n = 0
- done = False
- while n < len(indexedwCD) and done == False:
- n += 1
- #print('a',indexedwCD)
- #print('oo', indexedwCD[n])
- if indexedwCD[n][1] == eq_number:
- best_docs_surrounding.append(indexedwCD[n][0])
-
- #if indexedwCD[n][1] < eq_number:
- #best_docs_surrounding.append(indexedwCD[n][0])
-
- if indexedwCD[n][1] < eq_number :
- done = True
-
-
- # Count for these docs in surrounding the matches of wordnumbers per word
- # would be much faster when using the sparse class
-
- best_docs_surrounding_new = []
- for doc in best_docs_surrounding:
- dok_BoW = self.dbOZ[doc].toarray()[0].astype(np.float32, copy=False)
- Number_equal_words = 0
- for n in range(len(uiOZ)):
- #print(uiOZ[n])
- #print(dok_BoW[n])
- #print('dok_BoW',dok_BoW)
- if uiOZ[n] == dok_BoW[n]:
- Number_equal_words += 1
- best_docs_surrounding_new.append([doc , Number_equal_words])
-
- # Sort the result again with the original indexes
- best_n_documents = sorted(best_docs_surrounding_new[::-1], key=lambda tup: tup[1], reverse=True)
-
-
-
- #for n in range(numberofmatches):
-
- #best_n_documents.append([indexedwCD[n][0], indexedwCD[n][1]])
-
-
- return best_n_documents
-
-
|