From be4279eb645433847943261ecbdc3d13d9388f18 Mon Sep 17 00:00:00 2001 From: alpcentaur Date: Fri, 24 Sep 2021 15:13:03 +0200 Subject: [PATCH] first commit --- build/tf-gpu-FASTsearch/Dockerfile | 46 ++++ build/tf-gpu-FASTsearch/FASTsearch.py | 341 ++++++++++++++++++++++++++ compose/docker-compose.yml | 12 + oi | 1 - 4 files changed, 399 insertions(+), 1 deletion(-) create mode 100644 build/tf-gpu-FASTsearch/Dockerfile create mode 100644 build/tf-gpu-FASTsearch/FASTsearch.py create mode 100644 compose/docker-compose.yml delete mode 100644 oi diff --git a/build/tf-gpu-FASTsearch/Dockerfile b/build/tf-gpu-FASTsearch/Dockerfile new file mode 100644 index 0000000..ab5790d --- /dev/null +++ b/build/tf-gpu-FASTsearch/Dockerfile @@ -0,0 +1,46 @@ +FROM tensorflow/tensorflow:1.12.0-gpu + +COPY Prototyp /home/Prototyp + +COPY requis.txt /home/requis.txt + +RUN apt-get update && apt-get install -y wget libssl-dev openssl +#RUN wget https://www.python.org/ftp/python/3.5.3/Python-3.5.3.tgz +#RUN tar -xzvf Python-3.5.3.tgz +#RUN cd Python-3.5.3 && ./configure && make && make install + +RUN python --version + +RUN apt-get update && apt-get install -y virtualenv python-dev python-pip build-essential + +#RUN python3.5 -m venv /home/venv + +#ENV PATH="home/venv/bin:$PATH" + +RUN python --version + +#RUN pip3 install --upgrade pip + +RUN pip install -r /home/requis.txt && python -m spacy download de + +RUN pip install hickle==3.4.9 Twisted joblib +#nodejs npm + +#RUN python -m pip install incremental + +#RUN python -m pip install cffi + +#RUN python -m pip install -r /home/requis.txt + +#RUN python3 -m spacy download de + +#RUN pip3 install pandas bs4 + + + +RUN apt-get update && apt-get install -y nodejs + +#ENTRYPOINT ["tail"] +#CMD ["-f","/dev/null"] + +CMD /bin/sh -c "cd /home/Prototyp && nodejs server.js" diff --git a/build/tf-gpu-FASTsearch/FASTsearch.py b/build/tf-gpu-FASTsearch/FASTsearch.py new file mode 100644 index 0000000..960a52f --- /dev/null +++ b/build/tf-gpu-FASTsearch/FASTsearch.py @@ -0,0 +1,341 @@ + +# The new class FASTsearch. Every DB can be represented in Lists. The Brain actually is constituted from lists. Access to all Documents almost the same moment. + +# TODO GPU Multithreading has to be implemented. + + + +# USAGE: Learn scikit-learn count vectorizer on a database of lines or docs. + +import joblib +from sklearn.feature_extraction.text import CountVectorizer + +import numpy as np +import scipy as sc + +import tensorflow as tf + + +import _pickle as cPickle + +import hickle as hkl + +import os + + +# Define function to convert scipy csr matrix to tf tensor for working on gpu +def convert_sparse_matrix_to_sparse_tensor(X): + coo = sc.sparse.coo_matrix(X) + indices = np.mat([coo.row, coo.col]).transpose() + return tf.SparseTensorValue(indices, coo.data, coo.shape) + + + + +# The whole class is initialized with input of the database in [['word','word2'],[],[],[]] List format, 2 dimensional, the index of the list in the matrix defines its id +## in every list element of the input, each document is represented by one string +# This list must be saved as a hkl dump and then loaded into the database. + + +def my_tokenizer(s): + return s.split('\+') + +class FASTsearch(object): + + def __init__(self, DatabaseDir): + + self.DatabaseDir = DatabaseDir[:-4] + + database = [] + hkl_load = hkl.load(DatabaseDir) + + for element in hkl_load: + #print('element',element) + #print('joined element', ' '.join(element)) + database.append(' '.join(element)) + + + # input has to be hkl format + self.database = database + + + + + def Gen_BoW_Model(self, max_features, analyzer, punctuation = False): + + print("Creating the bag of words...\n") + from sklearn.feature_extraction.text import CountVectorizer + + # Initialize the "CountVectorizer" object, which is scikit-learn's + # bag of words tool. + if punctuation == False: + vectorizer = CountVectorizer(analyzer = analyzer, \ + tokenizer = None, \ + preprocessor = None, \ + stop_words = None, \ + max_features = max_features) + + if punctuation == True: + vectorizer = CountVectorizer(analyzer = analyzer, \ + tokenizer = my_tokenizer, \ + preprocessor = None, \ + stop_words = None, \ + max_features = max_features) + + + # token_pattern = r'(?u)\w') + # fit_transform() does two functions: First, it fits the model + # and learns the vocabulary; second, it transforms our training data + # into feature vectors. The input to fit_transform should be a list of + # strings. + train_data_features = vectorizer.fit_transform(self.database) + + joblib.dump(vectorizer, 'bagofwords' + self.DatabaseDir + '.pkl') + + print('dumping the data to hkl format..') + hkl.dump(train_data_features, 'DataBaseOneZeros' + self.DatabaseDir + '.hkl', mode='w', compression='gzip') + print('done') + + + return vectorizer + + def Load_BoW_Model(self, BoWModelDir, DatabaseOneZerosDir): + + # input has to be pkl format + self.vectorizer = joblib.load(BoWModelDir) + + self.dbOZ = hkl.load(DatabaseOneZerosDir).astype('float32') + + + return self.vectorizer + + + # input: string to search for in the documents, the numberofmatches to get the best n documents + # output the numberofmatches documents with their indexes on the database which is searched, the highest accordance number plus index [index, number] + + def search(self, string , numberofmatches): + + + numberofmatches = numberofmatches + + + # Convert user input to Zeros and Ones + user_array = [] + user_array.append(string) + + user_input_OnesZeros = self.vectorizer.transform(user_array) + + uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False) + + + uiOZ = uOZ[np.newaxis, :] + + uiOZ = uiOZ.transpose() + + sess = tf.Session() + with tf.device('/gpu:0'): + with sess.as_default(): + + uiOZ_tensor = tf.constant(uiOZ) + + dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ) + + #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None ) + #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None ) + + + #wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor) + wordCountDoku = tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor) + + wCD = np.array(wordCountDoku.eval()) + + indexedwCD = [] + for n in range(len(wCD)): + indexedwCD.append([n,wCD[n][0]]) + + + indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True) + + best_n_documents = [] + + eq_number = 0 + for number in uiOZ: + #print(number) + eq_number += number ** 2 + + #print(eq_number) + + n = 0 + done = False + while n < len(indexedwCD) and done == False: + n += 1 + if indexedwCD[n][1] == eq_number: + best_n_documents = indexedwCD[n][0] + done = True + + if indexedwCD[n][1] < eq_number: + best_n_documents = indexedwCD[n - 1][0] + done = True + + + #for n in range(numberofmatches): + + #best_n_documents.append([indexedwCD[n][0], indexedwCD[n][1]]) + + + return best_n_documents, indexedwCD[0] + + def search_with_highest_multiplikation_Output(self, string , numberofmatches): + + + + + numberofmatches = numberofmatches + + + # Convert user input to Zeros and Ones + user_array = [] + user_array.append(string) + + user_input_OnesZeros = self.vectorizer.transform(user_array) + + uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False) + + + uiOZ = uOZ[np.newaxis, :] + + uiOZ = uiOZ.transpose() + + sess = tf.Session() + with tf.device('/gpu:0'): + with sess.as_default(): + + uiOZ_tensor = tf.constant(uiOZ) + + dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ) + + #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None ) + #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None ) + + + #wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor) + wordCountDoku = tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor) + + wCD = np.array(wordCountDoku.eval()) + + indexedwCD = [] + for n in range(len(wCD)): + indexedwCD.append([n,wCD[n][0]]) + + + indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True) + + best_n_documents = [] + + for n in range(numberofmatches): + best_n_documents.append(indexedwCD[n][0]) + + return best_n_documents, indexedwCD[0] + + + def searchPatternMatch(self, string , numberofmatches): + + + numberofmatches = numberofmatches + + + # Convert user input to Zeros and Ones + user_array = [] + user_array.append(string) + + user_input_OnesZeros = self.vectorizer.transform(user_array) + + uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False) + + + uiOZ = uOZ[np.newaxis, :] + + uiOZ = uiOZ.transpose() + + sess = tf.Session() + with tf.device('/gpu:0'): + with sess.as_default(): + + uiOZ_tensor = tf.constant(uiOZ) + + dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ) + + #uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None ) + #dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None ) + + + #wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor) + wordCountDoku = tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor) + + wCD = np.array(wordCountDoku.eval()) + + indexedwCD = [] + for n in range(len(wCD)): + indexedwCD.append([n,wCD[n][0]]) + + # Sort the biggest matches + indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True) + + best_n_documents = [] + + best_docs_surrounding = [] + + + # Get the number which is result when same words would be in the document as in one grammar scheme + eq_number = 0 + for number in uiOZ: + #print(number) + eq_number += number ** 2 + + print(eq_number) + + # Create new array of closest grammar schemes, I have chosen around 3 (in the matchnumber, not regarding words or so) + n = 0 + done = False + while n < len(indexedwCD) and done == False: + n += 1 + #print('a',indexedwCD) + #print('oo', indexedwCD[n]) + if indexedwCD[n][1] == eq_number: + best_docs_surrounding.append(indexedwCD[n][0]) + + #if indexedwCD[n][1] < eq_number: + #best_docs_surrounding.append(indexedwCD[n][0]) + + if indexedwCD[n][1] < eq_number : + done = True + + + # Count for these docs in surrounding the matches of wordnumbers per word + # would be much faster when using the sparse class + + best_docs_surrounding_new = [] + for doc in best_docs_surrounding: + dok_BoW = self.dbOZ[doc].toarray()[0].astype(np.float32, copy=False) + Number_equal_words = 0 + for n in range(len(uiOZ)): + #print(uiOZ[n]) + #print(dok_BoW[n]) + #print('dok_BoW',dok_BoW) + if uiOZ[n] == dok_BoW[n]: + Number_equal_words += 1 + best_docs_surrounding_new.append([doc , Number_equal_words]) + + # Sort the result again with the original indexes + best_n_documents = sorted(best_docs_surrounding_new[::-1], key=lambda tup: tup[1], reverse=True) + + + + #for n in range(numberofmatches): + + #best_n_documents.append([indexedwCD[n][0], indexedwCD[n][1]]) + + + return best_n_documents + + diff --git a/compose/docker-compose.yml b/compose/docker-compose.yml new file mode 100644 index 0000000..822756d --- /dev/null +++ b/compose/docker-compose.yml @@ -0,0 +1,12 @@ +version: '2.3' + + +services: + + prototype: + + build: ../build/tf-gpu-Prototyp + container_name: prototype + restart: always + ports: + - "127.0.0.1:7000:7000" diff --git a/oi b/oi deleted file mode 100644 index e131791..0000000 --- a/oi +++ /dev/null @@ -1 +0,0 @@ -ölaksfd