first commit
This commit is contained in:
parent
51df51764c
commit
be4279eb64
4 changed files with 399 additions and 1 deletions
46
build/tf-gpu-FASTsearch/Dockerfile
Normal file
46
build/tf-gpu-FASTsearch/Dockerfile
Normal file
|
@ -0,0 +1,46 @@
|
|||
FROM tensorflow/tensorflow:1.12.0-gpu
|
||||
|
||||
COPY Prototyp /home/Prototyp
|
||||
|
||||
COPY requis.txt /home/requis.txt
|
||||
|
||||
RUN apt-get update && apt-get install -y wget libssl-dev openssl
|
||||
#RUN wget https://www.python.org/ftp/python/3.5.3/Python-3.5.3.tgz
|
||||
#RUN tar -xzvf Python-3.5.3.tgz
|
||||
#RUN cd Python-3.5.3 && ./configure && make && make install
|
||||
|
||||
RUN python --version
|
||||
|
||||
RUN apt-get update && apt-get install -y virtualenv python-dev python-pip build-essential
|
||||
|
||||
#RUN python3.5 -m venv /home/venv
|
||||
|
||||
#ENV PATH="home/venv/bin:$PATH"
|
||||
|
||||
RUN python --version
|
||||
|
||||
#RUN pip3 install --upgrade pip
|
||||
|
||||
RUN pip install -r /home/requis.txt && python -m spacy download de
|
||||
|
||||
RUN pip install hickle==3.4.9 Twisted joblib
|
||||
#nodejs npm
|
||||
|
||||
#RUN python -m pip install incremental
|
||||
|
||||
#RUN python -m pip install cffi
|
||||
|
||||
#RUN python -m pip install -r /home/requis.txt
|
||||
|
||||
#RUN python3 -m spacy download de
|
||||
|
||||
#RUN pip3 install pandas bs4
|
||||
|
||||
|
||||
|
||||
RUN apt-get update && apt-get install -y nodejs
|
||||
|
||||
#ENTRYPOINT ["tail"]
|
||||
#CMD ["-f","/dev/null"]
|
||||
|
||||
CMD /bin/sh -c "cd /home/Prototyp && nodejs server.js"
|
341
build/tf-gpu-FASTsearch/FASTsearch.py
Normal file
341
build/tf-gpu-FASTsearch/FASTsearch.py
Normal file
|
@ -0,0 +1,341 @@
|
|||
|
||||
# The new class FASTsearch. Every DB can be represented in Lists. The Brain actually is constituted from lists. Access to all Documents almost the same moment.
|
||||
|
||||
# TODO GPU Multithreading has to be implemented.
|
||||
|
||||
|
||||
|
||||
# USAGE: Learn scikit-learn count vectorizer on a database of lines or docs.
|
||||
|
||||
import joblib
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
|
||||
import numpy as np
|
||||
import scipy as sc
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
import _pickle as cPickle
|
||||
|
||||
import hickle as hkl
|
||||
|
||||
import os
|
||||
|
||||
|
||||
# Define function to convert scipy csr matrix to tf tensor for working on gpu
|
||||
def convert_sparse_matrix_to_sparse_tensor(X):
|
||||
coo = sc.sparse.coo_matrix(X)
|
||||
indices = np.mat([coo.row, coo.col]).transpose()
|
||||
return tf.SparseTensorValue(indices, coo.data, coo.shape)
|
||||
|
||||
|
||||
|
||||
|
||||
# The whole class is initialized with input of the database in [['word','word2'],[],[],[]] List format, 2 dimensional, the index of the list in the matrix defines its id
|
||||
## in every list element of the input, each document is represented by one string
|
||||
# This list must be saved as a hkl dump and then loaded into the database.
|
||||
|
||||
|
||||
def my_tokenizer(s):
|
||||
return s.split('\+')
|
||||
|
||||
class FASTsearch(object):
|
||||
|
||||
def __init__(self, DatabaseDir):
|
||||
|
||||
self.DatabaseDir = DatabaseDir[:-4]
|
||||
|
||||
database = []
|
||||
hkl_load = hkl.load(DatabaseDir)
|
||||
|
||||
for element in hkl_load:
|
||||
#print('element',element)
|
||||
#print('joined element', ' '.join(element))
|
||||
database.append(' '.join(element))
|
||||
|
||||
|
||||
# input has to be hkl format
|
||||
self.database = database
|
||||
|
||||
|
||||
|
||||
|
||||
def Gen_BoW_Model(self, max_features, analyzer, punctuation = False):
|
||||
|
||||
print("Creating the bag of words...\n")
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
|
||||
# Initialize the "CountVectorizer" object, which is scikit-learn's
|
||||
# bag of words tool.
|
||||
if punctuation == False:
|
||||
vectorizer = CountVectorizer(analyzer = analyzer, \
|
||||
tokenizer = None, \
|
||||
preprocessor = None, \
|
||||
stop_words = None, \
|
||||
max_features = max_features)
|
||||
|
||||
if punctuation == True:
|
||||
vectorizer = CountVectorizer(analyzer = analyzer, \
|
||||
tokenizer = my_tokenizer, \
|
||||
preprocessor = None, \
|
||||
stop_words = None, \
|
||||
max_features = max_features)
|
||||
|
||||
|
||||
# token_pattern = r'(?u)\w')
|
||||
# fit_transform() does two functions: First, it fits the model
|
||||
# and learns the vocabulary; second, it transforms our training data
|
||||
# into feature vectors. The input to fit_transform should be a list of
|
||||
# strings.
|
||||
train_data_features = vectorizer.fit_transform(self.database)
|
||||
|
||||
joblib.dump(vectorizer, 'bagofwords' + self.DatabaseDir + '.pkl')
|
||||
|
||||
print('dumping the data to hkl format..')
|
||||
hkl.dump(train_data_features, 'DataBaseOneZeros' + self.DatabaseDir + '.hkl', mode='w', compression='gzip')
|
||||
print('done')
|
||||
|
||||
|
||||
return vectorizer
|
||||
|
||||
def Load_BoW_Model(self, BoWModelDir, DatabaseOneZerosDir):
|
||||
|
||||
# input has to be pkl format
|
||||
self.vectorizer = joblib.load(BoWModelDir)
|
||||
|
||||
self.dbOZ = hkl.load(DatabaseOneZerosDir).astype('float32')
|
||||
|
||||
|
||||
return self.vectorizer
|
||||
|
||||
|
||||
# input: string to search for in the documents, the numberofmatches to get the best n documents
|
||||
# output the numberofmatches documents with their indexes on the database which is searched, the highest accordance number plus index [index, number]
|
||||
|
||||
def search(self, string , numberofmatches):
|
||||
|
||||
|
||||
numberofmatches = numberofmatches
|
||||
|
||||
|
||||
# Convert user input to Zeros and Ones
|
||||
user_array = []
|
||||
user_array.append(string)
|
||||
|
||||
user_input_OnesZeros = self.vectorizer.transform(user_array)
|
||||
|
||||
uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False)
|
||||
|
||||
|
||||
uiOZ = uOZ[np.newaxis, :]
|
||||
|
||||
uiOZ = uiOZ.transpose()
|
||||
|
||||
sess = tf.Session()
|
||||
with tf.device('/gpu:0'):
|
||||
with sess.as_default():
|
||||
|
||||
uiOZ_tensor = tf.constant(uiOZ)
|
||||
|
||||
dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ)
|
||||
|
||||
#uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
|
||||
#dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
|
||||
|
||||
|
||||
#wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor)
|
||||
wordCountDoku = tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor)
|
||||
|
||||
wCD = np.array(wordCountDoku.eval())
|
||||
|
||||
indexedwCD = []
|
||||
for n in range(len(wCD)):
|
||||
indexedwCD.append([n,wCD[n][0]])
|
||||
|
||||
|
||||
indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True)
|
||||
|
||||
best_n_documents = []
|
||||
|
||||
eq_number = 0
|
||||
for number in uiOZ:
|
||||
#print(number)
|
||||
eq_number += number ** 2
|
||||
|
||||
#print(eq_number)
|
||||
|
||||
n = 0
|
||||
done = False
|
||||
while n < len(indexedwCD) and done == False:
|
||||
n += 1
|
||||
if indexedwCD[n][1] == eq_number:
|
||||
best_n_documents = indexedwCD[n][0]
|
||||
done = True
|
||||
|
||||
if indexedwCD[n][1] < eq_number:
|
||||
best_n_documents = indexedwCD[n - 1][0]
|
||||
done = True
|
||||
|
||||
|
||||
#for n in range(numberofmatches):
|
||||
|
||||
#best_n_documents.append([indexedwCD[n][0], indexedwCD[n][1]])
|
||||
|
||||
|
||||
return best_n_documents, indexedwCD[0]
|
||||
|
||||
def search_with_highest_multiplikation_Output(self, string , numberofmatches):
|
||||
|
||||
|
||||
|
||||
|
||||
numberofmatches = numberofmatches
|
||||
|
||||
|
||||
# Convert user input to Zeros and Ones
|
||||
user_array = []
|
||||
user_array.append(string)
|
||||
|
||||
user_input_OnesZeros = self.vectorizer.transform(user_array)
|
||||
|
||||
uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False)
|
||||
|
||||
|
||||
uiOZ = uOZ[np.newaxis, :]
|
||||
|
||||
uiOZ = uiOZ.transpose()
|
||||
|
||||
sess = tf.Session()
|
||||
with tf.device('/gpu:0'):
|
||||
with sess.as_default():
|
||||
|
||||
uiOZ_tensor = tf.constant(uiOZ)
|
||||
|
||||
dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ)
|
||||
|
||||
#uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
|
||||
#dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
|
||||
|
||||
|
||||
#wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor)
|
||||
wordCountDoku = tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor)
|
||||
|
||||
wCD = np.array(wordCountDoku.eval())
|
||||
|
||||
indexedwCD = []
|
||||
for n in range(len(wCD)):
|
||||
indexedwCD.append([n,wCD[n][0]])
|
||||
|
||||
|
||||
indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True)
|
||||
|
||||
best_n_documents = []
|
||||
|
||||
for n in range(numberofmatches):
|
||||
best_n_documents.append(indexedwCD[n][0])
|
||||
|
||||
return best_n_documents, indexedwCD[0]
|
||||
|
||||
|
||||
def searchPatternMatch(self, string , numberofmatches):
|
||||
|
||||
|
||||
numberofmatches = numberofmatches
|
||||
|
||||
|
||||
# Convert user input to Zeros and Ones
|
||||
user_array = []
|
||||
user_array.append(string)
|
||||
|
||||
user_input_OnesZeros = self.vectorizer.transform(user_array)
|
||||
|
||||
uOZ = user_input_OnesZeros.toarray()[0].astype(np.float32, copy=False)
|
||||
|
||||
|
||||
uiOZ = uOZ[np.newaxis, :]
|
||||
|
||||
uiOZ = uiOZ.transpose()
|
||||
|
||||
sess = tf.Session()
|
||||
with tf.device('/gpu:0'):
|
||||
with sess.as_default():
|
||||
|
||||
uiOZ_tensor = tf.constant(uiOZ)
|
||||
|
||||
dbOZ_tensor_sparse = convert_sparse_matrix_to_sparse_tensor(self.dbOZ)
|
||||
|
||||
#uiOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(uiOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
|
||||
#dbOZ_tensor_sparse =tf.contrib.layers.dense_to_sparse(dbOZ_tensor, eos_token=0, outputs_collections=None, scope=None )
|
||||
|
||||
|
||||
#wordCountDoku = tf.matmul(uiOZ_tensor, dbOZ_tensor)
|
||||
wordCountDoku = tf.sparse_tensor_dense_matmul(dbOZ_tensor_sparse, uiOZ_tensor)
|
||||
|
||||
wCD = np.array(wordCountDoku.eval())
|
||||
|
||||
indexedwCD = []
|
||||
for n in range(len(wCD)):
|
||||
indexedwCD.append([n,wCD[n][0]])
|
||||
|
||||
# Sort the biggest matches
|
||||
indexedwCD = sorted(indexedwCD[::-1], key=lambda tup: tup[1], reverse=True)
|
||||
|
||||
best_n_documents = []
|
||||
|
||||
best_docs_surrounding = []
|
||||
|
||||
|
||||
# Get the number which is result when same words would be in the document as in one grammar scheme
|
||||
eq_number = 0
|
||||
for number in uiOZ:
|
||||
#print(number)
|
||||
eq_number += number ** 2
|
||||
|
||||
print(eq_number)
|
||||
|
||||
# Create new array of closest grammar schemes, I have chosen around 3 (in the matchnumber, not regarding words or so)
|
||||
n = 0
|
||||
done = False
|
||||
while n < len(indexedwCD) and done == False:
|
||||
n += 1
|
||||
#print('a',indexedwCD)
|
||||
#print('oo', indexedwCD[n])
|
||||
if indexedwCD[n][1] == eq_number:
|
||||
best_docs_surrounding.append(indexedwCD[n][0])
|
||||
|
||||
#if indexedwCD[n][1] < eq_number:
|
||||
#best_docs_surrounding.append(indexedwCD[n][0])
|
||||
|
||||
if indexedwCD[n][1] < eq_number :
|
||||
done = True
|
||||
|
||||
|
||||
# Count for these docs in surrounding the matches of wordnumbers per word
|
||||
# would be much faster when using the sparse class
|
||||
|
||||
best_docs_surrounding_new = []
|
||||
for doc in best_docs_surrounding:
|
||||
dok_BoW = self.dbOZ[doc].toarray()[0].astype(np.float32, copy=False)
|
||||
Number_equal_words = 0
|
||||
for n in range(len(uiOZ)):
|
||||
#print(uiOZ[n])
|
||||
#print(dok_BoW[n])
|
||||
#print('dok_BoW',dok_BoW)
|
||||
if uiOZ[n] == dok_BoW[n]:
|
||||
Number_equal_words += 1
|
||||
best_docs_surrounding_new.append([doc , Number_equal_words])
|
||||
|
||||
# Sort the result again with the original indexes
|
||||
best_n_documents = sorted(best_docs_surrounding_new[::-1], key=lambda tup: tup[1], reverse=True)
|
||||
|
||||
|
||||
|
||||
#for n in range(numberofmatches):
|
||||
|
||||
#best_n_documents.append([indexedwCD[n][0], indexedwCD[n][1]])
|
||||
|
||||
|
||||
return best_n_documents
|
||||
|
||||
|
12
compose/docker-compose.yml
Normal file
12
compose/docker-compose.yml
Normal file
|
@ -0,0 +1,12 @@
|
|||
version: '2.3'
|
||||
|
||||
|
||||
services:
|
||||
|
||||
prototype:
|
||||
|
||||
build: ../build/tf-gpu-Prototyp
|
||||
container_name: prototype
|
||||
restart: always
|
||||
ports:
|
||||
- "127.0.0.1:7000:7000"
|
1
oi
1
oi
|
@ -1 +0,0 @@
|
|||
ölaksfd
|
Loading…
Reference in a new issue