import pandas as pd 
import numpy as np  


from bs4 import BeautifulSoup  
import re

import nltk

from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords # Import the stop word list

from sklearn.linear_model import SGDClassifier
from sklearn import svm
import scipy
from sklearn import preprocessing

from sklearn.externals import joblib

def comment_to_words( raw_comment ):
    # Function to convert a raw comment to a string of words
    # The input is a single string (a raw comment), and 
    # the output is a single string (a preprocessed comment)
    #
    # 1. Remove HTML
    comment_text = BeautifulSoup(raw_comment, "html.parser").get_text() 
    #
    # 2. Remove non-letters        
    #letters_only = re.sub("[^a-zA-Z]", " ", comment_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = comment_text.split()
    #words = letters_only.split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("german"))                  
    # 
    # 5. Remove stop words
    #meaningful_words = [w for w in words if not w in stops]   
    meaningful_words = [w for w in words]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))   


class SentGlueMach(object):
    
    def __init__(self, SGDModel, bagofwords):
        
        self.sgdmodel = SGDModel
        self.bow = bagofwords


    def initialize(self):


        from sklearn.feature_extraction.text import CountVectorizer

        #print('loading vectorizer..')
        self.vectorizer = joblib.load(self.bow)
        #print('done')


        #print('loading the SGD model..')
        self.clf = joblib.load(self.sgdmodel)
        #print('done')
        
        #print('loading spacy..')
        import spacy
        self.nlp = spacy.load('de_core_news_sm')
        #print('done')
        
        
    def predictandevalOnCsv(self, csvdata):
        
        self.comment_in = pd.read_csv(csvdata, header=0, quotechar='"',
                            delimiter=',')
        
        #print('here ist the comment in format',self.comment_in)
        
        self.num_comments_in = self.comment_in["Kommentar"].size

        
        #print( "Cleaning and parsing the valuation set comments...\n")
        clean_comments_in = []
        for i in range( 0, self.num_comments_in ):
            # If the index is evenly divisible by 1000, print a message
            #if( (i+1)%1000 == 0 ):
                #print("comment %d of %d\n" % ( i+1, self.num_comments_in ))                                                                    
            #print(self.comment_in["Kommentar"][i])
            clean_comments_in.append( comment_to_words( self.comment_in["Kommentar"][i] ))


        #print(clean_comments_in)

        comments_in_vector = self.vectorizer.transform(clean_comments_in)

        #print('da comments', comments_in_vector)
        
        # Numpy arrays are easy to work with, so convert the result to an 
        # array
        self.comments_in_vector = comments_in_vector.toarray()
        #print(comments_in_vector)
        #print('here are the comments in vector, input to the predictmach', self.comments_in_vector)
        
        
        X_val = self.clf.predict(self.comments_in_vector)

        
        #print( X_val)


        Y_val = []
        for n in range(self.num_comments_in):
            Y_val.append(self.comment_in["correct or not"][n])

        for r in range(self.num_comments_in):
                d = Y_val[r]
                if d == ' correct':
                    Y_val[r] = 1
                else:
                    Y_val[r] = 0


        XY_val = np.zeros((2,self.num_comments_in))

        for n in range(self.num_comments_in):
            XY_val[0][n] = X_val[n]
            XY_val[1][n] = Y_val[n]

        #print(XY_val)    

        count = 0
        for n in range(self.num_comments_in):
            if XY_val[0][n] == XY_val[1][n]:
                count = count + 1
            

        #print('Die Anzahl der richtigen Aussagen betraegt:', count)

        Proz = (count*100)/self.num_comments_in

        #print('Prozentual wurde richtig geraten:', Proz, '%')

        
        return XY_val , Proz

    
    #input: ['this is sentence a', 'and then comes the next sentence', 'and so on'], [ 'the ori sentence is']
        
    def predictprobsOnSentenceList(self, beforeprocessingSentenceList, orisentence):
        
        oridocs = 0
        
        if type(orisentence) == str:
        
            oridoc = self.nlp(orisentence)
        else:
            oridocs = 1
        
        SentenceList =[]
        count = 0
        for sentence in beforeprocessingSentenceList:
            
            
            count += 1
            doc = self.nlp(sentence)
            
            if oridocs == 1:
                
                oridoc = self.nlp(orisentence[count - 1])
            
            #if count % 100 == 0:
                #print(count)
            depssentence = []
            tagssentence = []
            for word in doc:
                for word2 in oridoc:
                    if word.text == word2.text:
                        depssentence.append(word2.dep_)
                        break
                
                tagssentence.append(word.tag_)
            deps = ' '
            tags = ' '
            for x in depssentence:
                #print(x)
                deps += str(x) + ' '
                tags += str(x) + ' '
                
            #print('a',type(sentence)) 
            #print('nb', type(deps))
            
            processedsentence = '"' + sentence + deps + tags +'"'
            
            SentenceList.append(processedsentence)
        
        #print(SentenceList)
        #print( "Cleaning and parsing the valuation set comments...\n")
        clean_comments_in = []
        numSentenceList = len(SentenceList)
        for i in range( 0, numSentenceList ):
            # If the index is evenly divisible by 1000, print a message
            #if( (i+1)%1000 == 0 ):
                #print("comment %d of %d\n" % ( i+1, numSentenceList ))                                                                    
            #print(SentenceList[i])
            clean_comments_in.append( comment_to_words( SentenceList[i] ))


        #print(clean_comments_in)

        comments_in_vector = self.vectorizer.transform(clean_comments_in)

        #print('da comments', comments_in_vector)
        
        # Numpy arrays are easy to work with, so convert the result to an 
        # array
        self.comments_in_vector = comments_in_vector.toarray()
        
        
        Prob_perclass = self.clf.predict_proba(self.comments_in_vector)
        
        return Prob_perclass
        
        
    def GeneratePermutationsOfSentence(self, sentence):
        
        import itertools
        permutations = list(itertools.permutations(sentence))
        
        output = []
        for perm in permutations:
            output.append(list(perm))
        
        return output
    
    def GetBestSentenceFromSentencesAccordingToGrammar(self, sentences, orisentence):
        
        
        probsMatrix = self.predictprobsOnSentenceList(sentences, orisentence)
                    
        #print(probsMatrix)
        
        for i in range(len(probsMatrix)):
            probsMatrix[i][0] = i
        
        #print(probsMatrix)
        
        sortedprobsMatrix = sorted(probsMatrix[::-1], key=lambda tup: tup[1], reverse=True)
        
        #print(sortedprobsMatrix)
        
        bestindex = sortedprobsMatrix[0][0]
        
        #print(bestindex)
        #print('probablemainsentences', filteredprobsentences)
        bestsentence = sentences[int(bestindex)]
        
        
        return bestsentence