alpcentaur
/
basabuuka_prototyp


								"""Count occurrences of uint64-valued keys."""

								from __future__ import division

								cimport cython

								from libc.math cimport log, exp, sqrt


								cdef class PreshCounter:

								    def __init__(self, initial_size=8):

								        assert initial_size != 0

								        assert initial_size & (initial_size - 1) == 0

								        self.mem = Pool()

								        self.c_map = <MapStruct*>self.mem.alloc(1, sizeof(MapStruct))

								        map_init(self.mem, self.c_map, initial_size)

								        self.smoother = None

								        self.total = 0


								    property length:

								        def __get__(self):

								            return self.c_map.length


								    def __len__(self):

								        return self.c_map.length


								    def __iter__(self):

								        cdef int i

								        for i in range(self.c_map.length):

								            if self.c_map.cells[i].key != 0:

								                yield (self.c_map.cells[i].key, <count_t>self.c_map.cells[i].value)


								    def __getitem__(self, key_t key):

								        return <count_t>map_get(self.c_map, key)


								    cpdef int inc(self, key_t key, count_t inc) except -1:

								        cdef count_t c = <count_t>map_get(self.c_map, key)

								        c += inc

								        map_set(self.mem, self.c_map, key, <void*>c)

								        self.total += inc

								        return c


								    def prob(self, key_t key):

								        cdef GaleSmoother smoother

								        cdef void* value = map_get(self.c_map, key)

								        if self.smoother is not None:

								            smoother = self.smoother

								            r_star = self.smoother(<count_t>value)

								            return r_star / self.smoother.total

								        elif value == NULL:

								            return 0

								        else:

								            return <count_t>value / self.total


								    def smooth(self):

								        self.smoother = GaleSmoother(self)


								cdef class GaleSmoother:

								    cdef Pool mem

								    cdef count_t* Nr

								    cdef double gradient

								    cdef double intercept

								    cdef readonly count_t cutoff

								    cdef count_t Nr0

								    cdef readonly double total


								    def __init__(self, PreshCounter counts):

								        count_counts = PreshCounter()

								        cdef double total = 0

								        for _, count in counts:

								            count_counts.inc(count, 1)

								            total += count

								        # If we have no items seen 1 or 2 times, this doesn't work. But, this

								        # won't be true in real data...

								        assert count_counts[1] != 0 and count_counts[2] != 0, "Cannot smooth your weird data"

								        # Extrapolate Nr0 from Nr1 and Nr2.

								        self.Nr0 = count_counts[1] + (count_counts[1] - count_counts[2])

								        self.mem = Pool()


								        cdef double[2] mb


								        cdef int n_counts = 0

								        for _ in count_counts:

								            n_counts += 1

								        sorted_r = <count_t*>count_counts.mem.alloc(n_counts, sizeof(count_t))

								        self.Nr = <count_t*>self.mem.alloc(n_counts, sizeof(count_t))

								        for i, (count, count_count) in enumerate(sorted(count_counts)):

								            sorted_r[i] = count

								            self.Nr[i] = count_count


								        _fit_loglinear_model(mb, sorted_r, self.Nr, n_counts)


								        self.cutoff = _find_when_to_switch(sorted_r, self.Nr, mb[0], mb[1],

								                                           n_counts)

								        self.gradient = mb[0]

								        self.intercept = mb[1]

								        self.total = self(0) * self.Nr0

								        for count, count_count in count_counts:

								            self.total += self(count) * count_count


								    def __call__(self, count_t r):

								        if r == 0:

								            return self.Nr[1] / self.Nr0

								        elif r < self.cutoff:

								            return turing_estimate_of_r(<double>r, <double>self.Nr[r-1], <double>self.Nr[r])

								        else:

								            return gale_estimate_of_r(<double>r, self.gradient, self.intercept)


								    def count_count(self, count_t r):

								        if r == 0:

								            return self.Nr0

								        else:

								            return self.Nr[r-1]


								@cython.cdivision(True)

								cdef double turing_estimate_of_r(double r, double Nr, double Nr1) except -1:

								    return ((r + 1) * Nr1) / Nr


								@cython.cdivision(True)

								cdef double gale_estimate_of_r(double r, double gradient, double intercept) except -1:

								    cdef double e_nr  = exp(gradient * log(r) + intercept)

								    cdef double e_nr1 = exp(gradient * log(r+1) + intercept)

								    return (r + 1) * (e_nr1 / e_nr)


								@cython.cdivision(True)

								cdef void _fit_loglinear_model(double* output, count_t* sorted_r, count_t* Nr,

								                               int length) except *:

								    cdef double x_mean = 0.0

								    cdef double y_mean = 0.0


								    cdef Pool mem = Pool()

								    x = <double*>mem.alloc(length, sizeof(double))

								    y = <double*>mem.alloc(length, sizeof(double))


								    cdef int i

								    for i in range(length):

								        r = sorted_r[i]

								        x[i] = log(<double>r)

								        y[i] = log(<double>_get_zr(i, sorted_r, Nr[i], length))

								        x_mean += x[i]

								        y_mean += y[i]


								    x_mean /= length

								    y_mean /= length


								    cdef double ss_xy = 0.0

								    cdef double ss_xx = 0.0


								    for i in range(length):

								        x_dist = x[i] - x_mean

								        y_dist = y[i] - y_mean

								        # SS_xy = sum the product of the distances from the mean

								        ss_xy += x_dist * y_dist

								        # SS_xx = sum the squares of the x distance

								        ss_xx  += x_dist * x_dist

								    # Gradient

								    output[0]  = ss_xy / ss_xx

								    # Intercept

								    output[1] = y_mean - output[0] * x_mean


								@cython.cdivision(True)

								cdef double _get_zr(int j, count_t* sorted_r, count_t Nr_j, int n_counts) except -1:

								    cdef double r_i = sorted_r[j-1] if j >= 1 else 0

								    cdef double r_j = sorted_r[j]

								    cdef double r_k = sorted_r[j+1] if (j+1) < n_counts else (2 * r_i - 1)

								    return 2 * Nr_j / (r_k - r_i)


								@cython.cdivision(True)

								cdef double _variance(double r, double Nr, double Nr1) nogil:

								    return 1.96 * sqrt((r+1)**2 * (Nr1 / Nr**2) * (1.0 + (Nr1 / Nr)))


								@cython.cdivision(True)

								cdef count_t _find_when_to_switch(count_t* sorted_r, count_t* Nr, double m, double b,

								                                  int length) except -1:

								    cdef int i

								    cdef count_t r

								    for i in range(length-1):

								        r = sorted_r[i]

								        if sorted_r[i+1] != r+1:

								            return r

								        g_r = gale_estimate_of_r(r, m, b)

								        t_r = turing_estimate_of_r(<double>r, <double>Nr[i], <double>Nr[i+1])

								        if abs(t_r - g_r) <= _variance(<double>r, <double>Nr[i], <double>Nr[i+1]):

								            return r

								    else:

								        return length - 1