175 lines
4.6 KiB
Python
175 lines
4.6 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
#
|
|
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
|
|
|
|
"""This module calculate PageRank [1]_ based on wordgraph.
|
|
|
|
|
|
.. [1] https://en.wikipedia.org/wiki/PageRank
|
|
|
|
Examples
|
|
--------
|
|
|
|
Calculate Pagerank for words
|
|
|
|
>>> from gensim.summarization.keywords import get_graph
|
|
>>> from gensim.summarization.pagerank_weighted import pagerank_weighted
|
|
>>> graph = get_graph("The road to hell is paved with good intentions.")
|
|
>>> # result will looks like {'good': 0.70432858653171504, 'hell': 0.051128871128006126, ...}
|
|
>>> result = pagerank_weighted(graph)
|
|
|
|
Build matrix from graph
|
|
|
|
>>> from gensim.summarization.pagerank_weighted import build_adjacency_matrix
|
|
>>> build_adjacency_matrix(graph).todense()
|
|
matrix([[ 0., 0., 0., 0., 0.],
|
|
[ 0., 0., 1., 0., 0.],
|
|
[ 0., 1., 0., 0., 0.],
|
|
[ 0., 0., 0., 0., 0.],
|
|
[ 0., 0., 0., 0., 0.]])
|
|
|
|
"""
|
|
|
|
|
|
import numpy
|
|
from numpy import empty as empty_matrix
|
|
from scipy.linalg import eig
|
|
from scipy.sparse import csr_matrix
|
|
from scipy.sparse.linalg import eigs
|
|
from six.moves import xrange
|
|
|
|
|
|
def pagerank_weighted(graph, damping=0.85):
|
|
"""Get dictionary of `graph` nodes and its ranks.
|
|
|
|
Parameters
|
|
----------
|
|
graph : :class:`~gensim.summarization.graph.Graph`
|
|
Given graph.
|
|
damping : float
|
|
Damping parameter, optional
|
|
|
|
Returns
|
|
-------
|
|
dict
|
|
Nodes of `graph` as keys, its ranks as values.
|
|
|
|
"""
|
|
adjacency_matrix = build_adjacency_matrix(graph)
|
|
probability_matrix = build_probability_matrix(graph)
|
|
|
|
pagerank_matrix = damping * adjacency_matrix.todense() + (1 - damping) * probability_matrix
|
|
|
|
vec = principal_eigenvector(pagerank_matrix.T)
|
|
|
|
# Because pagerank_matrix is positive, vec is always real (i.e. not complex)
|
|
return process_results(graph, vec.real)
|
|
|
|
|
|
def build_adjacency_matrix(graph):
|
|
"""Get matrix representation of given `graph`.
|
|
|
|
Parameters
|
|
----------
|
|
graph : :class:`~gensim.summarization.graph.Graph`
|
|
Given graph.
|
|
|
|
Returns
|
|
-------
|
|
:class:`scipy.sparse.csr_matrix`, shape = [n, n]
|
|
Adjacency matrix of given `graph`, n is number of nodes.
|
|
|
|
"""
|
|
row = []
|
|
col = []
|
|
data = []
|
|
nodes = graph.nodes()
|
|
length = len(nodes)
|
|
|
|
for i in xrange(length):
|
|
current_node = nodes[i]
|
|
neighbors_sum = sum(graph.edge_weight((current_node, neighbor)) for neighbor in graph.neighbors(current_node))
|
|
for j in xrange(length):
|
|
edge_weight = float(graph.edge_weight((current_node, nodes[j])))
|
|
if i != j and edge_weight != 0.0:
|
|
row.append(i)
|
|
col.append(j)
|
|
data.append(edge_weight / neighbors_sum)
|
|
|
|
return csr_matrix((data, (row, col)), shape=(length, length))
|
|
|
|
|
|
def build_probability_matrix(graph):
|
|
"""Get square matrix of shape (n, n), where n is number of nodes of the
|
|
given `graph`.
|
|
|
|
Parameters
|
|
----------
|
|
graph : :class:`~gensim.summarization.graph.Graph`
|
|
Given graph.
|
|
|
|
Returns
|
|
-------
|
|
numpy.ndarray, shape = [n, n]
|
|
Eigenvector of matrix `a`, n is number of nodes of `graph`.
|
|
|
|
"""
|
|
dimension = len(graph.nodes())
|
|
matrix = empty_matrix((dimension, dimension))
|
|
|
|
probability = 1.0 / float(dimension)
|
|
matrix.fill(probability)
|
|
|
|
return matrix
|
|
|
|
|
|
def principal_eigenvector(a):
|
|
"""Get eigenvector of square matrix `a`.
|
|
|
|
Parameters
|
|
----------
|
|
a : numpy.ndarray, shape = [n, n]
|
|
Given matrix.
|
|
|
|
Returns
|
|
-------
|
|
numpy.ndarray, shape = [n, ]
|
|
Eigenvector of matrix `a`.
|
|
|
|
"""
|
|
# Note that we prefer to use `eigs` even for dense matrix
|
|
# because we need only one eigenvector. See #441, #438 for discussion.
|
|
|
|
# But it doesn't work for dim A < 3, so we just handle this special case
|
|
if len(a) < 3:
|
|
vals, vecs = eig(a)
|
|
ind = numpy.abs(vals).argmax()
|
|
return vecs[:, ind]
|
|
else:
|
|
vals, vecs = eigs(a, k=1)
|
|
return vecs[:, 0]
|
|
|
|
|
|
def process_results(graph, vec):
|
|
"""Get `graph` nodes and corresponding absolute values of provided eigenvector.
|
|
This function is helper for :func:`~gensim.summarization.pagerank_weighted.pagerank_weighted`
|
|
|
|
Parameters
|
|
----------
|
|
graph : :class:`~gensim.summarization.graph.Graph`
|
|
Given graph.
|
|
vec : numpy.ndarray, shape = [n, ]
|
|
Given eigenvector, n is number of nodes of `graph`.
|
|
|
|
Returns
|
|
-------
|
|
dict
|
|
Graph nodes as keys, corresponding elements of eigenvector as values.
|
|
|
|
"""
|
|
scores = {}
|
|
for i, node in enumerate(graph.nodes()):
|
|
scores[node] = abs(vec[i])
|
|
|
|
return scores
|