laywerrobot/lib/python3.6/site-packages/gensim/summarization/bm25.py

189 lines
5.4 KiB
Python
Raw Normal View History

2020-08-27 21:55:39 +02:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""This module contains function of computing rank scores for documents in
corpus and helper class `BM25` used in calculations. Original algorithm
descibed in [1]_, also you may check Wikipedia page [2]_.
.. [1] Robertson, Stephen; Zaragoza, Hugo (2009). The Probabilistic Relevance Framework: BM25 and Beyond,
http://www.staff.city.ac.uk/~sb317/papers/foundations_bm25_review.pdf
.. [2] Okapi BM25 on Wikipedia, https://en.wikipedia.org/wiki/Okapi_BM25
Examples
--------
>>> from gensim.summarization.bm25 import get_bm25_weights
>>> corpus = [
... ["black", "cat", "white", "cat"],
... ["cat", "outer", "space"],
... ["wag", "dog"]
... ]
>>> result = get_bm25_weights(corpus)
Data:
-----
.. data:: PARAM_K1 - Free smoothing parameter for BM25.
.. data:: PARAM_B - Free smoothing parameter for BM25.
.. data:: EPSILON - Constant used for negative idf of document in corpus.
"""
import math
from six import iteritems
from six.moves import xrange
PARAM_K1 = 1.5
PARAM_B = 0.75
EPSILON = 0.25
class BM25(object):
"""Implementation of Best Matching 25 ranking function.
Attributes
----------
corpus_size : int
Size of corpus (number of documents).
avgdl : float
Average length of document in `corpus`.
corpus : list of list of str
Corpus of documents.
f : list of dicts of int
Dictionary with terms frequencies for each document in `corpus`. Words used as keys and frequencies as values.
df : dict
Dictionary with terms frequencies for whole `corpus`. Words used as keys and frequencies as values.
idf : dict
Dictionary with inversed terms frequencies for whole `corpus`. Words used as keys and frequencies as values.
doc_len : list of int
List of document lengths.
"""
def __init__(self, corpus):
"""
Parameters
----------
corpus : list of list of str
Given corpus.
"""
self.corpus_size = len(corpus)
self.avgdl = sum(float(len(x)) for x in corpus) / self.corpus_size
self.corpus = corpus
self.f = []
self.df = {}
self.idf = {}
self.doc_len = []
self.initialize()
def initialize(self):
"""Calculates frequencies of terms in documents and in corpus. Also computes inverse document frequencies."""
for document in self.corpus:
frequencies = {}
self.doc_len.append(len(document))
for word in document:
if word not in frequencies:
frequencies[word] = 0
frequencies[word] += 1
self.f.append(frequencies)
for word, freq in iteritems(frequencies):
if word not in self.df:
self.df[word] = 0
self.df[word] += 1
for word, freq in iteritems(self.df):
self.idf[word] = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)
def get_score(self, document, index, average_idf):
"""Computes BM25 score of given `document` in relation to item of corpus selected by `index`.
Parameters
----------
document : list of str
Document to be scored.
index : int
Index of document in corpus selected to score with `document`.
average_idf : float
Average idf in corpus.
Returns
-------
float
BM25 score.
"""
score = 0
for word in document:
if word not in self.f[index]:
continue
idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf
score += (idf * self.f[index][word] * (PARAM_K1 + 1)
/ (self.f[index][word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.doc_len[index] / self.avgdl)))
return score
def get_scores(self, document, average_idf):
"""Computes and returns BM25 scores of given `document` in relation to
every item in corpus.
Parameters
----------
document : list of str
Document to be scored.
average_idf : float
Average idf in corpus.
Returns
-------
list of float
BM25 scores.
"""
scores = []
for index in xrange(self.corpus_size):
score = self.get_score(document, index, average_idf)
scores.append(score)
return scores
def get_bm25_weights(corpus):
"""Returns BM25 scores (weights) of documents in corpus.
Each document has to be weighted with every document in given corpus.
Parameters
----------
corpus : list of list of str
Corpus of documents.
Returns
-------
list of list of float
BM25 scores.
Examples
--------
>>> from gensim.summarization.bm25 import get_bm25_weights
>>> corpus = [
... ["black", "cat", "white", "cat"],
... ["cat", "outer", "space"],
... ["wag", "dog"]
... ]
>>> result = get_bm25_weights(corpus)
"""
bm25 = BM25(corpus)
average_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf)
weights = []
for doc in corpus:
scores = bm25.get_scores(doc, average_idf)
weights.append(scores)
return weights