389 lines
15 KiB
Python
389 lines
15 KiB
Python
|
#!/usr/bin/env python
|
||
|
# -*- coding: utf-8 -*-
|
||
|
#
|
||
|
# Author: Jayant Jain <jayant@rare-technologies.com>
|
||
|
# Copyright (C) 2016 RaRe Technologies
|
||
|
|
||
|
"""This script using for extracting plain text out of a raw Wikipedia dump. Input is an xml.bz2 file provided
|
||
|
by MediaWiki that looks like <LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2 or <LANG>wiki-latest-pages-articles.xml.bz2
|
||
|
(e.g. 14 GB of https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2).
|
||
|
|
||
|
It streams through all the XML articles using multiple cores (#cores - 1, by default),
|
||
|
decompressing on the fly and extracting plain text from the articles and their sections.
|
||
|
|
||
|
For each extracted article, it prints its title, section names and plain text section contents, in json-line format.
|
||
|
|
||
|
How to use
|
||
|
----------
|
||
|
#. Process Wikipedia dump with this script ::
|
||
|
|
||
|
python -m gensim.scripts.segment_wiki -i -f enwiki-latest-pages-articles.xml.bz2 -o enwiki-latest.json.gz
|
||
|
|
||
|
#. Read output in simple way
|
||
|
|
||
|
>>> from smart_open import smart_open
|
||
|
>>> import json
|
||
|
>>>
|
||
|
>>> # iterate over the plain text data we just created
|
||
|
>>> for line in smart_open('enwiki-latest.json.gz'):
|
||
|
>>> # decode each JSON line into a Python dictionary object
|
||
|
>>> article = json.loads(line)
|
||
|
>>>
|
||
|
>>> # each article has a "title", a mapping of interlinks and a list of "section_titles" and "section_texts".
|
||
|
>>> print("Article title: %s" % article['title'])
|
||
|
>>> print("Interlinks: %s" + article['interlinks'])
|
||
|
>>> for section_title, section_text in zip(article['section_titles'], article['section_texts']):
|
||
|
>>> print("Section title: %s" % section_title)
|
||
|
>>> print("Section text: %s" % section_text)
|
||
|
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
Processing the entire English Wikipedia dump takes 1.7 hours (about 3 million articles per hour,
|
||
|
or 10 MB of XML per second) on an 8 core Intel i7-7700 @3.60GHz.
|
||
|
|
||
|
|
||
|
Command line arguments
|
||
|
----------------------
|
||
|
|
||
|
.. program-output:: python -m gensim.scripts.segment_wiki --help
|
||
|
:ellipsis: 0, -10
|
||
|
|
||
|
"""
|
||
|
|
||
|
import argparse
|
||
|
import json
|
||
|
import logging
|
||
|
import multiprocessing
|
||
|
import re
|
||
|
import sys
|
||
|
from xml.etree import cElementTree
|
||
|
from functools import partial
|
||
|
|
||
|
from gensim.corpora.wikicorpus import IGNORED_NAMESPACES, WikiCorpus, filter_wiki, find_interlinks, get_namespace, utils
|
||
|
from smart_open import smart_open
|
||
|
|
||
|
logger = logging.getLogger(__name__)
|
||
|
|
||
|
|
||
|
def segment_all_articles(file_path, min_article_character=200, workers=None, include_interlinks=False):
|
||
|
"""Extract article titles and sections from a MediaWiki bz2 database dump.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
file_path : str
|
||
|
Path to MediaWiki dump, typical filename is <LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2
|
||
|
or <LANG>wiki-latest-pages-articles.xml.bz2.
|
||
|
|
||
|
min_article_character : int, optional
|
||
|
Minimal number of character for article (except titles and leading gaps).
|
||
|
|
||
|
workers: int or None
|
||
|
Number of parallel workers, max(1, multiprocessing.cpu_count() - 1) if None.
|
||
|
|
||
|
include_interlinks: bool
|
||
|
Whether or not interlinks should be included in the output
|
||
|
|
||
|
Yields
|
||
|
------
|
||
|
(str, list of (str, str), (Optionally) dict of str: str)
|
||
|
Structure contains (title, [(section_heading, section_content), ...], (Optionally) {interlinks}).
|
||
|
|
||
|
"""
|
||
|
with smart_open(file_path, 'rb') as xml_fileobj:
|
||
|
wiki_sections_corpus = _WikiSectionsCorpus(
|
||
|
xml_fileobj, min_article_character=min_article_character, processes=workers,
|
||
|
include_interlinks=include_interlinks)
|
||
|
wiki_sections_corpus.metadata = True
|
||
|
wiki_sections_text = wiki_sections_corpus.get_texts_with_sections()
|
||
|
|
||
|
for article in wiki_sections_text:
|
||
|
yield article
|
||
|
|
||
|
|
||
|
def segment_and_write_all_articles(file_path, output_file, min_article_character=200, workers=None,
|
||
|
include_interlinks=False):
|
||
|
"""Write article title and sections to `output_file` (or stdout, if output_file is None).
|
||
|
|
||
|
The output format is one article per line, in json-line format with 4 fields::
|
||
|
|
||
|
'title' - title of article,
|
||
|
'section_titles' - list of titles of sections,
|
||
|
'section_texts' - list of content from sections,
|
||
|
(Optional) 'section_interlinks' - list of interlinks in the article.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
file_path : str
|
||
|
Path to MediaWiki dump, typical filename is <LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2
|
||
|
or <LANG>wiki-latest-pages-articles.xml.bz2.
|
||
|
|
||
|
output_file : str or None
|
||
|
Path to output file in json-lines format, or None for printing to stdout.
|
||
|
|
||
|
min_article_character : int, optional
|
||
|
Minimal number of character for article (except titles and leading gaps).
|
||
|
|
||
|
workers: int or None
|
||
|
Number of parallel workers, max(1, multiprocessing.cpu_count() - 1) if None.
|
||
|
|
||
|
include_interlinks: bool
|
||
|
Whether or not interlinks should be included in the output
|
||
|
"""
|
||
|
if output_file is None:
|
||
|
outfile = getattr(sys.stdout, 'buffer', sys.stdout) # we want write bytes, so for py3 we used 'buffer'
|
||
|
else:
|
||
|
outfile = smart_open(output_file, 'wb')
|
||
|
|
||
|
try:
|
||
|
article_stream = segment_all_articles(file_path, min_article_character, workers=workers,
|
||
|
include_interlinks=include_interlinks)
|
||
|
for idx, article in enumerate(article_stream):
|
||
|
article_title, article_sections = article[0], article[1]
|
||
|
if include_interlinks:
|
||
|
interlinks = article[2]
|
||
|
|
||
|
output_data = {
|
||
|
"title": article_title,
|
||
|
"section_titles": [],
|
||
|
"section_texts": [],
|
||
|
}
|
||
|
if include_interlinks:
|
||
|
output_data["interlinks"] = interlinks
|
||
|
|
||
|
for section_heading, section_content in article_sections:
|
||
|
output_data["section_titles"].append(section_heading)
|
||
|
output_data["section_texts"].append(section_content)
|
||
|
|
||
|
if (idx + 1) % 100000 == 0:
|
||
|
logger.info("processed #%d articles (at %r now)", idx + 1, article_title)
|
||
|
outfile.write((json.dumps(output_data) + "\n").encode('utf-8'))
|
||
|
|
||
|
finally:
|
||
|
if output_file is not None:
|
||
|
outfile.close()
|
||
|
|
||
|
|
||
|
def extract_page_xmls(f):
|
||
|
"""Extract pages from a MediaWiki database dump.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
f : file
|
||
|
File descriptor of MediaWiki dump.
|
||
|
|
||
|
Yields
|
||
|
------
|
||
|
str
|
||
|
XML strings for page tags.
|
||
|
|
||
|
"""
|
||
|
elems = (elem for _, elem in cElementTree.iterparse(f, events=("end",)))
|
||
|
|
||
|
elem = next(elems)
|
||
|
namespace = get_namespace(elem.tag)
|
||
|
ns_mapping = {"ns": namespace}
|
||
|
page_tag = "{%(ns)s}page" % ns_mapping
|
||
|
|
||
|
for elem in elems:
|
||
|
if elem.tag == page_tag:
|
||
|
yield cElementTree.tostring(elem)
|
||
|
# Prune the element tree, as per
|
||
|
# http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
|
||
|
# except that we don't need to prune backlinks from the parent
|
||
|
# because we don't use LXML.
|
||
|
# We do this only for <page>s, since we need to inspect the
|
||
|
# ./revision/text element. The pages comprise the bulk of the
|
||
|
# file, so in practice we prune away enough.
|
||
|
elem.clear()
|
||
|
|
||
|
|
||
|
def segment(page_xml, include_interlinks=False):
|
||
|
"""Parse the content inside a page tag
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
page_xml : str
|
||
|
Content from page tag.
|
||
|
|
||
|
include_interlinks : bool
|
||
|
Whether or not interlinks should be parsed.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
(str, list of (str, str), (Optionally) dict of (str: str))
|
||
|
Structure contains (title, [(section_heading, section_content), ...], (Optionally) {interlinks}).
|
||
|
|
||
|
"""
|
||
|
elem = cElementTree.fromstring(page_xml)
|
||
|
filter_namespaces = ('0',)
|
||
|
namespace = get_namespace(elem.tag)
|
||
|
ns_mapping = {"ns": namespace}
|
||
|
text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping
|
||
|
title_path = "./{%(ns)s}title" % ns_mapping
|
||
|
ns_path = "./{%(ns)s}ns" % ns_mapping
|
||
|
lead_section_heading = "Introduction"
|
||
|
top_level_heading_regex = r"\n==[^=].*[^=]==\n"
|
||
|
top_level_heading_regex_capture = r"\n==([^=].*[^=])==\n"
|
||
|
|
||
|
title = elem.find(title_path).text
|
||
|
text = elem.find(text_path).text
|
||
|
ns = elem.find(ns_path).text
|
||
|
if ns not in filter_namespaces:
|
||
|
text = None
|
||
|
|
||
|
if text is not None:
|
||
|
if include_interlinks:
|
||
|
interlinks = find_interlinks(text)
|
||
|
section_contents = re.split(top_level_heading_regex, text)
|
||
|
section_headings = [lead_section_heading] + re.findall(top_level_heading_regex_capture, text)
|
||
|
section_headings = [heading.strip() for heading in section_headings]
|
||
|
assert len(section_contents) == len(section_headings)
|
||
|
else:
|
||
|
interlinks = []
|
||
|
section_contents = []
|
||
|
section_headings = []
|
||
|
|
||
|
section_contents = [filter_wiki(section_content) for section_content in section_contents]
|
||
|
sections = list(zip(section_headings, section_contents))
|
||
|
|
||
|
if include_interlinks:
|
||
|
return title, sections, interlinks
|
||
|
else:
|
||
|
return title, sections
|
||
|
|
||
|
|
||
|
class _WikiSectionsCorpus(WikiCorpus):
|
||
|
"""Treat a wikipedia articles dump (<LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2
|
||
|
or <LANG>wiki-latest-pages-articles.xml.bz2) as a (read-only) corpus.
|
||
|
|
||
|
The documents are extracted on-the-fly, so that the whole (massive) dump can stay compressed on disk.
|
||
|
|
||
|
"""
|
||
|
|
||
|
def __init__(self, fileobj, min_article_character=200, processes=None,
|
||
|
lemmatize=utils.has_pattern(), filter_namespaces=('0',), include_interlinks=False):
|
||
|
"""
|
||
|
Parameters
|
||
|
----------
|
||
|
fileobj : file
|
||
|
File descriptor of MediaWiki dump.
|
||
|
min_article_character : int, optional
|
||
|
Minimal number of character for article (except titles and leading gaps).
|
||
|
processes : int, optional
|
||
|
Number of processes, max(1, multiprocessing.cpu_count() - 1) if None.
|
||
|
lemmatize : bool, optional
|
||
|
If `pattern` package is installed, use fancier shallow parsing to get token lemmas.
|
||
|
Otherwise, use simple regexp tokenization.
|
||
|
filter_namespaces : tuple of int, optional
|
||
|
Enumeration of namespaces that will be ignored.
|
||
|
include_interlinks: bool
|
||
|
Whether or not interlinks should be included in the output
|
||
|
|
||
|
"""
|
||
|
self.fileobj = fileobj
|
||
|
self.filter_namespaces = filter_namespaces
|
||
|
self.metadata = False
|
||
|
if processes is None:
|
||
|
processes = max(1, multiprocessing.cpu_count() - 1)
|
||
|
self.processes = processes
|
||
|
self.lemmatize = lemmatize
|
||
|
self.min_article_character = min_article_character
|
||
|
self.include_interlinks = include_interlinks
|
||
|
|
||
|
def get_texts_with_sections(self):
|
||
|
"""Iterate over the dump, returning titles and text versions of all sections of articles.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
Only articles of sufficient length are returned (short articles & redirects
|
||
|
etc are ignored).
|
||
|
|
||
|
Note that this iterates over the **texts**; if you want vectors, just use
|
||
|
the standard corpus interface instead of this function::
|
||
|
|
||
|
>>> for vec in wiki_corpus:
|
||
|
>>> print(vec)
|
||
|
|
||
|
Yields
|
||
|
------
|
||
|
(str, list of (str, str), dict of (str: str))
|
||
|
Structure contains (title, [(section_heading, section_content), ...], (Optionally){interlinks}).
|
||
|
|
||
|
"""
|
||
|
skipped_namespace, skipped_length, skipped_redirect = 0, 0, 0
|
||
|
total_articles, total_sections = 0, 0
|
||
|
page_xmls = extract_page_xmls(self.fileobj)
|
||
|
pool = multiprocessing.Pool(self.processes)
|
||
|
# process the corpus in smaller chunks of docs, because multiprocessing.Pool
|
||
|
# is dumb and would load the entire input into RAM at once...
|
||
|
for group in utils.chunkize(page_xmls, chunksize=10 * self.processes, maxsize=1):
|
||
|
for article in pool.imap(partial(segment, include_interlinks=self.include_interlinks),
|
||
|
group): # chunksize=10): partial(merge_names, b='Sons')
|
||
|
article_title, sections = article[0], article[1]
|
||
|
|
||
|
# article redirects are pruned here
|
||
|
if any(article_title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): # filter non-articles
|
||
|
skipped_namespace += 1
|
||
|
continue
|
||
|
if not sections or sections[0][1].lstrip().lower().startswith("#redirect"): # filter redirect
|
||
|
skipped_redirect += 1
|
||
|
continue
|
||
|
if sum(len(body.strip()) for (_, body) in sections) < self.min_article_character:
|
||
|
# filter stubs (incomplete, very short articles)
|
||
|
skipped_length += 1
|
||
|
continue
|
||
|
total_articles += 1
|
||
|
total_sections += len(sections)
|
||
|
|
||
|
if self.include_interlinks:
|
||
|
interlinks = article[2]
|
||
|
yield (article_title, sections, interlinks)
|
||
|
else:
|
||
|
yield (article_title, sections)
|
||
|
|
||
|
logger.info(
|
||
|
"finished processing %i articles with %i sections (skipped %i redirects, %i stubs, %i ignored namespaces)",
|
||
|
total_articles, total_sections, skipped_redirect, skipped_length, skipped_namespace)
|
||
|
pool.terminate()
|
||
|
self.length = total_articles # cache corpus length
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
logging.basicConfig(format='%(asctime)s - %(module)s - %(levelname)s - %(message)s', level=logging.INFO)
|
||
|
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, description=__doc__[:-136])
|
||
|
default_workers = max(1, multiprocessing.cpu_count() - 1)
|
||
|
parser.add_argument('-f', '--file', help='Path to MediaWiki database dump (read-only).', required=True)
|
||
|
parser.add_argument(
|
||
|
'-o', '--output',
|
||
|
help='Path to output file (stdout if not specified). If ends in .gz or .bz2, '
|
||
|
'the output file will be automatically compressed (recommended!).')
|
||
|
parser.add_argument(
|
||
|
'-w', '--workers',
|
||
|
help='Number of parallel workers for multi-core systems. Default: %(default)s.',
|
||
|
type=int,
|
||
|
default=default_workers
|
||
|
)
|
||
|
parser.add_argument(
|
||
|
'-m', '--min-article-character',
|
||
|
help="Ignore articles with fewer characters than this (article stubs). Default: %(default)s.",
|
||
|
default=200
|
||
|
)
|
||
|
parser.add_argument(
|
||
|
'-i', '--include-interlinks',
|
||
|
help='Include a mapping for interlinks to other articles in the dump. The mappings format is: '
|
||
|
'"interlinks": {"article_title_1": "interlink_text_1", "article_title_2": "interlink_text_2", ...}',
|
||
|
action='store_true'
|
||
|
)
|
||
|
args = parser.parse_args()
|
||
|
|
||
|
logger.info("running %s", " ".join(sys.argv))
|
||
|
segment_and_write_all_articles(
|
||
|
args.file, args.output,
|
||
|
min_article_character=args.min_article_character,
|
||
|
workers=args.workers,
|
||
|
include_interlinks=args.include_interlinks
|
||
|
)
|
||
|
|
||
|
logger.info("finished running %s", sys.argv[0])
|