laywerrobot/gensim_w2v/wikiprep.pl

1721 lines
68 KiB
Perl
Raw Normal View History

2020-08-27 21:55:39 +02:00
###############################################################################
#
# wikiprep.pl - Preprocess Wikipedia XML dumps
# Copyright (C) 2007 Evgeniy Gabrilovich
# The author can be contacted by electronic mail at gabr@cs.technion.ac.il
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA,
# or see <http://www.gnu.org/licenses/> and
# <http://www.fsf.org/licensing/licenses/info/GPLv2.html>
#
###############################################################################
use strict;
use warnings;
use File::Basename;
use Getopt::Long;
use Time::localtime;
use XML::Parser;
my $licenseFile = "COPYING";
my $version = "2.02";
if (@ARGV < 1) {
&printUsage();
exit 0;
}
my $file;
my $showLicense = 0;
my $showVersion = 0;
GetOptions('f=s' => \$file,
'license' => \$showLicense,
'version' => \$showVersion);
if ($showLicense) {
if (-e $licenseFile) {
print "See file $licenseFile for more details.\n"
} else {
print "Please see <http://www.gnu.org/licenses/> and <http://www.fsf.org/licensing/licenses/info/GPLv2.html>\n";
}
exit 0;
}
if ($showVersion) {
print "Wikiprep version $version\n";
exit 0;
}
if (!defined($file)) {
&printUsage();
exit 0;
}
if (! -e $file) {
die "Input file '$file' cannot be opened for reading\n";
}
##### Global definitions #####
my %XmlEntities = ('&' => 'amp', '"' => 'quot', "'" => 'apos', '<' => 'lt', '>' => 'gt');
# The URL protocol (e.g., http) matched here may be in either case, hence we use the /i modifier.
my $urlProtocols = qr/http:\/\/|https:\/\/|telnet:\/\/|gopher:\/\/|file:\/\/|wais:\/\/|ftp:\/\/|mailto:|news:/i;
# A URL terminator may be either one of a list of characters OR end of string (that is, '$').
# This last part is necessary to handle URLs at the very end of a string when there is no "\n"
# or any other subsequent character.
my $urlTerminator = qr/[\[\]\{\}\s\n\|\"<>]|$/;
my $relatedWording_Standalone =
qr/Main(?:\s+)article(?:s?)|Further(?:\s+)information|Related(?:\s+)article(?:s?)|Related(?:\s+)topic(?:s?)|See(?:\s+)main(?:\s+)article(?:s?)|See(?:\s+)article(?:s?)|See(?:\s+)also|For(?:\s+)(?:more|further)/i;
## For(?:\s+)more(?:\s+)(?:background|details)(?:\s+)on(?:\s+)this(?:\s+)topic,(?:\s+)see
my $relatedWording_Inline = qr/See[\s:]|See(?:\s+)also|For(?:\s+)(?:more|further)/i;
my $relatedWording_Section = qr/Further(?:\s+)information|See(?:\s+)also|Related(?:\s+)article(?:s?)|Related(?:\s+)topic(?:s?)/i;
my %monthToNumDays = ('January' => 31, 'February' => 29, 'March' => 31, 'April' => 30,
'May' => 31, 'June' => 30, 'July' => 31, 'August' => 31,
'September' => 30, 'October' => 31, 'November' => 30, 'December' => 31);
my %numberToMonth = (1 => 'January', 2 => 'February', 3 => 'March', 4 => 'April',
5 => 'May', 6 => 'June', 7 => 'July', 8 => 'August',
9 => 'September', 10 => 'October', 11 => 'November', 12 => 'December');
my $maxTemplateRecursionLevels = 5;
my $maxParameterRecursionLevels = 5;
##### Global variables #####
my %namespaces;
# we only process pages in these namespaces + the main namespace (which has an empty name)
my %okNamespacesForPrescanning = ('Template' => 1, 'Category' => 1);
my %okNamespacesForTransforming = ('Category' => 1); # we don't use templates as concepts
my %id2title;
my %title2id;
my %redir;
my %templates; # template bodies for insertion
my %catHierarchy; # each category is associated with a list of its immediate descendants
my %statCategories; # number of pages classified under each category
my %statIncomingLinks; # number of links incoming to each page
my ($fileBasename, $filePath, $fileSuffix) = fileparse($file, ".xml");
my $outputFile = "$filePath/$fileBasename.hgw$fileSuffix";
my $logFile = "$filePath/$fileBasename.log";
my $anchorTextFile = "$filePath/$fileBasename.anchor_text";
my $relatedLinksFile = "$filePath/$fileBasename.related_links";
open(OUTF, "> $outputFile") or die "Cannot open $outputFile";
open(LOGF, "> $logFile") or die "Cannot open $logFile";
open(ANCHORF, "> $anchorTextFile") or die "Cannot open $anchorTextFile";
open(RELATEDF, "> $relatedLinksFile") or die "Cannot open $relatedLinksFile";
binmode(STDOUT, ':utf8');
binmode(STDERR, ':utf8');
binmode(OUTF, ':utf8');
binmode(LOGF, ':utf8');
binmode(ANCHORF, ':utf8');
print ANCHORF "# Line format: <Target page id> <Source page id> <Anchor text (up to the end of the line)>\n\n\n";
print RELATEDF "# Line format: <Page id> <List of ids of related articles>\n\n\n";
&copyXmlFileHeader();
&loadNamespaces();
&prescan();
my $numTitles = scalar( keys(%id2title) );
print "Loaded $numTitles titles\n";
my $numRedirects = scalar( keys(%redir) );
print "Loaded $numRedirects redirects\n";
my $numTemplates = scalar( keys(%templates) );
print "Loaded $numTemplates templates\n";
&transform();
&closeXmlFile();
&writeStatistics();
&writeCategoryHierarchy();
close(LOGF);
close(ANCHORF);
close(RELATEDF);
# Hogwarts needs the anchor text file to be sorted in the increading order of target page id.
# The file is originally sorted by source page id (second field in each line).
# We now use stable (-s) numeric (-n) sort on the first field (-k 1,1).
# This way, the resultant file will be sorted on the target page id (first field) as primary key,
# and on the source page id (second field) as secondary key.
system("sort -s -n -k 1,1 $anchorTextFile > $anchorTextFile.sorted");
##### Subroutines #####
sub normalizeTitle(\$) {
my ($refToStr) = @_;
# remove leading whitespace and underscores
$$refToStr =~ s/^[\s_]+//;
# remove trailing whitespace and underscores
$$refToStr =~ s/[\s_]+$//;
# replace sequences of whitespace and underscore chars with a single space
$$refToStr =~ s/[\s_]+/ /g;
if ($$refToStr =~ /^([^:]*):(\s*)(\S(?:.*))/) {
my $prefix = $1;
my $optionalWhitespace = $2;
my $rest = $3;
my $namespaceCandidate = $prefix;
&normalizeNamespace(\$namespaceCandidate); # this must be done before the call to 'isKnownNamespace'
if ( &isKnownNamespace(\$namespaceCandidate) ) {
# If the prefix designates a known namespace, then it might follow by optional
# whitespace that should be removed to get the canonical page name
# (e.g., "Category: Births" should become "Category:Births").
$$refToStr = $namespaceCandidate . ":" . ucfirst($rest);
} else {
# No namespace, just capitalize first letter.
# If the part before the colon is not a known namespace, then we must not remove the space
# after the colon (if any), e.g., "3001: The_Final_Odyssey" != "3001:The_Final_Odyssey".
# However, to get the canonical page name we must contract multiple spaces into one,
# because "3001: The_Final_Odyssey" != "3001: The_Final_Odyssey".
$$refToStr = ucfirst($prefix) . ":" .
(length($optionalWhitespace) > 0 ? " " : "") . $rest;
}
} else {
# no namespace, just capitalize first letter
$$refToStr = ucfirst($$refToStr);
}
}
sub normalizeNamespace(\$) {
my ($refToStr) = @_;
$$refToStr = ucfirst( lc($$refToStr) );
}
# Checks if the prefix of the page name before the colon is actually one of the
# 16+2+2 namespaces defined in the XML file.
# Assumption: the argument was already normalized using 'normalizeNamespace'
sub isKnownNamespace(\$) {
my ($refToStr) = @_;
defined( $namespaces{$$refToStr} ); # return value
}
# The correct form to create a redirect is #REDIRECT [[ link ]],
# and function 'Parse::MediaWikiDump::page->redirect' only supports this form.
# However, it seems that Wikipedia can also tolerate a variety of other forms, such as
# REDIRECT|REDIRECTS|REDIRECTED|REDIRECTION, then an optional ":", optional "to" or optional "=".
# Therefore, we use our own function to handle these cases as well.
# If the page is a redirect, the function returns the title of the target page;
# otherwise, it returns 'undef'.
sub isRedirect($) {
my ($page) = @_;
# quick check
return undef if ( ${$page->text} !~ /^#REDIRECT/i );
if ( ${$page->text} =~ m{^\#REDIRECT # Redirect must start with "#REDIRECT"
# (the backslash is needed before "#" here, because
# "#" has special meaning with /x modifier)
(?:S|ED|ION)? # The word may be in any of these forms,
# i.e., REDIRECT|REDIRECTS|REDIRECTED|REDIRECTION
(?:\s*) # optional whitespace
(?: :|\sTO|=)? # optional colon, "TO" or "="
# (in case of "TO", we expect a whitespace before it,
# so that it's not glued to the preceding word)
(?:\s*) # optional whitespace
\[\[([^\]]*)\]\] # the link itself
}ix ) { # matching is case-insensitive, hence /i
my $target = $1;
if ($target =~ /^(.*)\#(?:.*)$/) {
# The link contains an anchor. Anchors are not allowed in REDIRECT pages, and therefore
# we adjust the link to point to the page as a whole (that's how Wikipedia works).
$target = $1;
}
return $target;
}
# OK, it's probably either a malformed redirect link, or something else
return undef;
}
sub isNamespaceOkForPrescanning($) {
my ($page) = @_;
&isNamespaceOk($page, \%okNamespacesForPrescanning);
}
sub isNamespaceOkForTransforming($) {
my ($page) = @_;
&isNamespaceOk($page, \%okNamespacesForTransforming);
}
sub isNamespaceOk($\%) {
my ($page, $refToNamespaceHash) = @_;
my $result = 1;
# main namespace is OK, so we only check pages that belong to other namespaces
if ($page->namespace ne '') {
my $namespace = $page->namespace;
&normalizeNamespace(\$namespace);
if ( &isKnownNamespace(\$namespace) ) {
$result = defined( $$refToNamespaceHash{$namespace} );
} else {
# the prefix before ":" in the page title is not a known namespace,
# therefore, the page belongs to the main namespace and is OK
}
}
$result; # return value
}
sub encodeXmlChars(\$) {
my ($refToStr) = @_;
$$refToStr =~ s/([&"'<>])/&$XmlEntities{$1};/g;
}
sub copyXmlFileHeader() {
open(INF, "< $file") or die "Cannot open $file";
while (<INF>) { # copy lines up to "</siteinfo>"
if (/^<mediawiki /) {
# The top level element - mediawiki - contains a lot of attributes (e.g., schema)
# that are no longer applicable to the XML file after our transformation.
# Therefore, we simply write an opening tag <mediawiki> without any attributes.
print OUTF "<mediawiki>\n";
} else {
# All other lines (up to </siteinfo>) are copied as-is
print OUTF;
}
last if (/<\/siteinfo>/);
}
close(INF); # this file will later be reopened by "Parse::MediaWikiDump"
}
sub closeXmlFile() {
print OUTF "</mediawiki>\n";
close(OUTF);
}
sub writeStatistics() {
my $statCategoriesFile = "$filePath/$fileBasename.stat.categories";
my $statIncomingLinksFile = "$filePath/$fileBasename.stat.inlinks";
open(STAT_CATS, "> $statCategoriesFile") or die "Cannot open $statCategoriesFile";
print STAT_CATS "# Line format: <CategoryId (= page id)> <Number of pages in this category>\n",
"# Here we count the *pages* that belong to this category, i.e., articles AND\n",
"# sub-categories of this category (but not the articles in the sub-categories).\n",
"\n\n";
my $cat;
foreach $cat ( sort { $statCategories{$b} <=> $statCategories{$a} }
keys(%statCategories) ) {
print STAT_CATS "$cat\t$statCategories{$cat}\n";
}
close(STAT_CATS);
open(STAT_INLINKS, "> $statIncomingLinksFile") or die "Cannot open $statIncomingLinksFile";
print STAT_INLINKS "# Line format: <Target page id> <Number of links to it from other pages>\n\n\n";
my $destination;
foreach $destination ( sort { $statIncomingLinks{$b} <=> $statIncomingLinks{$a} }
keys(%statIncomingLinks) ) {
print STAT_INLINKS "$destination\t$statIncomingLinks{$destination}\n";
}
close(STAT_INLINKS);
}
sub writeCategoryHierarchy() {
my $catHierarchyFile = "$filePath/$fileBasename.cat_hier";
open(CAT_HIER, "> $catHierarchyFile") or die "Cannot open $catHierarchyFile";
print CAT_HIER "# Line format: <Category id> <List of ids of immediate descendants>\n\n\n";
my $cat;
foreach $cat ( sort { $catHierarchy{$a} <=> $catHierarchy{$b} }
keys(%catHierarchy) ) {
print CAT_HIER "$cat\t", join(" ", @{$catHierarchy{$cat}}), "\n";
}
close(CAT_HIER);
}
sub loadNamespaces() {
# re-open the input XML file
my $pages = Parse::MediaWikiDump::Pages->new($file);
# load namespaces
my $refNamespaces = $pages->namespaces;
# namespace names are case-insensitive, so we force them
# to canonical form to facilitate future comparisons
my $ns;
foreach $ns (@$refNamespaces) {
my @namespaceData = @$ns;
my $namespaceId = $namespaceData[0];
my $namespaceName = $namespaceData[1];
&normalizeNamespace(\$namespaceName);
$namespaces{$namespaceName} = $namespaceId;
}
}
# build id <-> title mappings and redirection table,
# as well as load templates
sub prescan() {
# re-open the input XML file
my $pages = Parse::MediaWikiDump::Pages->new($file);
my $counter = 0;
my $page;
while (defined($page = $pages->page)) {
my $id = $page->id;
$counter++;
if ($counter % 1000 == 0) {
my $timeStr = &getTimeAsString();
print LOGF "[$timeStr] Prescanning page id=$id\n";
}
my $title = $page->title;
&normalizeTitle(\$title);
if (length($title) == 0) {
# This is a defense against pages whose title only contains UTF-8 chars that
# are reduced to an empty string. Right now I can think of one such case -
# <C2><A0> which represents the non-breaking space. In this particular case,
# this page is a redirect to [[Non-nreaking space]], but having in the system
# a redirect page with an empty title causes numerous problems, so we'll live
# happier without it.
print LOGF "Skipping page with empty title id=$id\n";
next;
}
my $redirect = &isRedirect($page);
if (defined($redirect)) {
&normalizeTitle(\$redirect);
next if (length($redirect) == 0); # again, same precaution here - see comments above
$redir{$title} = $redirect;
# nothing more to do for redirect pages
next;
}
if ( ! &isNamespaceOkForPrescanning($page) ) {
next; # we're only interested in certain namespaces
}
# if we get here, then either the page belongs to the main namespace OR
# it belongs to one of the namespaces we're interested in
if ( exists($id2title{$id}) ) {
print LOGF "Warning: Page id=$id already encountered before!\n";
next;
}
if ( exists($title2id{$title}) ) {
# A page could have been encountered before with a different spelling.
# Examples: &nbsp; = <C2><A0> (nonbreakable space), &szlig; = <C3><9F> (German Eszett ligature)
print LOGF "Warning: Page title='$title' already encountered before!\n";
next;
}
$id2title{$id} = $title;
$title2id{$title} = $id;
if ($title =~ /^Template:/) {
my $text = ${$page->text};
# We're storing template text for future inclusion, therefore,
# remove all <noinclude> text and keep all <includeonly> text
# (but eliminate <includeonly> tags per se).
# However, if <onlyinclude> ... </onlyinclude> parts are present,
# then only keep them and discard the rest of the template body.
# This is because using <onlyinclude> on a text fragment is
# equivalent to enclosing it in <includeonly> tags **AND**
# enclosing all the rest of the template body in <noinclude> tags.
# These definitions can easily span several lines, hence the "/s" modifiers.
my $onlyincludeAccumulator;
while ($text =~ /<onlyinclude>(.*?)<\/onlyinclude>/sg) {
my $onlyincludeFragment = $1;
$onlyincludeAccumulator .= "$onlyincludeFragment\n";
}
if ( defined($onlyincludeAccumulator)) {
$text = $onlyincludeAccumulator;
} else {
# If there are no <onlyinclude> fragments, simply eliminate
# <noinclude> fragments and keep <includeonly> ones.
$text =~ s/<noinclude>(?:.*?)<\/noinclude>/\n/sg;
$text =~ s/<includeonly>(.*?)<\/includeonly>/$1/sg;
}
$templates{$id} = $text;
}
}
my $timeStr = &getTimeAsString();
print LOGF "[$timeStr] Prescanning complete - prescanned $counter pages\n";
}
sub transform() {
# re-open the input XML file
my $pages = Parse::MediaWikiDump::Pages->new($file);
my $page;
while (defined($page = $pages->page)) {
my $id = $page->id;
my $timeStr = &getTimeAsString();
print LOGF "[$timeStr] Transforming page id=$id\n";
if ( defined( &isRedirect($page) ) ) {
next; # we've already loaded all redirects in the prescanning phase
}
if ( ! &isNamespaceOkForTransforming($page) ) {
next; # we're only interested in pages from certain namespaces
}
my $title = $page->title;
&normalizeTitle(\$title);
# see the comment about empty titles in function 'prescan'
if (length($title) == 0) {
print LOGF "Skipping page with empty title id=$id\n";
next;
}
my $text = ${$page->text};
my $orgLength = length($text); # text length BEFORE any transformations
# The check for stub must be done BEFORE any further processing,
# because stubs indicators are templates, and templates are substituted.
my $isStub = 0;
if ( $text =~ m/stub}}/i ) {
$isStub = 1;
}
my @categories;
my @internalLinks;
my @urls;
&includeTemplates(\$text);
my @relatedArticles;
# This function only examines the contents of '$text', but doesn't change it.
&identifyRelatedArticles(\$text, \@relatedArticles, $id);
# We process categories directly, because '$page->categories' ignores
# categories inherited from included templates
&extractCategories(\$text, \@categories, $id);
# Categories are listed at the end of articles, and therefore may mistakenly
# be added to the list of related articles (which often appear in the last
# section such as "See also"). To avoid this, we explicitly remove all categories
# from the list of related links, and only then record the list of related links
# to the file.
&removeElements(\@relatedArticles, \@categories);
&recordRelatedArticles($id, \@relatedArticles);
&extractInternalLinks(\$text, \@internalLinks, $id, 1, 1);
&extractUrls(\$text, \@urls);
&postprocessText(\$text, 1);
my $newLength = length($text); # text length AFTER all transformations
&writePage($id, \$title, \$text, $orgLength, $newLength, $isStub, \@categories, \@internalLinks, \@urls);
&updateStatistics(\@categories, \@internalLinks);
if ($title =~ /^Category:/) {
&updateCategoryHierarchy($id, \@categories);
}
}
}
sub updateStatistics(\@\@) {
my ($refToCategories, $refToInternalLinks) = @_;
my $cat;
foreach $cat (@$refToCategories) {
$statCategories{$cat}++;
}
my $link;
foreach $link (@$refToInternalLinks) {
$statIncomingLinks{$link}++;
}
}
sub updateCategoryHierarchy($\@) {
# The list of categories passed as a parameter is actually the list of parent categories
# for the current category
my ($childId, $refToParentCategories) = @_;
my $parentCat;
foreach $parentCat (@$refToParentCategories) {
if ( exists($catHierarchy{$parentCat}) ) {
push(@{$catHierarchy{$parentCat}}, $childId);
} else {
# create a new array with '$childId' as the only child (for now) of '$parentCat'
my @arr;
push(@arr, $childId);
$catHierarchy{$parentCat} = [ @arr ];
}
}
}
sub writePage($\$\$$$$\@\@\@) {
my ($id, $refToTitle, $refToText, $orgLength, $newLength, $isStub,
$refToCategories, $refToInternalLinks, $refToUrls) = @_;
my $numCategories = scalar(@$refToCategories);
my $numLinks = scalar(@$refToInternalLinks);
my $numUrls = scalar(@$refToUrls);
print OUTF "<page id=\"$id\" orglength=\"$orgLength\" newlength=\"$newLength\" stub=\"$isStub\" " .
"categories=\"$numCategories\" outlinks=\"$numLinks\" urls=\"$numUrls\">\n";
my $encodedTitle = $$refToTitle;
&encodeXmlChars(\$encodedTitle);
print OUTF "<title>$encodedTitle</title>\n";
print OUTF "<categories>";
print OUTF join(" ", @$refToCategories);
print OUTF "</categories>\n";
print OUTF "<links>";
print OUTF join(" ", @$refToInternalLinks);
print OUTF "</links>\n";
print OUTF "<urls>\n";
my $url;
foreach $url (@$refToUrls) {
&encodeXmlChars(\$url);
print OUTF "$url\n";
}
print OUTF "</urls>\n";
# text has already undergone 'encodeXmlChars' in function 'postprocessText'
print OUTF "<text>\n$$refToText\n</text>\n";
print OUTF "</page>\n";
}
# Maps a title into the id, and performs redirection if necessary.
# Assumption: the argument was already normalized using 'normalizeTitle'
sub resolveLink(\$) {
my ($refToTitle) = @_;
# safety precaution
return undef if (length($$refToTitle) == 0);
my $targetId; # result
my $targetTitle = $$refToTitle;
if ( exists($redir{$$refToTitle}) ) { # this link is a redirect
$targetTitle = $redir{$$refToTitle};
# check if this is a double redirect
if ( exists($redir{$targetTitle}) ) {
$targetTitle = undef; # double redirects are not allowed and are ignored
print LOGF "Warning: link '$$refToTitle' caused double redirection and was ignored\n";
} else {
print LOGF "Link '$$refToTitle' was redirected to '$targetTitle'\n";
}
}
if ( defined($targetTitle) ) {
if ( exists($title2id{$targetTitle}) ) {
$targetId = $title2id{$targetTitle};
} else {
# target not found
print LOGF "Warning: link '$$refToTitle' cannot be matched to an id\n";
$targetId = undef;
}
} else {
$targetId = undef;
}
$targetId; # return value
}
sub includeTemplates(\$) {
my ($refToText) = @_;
# Using the while loop forces templates to be included recursively
# (i.e., includes the body of templates that themselves were included
# on the previous iteration ).
# Template definitions can easily span several lines, hence the "/s" modifier.
# Templates are frequently nested. Occasionally, parsing mistakes may cause template insertion
# to enter an infinite loop, for instance when trying to instantiate Template:Country
# {{country_{{{1}}}|{{{2}}}|{{{2}}}|size={{{size|}}}|name={{{name|}}}}}
# which is repeatedly trying to insert template "country_", which is again resolved to
# Template:Country. The straightforward solution of keeping track of templates that were
# already inserted for the current article would not work, because the same template
# may legally be used more than once, with different parameters in different parts of
# the article. Therefore, we simply limit the number of iterations of nested template
# inclusion.
my $templateRecursionLevels = 0;
# We also require that the body of a template does not contain the template opening sequence
# (two successive opening braces - "\{\{"). We use negative lookahead to achieve this.
while ( ($templateRecursionLevels < $maxTemplateRecursionLevels) &&
$$refToText =~ s/\{\{
(?:\s*) # optional whitespace before the template name is ignored
(
(?:
(?!
\{\{
)
.
)*?
)
# OLD code and comments
# (?:\s*) # optional whitespace before the template name is ignored
# ([^\{]*?) # Occasionally, templates are nested,
# # e.g., {{localurl:{{NAMESPACE}}:{{PAGENAME}}}}
# # In order to prevent incorrect parsing, e.g.,
# # "{{localurl:{{NAMESPACE}}", we require that the
# # template name does not include opening braces,
# # hence "[^\{]" (any char except opening brace).
# END OF OLD code and comments
\}\}
/&instantiateTemplate($1)/segx
) {
$templateRecursionLevels++;
}
# Since we limit the number of levels of template recursion, we might end up with several
# un-instantiated templates. In this case we simply eliminate them - however, we do so
# later, in function 'postprocessText()', after extracting categories, links and URLs.
}
BEGIN {
# Making variables static for the function to avoid recompilation of regular expressions
# every time the function is called.
my $specialSeparator = "\.pAr\.";
my $specialSeparatorRegex = qr/$specialSeparator/;
sub parseTemplateInvocation(\$\$\%) {
my ($refToTemplateInvocation, $refToTemplateTitle, $refToParameterHash) = @_;
# Template definitions (especially those with parameters) can easily span several lines,
# hence the "/s" modifier. The template name extends up to the first pipeline symbol (if any).
# Template parameters go after the "|" symbol.
if ($$refToTemplateInvocation =~ /^([^|]*)\|(.*)$/sx) {
$$refToTemplateTitle = $1; # single out the template name itself
my $paramsList = $2;
# Template parameters often contain URLs, internal links, or just other useful text,
# whereas the template serves for presenting it in some nice way.
# Parameters are separated by "|" symbols. However, we cannot simply split the string
# on "|" symbols, since these frequently appear inside internal links. Therefore, we split
# on those "|" symbols that are not inside [[...]]. It's obviously sufficient to check that
# brackets are not improperly nested on one side of "|", so we use lookahead.
# We first replace all "|" symbols that are not inside [[...]] with a special separator that
# we invented, which will hopefully not normally appear in the text (.pAr.).
# Next, we use 'split' to break the string on this new separator.
$paramsList =~ s/\| # split on pipeline symbol, such that
(?: # non-capturing grouper that encloses 2 options
(?= # zero-width lookahead - option #1
[^\]]*$ # there are no closing brackets up to the end
# of the string (i.e., all the characters up to
# the end of the string are not closing brackets)
) # end of first lookahead (= end of option #1)
| # or
(?= # another zero-width lookahead - option #2
[^\]]* \[ # the nearest opening bracket on the right is not preceded
# by a closing bracket (i.e., all the characters that
# precede it are not closing brackets
) # end of second lookahead (= end of option #2)
) # end of the outer grouper
/$specialSeparator/sxg; # replace matching symbols with a special separator
# /s means string can contain newline chars
my @parameters = split(/$specialSeparatorRegex/, $paramsList);
# Parameters can be either named or unnamed. In the latter case, their name is defined by their
# ordinal position (1, 2, 3, ...).
my $unnamedParameterCounter = 0;
# It's legal for unnamed parameters to be skipped, in which case they will get default
# values (if available) during actual instantiation. That is {{template_name|a||c}} means
# parameter 1 gets the value 'a', parameter 2 value is not defined, and parameter 3 gets the value 'c'.
# This case is correctly handled by function 'split', and does not require any special handling.
my $param;
foreach $param (@parameters) {
# Spaces before or after a parameter value are normally ignored, UNLESS the parameter contains
# a link (to prevent possible gluing the link to the following text after template substitution)
# Parameter values may contain "=" symbols, hence the parameter name extends up to
# the first such symbol.
# It is legal for a parameter to be specified several times, in which case the last assignment
# takes precedence. Example: "{{t|a|b|c|2=B}}" is equivalent to "{{t|a|B|c}}".
# Therefore, we don't check if the parameter has been assigned a value before, because
# anyway the last assignment should override any previous ones.
if ($param =~ /^([^=]*)=(.*)$/s) {
# This is a named parameter.
# This case also handles parameter assignments like "2=xxx", where the number of an unnamed
# parameter ("2") is specified explicitly - this is handled transparently.
my $parameterName = $1;
my $parameterValue = $2;
&trimWhitespaceBothSides(\$parameterName);
if ($parameterValue !~ /\]\]/) { # if the value does not contain a link, trim whitespace
&trimWhitespaceBothSides(\$parameterValue);
}
$$refToParameterHash{$parameterName} = $parameterValue;
} else {
# this is an unnamed parameter
$unnamedParameterCounter++;
if ($param !~ /\]\]/) { # if the value does not contain a link, trim whitespace
&trimWhitespaceBothSides(\$param);
}
$$refToParameterHash{$unnamedParameterCounter} = $param;
}
}
} else {
# Template invocation does not contain a pipeline symbol, hence take the entire
# invocation text as the template title.
$$refToTemplateTitle = $$refToTemplateInvocation;
}
}
} # end of BEGIN block
sub instantiateTemplate($) {
my ($templateInvocation) = @_;
my $result = "";
print LOGF "Instantiating template=$templateInvocation\n";
my $templateTitle;
my %templateParams;
&parseTemplateInvocation(\$templateInvocation, \$templateTitle, \%templateParams);
&computeFullyQualifiedTemplateTitle(\$templateTitle);
&includeTemplateText(\$templateTitle, \%templateParams, \$result);
$result; # return value
}
sub includeTemplateText(\$\%\$) {
my ($refToTemplateTitle, $refToParameterHash, $refToResult) = @_;
&normalizeTitle($refToTemplateTitle);
my $includedPageId = &resolveLink($refToTemplateTitle);
if ( defined($includedPageId) && exists($templates{$includedPageId}) ) {
# OK, perform the actual inclusion with parameter substitution
$$refToResult = $templates{$includedPageId};
# Perform parameter substitution
# A parameter call ( {{{...}}} ) may span over a newline, hence the /s modifier
# Parameters may be nested (see comments below), hence we do the substitution iteratively
# in a while loop. We also limit the maximum number of iterations to avoid too long or
# even endless loops (in case of malformed input).
my $parameterRecursionLevels = 0;
# We also require that the body of a parameter does not contain the parameter opening sequence
# (three successive opening braces - "\{\{\{"). We use negative lookahead to achieve this.
while ( ($parameterRecursionLevels < $maxParameterRecursionLevels) &&
$$refToResult =~ s/\{\{\{
(
(?:
(?!
\{\{\{
)
.
)*?
)
# OLD code and comments
# ([^\{]*?) # Occasionally, parameters are nested because
# # they are dependent on other parameters,
# # e.g., {{{Author|{{{PublishYear|}}}}}}
# # (here, the default value for 'Author' is
# # dependent on 'PublishYear').
# # In order to prevent incorrect parsing, e.g.,
# # "{{{Author|{{{PublishYear|}}}", we require that the
# # parameter name does not include opening braces,
# # hence "[^\{]" (any char except opening brace).
# END OF OLD code and comments
\}\}\}
/&substituteParameter($1, $refToParameterHash)/segx
) {
$parameterRecursionLevels++;
}
} else {
# The page being included cannot be identified - perhaps we skipped it (because currently
# we only allow for inclusion of pages in the Template namespace), or perhaps it's
# a variable name like {{NUMBEROFARTICLES}}. Just remove this inclusion directive and
# replace it with a space
print LOGF "Template '$$refToTemplateTitle' is not available for inclusion\n";
$$refToResult = " ";
}
}
sub substituteParameter($\%) {
my ($parameter, $refToParameterHash) = @_;
my $result;
if ($parameter =~ /^([^|]*)\|(.*)$/) {
# This parameter has a default value
my $paramName = $1;
my $defaultValue = $2;
if ( defined($$refToParameterHash{$paramName}) ) {
$result = $$refToParameterHash{$paramName}; # use parameter value specified in template invocation
} else { # use the default value
$result = $defaultValue;
}
} else {
# parameter without a default value
if ( defined($$refToParameterHash{$parameter}) ) {
$result = $$refToParameterHash{$parameter}; # use parameter value specified in template invocation
} else {
# Parameter not specified in template invocation and does not have a default value -
# do not perform substitution and keep the parameter in 3 braces
# (these are Wiki rules for templates, see http://meta.wikimedia.org/wiki/Help:Template ).
$result = "{{{$parameter}}}";
}
}
# Surplus parameters - i.e., those assigned values in template invocation but not used
# in the template body - are simply ignored.
$result; # return value
}
sub computeFullyQualifiedTemplateTitle(\$) {
my ($refToTemplateTitle) = @_;
# Determine the namespace of the page being included through the template mechanism
my $namespaceSpecified = 0;
if ($$refToTemplateTitle =~ /^:(.*)$/) {
# Leading colon by itself implies main namespace, so strip this colon
$$refToTemplateTitle = $1;
$namespaceSpecified = 1;
} elsif ($$refToTemplateTitle =~ /^([^:]*):/) {
# colon found but not in the first position - check if it designates a known namespace
my $prefix = $1;
&normalizeNamespace(\$prefix);
$namespaceSpecified = &isKnownNamespace(\$prefix);
}
# The case when the page title does not contain a colon at all also falls here.
if ($namespaceSpecified) {
# OK, the title of the page being included is fully qualified with a namespace
} else {
# The title of the page being included is NOT in the main namespace and lacks
# any other explicit designation of the namespace - therefore, it is resolved
# to the Template namespace (that's the default for the template inclusion mechanism).
$$refToTemplateTitle = "Template:$$refToTemplateTitle";
}
}
sub extractCategories(\$\@$) {
my ($refToText, $refToCategoriesArray, $id) = @_;
# Remember that namespace names are case-insensitive, hence we're matching with "/i".
# The first parameter to 'collectCategory' is passed by value rather than by reference,
# because it might be dangerous to pass a reference to $1 in case it might get modified
# (with unclear consequences).
$$refToText =~ s/\[\[(?:\s*)(Category:.*?)\]\]/&collectCategory($1, $refToCategoriesArray)/ieg;
# We don't accumulate categories directly in a hash table, since this would not preserve
# their original order of appearance.
&removeDuplicatesAndSelf($refToCategoriesArray, $id);
}
sub collectCategory($\@) {
my ($catName, $refToCategoriesArray) = @_;
if ($catName =~ /^(.*)\|/) {
# Some categories contain a sort key, e.g., [[Category:Whatever|*]] or [[Category:Whatever| ]]
# In such a case, take only the category name itself.
$catName = $1;
}
&normalizeTitle(\$catName);
my $catId = &resolveLink(\$catName);
if ( defined($catId) ) {
push(@$refToCategoriesArray, $catId);
} else {
print LOGF "Warning: unknown category '$catName'\n";
}
# The return value is just a space, because we remove categories from the text
# after we collected them
" ";
}
sub extractInternalLinks(\$\@$$$) {
my ($refToText, $refToInternalLinksArray, $id,
$whetherToLogAnchorText, $whetherToRemoveDuplicates) = @_;
# For each internal link outgoing form the current article, this hash table maps
# the target id into the anchor text associated with it. Naturally, we only
# collect anchor text for links that can be resolved to a page id.
my %anchorTexts;
# Link definitions may span over adjacent lines and therefore contain line breaks,
# hence we use the /s modifier.
# Occasionally, links are nested, e.g.,
# [[Image:kanner_kl2.jpg|frame|right|Dr. [[Leo Kanner]] introduced the label ''early infantile autism'' in [[1943]].]]
# In order to prevent incorrect parsing, e.g., "[[Image:kanner_kl2.jpg|frame|right|Dr. [[Leo Kanner]]",
# we extract links in several iterations of the while loop, while the link definition requires that
# each pair [[...]] does not contain any opening braces.
1 while ( $$refToText =~ s/
(\w*) # words may be glued to the beginning of the link,
# in which case they become part of the link
# e.g., "ex-[[Giuseppe Mazzini|Mazzinian]] "
\[\[
([^\[]*?) # the link text can be any chars except an opening bracket,
# this ensures we correctly parse nested links (see comments above)
\]\]
(\w*) # words may be glued to the end of the link,
# in which case they become part of the link
# e.g., "[[public transport]]ation"
/&collectInternalLink($1, $2, $3, $refToInternalLinksArray, \%anchorTexts)/segx
);
if ($whetherToRemoveDuplicates) {
&removeDuplicatesAndSelf($refToInternalLinksArray, $id);
}
if ($whetherToLogAnchorText) {
&logAnchorText(\%anchorTexts, $id);
}
}
sub logAnchorText(\%$) {
my ($refToAnchorTextsHash, $curPageId) = @_;
# Remember that we use a hash table to associate anchor text with target page ids.
# Therefore, if the current page has several links to another page (it happens), then we only
# keep the anchor text of the last one (and override the previous ones) - we can live with it.
# Consequently, we do not need to remove duplicates as there are none.
# However, we still remove the links that point from the page to itself.
my $targetId;
my $anchorText;
while ( ($targetId, $anchorText) = each(%$refToAnchorTextsHash) ) {
if ($targetId != $curPageId) {
&postprocessText(\$anchorText, 0); # anchor text doesn't need escaping of XML characters,
# hence the second function parameter is 0
$anchorText =~ s/\n/ /g; # replace all newlines with spaces
# make sure that something is left of anchor text after postprocessing
if (length($anchorText) > 0) {
print ANCHORF "$targetId\t$curPageId\t$anchorText\n";
}
}
}
}
sub collectInternalLink($$$\@\%) {
my ($prefix, $link, $suffix, $refToInternalLinksArray, $refToAnchorTextHash) = @_;
my $originalLink = $link;
my $result = "";
# strip leading whitespace, if any
$link =~ s/^\s*//;
# Link definitions may span over adjacent lines and therefore contain line breaks,
# hence we use the /s modifier on most matchings.
# There are some special cases when the link may be preceded with a colon.
# Known cases:
# - Linking to a category (as opposed to actually assigning the current article
# to a category) is performed using special syntax [[:Category:...]]
# - Linking to other languages, e.g., [[:fr:Wikipedia:Aide]]
# (without the leading colon, the link will go to the side menu
# - Linking directly to the description page of an image, e.g., [[:Image:wiki.png]]
# In all such cases, we strip the leading colon.
if ($link =~ /^
: # colon at the beginnning of the link name
(.*) # the rest of the link text
$
/sx) {
# just strip this initial colon (as well as any whitespace preceding it)
$link = $1;
}
# Alternative text may be available after the pipeline symbol.
# If the pipeline symbol is only used for masking parts of
# the link name for presentation, we still consider that the author of the page
# deemed the resulting text important, hence we always set this variable when
# the pipeline symbol is present.
my $alternativeTextAvailable = 0;
# Some links contain several pipeline symbols, e.g.,
# [[Image:Zerzan.jpeg|thumb|right|[[John Zerzan]]]]
# It seems that the extra pipeline symbols are parameters, so we just eliminate them.
if ($link =~ /^(.*)\|([^|]*)$/s) { # first, extract the link up to the last pipeline symbol
$link = $1; # the part before the last pipeline
$result = $2; # the part after the last pipeline, this is usually an alternative text for this link
$alternativeTextAvailable = 1; # pipeline found, see comment above
# Now check if there are pipeline symbols remaining.
# Note that this time we're looking for the shortest match,
# to take the part of the text up to the first pipeline symbol.
if ($link =~ /^([^|]*)\|(.*)$/s) {
$link = $1;
# $2 contains the parameters, which we don't really need
}
if (length($result) == 0) {
if ($link !~ /\#/) {
# If the "|" symbol is not followed by some text, then it masks the namespace
# as well as any text in parentheses at the end of the link title.
# However, pipeline masking is only invoked if the link does not contain an anchor,
# hence the additional condition in the 'if' statement.
&performPipelineMasking(\$link, \$result);
} else {
# If the link contains an anchor, then masking is not invoked, and we take the entire link
$result = $link;
}
}
} else {
# the link text does not contain the pipeline, so take it as-is
$result = $link;
}
if ($link =~ /^(.*)\#(.*)$/s) {
# The link contains an anchor, so adjust the link to point to the page as a whole.
$link = $1;
my $anchor = $2;
# Check if the link points to an anchor on the current page, and if so - ignore it.
if (length($link) == 0 && ! $alternativeTextAvailable) {
# This is indeed a link pointing to an anchor on the current page.
# The link is thus cleared, so that it will not be resolved and collected later.
# For anchors to the same page, discard the leading '#' symbol, and take
# the rest as the text - but only if no alternative text was provided for this link.
$result = $anchor;
}
}
# Now collect the link, or links if the original link is in the date format
# and specifies both day and year. In the latter case, the function for date
# normalization may also modify the link text ($result), and may collect more
# than one link (one for the day, another one for the year).
my $dateRecognized = 0;
# Alternative text (specified after pipeline) blocks normalization of dates.
# We also perform a quick check - if the link does not start with a digit,
# then it surely does not contain a date
if ( ($link =~ /^\d/) && (! $alternativeTextAvailable)) {
$dateRecognized = &normalizeDates(\$link, \$result, $refToInternalLinksArray, $refToAnchorTextHash);
}
# If a date (either day or day + year) was recognized, then no further processing is necessary
if (! $dateRecognized) {
&normalizeTitle(\$link);
my $targetId = &resolveAndCollectInternalLink(\$link, $refToInternalLinksArray);
# Wikipedia pages contain many links to other Wiki projects (especially Wikipedia in
# other languages). While these links are not resolved to valid pages, we also want
# to ignore their text. However, simply discarding the text of all links that cannot
# be resolved would be overly aggressive, as authors frequently define phrases as links
# to articles that don't yet exist, in the hope that they will be added later.
# Therefore, we formulate the following conditions that must hold simultaneously
# for discarding the text of a link:
# 1) the link was not resolved to a valid id
# 2) the link does not contain alternative text (if it did, then the text is probably
# important enough to be retained)
# 3) the link contains a colon - this is a very simple heuristics for identifying links to
# other Wiki projects, other languages, or simply other namespaces within current Wikipedia.
# While this method is not fool-proof (there are regular pages in the main namespace
# that contain a colon in their title), we believe this is a reasonable tradeoff.
if ( !defined($targetId) && ! $alternativeTextAvailable && $link =~ /:/ ) {
$result = "";
print LOGF "Discarding text for link '$originalLink'\n";
} else {
# finally, add the text originally attached to the left and/or to the right of the link
# (if the link represents a date, then it has not text glued to it, so it's OK to only
# use the prefix and suffix here)
$result = $prefix . $result . $suffix;
}
if ( defined($targetId) ) {
# If the current page has several links to another page, then we only take the anchor
# of the last one (and override the previous ones) - we can live with it.
$$refToAnchorTextHash{$targetId} = $result;
}
}
$result; #return value
}
sub performPipelineMasking(\$\$) {
my ($refToLink, $refToResult) = @_;
# First check for presence of a namespace
if ($$refToLink =~ /^([^:]*):(.*)$/) {
my $namespaceCandidate = $1;
my $rest = $2;
&normalizeNamespace(\$namespaceCandidate);
if ( &isKnownNamespace(\$namespaceCandidate) ) {
$$refToResult = $rest; # take the link text without the namespace
} else {
$$refToResult = $$refToLink; # otherwise, take the entire link text (for now)
}
} else {
$$refToResult = $$refToLink; # otherwise, take the entire link text (for now)
}
# Now check if there are parentheses at the end of the link text
# (we now operate on $$refToResult, because we might have stripped the leading
# namespace in the previous test).
if ($$refToResult =~ /^ # the beginning of the string
(.*) # the text up to the last pair of parentheses
\( # opening parenthesis
(?:[^()]*) # the text in the parentheses
\) # closing parenthesis
(?:\s*) # optional trailing whitespace, just in case
$ # end of string
/x) {
$$refToResult = $1; # discard the text in parentheses at the end of the string
}
}
sub resolveAndCollectInternalLink(\$\@) {
my ($refToLink, $refToInternalLinksArray) = @_;
my $targetId = &resolveLink($refToLink);
if ( defined($targetId) ) {
push(@$refToInternalLinksArray, $targetId);
} else {
# Some cases in this category that obviously won't be resolved to legal ids:
# - Links to namespaces that we don't currently handle
# (other than those for which 'isNamespaceOK' returns true);
# media and sound files fall in this category
# - Links to other languages, e.g., [[de:...]]
# - Links to other Wiki projects, e.g., [[Wiktionary:...]]
print LOGF "Warning: unknown link '$$refToLink'\n";
}
$targetId; # return value
}
# Dates can appear in several formats
# 1) [[July 20]], [[1969]]
# 2) [[20 July]] [[1969]]
# 3) [[1969]]-[[07-20]]
# 4) [[1969-07-20]]
# The first one is handled correctly without any special treatment,
# so we don't even check for it here.
# In (2) and (3), we only normalize the day, because it will be parsed separately from the year.
# This function is only invoked if the link has no alternative text available, therefore,
# we're free to override the result text.
sub normalizeDates(\$\$\@\%) {
my ($refToLink, $refToResultText, $refToInternalLinksArray, $refToAnchorTextHash) = @_;
my $dateRecognized = 0;
if ($$refToLink =~ /^(\d\d)\s*([A-Za-z]+)$/) {
my $day = $1;
my $month = ucfirst(lc($2));
if ( defined($monthToNumDays{$month}) &&
1 <= $day && $day <= $monthToNumDays{$month} ) {
$dateRecognized = 1;
$$refToLink = "$month $day";
$$refToResultText = "$month $day";
my $targetId = &resolveAndCollectInternalLink($refToLink, $refToInternalLinksArray);
if ( defined($targetId) ) {
$$refToAnchorTextHash{$targetId} = $$refToResultText;
}
} else {
# this doesn't look like a valid date, leave as-is
}
} elsif ($$refToLink =~ /^(\d\d)\-(\d\d)$/) {
my $monthNum = int($1);
my $day = $2;
if ( defined($numberToMonth{$monthNum}) ) {
my $month = $numberToMonth{$monthNum};
if (1 <= $day && $day <= $monthToNumDays{$month}) {
$dateRecognized = 1;
$$refToLink = "$month $day";
# we add a leading space, to separate the preceding year ("[[1969]]-" in the example")
# from the day that we're creating
$$refToResultText = " $month $day";
my $targetId = &resolveAndCollectInternalLink($refToLink, $refToInternalLinksArray);
if ( defined($targetId) ) {
$$refToAnchorTextHash{$targetId} = $$refToResultText;
}
} else {
# this doesn't look like a valid date, leave as-is
}
} else {
# this doesn't look like a valid date, leave as-is
}
} elsif ($$refToLink =~ /^(\d\d\d\d)\-(\d\d)\-(\d\d)$/) {
my $year = $1;
my $monthNum = int($2);
my $day = $3;
if ( defined($numberToMonth{$monthNum}) ) {
my $month = $numberToMonth{$monthNum};
if (1 <= $day && $day <= $monthToNumDays{$month}) {
$dateRecognized = 1;
$$refToLink = "$month $day";
# the link text is combined from the day and the year
$$refToResultText = "$month $day, $year";
my $targetId;
# collect the link for the day
$targetId = &resolveAndCollectInternalLink($refToLink, $refToInternalLinksArray);
if ( defined($targetId) ) {
$$refToAnchorTextHash{$targetId} = $$refToLink;
}
# collect the link for the year
$targetId = &resolveAndCollectInternalLink(\$year, $refToInternalLinksArray);
if ( defined($targetId) ) {
$$refToAnchorTextHash{$targetId} = $year;
}
} else {
# this doesn't look like a valid date, leave as-is
}
} else {
# this doesn't look like a valid date, leave as-is
}
}
$dateRecognized; # return value
}
sub extractUrls(\$\@) {
my ($refToText, $refToUrlsArray) = @_;
# First we handle the case of URLs enclosed in single brackets, with or without the description,
# and with optional leading and/or trailing whitespace
# Examples: [http://www.cnn.com], [ http://www.cnn.com ], [http://www.cnn.com CNN Web site]
$$refToText =~ s/\[(?:\s*)($urlProtocols(?:[^\[\]]*))\]/&collectUrlFromBrackets($1, $refToUrlsArray)/eg;
# Now we handle standalone URLs (those not enclosed in brackets)
# The $urlTemrinator is matched via positive lookahead (?=...) in order not to remove
# the terminator symbol itself, but rather only the URL.
$$refToText =~ s/($urlProtocols(?:.*?))$urlTerminator/&collectStandaloneUrl($1, $refToUrlsArray)/eg;
&removeDuplicatesAndSelf($refToUrlsArray, undef);
}
sub collectUrlFromBrackets($\@) {
my ($url, $refToUrlsArray) = @_;
my $text;
# Assumption: leading whitespace has already been stripped
if ( $url =~ /^($urlProtocols(?:.*?))($urlTerminator(?:.*))$/ ) { # description available
push(@$refToUrlsArray, $1);
$text = $2;
} else { # no description
push(@$refToUrlsArray, $url);
$text = " ";
}
$text; # return value
}
sub collectStandaloneUrl($\@) {
my ($url, $refToUrlsArray) = @_;
push(@$refToUrlsArray, $url); # collect the URL as-is
" "; # return value - replace the URL with a space
}
sub postprocessText(\$$) {
my ($refToText, $whetherToEncodeXmlChars) = @_;
# Eliminate all <includeonly> and <onlyinclude> fragments, because this text
# will not be included anywhere, as we already handled all inclusion directives
# in function 'includeTemplates'.
# This block can easily span several lines, hence the "/s" modifier.
$$refToText =~ s/<includeonly>(.*?)<\/includeonly>/ /sg;
$$refToText =~ s/<onlyinclude>(.*?)<\/onlyinclude>/ /sg;
# <noinclude> fragments remain, but remove the tags per se
# We block the code below, as <noinclude> tags will anyway be thrown away later,
# when we eliminate all remaining tags.
### This block can easily span several lines, hence the "/s" modifier
### $$refToText =~ s/<noinclude>(.*?)<\/noinclude>/$1/sg;
# replace <br> and <br /> directives with new paragraph
$$refToText =~ s/<br(?:\s*)(?:[\/]?)>/\n\n/g;
# Remove tables, as they often carry a lot of noise
&eliminateTables($refToText);
# Since we limit the number of levels of template recursion, we might end up with several
# un-instantiated templates. In this case we simply eliminate them now.
# Because templates may be nested, we eliminate them iteratively by starting from the most
# nested one (hence the 'while' loop).
# OLD comments and code:
# For the same reason, we also require that the body of a template does not contain
# opening braces (hence "[^\{]", any char except opening brace).
# 1 while ($$refToText =~ s/\{\{(?:[^\{]*?)\}\}/ /sg);
# END OF old comments and code
# We also require that the body of a template does not contain the template opening sequence
# (two successive opening braces - "\{\{"). We use negative lookahead to achieve this.
1 while ($$refToText =~ s/\{\{
(?:
(?:
(?!
\{\{
)
.
)*?
)
\}\}
/ /sgx);
# Remove any other <...> tags - but keep the text they enclose
# (the tags are replaced with spaces to prevent adjacent pieces of text
# from being glued together).
# Comments (<!-- ... -->) also fall into this category, and since they can easily span several lines,
# we use the "/s" modifier.
$$refToText =~ s/<(?:.*?)>/ /sg;
# Change markup on bold/italics emphasis. We probably don't need to distinguish
# these 3 types of emphasis, so we just replace all of them with a generic <em> tag.
# IMPORTANT: If 'encodeXmlChars' has beeen called before this line, then remember that
# the apostrophes were already quoted to "&apos;"
$$refToText =~ s/'''''(.*?)'''''/$1/g;
$$refToText =~ s/'''(.*?)'''/$1/g;
$$refToText =~ s/''(.*?)''/$1/g;
# Eliminate long sequences of newlines and whitespace.
# Note that we don't want to replace sequences of spaces only, as this might make the text
# less readable. Instead, we only eliminate sequences of whitespace that contain at least
# two newlines.
$$refToText =~ s/(?:\s*)\n(?:\s*)\n(?:\s*)/\n\n/g;
# Eliminate XML entities such as "&nbsp;" , "&times;" etc. - otherwise,
# in C++ code they will give rise to spurious words "nbsp", "times" etc.
# Note that the standard entities - &amp; , &quot; , &apos; , &lt; and &gt;
# are handled by the XML parser. All other entities, such as &nbsp; are passed
# by the XML parser to the upper level (in case of Wikipedia pages,
# to the rendering engine).
# Note that in the raw XML text, these entities look like "&amp;nbsp;"
# (i.e., with leading "&amp;"). XML parser replaces "&amp;" with "&",
# so here in the code we see the entities as "&nbsp;".
$$refToText =~ s{& # the entity starts with "&"
((?:\#?)(?:\w+)) # optional '#' sign (as in &#945;), followed by
# an uninterrupted sequence of letters and/or digits
; # the entity ends with a semicolon
}{&logReplacedXmlEntity($1)}egx; # entities are replaced with a space
if ($whetherToEncodeXmlChars) {
# encode text for XML
&encodeXmlChars($refToText);
}
# NOTE that the following operations introduce XML tags, so they must appear
# after the original text underwent character encoding with 'encodeXmlChars' !!
# Change markup for section headers.
# Note that section headers may only begin at the very first position in the line
# (not even after a space). Therefore, each header markup in the following commands
# is prefixed with "^" to make sure it begins at the beginning of the line.
# Since the text (e.g., article body) may contains multiple lines, we use
# the "/m" modifier to allow matching "^" at embedded "\n" positions.
$$refToText =~ s/^=====(.*?)=====/<h4>$1<\/h4>/mg;
$$refToText =~ s/^====(.*?)====/<h3>$1<\/h3>/mg;
$$refToText =~ s/^===(.*?)===/<h2>$1<\/h2>/mg;
$$refToText =~ s/^==(.*?)==/<h1>$1<\/h1>/mg;
}
sub logReplacedXmlEntity($) {
my ($xmlEntity) = @_;
print LOGF "ENTITY: &$xmlEntity;\n";
" "; # return value - entities are replaced with a space
}
BEGIN {
# Making variables static for the function to avoid recompilation of regular expressions
# every time the function is called.
# Table definitions can easily span several lines, hence the "/s" modifier
my $tableOpeningSequence1 = qr{<table> # either just <table>
| # or
<table(?:\s+)(?:[^<>]*)>}ix; # "<table" followed by at least one space
# (to prevent "<tablexxx"), followed by
# some optional text, e.g., table parameters
# as in "<table border=0>"
# In the above definition, prohibiting '<' and '>' chars ([^<>]) ensures
# that we do not consume more than necessary, so that in the example
# "<table border=0> aaa <table> bbb </table> ccc </table>"
# $1 is NOT extended to be "> aaa <table"
my $tableClosingSequence1 = qr/<\/table>/i;
# my $nonNestedTableRegex1 =
# qr{$tableOpeningSequence1 # opening sequence
# (
# (?: # non-capturing grouper
# (?! # lookahead negation
# $tableOpeningSequence1 # that's what we don't want to find inside a table definition
# )
# . # any character (such that there is no table opening sequence
# # after it because of the lookahead condition)
# )*? # shortest match of such characters, up to the closing of a table
# )
# $tableClosingSequence1}sx; # closing sequence
my $tableOpeningSequence2 = qr/\{\|/;
my $tableClosingSequence2 = qr/\|\}/;
# my $nonNestedTableRegex2 =
# qr{$tableOpeningSequence2 # opening sequence
# (
# (?: # non-capturing grouper
# (?! # lookahead negation
# $tableOpeningSequence2 # that's what we don't want to find inside a table definition
# )
# . # any character (such that there is no table opening sequence
# # after it because of the lookahead condition)
# )*? # shortest match of such characters, up to the closing of a table
# )
# $tableClosingSequence2}sx; # closing sequence
sub eliminateTables(\$) {
my ($refToText) = @_;
# Sadly, these patterns became too complex and cause segmentation fault,
# hence we fall back to only handling non-nested tables :(
# # Sometimes, tables are nested, therefore we use a while loop to eliminate them
# # recursively, while requiring that any table we eliminate does not contain nested tables.
# # For simplicity, we assume that tables of the two kinds (e.g., <table> ... </table> and {| ... |})
# # are not nested in one another.
$$refToText =~ s/$tableOpeningSequence1(.*?)$tableClosingSequence1/\n/sg;
$$refToText =~ s/$tableOpeningSequence2(.*?)$tableClosingSequence2/\n/sg;
}
} # end of BEGIN block
# If specified, 'elementToRemove' contains an element that needs to be removed as well.
# For links, this ensures that a page does not link to itself. For categories, this
# ensures that a page is not categorized to itself. This parameter is obviously
# irrelevant for filtering URLs.
# 'elementToRemove' must be a numeric value (not string), since we're testing it with '==' (not 'eq')
sub removeDuplicatesAndSelf(\@$) {
my ($refToArray, $elementToRemove) = @_;
my %seen = ();
my @uniq;
my $item;
foreach $item (@$refToArray) {
if ( defined($elementToRemove) && ($item == $elementToRemove) ) {
printf LOGF "Warning: current page links or categorizes to itself - " .
"link discarded ($elementToRemove)\n";
next;
}
push(@uniq, $item) unless $seen{$item}++;
}
# overwrite the original array with the new one that does not contain duplicates
@$refToArray = @uniq;
}
# Removes elements of the second list from the first list.
# For efficiency purposes, the second list is converted into a hash.
sub removeElements(\@\@) {
my ($refToArray, $refToElementsToRemove) = @_;
my %elementsToRemove = ();
my @result;
# Construct the hash table for fast lookups
my $item;
foreach $item (@$refToElementsToRemove) {
$elementsToRemove{$item} = 1;
}
foreach $item (@$refToArray) {
if ( ! defined($elementsToRemove{$item}) ) {
push(@result, $item);
}
}
# overwrite the original array with the new one
@$refToArray = @result;
}
sub getTimeAsString() {
my $tm = localtime();
my $result = sprintf("%02d:%02d:%02d", $tm->hour, $tm->min, $tm->sec);
}
sub trimWhitespaceBothSides(\$) {
my ($stringRef) = @_;
# remove leading whitespace
$$stringRef =~ s/^\s*//;
# remove trailing whitespace
$$stringRef =~ s/\s*$//;
}
# There are 3 kinds of related links that we look for:
# 1) Standalone (usually, at the beginning of the article or a section of it)
# Ex: Main articles: ...
# 2) Inlined - text in parentheses inside the body of the article
# Ex: medicine (see also: [[Health]])
# 3) Dedicated section
# Ex: == See also ==
#
# In all calls to 'extractInternalLinks':
# - The penultimate argument is 0, since we don't need to log anchor text here.
# Anchor text will be handled when we analyze all the internal links in
# the entire article (and not just look for related links).
# - The last argument is 0 in order not to remove duplicates on every invocation
# of 'extractInternalLinks'. This is because duplicates in related links are
# not very common, but performing duplicate removal each time is expensive.
# Instead, we remove duplicates once at the very end.
sub identifyRelatedArticles(\$\@$) {
my ($refToText, $refToRelatedArticles, $id) = @_;
# We split the text into a set of lines. This also creates a copy of the original text -
# this is important, since the function 'extractInternalLinks' modifies its argument,
# so we'd better use it on a copy of the real article body.
my @text = split("\n", $$refToText);
my $line;
# Standalone
foreach $line (@text) {
# We require that stanalone designators occur at the beginning of the line
# (after at most a few characters, such as a whitespace or a colon),
# and not just anywhere in the line. Otherwise, we would collect as related
# those links that just happen to occur in the same line with an unrelated
# string that represents a standalone designator.
if ($line =~ /^(?:.{0,5})(${relatedWording_Standalone}.*)$/) {
my $str = $1; # We extract links from the rest of the line
print LOGF "Related(S): $id => $str\n";
&extractInternalLinks(\$str, $refToRelatedArticles, $id, 0, 0);
print LOGF "Related(S): $id ==> @$refToRelatedArticles\n";
}
}
# Inlined (in parentheses)
foreach $line (@text) {
while ($line =~ /\((?:\s*)(${relatedWording_Inline}.*?)\)/g) {
my $str = $1;
print LOGF "Related(I): $id => $str\n";
&extractInternalLinks(\$str, $refToRelatedArticles, $id, 0, 0);
print LOGF "Related(I): $id ==> @$refToRelatedArticles\n";
}
}
# Section
# Sections can be at any level - "==", "===", "====" - it doesn't matter,
# so it suffices to look for two consecutive "=" signs
my $relatedSectionFound = 0;
foreach $line (@text) {
if ($relatedSectionFound) { # we're in the related section now
if ($line =~ /==(?:.*?)==/) { # we just encountered the next section - exit the loop
last;
} else { # collect the links from the current line
print LOGF "Related(N): $id => $line\n";
# 'extractInternalLinks' may mofidy its argument ('$line'), but it's OK
# as we do not do any further processing to '$line' or '@text'
&extractInternalLinks(\$line, $refToRelatedArticles, $id, 0, 0);
print LOGF "Related(N): $id ==> @$refToRelatedArticles\n";
}
} else { # we haven't yet found the related section
if ($line =~ /==(.*?)==/) { # found some section header - let's check it
my $sectionHeader = $1;
if ($sectionHeader =~ /$relatedWording_Section/) {
$relatedSectionFound = 1;
next; # proceed to the next line
} else {
next; # unrelated section - just proceed to the next line
}
} else {
next; # just proceed to the next line - nothing to do
}
}
}
&removeDuplicatesAndSelf($refToRelatedArticles, $id);
}
sub recordRelatedArticles($\@) {
my ($id, $refToRelatedArticles) = @_;
my $size = scalar(@$refToRelatedArticles);
return if ($size == 0);
print RELATEDF "$id\t", join(" ", @$refToRelatedArticles), "\n";
}
########################################################################
sub printUsage()
{
print "Wikiprep version $version, Copyright (C) 2007 Evgeniy Gabrilovich\n" .
"Wikiprep comes with ABSOLUTELY NO WARRANTY; for details type '$0 -license'.\n" .
"This is free software, and you are welcome to redistribute it\n" .
"under certain conditions; type '$0 -license' for details.\n" .
"Type '$0 -version' for version information.\n\n" .
"Usage: $0 -f <XML file with page dump>\n" .
" e.g., $0 -f pages_articles.xml\n\n";
}