1720 lines
68 KiB
Perl
1720 lines
68 KiB
Perl
###############################################################################
|
|
#
|
|
# wikiprep.pl - Preprocess Wikipedia XML dumps
|
|
# Copyright (C) 2007 Evgeniy Gabrilovich
|
|
# The author can be contacted by electronic mail at gabr@cs.technion.ac.il
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program; if not, write to the Free Software
|
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA,
|
|
# or see <http://www.gnu.org/licenses/> and
|
|
# <http://www.fsf.org/licensing/licenses/info/GPLv2.html>
|
|
#
|
|
###############################################################################
|
|
|
|
|
|
use strict;
|
|
use warnings;
|
|
|
|
use File::Basename;
|
|
use Getopt::Long;
|
|
use Time::localtime;
|
|
use XML::Parser;
|
|
|
|
my $licenseFile = "COPYING";
|
|
my $version = "2.02";
|
|
|
|
if (@ARGV < 1) {
|
|
&printUsage();
|
|
exit 0;
|
|
}
|
|
|
|
my $file;
|
|
my $showLicense = 0;
|
|
my $showVersion = 0;
|
|
|
|
GetOptions('f=s' => \$file,
|
|
'license' => \$showLicense,
|
|
'version' => \$showVersion);
|
|
|
|
if ($showLicense) {
|
|
if (-e $licenseFile) {
|
|
print "See file $licenseFile for more details.\n"
|
|
} else {
|
|
print "Please see <http://www.gnu.org/licenses/> and <http://www.fsf.org/licensing/licenses/info/GPLv2.html>\n";
|
|
}
|
|
exit 0;
|
|
}
|
|
if ($showVersion) {
|
|
print "Wikiprep version $version\n";
|
|
exit 0;
|
|
}
|
|
if (!defined($file)) {
|
|
&printUsage();
|
|
exit 0;
|
|
}
|
|
if (! -e $file) {
|
|
die "Input file '$file' cannot be opened for reading\n";
|
|
}
|
|
|
|
|
|
##### Global definitions #####
|
|
|
|
my %XmlEntities = ('&' => 'amp', '"' => 'quot', "'" => 'apos', '<' => 'lt', '>' => 'gt');
|
|
|
|
# The URL protocol (e.g., http) matched here may be in either case, hence we use the /i modifier.
|
|
my $urlProtocols = qr/http:\/\/|https:\/\/|telnet:\/\/|gopher:\/\/|file:\/\/|wais:\/\/|ftp:\/\/|mailto:|news:/i;
|
|
# A URL terminator may be either one of a list of characters OR end of string (that is, '$').
|
|
# This last part is necessary to handle URLs at the very end of a string when there is no "\n"
|
|
# or any other subsequent character.
|
|
my $urlTerminator = qr/[\[\]\{\}\s\n\|\"<>]|$/;
|
|
|
|
my $relatedWording_Standalone =
|
|
qr/Main(?:\s+)article(?:s?)|Further(?:\s+)information|Related(?:\s+)article(?:s?)|Related(?:\s+)topic(?:s?)|See(?:\s+)main(?:\s+)article(?:s?)|See(?:\s+)article(?:s?)|See(?:\s+)also|For(?:\s+)(?:more|further)/i;
|
|
## For(?:\s+)more(?:\s+)(?:background|details)(?:\s+)on(?:\s+)this(?:\s+)topic,(?:\s+)see
|
|
my $relatedWording_Inline = qr/See[\s:]|See(?:\s+)also|For(?:\s+)(?:more|further)/i;
|
|
my $relatedWording_Section = qr/Further(?:\s+)information|See(?:\s+)also|Related(?:\s+)article(?:s?)|Related(?:\s+)topic(?:s?)/i;
|
|
|
|
my %monthToNumDays = ('January' => 31, 'February' => 29, 'March' => 31, 'April' => 30,
|
|
'May' => 31, 'June' => 30, 'July' => 31, 'August' => 31,
|
|
'September' => 30, 'October' => 31, 'November' => 30, 'December' => 31);
|
|
my %numberToMonth = (1 => 'January', 2 => 'February', 3 => 'March', 4 => 'April',
|
|
5 => 'May', 6 => 'June', 7 => 'July', 8 => 'August',
|
|
9 => 'September', 10 => 'October', 11 => 'November', 12 => 'December');
|
|
|
|
my $maxTemplateRecursionLevels = 5;
|
|
my $maxParameterRecursionLevels = 5;
|
|
|
|
##### Global variables #####
|
|
|
|
my %namespaces;
|
|
# we only process pages in these namespaces + the main namespace (which has an empty name)
|
|
my %okNamespacesForPrescanning = ('Template' => 1, 'Category' => 1);
|
|
my %okNamespacesForTransforming = ('Category' => 1); # we don't use templates as concepts
|
|
|
|
my %id2title;
|
|
my %title2id;
|
|
my %redir;
|
|
my %templates; # template bodies for insertion
|
|
my %catHierarchy; # each category is associated with a list of its immediate descendants
|
|
my %statCategories; # number of pages classified under each category
|
|
my %statIncomingLinks; # number of links incoming to each page
|
|
|
|
|
|
my ($fileBasename, $filePath, $fileSuffix) = fileparse($file, ".xml");
|
|
my $outputFile = "$filePath/$fileBasename.hgw$fileSuffix";
|
|
my $logFile = "$filePath/$fileBasename.log";
|
|
my $anchorTextFile = "$filePath/$fileBasename.anchor_text";
|
|
my $relatedLinksFile = "$filePath/$fileBasename.related_links";
|
|
|
|
open(OUTF, "> $outputFile") or die "Cannot open $outputFile";
|
|
open(LOGF, "> $logFile") or die "Cannot open $logFile";
|
|
open(ANCHORF, "> $anchorTextFile") or die "Cannot open $anchorTextFile";
|
|
open(RELATEDF, "> $relatedLinksFile") or die "Cannot open $relatedLinksFile";
|
|
|
|
binmode(STDOUT, ':utf8');
|
|
binmode(STDERR, ':utf8');
|
|
binmode(OUTF, ':utf8');
|
|
binmode(LOGF, ':utf8');
|
|
binmode(ANCHORF, ':utf8');
|
|
|
|
print ANCHORF "# Line format: <Target page id> <Source page id> <Anchor text (up to the end of the line)>\n\n\n";
|
|
print RELATEDF "# Line format: <Page id> <List of ids of related articles>\n\n\n";
|
|
|
|
©XmlFileHeader();
|
|
&loadNamespaces();
|
|
&prescan();
|
|
|
|
my $numTitles = scalar( keys(%id2title) );
|
|
print "Loaded $numTitles titles\n";
|
|
my $numRedirects = scalar( keys(%redir) );
|
|
print "Loaded $numRedirects redirects\n";
|
|
my $numTemplates = scalar( keys(%templates) );
|
|
print "Loaded $numTemplates templates\n";
|
|
|
|
&transform();
|
|
&closeXmlFile();
|
|
|
|
&writeStatistics();
|
|
&writeCategoryHierarchy();
|
|
|
|
close(LOGF);
|
|
close(ANCHORF);
|
|
close(RELATEDF);
|
|
|
|
# Hogwarts needs the anchor text file to be sorted in the increading order of target page id.
|
|
# The file is originally sorted by source page id (second field in each line).
|
|
# We now use stable (-s) numeric (-n) sort on the first field (-k 1,1).
|
|
# This way, the resultant file will be sorted on the target page id (first field) as primary key,
|
|
# and on the source page id (second field) as secondary key.
|
|
system("sort -s -n -k 1,1 $anchorTextFile > $anchorTextFile.sorted");
|
|
|
|
|
|
##### Subroutines #####
|
|
|
|
sub normalizeTitle(\$) {
|
|
my ($refToStr) = @_;
|
|
|
|
# remove leading whitespace and underscores
|
|
$$refToStr =~ s/^[\s_]+//;
|
|
# remove trailing whitespace and underscores
|
|
$$refToStr =~ s/[\s_]+$//;
|
|
# replace sequences of whitespace and underscore chars with a single space
|
|
$$refToStr =~ s/[\s_]+/ /g;
|
|
|
|
if ($$refToStr =~ /^([^:]*):(\s*)(\S(?:.*))/) {
|
|
my $prefix = $1;
|
|
my $optionalWhitespace = $2;
|
|
my $rest = $3;
|
|
|
|
my $namespaceCandidate = $prefix;
|
|
&normalizeNamespace(\$namespaceCandidate); # this must be done before the call to 'isKnownNamespace'
|
|
if ( &isKnownNamespace(\$namespaceCandidate) ) {
|
|
# If the prefix designates a known namespace, then it might follow by optional
|
|
# whitespace that should be removed to get the canonical page name
|
|
# (e.g., "Category: Births" should become "Category:Births").
|
|
$$refToStr = $namespaceCandidate . ":" . ucfirst($rest);
|
|
} else {
|
|
# No namespace, just capitalize first letter.
|
|
# If the part before the colon is not a known namespace, then we must not remove the space
|
|
# after the colon (if any), e.g., "3001: The_Final_Odyssey" != "3001:The_Final_Odyssey".
|
|
# However, to get the canonical page name we must contract multiple spaces into one,
|
|
# because "3001: The_Final_Odyssey" != "3001: The_Final_Odyssey".
|
|
$$refToStr = ucfirst($prefix) . ":" .
|
|
(length($optionalWhitespace) > 0 ? " " : "") . $rest;
|
|
}
|
|
} else {
|
|
# no namespace, just capitalize first letter
|
|
$$refToStr = ucfirst($$refToStr);
|
|
}
|
|
}
|
|
|
|
sub normalizeNamespace(\$) {
|
|
my ($refToStr) = @_;
|
|
|
|
$$refToStr = ucfirst( lc($$refToStr) );
|
|
}
|
|
|
|
# Checks if the prefix of the page name before the colon is actually one of the
|
|
# 16+2+2 namespaces defined in the XML file.
|
|
# Assumption: the argument was already normalized using 'normalizeNamespace'
|
|
sub isKnownNamespace(\$) {
|
|
my ($refToStr) = @_;
|
|
|
|
defined( $namespaces{$$refToStr} ); # return value
|
|
}
|
|
|
|
# The correct form to create a redirect is #REDIRECT [[ link ]],
|
|
# and function 'Parse::MediaWikiDump::page->redirect' only supports this form.
|
|
# However, it seems that Wikipedia can also tolerate a variety of other forms, such as
|
|
# REDIRECT|REDIRECTS|REDIRECTED|REDIRECTION, then an optional ":", optional "to" or optional "=".
|
|
# Therefore, we use our own function to handle these cases as well.
|
|
# If the page is a redirect, the function returns the title of the target page;
|
|
# otherwise, it returns 'undef'.
|
|
sub isRedirect($) {
|
|
my ($page) = @_;
|
|
|
|
# quick check
|
|
return undef if ( ${$page->text} !~ /^#REDIRECT/i );
|
|
|
|
if ( ${$page->text} =~ m{^\#REDIRECT # Redirect must start with "#REDIRECT"
|
|
# (the backslash is needed before "#" here, because
|
|
# "#" has special meaning with /x modifier)
|
|
(?:S|ED|ION)? # The word may be in any of these forms,
|
|
# i.e., REDIRECT|REDIRECTS|REDIRECTED|REDIRECTION
|
|
(?:\s*) # optional whitespace
|
|
(?: :|\sTO|=)? # optional colon, "TO" or "="
|
|
# (in case of "TO", we expect a whitespace before it,
|
|
# so that it's not glued to the preceding word)
|
|
(?:\s*) # optional whitespace
|
|
\[\[([^\]]*)\]\] # the link itself
|
|
}ix ) { # matching is case-insensitive, hence /i
|
|
my $target = $1;
|
|
|
|
if ($target =~ /^(.*)\#(?:.*)$/) {
|
|
# The link contains an anchor. Anchors are not allowed in REDIRECT pages, and therefore
|
|
# we adjust the link to point to the page as a whole (that's how Wikipedia works).
|
|
$target = $1;
|
|
}
|
|
|
|
return $target;
|
|
}
|
|
|
|
# OK, it's probably either a malformed redirect link, or something else
|
|
return undef;
|
|
}
|
|
|
|
sub isNamespaceOkForPrescanning($) {
|
|
my ($page) = @_;
|
|
|
|
&isNamespaceOk($page, \%okNamespacesForPrescanning);
|
|
}
|
|
|
|
sub isNamespaceOkForTransforming($) {
|
|
my ($page) = @_;
|
|
|
|
&isNamespaceOk($page, \%okNamespacesForTransforming);
|
|
}
|
|
|
|
sub isNamespaceOk($\%) {
|
|
my ($page, $refToNamespaceHash) = @_;
|
|
|
|
my $result = 1;
|
|
|
|
# main namespace is OK, so we only check pages that belong to other namespaces
|
|
|
|
if ($page->namespace ne '') {
|
|
my $namespace = $page->namespace;
|
|
&normalizeNamespace(\$namespace);
|
|
if ( &isKnownNamespace(\$namespace) ) {
|
|
$result = defined( $$refToNamespaceHash{$namespace} );
|
|
} else {
|
|
# the prefix before ":" in the page title is not a known namespace,
|
|
# therefore, the page belongs to the main namespace and is OK
|
|
}
|
|
}
|
|
|
|
$result; # return value
|
|
}
|
|
|
|
sub encodeXmlChars(\$) {
|
|
my ($refToStr) = @_;
|
|
|
|
$$refToStr =~ s/([&"'<>])/&$XmlEntities{$1};/g;
|
|
}
|
|
|
|
sub copyXmlFileHeader() {
|
|
open(INF, "< $file") or die "Cannot open $file";
|
|
while (<INF>) { # copy lines up to "</siteinfo>"
|
|
if (/^<mediawiki /) {
|
|
# The top level element - mediawiki - contains a lot of attributes (e.g., schema)
|
|
# that are no longer applicable to the XML file after our transformation.
|
|
# Therefore, we simply write an opening tag <mediawiki> without any attributes.
|
|
print OUTF "<mediawiki>\n";
|
|
} else {
|
|
# All other lines (up to </siteinfo>) are copied as-is
|
|
print OUTF;
|
|
}
|
|
last if (/<\/siteinfo>/);
|
|
}
|
|
close(INF); # this file will later be reopened by "Parse::MediaWikiDump"
|
|
}
|
|
|
|
sub closeXmlFile() {
|
|
print OUTF "</mediawiki>\n";
|
|
close(OUTF);
|
|
}
|
|
|
|
sub writeStatistics() {
|
|
my $statCategoriesFile = "$filePath/$fileBasename.stat.categories";
|
|
my $statIncomingLinksFile = "$filePath/$fileBasename.stat.inlinks";
|
|
|
|
open(STAT_CATS, "> $statCategoriesFile") or die "Cannot open $statCategoriesFile";
|
|
print STAT_CATS "# Line format: <CategoryId (= page id)> <Number of pages in this category>\n",
|
|
"# Here we count the *pages* that belong to this category, i.e., articles AND\n",
|
|
"# sub-categories of this category (but not the articles in the sub-categories).\n",
|
|
"\n\n";
|
|
|
|
my $cat;
|
|
foreach $cat ( sort { $statCategories{$b} <=> $statCategories{$a} }
|
|
keys(%statCategories) ) {
|
|
print STAT_CATS "$cat\t$statCategories{$cat}\n";
|
|
}
|
|
close(STAT_CATS);
|
|
|
|
open(STAT_INLINKS, "> $statIncomingLinksFile") or die "Cannot open $statIncomingLinksFile";
|
|
print STAT_INLINKS "# Line format: <Target page id> <Number of links to it from other pages>\n\n\n";
|
|
|
|
my $destination;
|
|
foreach $destination ( sort { $statIncomingLinks{$b} <=> $statIncomingLinks{$a} }
|
|
keys(%statIncomingLinks) ) {
|
|
print STAT_INLINKS "$destination\t$statIncomingLinks{$destination}\n";
|
|
}
|
|
|
|
close(STAT_INLINKS);
|
|
}
|
|
|
|
sub writeCategoryHierarchy() {
|
|
my $catHierarchyFile = "$filePath/$fileBasename.cat_hier";
|
|
|
|
open(CAT_HIER, "> $catHierarchyFile") or die "Cannot open $catHierarchyFile";
|
|
print CAT_HIER "# Line format: <Category id> <List of ids of immediate descendants>\n\n\n";
|
|
|
|
my $cat;
|
|
foreach $cat ( sort { $catHierarchy{$a} <=> $catHierarchy{$b} }
|
|
keys(%catHierarchy) ) {
|
|
print CAT_HIER "$cat\t", join(" ", @{$catHierarchy{$cat}}), "\n";
|
|
}
|
|
|
|
close(CAT_HIER);
|
|
}
|
|
|
|
sub loadNamespaces() {
|
|
# re-open the input XML file
|
|
my $pages = Parse::MediaWikiDump::Pages->new($file);
|
|
|
|
# load namespaces
|
|
my $refNamespaces = $pages->namespaces;
|
|
|
|
# namespace names are case-insensitive, so we force them
|
|
# to canonical form to facilitate future comparisons
|
|
my $ns;
|
|
foreach $ns (@$refNamespaces) {
|
|
my @namespaceData = @$ns;
|
|
my $namespaceId = $namespaceData[0];
|
|
my $namespaceName = $namespaceData[1];
|
|
&normalizeNamespace(\$namespaceName);
|
|
$namespaces{$namespaceName} = $namespaceId;
|
|
}
|
|
}
|
|
|
|
# build id <-> title mappings and redirection table,
|
|
# as well as load templates
|
|
sub prescan() {
|
|
# re-open the input XML file
|
|
my $pages = Parse::MediaWikiDump::Pages->new($file);
|
|
|
|
my $counter = 0;
|
|
|
|
my $page;
|
|
while (defined($page = $pages->page)) {
|
|
my $id = $page->id;
|
|
|
|
$counter++;
|
|
|
|
if ($counter % 1000 == 0) {
|
|
my $timeStr = &getTimeAsString();
|
|
print LOGF "[$timeStr] Prescanning page id=$id\n";
|
|
}
|
|
|
|
my $title = $page->title;
|
|
&normalizeTitle(\$title);
|
|
|
|
if (length($title) == 0) {
|
|
# This is a defense against pages whose title only contains UTF-8 chars that
|
|
# are reduced to an empty string. Right now I can think of one such case -
|
|
# <C2><A0> which represents the non-breaking space. In this particular case,
|
|
# this page is a redirect to [[Non-nreaking space]], but having in the system
|
|
# a redirect page with an empty title causes numerous problems, so we'll live
|
|
# happier without it.
|
|
print LOGF "Skipping page with empty title id=$id\n";
|
|
next;
|
|
}
|
|
|
|
my $redirect = &isRedirect($page);
|
|
if (defined($redirect)) {
|
|
&normalizeTitle(\$redirect);
|
|
next if (length($redirect) == 0); # again, same precaution here - see comments above
|
|
$redir{$title} = $redirect;
|
|
|
|
# nothing more to do for redirect pages
|
|
next;
|
|
}
|
|
|
|
if ( ! &isNamespaceOkForPrescanning($page) ) {
|
|
next; # we're only interested in certain namespaces
|
|
}
|
|
# if we get here, then either the page belongs to the main namespace OR
|
|
# it belongs to one of the namespaces we're interested in
|
|
|
|
if ( exists($id2title{$id}) ) {
|
|
print LOGF "Warning: Page id=$id already encountered before!\n";
|
|
next;
|
|
}
|
|
if ( exists($title2id{$title}) ) {
|
|
# A page could have been encountered before with a different spelling.
|
|
# Examples: = <C2><A0> (nonbreakable space), ß = <C3><9F> (German Eszett ligature)
|
|
print LOGF "Warning: Page title='$title' already encountered before!\n";
|
|
next;
|
|
}
|
|
$id2title{$id} = $title;
|
|
$title2id{$title} = $id;
|
|
|
|
if ($title =~ /^Template:/) {
|
|
my $text = ${$page->text};
|
|
|
|
# We're storing template text for future inclusion, therefore,
|
|
# remove all <noinclude> text and keep all <includeonly> text
|
|
# (but eliminate <includeonly> tags per se).
|
|
# However, if <onlyinclude> ... </onlyinclude> parts are present,
|
|
# then only keep them and discard the rest of the template body.
|
|
# This is because using <onlyinclude> on a text fragment is
|
|
# equivalent to enclosing it in <includeonly> tags **AND**
|
|
# enclosing all the rest of the template body in <noinclude> tags.
|
|
# These definitions can easily span several lines, hence the "/s" modifiers.
|
|
|
|
my $onlyincludeAccumulator;
|
|
while ($text =~ /<onlyinclude>(.*?)<\/onlyinclude>/sg) {
|
|
my $onlyincludeFragment = $1;
|
|
$onlyincludeAccumulator .= "$onlyincludeFragment\n";
|
|
}
|
|
if ( defined($onlyincludeAccumulator)) {
|
|
$text = $onlyincludeAccumulator;
|
|
} else {
|
|
# If there are no <onlyinclude> fragments, simply eliminate
|
|
# <noinclude> fragments and keep <includeonly> ones.
|
|
$text =~ s/<noinclude>(?:.*?)<\/noinclude>/\n/sg;
|
|
$text =~ s/<includeonly>(.*?)<\/includeonly>/$1/sg;
|
|
}
|
|
|
|
$templates{$id} = $text;
|
|
}
|
|
}
|
|
|
|
my $timeStr = &getTimeAsString();
|
|
print LOGF "[$timeStr] Prescanning complete - prescanned $counter pages\n";
|
|
}
|
|
|
|
sub transform() {
|
|
# re-open the input XML file
|
|
my $pages = Parse::MediaWikiDump::Pages->new($file);
|
|
|
|
my $page;
|
|
while (defined($page = $pages->page)) {
|
|
my $id = $page->id;
|
|
|
|
my $timeStr = &getTimeAsString();
|
|
print LOGF "[$timeStr] Transforming page id=$id\n";
|
|
|
|
if ( defined( &isRedirect($page) ) ) {
|
|
next; # we've already loaded all redirects in the prescanning phase
|
|
}
|
|
|
|
if ( ! &isNamespaceOkForTransforming($page) ) {
|
|
next; # we're only interested in pages from certain namespaces
|
|
}
|
|
|
|
my $title = $page->title;
|
|
&normalizeTitle(\$title);
|
|
|
|
# see the comment about empty titles in function 'prescan'
|
|
if (length($title) == 0) {
|
|
print LOGF "Skipping page with empty title id=$id\n";
|
|
next;
|
|
}
|
|
|
|
my $text = ${$page->text};
|
|
|
|
my $orgLength = length($text); # text length BEFORE any transformations
|
|
|
|
# The check for stub must be done BEFORE any further processing,
|
|
# because stubs indicators are templates, and templates are substituted.
|
|
my $isStub = 0;
|
|
if ( $text =~ m/stub}}/i ) {
|
|
$isStub = 1;
|
|
}
|
|
|
|
my @categories;
|
|
my @internalLinks;
|
|
my @urls;
|
|
|
|
&includeTemplates(\$text);
|
|
|
|
my @relatedArticles;
|
|
# This function only examines the contents of '$text', but doesn't change it.
|
|
&identifyRelatedArticles(\$text, \@relatedArticles, $id);
|
|
|
|
# We process categories directly, because '$page->categories' ignores
|
|
# categories inherited from included templates
|
|
&extractCategories(\$text, \@categories, $id);
|
|
|
|
# Categories are listed at the end of articles, and therefore may mistakenly
|
|
# be added to the list of related articles (which often appear in the last
|
|
# section such as "See also"). To avoid this, we explicitly remove all categories
|
|
# from the list of related links, and only then record the list of related links
|
|
# to the file.
|
|
&removeElements(\@relatedArticles, \@categories);
|
|
&recordRelatedArticles($id, \@relatedArticles);
|
|
|
|
&extractInternalLinks(\$text, \@internalLinks, $id, 1, 1);
|
|
&extractUrls(\$text, \@urls);
|
|
|
|
&postprocessText(\$text, 1);
|
|
|
|
my $newLength = length($text); # text length AFTER all transformations
|
|
|
|
&writePage($id, \$title, \$text, $orgLength, $newLength, $isStub, \@categories, \@internalLinks, \@urls);
|
|
|
|
&updateStatistics(\@categories, \@internalLinks);
|
|
|
|
if ($title =~ /^Category:/) {
|
|
&updateCategoryHierarchy($id, \@categories);
|
|
}
|
|
}
|
|
}
|
|
|
|
sub updateStatistics(\@\@) {
|
|
my ($refToCategories, $refToInternalLinks) = @_;
|
|
|
|
my $cat;
|
|
foreach $cat (@$refToCategories) {
|
|
$statCategories{$cat}++;
|
|
}
|
|
|
|
my $link;
|
|
foreach $link (@$refToInternalLinks) {
|
|
$statIncomingLinks{$link}++;
|
|
}
|
|
}
|
|
|
|
sub updateCategoryHierarchy($\@) {
|
|
# The list of categories passed as a parameter is actually the list of parent categories
|
|
# for the current category
|
|
my ($childId, $refToParentCategories) = @_;
|
|
|
|
my $parentCat;
|
|
foreach $parentCat (@$refToParentCategories) {
|
|
if ( exists($catHierarchy{$parentCat}) ) {
|
|
push(@{$catHierarchy{$parentCat}}, $childId);
|
|
} else {
|
|
# create a new array with '$childId' as the only child (for now) of '$parentCat'
|
|
my @arr;
|
|
push(@arr, $childId);
|
|
$catHierarchy{$parentCat} = [ @arr ];
|
|
}
|
|
}
|
|
}
|
|
|
|
sub writePage($\$\$$$$\@\@\@) {
|
|
my ($id, $refToTitle, $refToText, $orgLength, $newLength, $isStub,
|
|
$refToCategories, $refToInternalLinks, $refToUrls) = @_;
|
|
|
|
my $numCategories = scalar(@$refToCategories);
|
|
my $numLinks = scalar(@$refToInternalLinks);
|
|
my $numUrls = scalar(@$refToUrls);
|
|
|
|
print OUTF "<page id=\"$id\" orglength=\"$orgLength\" newlength=\"$newLength\" stub=\"$isStub\" " .
|
|
"categories=\"$numCategories\" outlinks=\"$numLinks\" urls=\"$numUrls\">\n";
|
|
|
|
my $encodedTitle = $$refToTitle;
|
|
&encodeXmlChars(\$encodedTitle);
|
|
print OUTF "<title>$encodedTitle</title>\n";
|
|
|
|
print OUTF "<categories>";
|
|
print OUTF join(" ", @$refToCategories);
|
|
print OUTF "</categories>\n";
|
|
|
|
print OUTF "<links>";
|
|
print OUTF join(" ", @$refToInternalLinks);
|
|
print OUTF "</links>\n";
|
|
|
|
print OUTF "<urls>\n";
|
|
|
|
my $url;
|
|
foreach $url (@$refToUrls) {
|
|
&encodeXmlChars(\$url);
|
|
print OUTF "$url\n";
|
|
}
|
|
print OUTF "</urls>\n";
|
|
|
|
# text has already undergone 'encodeXmlChars' in function 'postprocessText'
|
|
print OUTF "<text>\n$$refToText\n</text>\n";
|
|
|
|
print OUTF "</page>\n";
|
|
}
|
|
|
|
# Maps a title into the id, and performs redirection if necessary.
|
|
# Assumption: the argument was already normalized using 'normalizeTitle'
|
|
sub resolveLink(\$) {
|
|
my ($refToTitle) = @_;
|
|
|
|
# safety precaution
|
|
return undef if (length($$refToTitle) == 0);
|
|
|
|
my $targetId; # result
|
|
my $targetTitle = $$refToTitle;
|
|
|
|
if ( exists($redir{$$refToTitle}) ) { # this link is a redirect
|
|
$targetTitle = $redir{$$refToTitle};
|
|
|
|
# check if this is a double redirect
|
|
if ( exists($redir{$targetTitle}) ) {
|
|
$targetTitle = undef; # double redirects are not allowed and are ignored
|
|
print LOGF "Warning: link '$$refToTitle' caused double redirection and was ignored\n";
|
|
} else {
|
|
print LOGF "Link '$$refToTitle' was redirected to '$targetTitle'\n";
|
|
}
|
|
}
|
|
|
|
if ( defined($targetTitle) ) {
|
|
if ( exists($title2id{$targetTitle}) ) {
|
|
$targetId = $title2id{$targetTitle};
|
|
} else {
|
|
# target not found
|
|
print LOGF "Warning: link '$$refToTitle' cannot be matched to an id\n";
|
|
$targetId = undef;
|
|
}
|
|
} else {
|
|
$targetId = undef;
|
|
}
|
|
|
|
$targetId; # return value
|
|
}
|
|
|
|
sub includeTemplates(\$) {
|
|
my ($refToText) = @_;
|
|
|
|
# Using the while loop forces templates to be included recursively
|
|
# (i.e., includes the body of templates that themselves were included
|
|
# on the previous iteration ).
|
|
# Template definitions can easily span several lines, hence the "/s" modifier.
|
|
|
|
# Templates are frequently nested. Occasionally, parsing mistakes may cause template insertion
|
|
# to enter an infinite loop, for instance when trying to instantiate Template:Country
|
|
# {{country_{{{1}}}|{{{2}}}|{{{2}}}|size={{{size|}}}|name={{{name|}}}}}
|
|
# which is repeatedly trying to insert template "country_", which is again resolved to
|
|
# Template:Country. The straightforward solution of keeping track of templates that were
|
|
# already inserted for the current article would not work, because the same template
|
|
# may legally be used more than once, with different parameters in different parts of
|
|
# the article. Therefore, we simply limit the number of iterations of nested template
|
|
# inclusion.
|
|
|
|
my $templateRecursionLevels = 0;
|
|
|
|
# We also require that the body of a template does not contain the template opening sequence
|
|
# (two successive opening braces - "\{\{"). We use negative lookahead to achieve this.
|
|
while ( ($templateRecursionLevels < $maxTemplateRecursionLevels) &&
|
|
$$refToText =~ s/\{\{
|
|
(?:\s*) # optional whitespace before the template name is ignored
|
|
(
|
|
(?:
|
|
(?!
|
|
\{\{
|
|
)
|
|
.
|
|
)*?
|
|
)
|
|
# OLD code and comments
|
|
# (?:\s*) # optional whitespace before the template name is ignored
|
|
# ([^\{]*?) # Occasionally, templates are nested,
|
|
# # e.g., {{localurl:{{NAMESPACE}}:{{PAGENAME}}}}
|
|
# # In order to prevent incorrect parsing, e.g.,
|
|
# # "{{localurl:{{NAMESPACE}}", we require that the
|
|
# # template name does not include opening braces,
|
|
# # hence "[^\{]" (any char except opening brace).
|
|
# END OF OLD code and comments
|
|
\}\}
|
|
/&instantiateTemplate($1)/segx
|
|
) {
|
|
$templateRecursionLevels++;
|
|
}
|
|
|
|
# Since we limit the number of levels of template recursion, we might end up with several
|
|
# un-instantiated templates. In this case we simply eliminate them - however, we do so
|
|
# later, in function 'postprocessText()', after extracting categories, links and URLs.
|
|
}
|
|
|
|
BEGIN {
|
|
# Making variables static for the function to avoid recompilation of regular expressions
|
|
# every time the function is called.
|
|
|
|
my $specialSeparator = "\.pAr\.";
|
|
my $specialSeparatorRegex = qr/$specialSeparator/;
|
|
|
|
sub parseTemplateInvocation(\$\$\%) {
|
|
my ($refToTemplateInvocation, $refToTemplateTitle, $refToParameterHash) = @_;
|
|
|
|
# Template definitions (especially those with parameters) can easily span several lines,
|
|
# hence the "/s" modifier. The template name extends up to the first pipeline symbol (if any).
|
|
# Template parameters go after the "|" symbol.
|
|
if ($$refToTemplateInvocation =~ /^([^|]*)\|(.*)$/sx) {
|
|
$$refToTemplateTitle = $1; # single out the template name itself
|
|
my $paramsList = $2;
|
|
|
|
# Template parameters often contain URLs, internal links, or just other useful text,
|
|
# whereas the template serves for presenting it in some nice way.
|
|
# Parameters are separated by "|" symbols. However, we cannot simply split the string
|
|
# on "|" symbols, since these frequently appear inside internal links. Therefore, we split
|
|
# on those "|" symbols that are not inside [[...]]. It's obviously sufficient to check that
|
|
# brackets are not improperly nested on one side of "|", so we use lookahead.
|
|
# We first replace all "|" symbols that are not inside [[...]] with a special separator that
|
|
# we invented, which will hopefully not normally appear in the text (.pAr.).
|
|
# Next, we use 'split' to break the string on this new separator.
|
|
|
|
$paramsList =~ s/\| # split on pipeline symbol, such that
|
|
(?: # non-capturing grouper that encloses 2 options
|
|
(?= # zero-width lookahead - option #1
|
|
[^\]]*$ # there are no closing brackets up to the end
|
|
# of the string (i.e., all the characters up to
|
|
# the end of the string are not closing brackets)
|
|
) # end of first lookahead (= end of option #1)
|
|
| # or
|
|
(?= # another zero-width lookahead - option #2
|
|
[^\]]* \[ # the nearest opening bracket on the right is not preceded
|
|
# by a closing bracket (i.e., all the characters that
|
|
# precede it are not closing brackets
|
|
) # end of second lookahead (= end of option #2)
|
|
) # end of the outer grouper
|
|
/$specialSeparator/sxg; # replace matching symbols with a special separator
|
|
# /s means string can contain newline chars
|
|
|
|
my @parameters = split(/$specialSeparatorRegex/, $paramsList);
|
|
|
|
# Parameters can be either named or unnamed. In the latter case, their name is defined by their
|
|
# ordinal position (1, 2, 3, ...).
|
|
|
|
my $unnamedParameterCounter = 0;
|
|
|
|
# It's legal for unnamed parameters to be skipped, in which case they will get default
|
|
# values (if available) during actual instantiation. That is {{template_name|a||c}} means
|
|
# parameter 1 gets the value 'a', parameter 2 value is not defined, and parameter 3 gets the value 'c'.
|
|
# This case is correctly handled by function 'split', and does not require any special handling.
|
|
my $param;
|
|
foreach $param (@parameters) {
|
|
# Spaces before or after a parameter value are normally ignored, UNLESS the parameter contains
|
|
# a link (to prevent possible gluing the link to the following text after template substitution)
|
|
|
|
# Parameter values may contain "=" symbols, hence the parameter name extends up to
|
|
# the first such symbol.
|
|
# It is legal for a parameter to be specified several times, in which case the last assignment
|
|
# takes precedence. Example: "{{t|a|b|c|2=B}}" is equivalent to "{{t|a|B|c}}".
|
|
# Therefore, we don't check if the parameter has been assigned a value before, because
|
|
# anyway the last assignment should override any previous ones.
|
|
if ($param =~ /^([^=]*)=(.*)$/s) {
|
|
# This is a named parameter.
|
|
# This case also handles parameter assignments like "2=xxx", where the number of an unnamed
|
|
# parameter ("2") is specified explicitly - this is handled transparently.
|
|
|
|
my $parameterName = $1;
|
|
my $parameterValue = $2;
|
|
|
|
&trimWhitespaceBothSides(\$parameterName);
|
|
if ($parameterValue !~ /\]\]/) { # if the value does not contain a link, trim whitespace
|
|
&trimWhitespaceBothSides(\$parameterValue);
|
|
}
|
|
|
|
$$refToParameterHash{$parameterName} = $parameterValue;
|
|
} else {
|
|
# this is an unnamed parameter
|
|
$unnamedParameterCounter++;
|
|
|
|
if ($param !~ /\]\]/) { # if the value does not contain a link, trim whitespace
|
|
&trimWhitespaceBothSides(\$param);
|
|
}
|
|
|
|
$$refToParameterHash{$unnamedParameterCounter} = $param;
|
|
}
|
|
}
|
|
} else {
|
|
# Template invocation does not contain a pipeline symbol, hence take the entire
|
|
# invocation text as the template title.
|
|
$$refToTemplateTitle = $$refToTemplateInvocation;
|
|
}
|
|
}
|
|
|
|
} # end of BEGIN block
|
|
|
|
|
|
sub instantiateTemplate($) {
|
|
my ($templateInvocation) = @_;
|
|
|
|
my $result = "";
|
|
|
|
print LOGF "Instantiating template=$templateInvocation\n";
|
|
|
|
my $templateTitle;
|
|
my %templateParams;
|
|
&parseTemplateInvocation(\$templateInvocation, \$templateTitle, \%templateParams);
|
|
|
|
&computeFullyQualifiedTemplateTitle(\$templateTitle);
|
|
|
|
&includeTemplateText(\$templateTitle, \%templateParams, \$result);
|
|
|
|
$result; # return value
|
|
}
|
|
|
|
sub includeTemplateText(\$\%\$) {
|
|
my ($refToTemplateTitle, $refToParameterHash, $refToResult) = @_;
|
|
|
|
&normalizeTitle($refToTemplateTitle);
|
|
my $includedPageId = &resolveLink($refToTemplateTitle);
|
|
|
|
if ( defined($includedPageId) && exists($templates{$includedPageId}) ) {
|
|
# OK, perform the actual inclusion with parameter substitution
|
|
|
|
$$refToResult = $templates{$includedPageId};
|
|
|
|
# Perform parameter substitution
|
|
# A parameter call ( {{{...}}} ) may span over a newline, hence the /s modifier
|
|
|
|
# Parameters may be nested (see comments below), hence we do the substitution iteratively
|
|
# in a while loop. We also limit the maximum number of iterations to avoid too long or
|
|
# even endless loops (in case of malformed input).
|
|
my $parameterRecursionLevels = 0;
|
|
|
|
# We also require that the body of a parameter does not contain the parameter opening sequence
|
|
# (three successive opening braces - "\{\{\{"). We use negative lookahead to achieve this.
|
|
while ( ($parameterRecursionLevels < $maxParameterRecursionLevels) &&
|
|
$$refToResult =~ s/\{\{\{
|
|
(
|
|
(?:
|
|
(?!
|
|
\{\{\{
|
|
)
|
|
.
|
|
)*?
|
|
)
|
|
|
|
# OLD code and comments
|
|
# ([^\{]*?) # Occasionally, parameters are nested because
|
|
# # they are dependent on other parameters,
|
|
# # e.g., {{{Author|{{{PublishYear|}}}}}}
|
|
# # (here, the default value for 'Author' is
|
|
# # dependent on 'PublishYear').
|
|
# # In order to prevent incorrect parsing, e.g.,
|
|
# # "{{{Author|{{{PublishYear|}}}", we require that the
|
|
# # parameter name does not include opening braces,
|
|
# # hence "[^\{]" (any char except opening brace).
|
|
# END OF OLD code and comments
|
|
\}\}\}
|
|
/&substituteParameter($1, $refToParameterHash)/segx
|
|
) {
|
|
$parameterRecursionLevels++;
|
|
}
|
|
} else {
|
|
# The page being included cannot be identified - perhaps we skipped it (because currently
|
|
# we only allow for inclusion of pages in the Template namespace), or perhaps it's
|
|
# a variable name like {{NUMBEROFARTICLES}}. Just remove this inclusion directive and
|
|
# replace it with a space
|
|
print LOGF "Template '$$refToTemplateTitle' is not available for inclusion\n";
|
|
$$refToResult = " ";
|
|
}
|
|
}
|
|
|
|
sub substituteParameter($\%) {
|
|
my ($parameter, $refToParameterHash) = @_;
|
|
|
|
my $result;
|
|
|
|
if ($parameter =~ /^([^|]*)\|(.*)$/) {
|
|
# This parameter has a default value
|
|
my $paramName = $1;
|
|
my $defaultValue = $2;
|
|
|
|
if ( defined($$refToParameterHash{$paramName}) ) {
|
|
$result = $$refToParameterHash{$paramName}; # use parameter value specified in template invocation
|
|
} else { # use the default value
|
|
$result = $defaultValue;
|
|
}
|
|
} else {
|
|
# parameter without a default value
|
|
|
|
if ( defined($$refToParameterHash{$parameter}) ) {
|
|
$result = $$refToParameterHash{$parameter}; # use parameter value specified in template invocation
|
|
} else {
|
|
# Parameter not specified in template invocation and does not have a default value -
|
|
# do not perform substitution and keep the parameter in 3 braces
|
|
# (these are Wiki rules for templates, see http://meta.wikimedia.org/wiki/Help:Template ).
|
|
$result = "{{{$parameter}}}";
|
|
}
|
|
}
|
|
|
|
# Surplus parameters - i.e., those assigned values in template invocation but not used
|
|
# in the template body - are simply ignored.
|
|
|
|
$result; # return value
|
|
}
|
|
|
|
sub computeFullyQualifiedTemplateTitle(\$) {
|
|
my ($refToTemplateTitle) = @_;
|
|
|
|
# Determine the namespace of the page being included through the template mechanism
|
|
|
|
my $namespaceSpecified = 0;
|
|
|
|
if ($$refToTemplateTitle =~ /^:(.*)$/) {
|
|
# Leading colon by itself implies main namespace, so strip this colon
|
|
$$refToTemplateTitle = $1;
|
|
$namespaceSpecified = 1;
|
|
} elsif ($$refToTemplateTitle =~ /^([^:]*):/) {
|
|
# colon found but not in the first position - check if it designates a known namespace
|
|
my $prefix = $1;
|
|
&normalizeNamespace(\$prefix);
|
|
$namespaceSpecified = &isKnownNamespace(\$prefix);
|
|
}
|
|
|
|
# The case when the page title does not contain a colon at all also falls here.
|
|
|
|
if ($namespaceSpecified) {
|
|
# OK, the title of the page being included is fully qualified with a namespace
|
|
} else {
|
|
# The title of the page being included is NOT in the main namespace and lacks
|
|
# any other explicit designation of the namespace - therefore, it is resolved
|
|
# to the Template namespace (that's the default for the template inclusion mechanism).
|
|
$$refToTemplateTitle = "Template:$$refToTemplateTitle";
|
|
}
|
|
}
|
|
|
|
sub extractCategories(\$\@$) {
|
|
my ($refToText, $refToCategoriesArray, $id) = @_;
|
|
|
|
# Remember that namespace names are case-insensitive, hence we're matching with "/i".
|
|
# The first parameter to 'collectCategory' is passed by value rather than by reference,
|
|
# because it might be dangerous to pass a reference to $1 in case it might get modified
|
|
# (with unclear consequences).
|
|
$$refToText =~ s/\[\[(?:\s*)(Category:.*?)\]\]/&collectCategory($1, $refToCategoriesArray)/ieg;
|
|
|
|
# We don't accumulate categories directly in a hash table, since this would not preserve
|
|
# their original order of appearance.
|
|
&removeDuplicatesAndSelf($refToCategoriesArray, $id);
|
|
}
|
|
|
|
sub collectCategory($\@) {
|
|
my ($catName, $refToCategoriesArray) = @_;
|
|
|
|
if ($catName =~ /^(.*)\|/) {
|
|
# Some categories contain a sort key, e.g., [[Category:Whatever|*]] or [[Category:Whatever| ]]
|
|
# In such a case, take only the category name itself.
|
|
$catName = $1;
|
|
}
|
|
|
|
&normalizeTitle(\$catName);
|
|
|
|
my $catId = &resolveLink(\$catName);
|
|
if ( defined($catId) ) {
|
|
push(@$refToCategoriesArray, $catId);
|
|
} else {
|
|
print LOGF "Warning: unknown category '$catName'\n";
|
|
}
|
|
|
|
# The return value is just a space, because we remove categories from the text
|
|
# after we collected them
|
|
" ";
|
|
}
|
|
|
|
sub extractInternalLinks(\$\@$$$) {
|
|
my ($refToText, $refToInternalLinksArray, $id,
|
|
$whetherToLogAnchorText, $whetherToRemoveDuplicates) = @_;
|
|
|
|
# For each internal link outgoing form the current article, this hash table maps
|
|
# the target id into the anchor text associated with it. Naturally, we only
|
|
# collect anchor text for links that can be resolved to a page id.
|
|
my %anchorTexts;
|
|
|
|
# Link definitions may span over adjacent lines and therefore contain line breaks,
|
|
# hence we use the /s modifier.
|
|
# Occasionally, links are nested, e.g.,
|
|
# [[Image:kanner_kl2.jpg|frame|right|Dr. [[Leo Kanner]] introduced the label ''early infantile autism'' in [[1943]].]]
|
|
# In order to prevent incorrect parsing, e.g., "[[Image:kanner_kl2.jpg|frame|right|Dr. [[Leo Kanner]]",
|
|
# we extract links in several iterations of the while loop, while the link definition requires that
|
|
# each pair [[...]] does not contain any opening braces.
|
|
|
|
1 while ( $$refToText =~ s/
|
|
(\w*) # words may be glued to the beginning of the link,
|
|
# in which case they become part of the link
|
|
# e.g., "ex-[[Giuseppe Mazzini|Mazzinian]] "
|
|
\[\[
|
|
([^\[]*?) # the link text can be any chars except an opening bracket,
|
|
# this ensures we correctly parse nested links (see comments above)
|
|
\]\]
|
|
(\w*) # words may be glued to the end of the link,
|
|
# in which case they become part of the link
|
|
# e.g., "[[public transport]]ation"
|
|
/&collectInternalLink($1, $2, $3, $refToInternalLinksArray, \%anchorTexts)/segx
|
|
);
|
|
|
|
if ($whetherToRemoveDuplicates) {
|
|
&removeDuplicatesAndSelf($refToInternalLinksArray, $id);
|
|
}
|
|
|
|
if ($whetherToLogAnchorText) {
|
|
&logAnchorText(\%anchorTexts, $id);
|
|
}
|
|
}
|
|
|
|
sub logAnchorText(\%$) {
|
|
my ($refToAnchorTextsHash, $curPageId) = @_;
|
|
|
|
# Remember that we use a hash table to associate anchor text with target page ids.
|
|
# Therefore, if the current page has several links to another page (it happens), then we only
|
|
# keep the anchor text of the last one (and override the previous ones) - we can live with it.
|
|
# Consequently, we do not need to remove duplicates as there are none.
|
|
# However, we still remove the links that point from the page to itself.
|
|
my $targetId;
|
|
my $anchorText;
|
|
while ( ($targetId, $anchorText) = each(%$refToAnchorTextsHash) ) {
|
|
if ($targetId != $curPageId) {
|
|
&postprocessText(\$anchorText, 0); # anchor text doesn't need escaping of XML characters,
|
|
# hence the second function parameter is 0
|
|
$anchorText =~ s/\n/ /g; # replace all newlines with spaces
|
|
|
|
# make sure that something is left of anchor text after postprocessing
|
|
if (length($anchorText) > 0) {
|
|
print ANCHORF "$targetId\t$curPageId\t$anchorText\n";
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
sub collectInternalLink($$$\@\%) {
|
|
my ($prefix, $link, $suffix, $refToInternalLinksArray, $refToAnchorTextHash) = @_;
|
|
|
|
my $originalLink = $link;
|
|
my $result = "";
|
|
|
|
# strip leading whitespace, if any
|
|
$link =~ s/^\s*//;
|
|
|
|
# Link definitions may span over adjacent lines and therefore contain line breaks,
|
|
# hence we use the /s modifier on most matchings.
|
|
|
|
# There are some special cases when the link may be preceded with a colon.
|
|
# Known cases:
|
|
# - Linking to a category (as opposed to actually assigning the current article
|
|
# to a category) is performed using special syntax [[:Category:...]]
|
|
# - Linking to other languages, e.g., [[:fr:Wikipedia:Aide]]
|
|
# (without the leading colon, the link will go to the side menu
|
|
# - Linking directly to the description page of an image, e.g., [[:Image:wiki.png]]
|
|
# In all such cases, we strip the leading colon.
|
|
if ($link =~ /^
|
|
: # colon at the beginnning of the link name
|
|
(.*) # the rest of the link text
|
|
$
|
|
/sx) {
|
|
# just strip this initial colon (as well as any whitespace preceding it)
|
|
$link = $1;
|
|
}
|
|
|
|
# Alternative text may be available after the pipeline symbol.
|
|
# If the pipeline symbol is only used for masking parts of
|
|
# the link name for presentation, we still consider that the author of the page
|
|
# deemed the resulting text important, hence we always set this variable when
|
|
# the pipeline symbol is present.
|
|
my $alternativeTextAvailable = 0;
|
|
|
|
# Some links contain several pipeline symbols, e.g.,
|
|
# [[Image:Zerzan.jpeg|thumb|right|[[John Zerzan]]]]
|
|
# It seems that the extra pipeline symbols are parameters, so we just eliminate them.
|
|
if ($link =~ /^(.*)\|([^|]*)$/s) { # first, extract the link up to the last pipeline symbol
|
|
$link = $1; # the part before the last pipeline
|
|
$result = $2; # the part after the last pipeline, this is usually an alternative text for this link
|
|
|
|
$alternativeTextAvailable = 1; # pipeline found, see comment above
|
|
|
|
# Now check if there are pipeline symbols remaining.
|
|
# Note that this time we're looking for the shortest match,
|
|
# to take the part of the text up to the first pipeline symbol.
|
|
if ($link =~ /^([^|]*)\|(.*)$/s) {
|
|
$link = $1;
|
|
# $2 contains the parameters, which we don't really need
|
|
}
|
|
|
|
if (length($result) == 0) {
|
|
if ($link !~ /\#/) {
|
|
# If the "|" symbol is not followed by some text, then it masks the namespace
|
|
# as well as any text in parentheses at the end of the link title.
|
|
# However, pipeline masking is only invoked if the link does not contain an anchor,
|
|
# hence the additional condition in the 'if' statement.
|
|
&performPipelineMasking(\$link, \$result);
|
|
} else {
|
|
# If the link contains an anchor, then masking is not invoked, and we take the entire link
|
|
$result = $link;
|
|
}
|
|
}
|
|
} else {
|
|
# the link text does not contain the pipeline, so take it as-is
|
|
$result = $link;
|
|
}
|
|
|
|
if ($link =~ /^(.*)\#(.*)$/s) {
|
|
# The link contains an anchor, so adjust the link to point to the page as a whole.
|
|
$link = $1;
|
|
my $anchor = $2;
|
|
# Check if the link points to an anchor on the current page, and if so - ignore it.
|
|
if (length($link) == 0 && ! $alternativeTextAvailable) {
|
|
# This is indeed a link pointing to an anchor on the current page.
|
|
# The link is thus cleared, so that it will not be resolved and collected later.
|
|
# For anchors to the same page, discard the leading '#' symbol, and take
|
|
# the rest as the text - but only if no alternative text was provided for this link.
|
|
$result = $anchor;
|
|
}
|
|
}
|
|
|
|
# Now collect the link, or links if the original link is in the date format
|
|
# and specifies both day and year. In the latter case, the function for date
|
|
# normalization may also modify the link text ($result), and may collect more
|
|
# than one link (one for the day, another one for the year).
|
|
my $dateRecognized = 0;
|
|
|
|
# Alternative text (specified after pipeline) blocks normalization of dates.
|
|
# We also perform a quick check - if the link does not start with a digit,
|
|
# then it surely does not contain a date
|
|
if ( ($link =~ /^\d/) && (! $alternativeTextAvailable)) {
|
|
$dateRecognized = &normalizeDates(\$link, \$result, $refToInternalLinksArray, $refToAnchorTextHash);
|
|
}
|
|
|
|
# If a date (either day or day + year) was recognized, then no further processing is necessary
|
|
if (! $dateRecognized) {
|
|
&normalizeTitle(\$link);
|
|
my $targetId = &resolveAndCollectInternalLink(\$link, $refToInternalLinksArray);
|
|
|
|
# Wikipedia pages contain many links to other Wiki projects (especially Wikipedia in
|
|
# other languages). While these links are not resolved to valid pages, we also want
|
|
# to ignore their text. However, simply discarding the text of all links that cannot
|
|
# be resolved would be overly aggressive, as authors frequently define phrases as links
|
|
# to articles that don't yet exist, in the hope that they will be added later.
|
|
# Therefore, we formulate the following conditions that must hold simultaneously
|
|
# for discarding the text of a link:
|
|
# 1) the link was not resolved to a valid id
|
|
# 2) the link does not contain alternative text (if it did, then the text is probably
|
|
# important enough to be retained)
|
|
# 3) the link contains a colon - this is a very simple heuristics for identifying links to
|
|
# other Wiki projects, other languages, or simply other namespaces within current Wikipedia.
|
|
# While this method is not fool-proof (there are regular pages in the main namespace
|
|
# that contain a colon in their title), we believe this is a reasonable tradeoff.
|
|
if ( !defined($targetId) && ! $alternativeTextAvailable && $link =~ /:/ ) {
|
|
$result = "";
|
|
print LOGF "Discarding text for link '$originalLink'\n";
|
|
} else {
|
|
# finally, add the text originally attached to the left and/or to the right of the link
|
|
# (if the link represents a date, then it has not text glued to it, so it's OK to only
|
|
# use the prefix and suffix here)
|
|
$result = $prefix . $result . $suffix;
|
|
}
|
|
|
|
if ( defined($targetId) ) {
|
|
# If the current page has several links to another page, then we only take the anchor
|
|
# of the last one (and override the previous ones) - we can live with it.
|
|
$$refToAnchorTextHash{$targetId} = $result;
|
|
}
|
|
}
|
|
|
|
$result; #return value
|
|
}
|
|
|
|
sub performPipelineMasking(\$\$) {
|
|
my ($refToLink, $refToResult) = @_;
|
|
|
|
# First check for presence of a namespace
|
|
if ($$refToLink =~ /^([^:]*):(.*)$/) {
|
|
my $namespaceCandidate = $1;
|
|
my $rest = $2;
|
|
|
|
&normalizeNamespace(\$namespaceCandidate);
|
|
if ( &isKnownNamespace(\$namespaceCandidate) ) {
|
|
$$refToResult = $rest; # take the link text without the namespace
|
|
} else {
|
|
$$refToResult = $$refToLink; # otherwise, take the entire link text (for now)
|
|
}
|
|
} else {
|
|
$$refToResult = $$refToLink; # otherwise, take the entire link text (for now)
|
|
}
|
|
|
|
# Now check if there are parentheses at the end of the link text
|
|
# (we now operate on $$refToResult, because we might have stripped the leading
|
|
# namespace in the previous test).
|
|
if ($$refToResult =~ /^ # the beginning of the string
|
|
(.*) # the text up to the last pair of parentheses
|
|
\( # opening parenthesis
|
|
(?:[^()]*) # the text in the parentheses
|
|
\) # closing parenthesis
|
|
(?:\s*) # optional trailing whitespace, just in case
|
|
$ # end of string
|
|
/x) {
|
|
$$refToResult = $1; # discard the text in parentheses at the end of the string
|
|
}
|
|
}
|
|
|
|
sub resolveAndCollectInternalLink(\$\@) {
|
|
my ($refToLink, $refToInternalLinksArray) = @_;
|
|
|
|
my $targetId = &resolveLink($refToLink);
|
|
if ( defined($targetId) ) {
|
|
push(@$refToInternalLinksArray, $targetId);
|
|
} else {
|
|
# Some cases in this category that obviously won't be resolved to legal ids:
|
|
# - Links to namespaces that we don't currently handle
|
|
# (other than those for which 'isNamespaceOK' returns true);
|
|
# media and sound files fall in this category
|
|
# - Links to other languages, e.g., [[de:...]]
|
|
# - Links to other Wiki projects, e.g., [[Wiktionary:...]]
|
|
print LOGF "Warning: unknown link '$$refToLink'\n";
|
|
}
|
|
|
|
$targetId; # return value
|
|
}
|
|
|
|
# Dates can appear in several formats
|
|
# 1) [[July 20]], [[1969]]
|
|
# 2) [[20 July]] [[1969]]
|
|
# 3) [[1969]]-[[07-20]]
|
|
# 4) [[1969-07-20]]
|
|
# The first one is handled correctly without any special treatment,
|
|
# so we don't even check for it here.
|
|
# In (2) and (3), we only normalize the day, because it will be parsed separately from the year.
|
|
# This function is only invoked if the link has no alternative text available, therefore,
|
|
# we're free to override the result text.
|
|
sub normalizeDates(\$\$\@\%) {
|
|
my ($refToLink, $refToResultText, $refToInternalLinksArray, $refToAnchorTextHash) = @_;
|
|
|
|
my $dateRecognized = 0;
|
|
|
|
if ($$refToLink =~ /^(\d\d)\s*([A-Za-z]+)$/) {
|
|
my $day = $1;
|
|
my $month = ucfirst(lc($2));
|
|
|
|
if ( defined($monthToNumDays{$month}) &&
|
|
1 <= $day && $day <= $monthToNumDays{$month} ) {
|
|
$dateRecognized = 1;
|
|
|
|
$$refToLink = "$month $day";
|
|
$$refToResultText = "$month $day";
|
|
|
|
my $targetId = &resolveAndCollectInternalLink($refToLink, $refToInternalLinksArray);
|
|
if ( defined($targetId) ) {
|
|
$$refToAnchorTextHash{$targetId} = $$refToResultText;
|
|
}
|
|
} else {
|
|
# this doesn't look like a valid date, leave as-is
|
|
}
|
|
} elsif ($$refToLink =~ /^(\d\d)\-(\d\d)$/) {
|
|
my $monthNum = int($1);
|
|
my $day = $2;
|
|
|
|
if ( defined($numberToMonth{$monthNum}) ) {
|
|
my $month = $numberToMonth{$monthNum};
|
|
if (1 <= $day && $day <= $monthToNumDays{$month}) {
|
|
$dateRecognized = 1;
|
|
|
|
$$refToLink = "$month $day";
|
|
# we add a leading space, to separate the preceding year ("[[1969]]-" in the example")
|
|
# from the day that we're creating
|
|
$$refToResultText = " $month $day";
|
|
|
|
my $targetId = &resolveAndCollectInternalLink($refToLink, $refToInternalLinksArray);
|
|
if ( defined($targetId) ) {
|
|
$$refToAnchorTextHash{$targetId} = $$refToResultText;
|
|
}
|
|
} else {
|
|
# this doesn't look like a valid date, leave as-is
|
|
}
|
|
} else {
|
|
# this doesn't look like a valid date, leave as-is
|
|
}
|
|
} elsif ($$refToLink =~ /^(\d\d\d\d)\-(\d\d)\-(\d\d)$/) {
|
|
my $year = $1;
|
|
my $monthNum = int($2);
|
|
my $day = $3;
|
|
|
|
if ( defined($numberToMonth{$monthNum}) ) {
|
|
my $month = $numberToMonth{$monthNum};
|
|
if (1 <= $day && $day <= $monthToNumDays{$month}) {
|
|
$dateRecognized = 1;
|
|
|
|
$$refToLink = "$month $day";
|
|
# the link text is combined from the day and the year
|
|
$$refToResultText = "$month $day, $year";
|
|
|
|
my $targetId;
|
|
|
|
# collect the link for the day
|
|
$targetId = &resolveAndCollectInternalLink($refToLink, $refToInternalLinksArray);
|
|
if ( defined($targetId) ) {
|
|
$$refToAnchorTextHash{$targetId} = $$refToLink;
|
|
}
|
|
|
|
# collect the link for the year
|
|
$targetId = &resolveAndCollectInternalLink(\$year, $refToInternalLinksArray);
|
|
if ( defined($targetId) ) {
|
|
$$refToAnchorTextHash{$targetId} = $year;
|
|
}
|
|
} else {
|
|
# this doesn't look like a valid date, leave as-is
|
|
}
|
|
} else {
|
|
# this doesn't look like a valid date, leave as-is
|
|
}
|
|
}
|
|
|
|
$dateRecognized; # return value
|
|
}
|
|
|
|
sub extractUrls(\$\@) {
|
|
my ($refToText, $refToUrlsArray) = @_;
|
|
|
|
# First we handle the case of URLs enclosed in single brackets, with or without the description,
|
|
# and with optional leading and/or trailing whitespace
|
|
# Examples: [http://www.cnn.com], [ http://www.cnn.com ], [http://www.cnn.com CNN Web site]
|
|
$$refToText =~ s/\[(?:\s*)($urlProtocols(?:[^\[\]]*))\]/&collectUrlFromBrackets($1, $refToUrlsArray)/eg;
|
|
|
|
# Now we handle standalone URLs (those not enclosed in brackets)
|
|
# The $urlTemrinator is matched via positive lookahead (?=...) in order not to remove
|
|
# the terminator symbol itself, but rather only the URL.
|
|
$$refToText =~ s/($urlProtocols(?:.*?))$urlTerminator/&collectStandaloneUrl($1, $refToUrlsArray)/eg;
|
|
|
|
&removeDuplicatesAndSelf($refToUrlsArray, undef);
|
|
}
|
|
|
|
sub collectUrlFromBrackets($\@) {
|
|
my ($url, $refToUrlsArray) = @_;
|
|
|
|
my $text;
|
|
# Assumption: leading whitespace has already been stripped
|
|
if ( $url =~ /^($urlProtocols(?:.*?))($urlTerminator(?:.*))$/ ) { # description available
|
|
push(@$refToUrlsArray, $1);
|
|
$text = $2;
|
|
} else { # no description
|
|
push(@$refToUrlsArray, $url);
|
|
$text = " ";
|
|
}
|
|
|
|
$text; # return value
|
|
}
|
|
|
|
sub collectStandaloneUrl($\@) {
|
|
my ($url, $refToUrlsArray) = @_;
|
|
|
|
push(@$refToUrlsArray, $url); # collect the URL as-is
|
|
|
|
" "; # return value - replace the URL with a space
|
|
}
|
|
|
|
sub postprocessText(\$$) {
|
|
my ($refToText, $whetherToEncodeXmlChars) = @_;
|
|
|
|
# Eliminate all <includeonly> and <onlyinclude> fragments, because this text
|
|
# will not be included anywhere, as we already handled all inclusion directives
|
|
# in function 'includeTemplates'.
|
|
# This block can easily span several lines, hence the "/s" modifier.
|
|
$$refToText =~ s/<includeonly>(.*?)<\/includeonly>/ /sg;
|
|
$$refToText =~ s/<onlyinclude>(.*?)<\/onlyinclude>/ /sg;
|
|
|
|
# <noinclude> fragments remain, but remove the tags per se
|
|
# We block the code below, as <noinclude> tags will anyway be thrown away later,
|
|
# when we eliminate all remaining tags.
|
|
### This block can easily span several lines, hence the "/s" modifier
|
|
### $$refToText =~ s/<noinclude>(.*?)<\/noinclude>/$1/sg;
|
|
|
|
# replace <br> and <br /> directives with new paragraph
|
|
$$refToText =~ s/<br(?:\s*)(?:[\/]?)>/\n\n/g;
|
|
|
|
# Remove tables, as they often carry a lot of noise
|
|
&eliminateTables($refToText);
|
|
|
|
# Since we limit the number of levels of template recursion, we might end up with several
|
|
# un-instantiated templates. In this case we simply eliminate them now.
|
|
# Because templates may be nested, we eliminate them iteratively by starting from the most
|
|
# nested one (hence the 'while' loop).
|
|
# OLD comments and code:
|
|
# For the same reason, we also require that the body of a template does not contain
|
|
# opening braces (hence "[^\{]", any char except opening brace).
|
|
# 1 while ($$refToText =~ s/\{\{(?:[^\{]*?)\}\}/ /sg);
|
|
# END OF old comments and code
|
|
# We also require that the body of a template does not contain the template opening sequence
|
|
# (two successive opening braces - "\{\{"). We use negative lookahead to achieve this.
|
|
1 while ($$refToText =~ s/\{\{
|
|
(?:
|
|
(?:
|
|
(?!
|
|
\{\{
|
|
)
|
|
.
|
|
)*?
|
|
)
|
|
\}\}
|
|
/ /sgx);
|
|
|
|
# Remove any other <...> tags - but keep the text they enclose
|
|
# (the tags are replaced with spaces to prevent adjacent pieces of text
|
|
# from being glued together).
|
|
# Comments (<!-- ... -->) also fall into this category, and since they can easily span several lines,
|
|
# we use the "/s" modifier.
|
|
$$refToText =~ s/<(?:.*?)>/ /sg;
|
|
|
|
# Change markup on bold/italics emphasis. We probably don't need to distinguish
|
|
# these 3 types of emphasis, so we just replace all of them with a generic <em> tag.
|
|
# IMPORTANT: If 'encodeXmlChars' has beeen called before this line, then remember that
|
|
# the apostrophes were already quoted to "'"
|
|
$$refToText =~ s/'''''(.*?)'''''/$1/g;
|
|
$$refToText =~ s/'''(.*?)'''/$1/g;
|
|
$$refToText =~ s/''(.*?)''/$1/g;
|
|
|
|
# Eliminate long sequences of newlines and whitespace.
|
|
# Note that we don't want to replace sequences of spaces only, as this might make the text
|
|
# less readable. Instead, we only eliminate sequences of whitespace that contain at least
|
|
# two newlines.
|
|
$$refToText =~ s/(?:\s*)\n(?:\s*)\n(?:\s*)/\n\n/g;
|
|
|
|
# Eliminate XML entities such as " " , "×" etc. - otherwise,
|
|
# in C++ code they will give rise to spurious words "nbsp", "times" etc.
|
|
# Note that the standard entities - & , " , ' , < and >
|
|
# are handled by the XML parser. All other entities, such as are passed
|
|
# by the XML parser to the upper level (in case of Wikipedia pages,
|
|
# to the rendering engine).
|
|
# Note that in the raw XML text, these entities look like "&nbsp;"
|
|
# (i.e., with leading "&"). XML parser replaces "&" with "&",
|
|
# so here in the code we see the entities as " ".
|
|
$$refToText =~ s{& # the entity starts with "&"
|
|
((?:\#?)(?:\w+)) # optional '#' sign (as in α), followed by
|
|
# an uninterrupted sequence of letters and/or digits
|
|
; # the entity ends with a semicolon
|
|
}{&logReplacedXmlEntity($1)}egx; # entities are replaced with a space
|
|
|
|
if ($whetherToEncodeXmlChars) {
|
|
# encode text for XML
|
|
&encodeXmlChars($refToText);
|
|
}
|
|
|
|
# NOTE that the following operations introduce XML tags, so they must appear
|
|
# after the original text underwent character encoding with 'encodeXmlChars' !!
|
|
|
|
# Change markup for section headers.
|
|
# Note that section headers may only begin at the very first position in the line
|
|
# (not even after a space). Therefore, each header markup in the following commands
|
|
# is prefixed with "^" to make sure it begins at the beginning of the line.
|
|
# Since the text (e.g., article body) may contains multiple lines, we use
|
|
# the "/m" modifier to allow matching "^" at embedded "\n" positions.
|
|
$$refToText =~ s/^=====(.*?)=====/<h4>$1<\/h4>/mg;
|
|
$$refToText =~ s/^====(.*?)====/<h3>$1<\/h3>/mg;
|
|
$$refToText =~ s/^===(.*?)===/<h2>$1<\/h2>/mg;
|
|
$$refToText =~ s/^==(.*?)==/<h1>$1<\/h1>/mg;
|
|
}
|
|
|
|
sub logReplacedXmlEntity($) {
|
|
my ($xmlEntity) = @_;
|
|
|
|
print LOGF "ENTITY: &$xmlEntity;\n";
|
|
|
|
" "; # return value - entities are replaced with a space
|
|
}
|
|
|
|
BEGIN {
|
|
# Making variables static for the function to avoid recompilation of regular expressions
|
|
# every time the function is called.
|
|
|
|
# Table definitions can easily span several lines, hence the "/s" modifier
|
|
|
|
my $tableOpeningSequence1 = qr{<table> # either just <table>
|
|
| # or
|
|
<table(?:\s+)(?:[^<>]*)>}ix; # "<table" followed by at least one space
|
|
# (to prevent "<tablexxx"), followed by
|
|
# some optional text, e.g., table parameters
|
|
# as in "<table border=0>"
|
|
# In the above definition, prohibiting '<' and '>' chars ([^<>]) ensures
|
|
# that we do not consume more than necessary, so that in the example
|
|
# "<table border=0> aaa <table> bbb </table> ccc </table>"
|
|
# $1 is NOT extended to be "> aaa <table"
|
|
|
|
my $tableClosingSequence1 = qr/<\/table>/i;
|
|
# my $nonNestedTableRegex1 =
|
|
# qr{$tableOpeningSequence1 # opening sequence
|
|
# (
|
|
# (?: # non-capturing grouper
|
|
# (?! # lookahead negation
|
|
# $tableOpeningSequence1 # that's what we don't want to find inside a table definition
|
|
# )
|
|
# . # any character (such that there is no table opening sequence
|
|
# # after it because of the lookahead condition)
|
|
# )*? # shortest match of such characters, up to the closing of a table
|
|
# )
|
|
# $tableClosingSequence1}sx; # closing sequence
|
|
|
|
my $tableOpeningSequence2 = qr/\{\|/;
|
|
my $tableClosingSequence2 = qr/\|\}/;
|
|
# my $nonNestedTableRegex2 =
|
|
# qr{$tableOpeningSequence2 # opening sequence
|
|
# (
|
|
# (?: # non-capturing grouper
|
|
# (?! # lookahead negation
|
|
# $tableOpeningSequence2 # that's what we don't want to find inside a table definition
|
|
# )
|
|
# . # any character (such that there is no table opening sequence
|
|
# # after it because of the lookahead condition)
|
|
# )*? # shortest match of such characters, up to the closing of a table
|
|
# )
|
|
# $tableClosingSequence2}sx; # closing sequence
|
|
|
|
sub eliminateTables(\$) {
|
|
my ($refToText) = @_;
|
|
|
|
# Sadly, these patterns became too complex and cause segmentation fault,
|
|
# hence we fall back to only handling non-nested tables :(
|
|
# # Sometimes, tables are nested, therefore we use a while loop to eliminate them
|
|
# # recursively, while requiring that any table we eliminate does not contain nested tables.
|
|
# # For simplicity, we assume that tables of the two kinds (e.g., <table> ... </table> and {| ... |})
|
|
# # are not nested in one another.
|
|
|
|
$$refToText =~ s/$tableOpeningSequence1(.*?)$tableClosingSequence1/\n/sg;
|
|
$$refToText =~ s/$tableOpeningSequence2(.*?)$tableClosingSequence2/\n/sg;
|
|
}
|
|
|
|
} # end of BEGIN block
|
|
|
|
# If specified, 'elementToRemove' contains an element that needs to be removed as well.
|
|
# For links, this ensures that a page does not link to itself. For categories, this
|
|
# ensures that a page is not categorized to itself. This parameter is obviously
|
|
# irrelevant for filtering URLs.
|
|
# 'elementToRemove' must be a numeric value (not string), since we're testing it with '==' (not 'eq')
|
|
sub removeDuplicatesAndSelf(\@$) {
|
|
my ($refToArray, $elementToRemove) = @_;
|
|
|
|
my %seen = ();
|
|
my @uniq;
|
|
|
|
my $item;
|
|
foreach $item (@$refToArray) {
|
|
if ( defined($elementToRemove) && ($item == $elementToRemove) ) {
|
|
printf LOGF "Warning: current page links or categorizes to itself - " .
|
|
"link discarded ($elementToRemove)\n";
|
|
next;
|
|
}
|
|
push(@uniq, $item) unless $seen{$item}++;
|
|
}
|
|
|
|
# overwrite the original array with the new one that does not contain duplicates
|
|
@$refToArray = @uniq;
|
|
}
|
|
|
|
# Removes elements of the second list from the first list.
|
|
# For efficiency purposes, the second list is converted into a hash.
|
|
sub removeElements(\@\@) {
|
|
my ($refToArray, $refToElementsToRemove) = @_;
|
|
|
|
my %elementsToRemove = ();
|
|
my @result;
|
|
|
|
# Construct the hash table for fast lookups
|
|
my $item;
|
|
foreach $item (@$refToElementsToRemove) {
|
|
$elementsToRemove{$item} = 1;
|
|
}
|
|
|
|
foreach $item (@$refToArray) {
|
|
if ( ! defined($elementsToRemove{$item}) ) {
|
|
push(@result, $item);
|
|
}
|
|
}
|
|
|
|
# overwrite the original array with the new one
|
|
@$refToArray = @result;
|
|
}
|
|
|
|
sub getTimeAsString() {
|
|
my $tm = localtime();
|
|
my $result = sprintf("%02d:%02d:%02d", $tm->hour, $tm->min, $tm->sec);
|
|
}
|
|
|
|
sub trimWhitespaceBothSides(\$) {
|
|
my ($stringRef) = @_;
|
|
|
|
# remove leading whitespace
|
|
$$stringRef =~ s/^\s*//;
|
|
# remove trailing whitespace
|
|
$$stringRef =~ s/\s*$//;
|
|
}
|
|
|
|
# There are 3 kinds of related links that we look for:
|
|
# 1) Standalone (usually, at the beginning of the article or a section of it)
|
|
# Ex: Main articles: ...
|
|
# 2) Inlined - text in parentheses inside the body of the article
|
|
# Ex: medicine (see also: [[Health]])
|
|
# 3) Dedicated section
|
|
# Ex: == See also ==
|
|
#
|
|
# In all calls to 'extractInternalLinks':
|
|
# - The penultimate argument is 0, since we don't need to log anchor text here.
|
|
# Anchor text will be handled when we analyze all the internal links in
|
|
# the entire article (and not just look for related links).
|
|
# - The last argument is 0 in order not to remove duplicates on every invocation
|
|
# of 'extractInternalLinks'. This is because duplicates in related links are
|
|
# not very common, but performing duplicate removal each time is expensive.
|
|
# Instead, we remove duplicates once at the very end.
|
|
sub identifyRelatedArticles(\$\@$) {
|
|
my ($refToText, $refToRelatedArticles, $id) = @_;
|
|
|
|
# We split the text into a set of lines. This also creates a copy of the original text -
|
|
# this is important, since the function 'extractInternalLinks' modifies its argument,
|
|
# so we'd better use it on a copy of the real article body.
|
|
my @text = split("\n", $$refToText);
|
|
my $line;
|
|
|
|
# Standalone
|
|
foreach $line (@text) {
|
|
# We require that stanalone designators occur at the beginning of the line
|
|
# (after at most a few characters, such as a whitespace or a colon),
|
|
# and not just anywhere in the line. Otherwise, we would collect as related
|
|
# those links that just happen to occur in the same line with an unrelated
|
|
# string that represents a standalone designator.
|
|
if ($line =~ /^(?:.{0,5})(${relatedWording_Standalone}.*)$/) {
|
|
my $str = $1; # We extract links from the rest of the line
|
|
print LOGF "Related(S): $id => $str\n";
|
|
&extractInternalLinks(\$str, $refToRelatedArticles, $id, 0, 0);
|
|
print LOGF "Related(S): $id ==> @$refToRelatedArticles\n";
|
|
}
|
|
}
|
|
|
|
# Inlined (in parentheses)
|
|
foreach $line (@text) {
|
|
while ($line =~ /\((?:\s*)(${relatedWording_Inline}.*?)\)/g) {
|
|
my $str = $1;
|
|
print LOGF "Related(I): $id => $str\n";
|
|
&extractInternalLinks(\$str, $refToRelatedArticles, $id, 0, 0);
|
|
print LOGF "Related(I): $id ==> @$refToRelatedArticles\n";
|
|
}
|
|
}
|
|
|
|
# Section
|
|
# Sections can be at any level - "==", "===", "====" - it doesn't matter,
|
|
# so it suffices to look for two consecutive "=" signs
|
|
my $relatedSectionFound = 0;
|
|
foreach $line (@text) {
|
|
if ($relatedSectionFound) { # we're in the related section now
|
|
if ($line =~ /==(?:.*?)==/) { # we just encountered the next section - exit the loop
|
|
last;
|
|
} else { # collect the links from the current line
|
|
print LOGF "Related(N): $id => $line\n";
|
|
# 'extractInternalLinks' may mofidy its argument ('$line'), but it's OK
|
|
# as we do not do any further processing to '$line' or '@text'
|
|
&extractInternalLinks(\$line, $refToRelatedArticles, $id, 0, 0);
|
|
print LOGF "Related(N): $id ==> @$refToRelatedArticles\n";
|
|
}
|
|
} else { # we haven't yet found the related section
|
|
if ($line =~ /==(.*?)==/) { # found some section header - let's check it
|
|
my $sectionHeader = $1;
|
|
if ($sectionHeader =~ /$relatedWording_Section/) {
|
|
$relatedSectionFound = 1;
|
|
next; # proceed to the next line
|
|
} else {
|
|
next; # unrelated section - just proceed to the next line
|
|
}
|
|
} else {
|
|
next; # just proceed to the next line - nothing to do
|
|
}
|
|
}
|
|
}
|
|
|
|
&removeDuplicatesAndSelf($refToRelatedArticles, $id);
|
|
}
|
|
|
|
sub recordRelatedArticles($\@) {
|
|
my ($id, $refToRelatedArticles) = @_;
|
|
|
|
my $size = scalar(@$refToRelatedArticles);
|
|
return if ($size == 0);
|
|
|
|
print RELATEDF "$id\t", join(" ", @$refToRelatedArticles), "\n";
|
|
}
|
|
|
|
|
|
########################################################################
|
|
|
|
sub printUsage()
|
|
{
|
|
print "Wikiprep version $version, Copyright (C) 2007 Evgeniy Gabrilovich\n" .
|
|
"Wikiprep comes with ABSOLUTELY NO WARRANTY; for details type '$0 -license'.\n" .
|
|
"This is free software, and you are welcome to redistribute it\n" .
|
|
"under certain conditions; type '$0 -license' for details.\n" .
|
|
"Type '$0 -version' for version information.\n\n" .
|
|
"Usage: $0 -f <XML file with page dump>\n" .
|
|
" e.g., $0 -f pages_articles.xml\n\n";
|
|
}
|