|
|
- ###############################################################################
- #
- # wikiprep.pl - Preprocess Wikipedia XML dumps
- # Copyright (C) 2007 Evgeniy Gabrilovich
- # The author can be contacted by electronic mail at gabr@cs.technion.ac.il
- #
- # This program is free software; you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation; either version 2 of the License, or
- # (at your option) any later version.
- #
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- #
- # You should have received a copy of the GNU General Public License
- # along with this program; if not, write to the Free Software
- # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA,
- # or see <http://www.gnu.org/licenses/> and
- # <http://www.fsf.org/licensing/licenses/info/GPLv2.html>
- #
- ###############################################################################
-
-
- use strict;
- use warnings;
-
- use File::Basename;
- use Getopt::Long;
- use Time::localtime;
- use XML::Parser;
-
- my $licenseFile = "COPYING";
- my $version = "2.02";
-
- if (@ARGV < 1) {
- &printUsage();
- exit 0;
- }
-
- my $file;
- my $showLicense = 0;
- my $showVersion = 0;
-
- GetOptions('f=s' => \$file,
- 'license' => \$showLicense,
- 'version' => \$showVersion);
-
- if ($showLicense) {
- if (-e $licenseFile) {
- print "See file $licenseFile for more details.\n"
- } else {
- print "Please see <http://www.gnu.org/licenses/> and <http://www.fsf.org/licensing/licenses/info/GPLv2.html>\n";
- }
- exit 0;
- }
- if ($showVersion) {
- print "Wikiprep version $version\n";
- exit 0;
- }
- if (!defined($file)) {
- &printUsage();
- exit 0;
- }
- if (! -e $file) {
- die "Input file '$file' cannot be opened for reading\n";
- }
-
-
- ##### Global definitions #####
-
- my %XmlEntities = ('&' => 'amp', '"' => 'quot', "'" => 'apos', '<' => 'lt', '>' => 'gt');
-
- # The URL protocol (e.g., http) matched here may be in either case, hence we use the /i modifier.
- my $urlProtocols = qr/http:\/\/|https:\/\/|telnet:\/\/|gopher:\/\/|file:\/\/|wais:\/\/|ftp:\/\/|mailto:|news:/i;
- # A URL terminator may be either one of a list of characters OR end of string (that is, '$').
- # This last part is necessary to handle URLs at the very end of a string when there is no "\n"
- # or any other subsequent character.
- my $urlTerminator = qr/[\[\]\{\}\s\n\|\"<>]|$/;
-
- my $relatedWording_Standalone =
- qr/Main(?:\s+)article(?:s?)|Further(?:\s+)information|Related(?:\s+)article(?:s?)|Related(?:\s+)topic(?:s?)|See(?:\s+)main(?:\s+)article(?:s?)|See(?:\s+)article(?:s?)|See(?:\s+)also|For(?:\s+)(?:more|further)/i;
- ## For(?:\s+)more(?:\s+)(?:background|details)(?:\s+)on(?:\s+)this(?:\s+)topic,(?:\s+)see
- my $relatedWording_Inline = qr/See[\s:]|See(?:\s+)also|For(?:\s+)(?:more|further)/i;
- my $relatedWording_Section = qr/Further(?:\s+)information|See(?:\s+)also|Related(?:\s+)article(?:s?)|Related(?:\s+)topic(?:s?)/i;
-
- my %monthToNumDays = ('January' => 31, 'February' => 29, 'March' => 31, 'April' => 30,
- 'May' => 31, 'June' => 30, 'July' => 31, 'August' => 31,
- 'September' => 30, 'October' => 31, 'November' => 30, 'December' => 31);
- my %numberToMonth = (1 => 'January', 2 => 'February', 3 => 'March', 4 => 'April',
- 5 => 'May', 6 => 'June', 7 => 'July', 8 => 'August',
- 9 => 'September', 10 => 'October', 11 => 'November', 12 => 'December');
-
- my $maxTemplateRecursionLevels = 5;
- my $maxParameterRecursionLevels = 5;
-
- ##### Global variables #####
-
- my %namespaces;
- # we only process pages in these namespaces + the main namespace (which has an empty name)
- my %okNamespacesForPrescanning = ('Template' => 1, 'Category' => 1);
- my %okNamespacesForTransforming = ('Category' => 1); # we don't use templates as concepts
-
- my %id2title;
- my %title2id;
- my %redir;
- my %templates; # template bodies for insertion
- my %catHierarchy; # each category is associated with a list of its immediate descendants
- my %statCategories; # number of pages classified under each category
- my %statIncomingLinks; # number of links incoming to each page
-
-
- my ($fileBasename, $filePath, $fileSuffix) = fileparse($file, ".xml");
- my $outputFile = "$filePath/$fileBasename.hgw$fileSuffix";
- my $logFile = "$filePath/$fileBasename.log";
- my $anchorTextFile = "$filePath/$fileBasename.anchor_text";
- my $relatedLinksFile = "$filePath/$fileBasename.related_links";
-
- open(OUTF, "> $outputFile") or die "Cannot open $outputFile";
- open(LOGF, "> $logFile") or die "Cannot open $logFile";
- open(ANCHORF, "> $anchorTextFile") or die "Cannot open $anchorTextFile";
- open(RELATEDF, "> $relatedLinksFile") or die "Cannot open $relatedLinksFile";
-
- binmode(STDOUT, ':utf8');
- binmode(STDERR, ':utf8');
- binmode(OUTF, ':utf8');
- binmode(LOGF, ':utf8');
- binmode(ANCHORF, ':utf8');
-
- print ANCHORF "# Line format: <Target page id> <Source page id> <Anchor text (up to the end of the line)>\n\n\n";
- print RELATEDF "# Line format: <Page id> <List of ids of related articles>\n\n\n";
-
- ©XmlFileHeader();
- &loadNamespaces();
- &prescan();
-
- my $numTitles = scalar( keys(%id2title) );
- print "Loaded $numTitles titles\n";
- my $numRedirects = scalar( keys(%redir) );
- print "Loaded $numRedirects redirects\n";
- my $numTemplates = scalar( keys(%templates) );
- print "Loaded $numTemplates templates\n";
-
- &transform();
- &closeXmlFile();
-
- &writeStatistics();
- &writeCategoryHierarchy();
-
- close(LOGF);
- close(ANCHORF);
- close(RELATEDF);
-
- # Hogwarts needs the anchor text file to be sorted in the increading order of target page id.
- # The file is originally sorted by source page id (second field in each line).
- # We now use stable (-s) numeric (-n) sort on the first field (-k 1,1).
- # This way, the resultant file will be sorted on the target page id (first field) as primary key,
- # and on the source page id (second field) as secondary key.
- system("sort -s -n -k 1,1 $anchorTextFile > $anchorTextFile.sorted");
-
-
- ##### Subroutines #####
-
- sub normalizeTitle(\$) {
- my ($refToStr) = @_;
-
- # remove leading whitespace and underscores
- $$refToStr =~ s/^[\s_]+//;
- # remove trailing whitespace and underscores
- $$refToStr =~ s/[\s_]+$//;
- # replace sequences of whitespace and underscore chars with a single space
- $$refToStr =~ s/[\s_]+/ /g;
-
- if ($$refToStr =~ /^([^:]*):(\s*)(\S(?:.*))/) {
- my $prefix = $1;
- my $optionalWhitespace = $2;
- my $rest = $3;
-
- my $namespaceCandidate = $prefix;
- &normalizeNamespace(\$namespaceCandidate); # this must be done before the call to 'isKnownNamespace'
- if ( &isKnownNamespace(\$namespaceCandidate) ) {
- # If the prefix designates a known namespace, then it might follow by optional
- # whitespace that should be removed to get the canonical page name
- # (e.g., "Category: Births" should become "Category:Births").
- $$refToStr = $namespaceCandidate . ":" . ucfirst($rest);
- } else {
- # No namespace, just capitalize first letter.
- # If the part before the colon is not a known namespace, then we must not remove the space
- # after the colon (if any), e.g., "3001: The_Final_Odyssey" != "3001:The_Final_Odyssey".
- # However, to get the canonical page name we must contract multiple spaces into one,
- # because "3001: The_Final_Odyssey" != "3001: The_Final_Odyssey".
- $$refToStr = ucfirst($prefix) . ":" .
- (length($optionalWhitespace) > 0 ? " " : "") . $rest;
- }
- } else {
- # no namespace, just capitalize first letter
- $$refToStr = ucfirst($$refToStr);
- }
- }
-
- sub normalizeNamespace(\$) {
- my ($refToStr) = @_;
-
- $$refToStr = ucfirst( lc($$refToStr) );
- }
-
- # Checks if the prefix of the page name before the colon is actually one of the
- # 16+2+2 namespaces defined in the XML file.
- # Assumption: the argument was already normalized using 'normalizeNamespace'
- sub isKnownNamespace(\$) {
- my ($refToStr) = @_;
-
- defined( $namespaces{$$refToStr} ); # return value
- }
-
- # The correct form to create a redirect is #REDIRECT [[ link ]],
- # and function 'Parse::MediaWikiDump::page->redirect' only supports this form.
- # However, it seems that Wikipedia can also tolerate a variety of other forms, such as
- # REDIRECT|REDIRECTS|REDIRECTED|REDIRECTION, then an optional ":", optional "to" or optional "=".
- # Therefore, we use our own function to handle these cases as well.
- # If the page is a redirect, the function returns the title of the target page;
- # otherwise, it returns 'undef'.
- sub isRedirect($) {
- my ($page) = @_;
-
- # quick check
- return undef if ( ${$page->text} !~ /^#REDIRECT/i );
-
- if ( ${$page->text} =~ m{^\#REDIRECT # Redirect must start with "#REDIRECT"
- # (the backslash is needed before "#" here, because
- # "#" has special meaning with /x modifier)
- (?:S|ED|ION)? # The word may be in any of these forms,
- # i.e., REDIRECT|REDIRECTS|REDIRECTED|REDIRECTION
- (?:\s*) # optional whitespace
- (?: :|\sTO|=)? # optional colon, "TO" or "="
- # (in case of "TO", we expect a whitespace before it,
- # so that it's not glued to the preceding word)
- (?:\s*) # optional whitespace
- \[\[([^\]]*)\]\] # the link itself
- }ix ) { # matching is case-insensitive, hence /i
- my $target = $1;
-
- if ($target =~ /^(.*)\#(?:.*)$/) {
- # The link contains an anchor. Anchors are not allowed in REDIRECT pages, and therefore
- # we adjust the link to point to the page as a whole (that's how Wikipedia works).
- $target = $1;
- }
-
- return $target;
- }
-
- # OK, it's probably either a malformed redirect link, or something else
- return undef;
- }
-
- sub isNamespaceOkForPrescanning($) {
- my ($page) = @_;
-
- &isNamespaceOk($page, \%okNamespacesForPrescanning);
- }
-
- sub isNamespaceOkForTransforming($) {
- my ($page) = @_;
-
- &isNamespaceOk($page, \%okNamespacesForTransforming);
- }
-
- sub isNamespaceOk($\%) {
- my ($page, $refToNamespaceHash) = @_;
-
- my $result = 1;
-
- # main namespace is OK, so we only check pages that belong to other namespaces
-
- if ($page->namespace ne '') {
- my $namespace = $page->namespace;
- &normalizeNamespace(\$namespace);
- if ( &isKnownNamespace(\$namespace) ) {
- $result = defined( $$refToNamespaceHash{$namespace} );
- } else {
- # the prefix before ":" in the page title is not a known namespace,
- # therefore, the page belongs to the main namespace and is OK
- }
- }
-
- $result; # return value
- }
-
- sub encodeXmlChars(\$) {
- my ($refToStr) = @_;
-
- $$refToStr =~ s/([&"'<>])/&$XmlEntities{$1};/g;
- }
-
- sub copyXmlFileHeader() {
- open(INF, "< $file") or die "Cannot open $file";
- while (<INF>) { # copy lines up to "</siteinfo>"
- if (/^<mediawiki /) {
- # The top level element - mediawiki - contains a lot of attributes (e.g., schema)
- # that are no longer applicable to the XML file after our transformation.
- # Therefore, we simply write an opening tag <mediawiki> without any attributes.
- print OUTF "<mediawiki>\n";
- } else {
- # All other lines (up to </siteinfo>) are copied as-is
- print OUTF;
- }
- last if (/<\/siteinfo>/);
- }
- close(INF); # this file will later be reopened by "Parse::MediaWikiDump"
- }
-
- sub closeXmlFile() {
- print OUTF "</mediawiki>\n";
- close(OUTF);
- }
-
- sub writeStatistics() {
- my $statCategoriesFile = "$filePath/$fileBasename.stat.categories";
- my $statIncomingLinksFile = "$filePath/$fileBasename.stat.inlinks";
-
- open(STAT_CATS, "> $statCategoriesFile") or die "Cannot open $statCategoriesFile";
- print STAT_CATS "# Line format: <CategoryId (= page id)> <Number of pages in this category>\n",
- "# Here we count the *pages* that belong to this category, i.e., articles AND\n",
- "# sub-categories of this category (but not the articles in the sub-categories).\n",
- "\n\n";
-
- my $cat;
- foreach $cat ( sort { $statCategories{$b} <=> $statCategories{$a} }
- keys(%statCategories) ) {
- print STAT_CATS "$cat\t$statCategories{$cat}\n";
- }
- close(STAT_CATS);
-
- open(STAT_INLINKS, "> $statIncomingLinksFile") or die "Cannot open $statIncomingLinksFile";
- print STAT_INLINKS "# Line format: <Target page id> <Number of links to it from other pages>\n\n\n";
-
- my $destination;
- foreach $destination ( sort { $statIncomingLinks{$b} <=> $statIncomingLinks{$a} }
- keys(%statIncomingLinks) ) {
- print STAT_INLINKS "$destination\t$statIncomingLinks{$destination}\n";
- }
-
- close(STAT_INLINKS);
- }
-
- sub writeCategoryHierarchy() {
- my $catHierarchyFile = "$filePath/$fileBasename.cat_hier";
-
- open(CAT_HIER, "> $catHierarchyFile") or die "Cannot open $catHierarchyFile";
- print CAT_HIER "# Line format: <Category id> <List of ids of immediate descendants>\n\n\n";
-
- my $cat;
- foreach $cat ( sort { $catHierarchy{$a} <=> $catHierarchy{$b} }
- keys(%catHierarchy) ) {
- print CAT_HIER "$cat\t", join(" ", @{$catHierarchy{$cat}}), "\n";
- }
-
- close(CAT_HIER);
- }
-
- sub loadNamespaces() {
- # re-open the input XML file
- my $pages = Parse::MediaWikiDump::Pages->new($file);
-
- # load namespaces
- my $refNamespaces = $pages->namespaces;
-
- # namespace names are case-insensitive, so we force them
- # to canonical form to facilitate future comparisons
- my $ns;
- foreach $ns (@$refNamespaces) {
- my @namespaceData = @$ns;
- my $namespaceId = $namespaceData[0];
- my $namespaceName = $namespaceData[1];
- &normalizeNamespace(\$namespaceName);
- $namespaces{$namespaceName} = $namespaceId;
- }
- }
-
- # build id <-> title mappings and redirection table,
- # as well as load templates
- sub prescan() {
- # re-open the input XML file
- my $pages = Parse::MediaWikiDump::Pages->new($file);
-
- my $counter = 0;
-
- my $page;
- while (defined($page = $pages->page)) {
- my $id = $page->id;
-
- $counter++;
-
- if ($counter % 1000 == 0) {
- my $timeStr = &getTimeAsString();
- print LOGF "[$timeStr] Prescanning page id=$id\n";
- }
-
- my $title = $page->title;
- &normalizeTitle(\$title);
-
- if (length($title) == 0) {
- # This is a defense against pages whose title only contains UTF-8 chars that
- # are reduced to an empty string. Right now I can think of one such case -
- # <C2><A0> which represents the non-breaking space. In this particular case,
- # this page is a redirect to [[Non-nreaking space]], but having in the system
- # a redirect page with an empty title causes numerous problems, so we'll live
- # happier without it.
- print LOGF "Skipping page with empty title id=$id\n";
- next;
- }
-
- my $redirect = &isRedirect($page);
- if (defined($redirect)) {
- &normalizeTitle(\$redirect);
- next if (length($redirect) == 0); # again, same precaution here - see comments above
- $redir{$title} = $redirect;
-
- # nothing more to do for redirect pages
- next;
- }
-
- if ( ! &isNamespaceOkForPrescanning($page) ) {
- next; # we're only interested in certain namespaces
- }
- # if we get here, then either the page belongs to the main namespace OR
- # it belongs to one of the namespaces we're interested in
-
- if ( exists($id2title{$id}) ) {
- print LOGF "Warning: Page id=$id already encountered before!\n";
- next;
- }
- if ( exists($title2id{$title}) ) {
- # A page could have been encountered before with a different spelling.
- # Examples: = <C2><A0> (nonbreakable space), ß = <C3><9F> (German Eszett ligature)
- print LOGF "Warning: Page title='$title' already encountered before!\n";
- next;
- }
- $id2title{$id} = $title;
- $title2id{$title} = $id;
-
- if ($title =~ /^Template:/) {
- my $text = ${$page->text};
-
- # We're storing template text for future inclusion, therefore,
- # remove all <noinclude> text and keep all <includeonly> text
- # (but eliminate <includeonly> tags per se).
- # However, if <onlyinclude> ... </onlyinclude> parts are present,
- # then only keep them and discard the rest of the template body.
- # This is because using <onlyinclude> on a text fragment is
- # equivalent to enclosing it in <includeonly> tags **AND**
- # enclosing all the rest of the template body in <noinclude> tags.
- # These definitions can easily span several lines, hence the "/s" modifiers.
-
- my $onlyincludeAccumulator;
- while ($text =~ /<onlyinclude>(.*?)<\/onlyinclude>/sg) {
- my $onlyincludeFragment = $1;
- $onlyincludeAccumulator .= "$onlyincludeFragment\n";
- }
- if ( defined($onlyincludeAccumulator)) {
- $text = $onlyincludeAccumulator;
- } else {
- # If there are no <onlyinclude> fragments, simply eliminate
- # <noinclude> fragments and keep <includeonly> ones.
- $text =~ s/<noinclude>(?:.*?)<\/noinclude>/\n/sg;
- $text =~ s/<includeonly>(.*?)<\/includeonly>/$1/sg;
- }
-
- $templates{$id} = $text;
- }
- }
-
- my $timeStr = &getTimeAsString();
- print LOGF "[$timeStr] Prescanning complete - prescanned $counter pages\n";
- }
-
- sub transform() {
- # re-open the input XML file
- my $pages = Parse::MediaWikiDump::Pages->new($file);
-
- my $page;
- while (defined($page = $pages->page)) {
- my $id = $page->id;
-
- my $timeStr = &getTimeAsString();
- print LOGF "[$timeStr] Transforming page id=$id\n";
-
- if ( defined( &isRedirect($page) ) ) {
- next; # we've already loaded all redirects in the prescanning phase
- }
-
- if ( ! &isNamespaceOkForTransforming($page) ) {
- next; # we're only interested in pages from certain namespaces
- }
-
- my $title = $page->title;
- &normalizeTitle(\$title);
-
- # see the comment about empty titles in function 'prescan'
- if (length($title) == 0) {
- print LOGF "Skipping page with empty title id=$id\n";
- next;
- }
-
- my $text = ${$page->text};
-
- my $orgLength = length($text); # text length BEFORE any transformations
-
- # The check for stub must be done BEFORE any further processing,
- # because stubs indicators are templates, and templates are substituted.
- my $isStub = 0;
- if ( $text =~ m/stub}}/i ) {
- $isStub = 1;
- }
-
- my @categories;
- my @internalLinks;
- my @urls;
-
- &includeTemplates(\$text);
-
- my @relatedArticles;
- # This function only examines the contents of '$text', but doesn't change it.
- &identifyRelatedArticles(\$text, \@relatedArticles, $id);
-
- # We process categories directly, because '$page->categories' ignores
- # categories inherited from included templates
- &extractCategories(\$text, \@categories, $id);
-
- # Categories are listed at the end of articles, and therefore may mistakenly
- # be added to the list of related articles (which often appear in the last
- # section such as "See also"). To avoid this, we explicitly remove all categories
- # from the list of related links, and only then record the list of related links
- # to the file.
- &removeElements(\@relatedArticles, \@categories);
- &recordRelatedArticles($id, \@relatedArticles);
-
- &extractInternalLinks(\$text, \@internalLinks, $id, 1, 1);
- &extractUrls(\$text, \@urls);
-
- &postprocessText(\$text, 1);
-
- my $newLength = length($text); # text length AFTER all transformations
-
- &writePage($id, \$title, \$text, $orgLength, $newLength, $isStub, \@categories, \@internalLinks, \@urls);
-
- &updateStatistics(\@categories, \@internalLinks);
-
- if ($title =~ /^Category:/) {
- &updateCategoryHierarchy($id, \@categories);
- }
- }
- }
-
- sub updateStatistics(\@\@) {
- my ($refToCategories, $refToInternalLinks) = @_;
-
- my $cat;
- foreach $cat (@$refToCategories) {
- $statCategories{$cat}++;
- }
-
- my $link;
- foreach $link (@$refToInternalLinks) {
- $statIncomingLinks{$link}++;
- }
- }
-
- sub updateCategoryHierarchy($\@) {
- # The list of categories passed as a parameter is actually the list of parent categories
- # for the current category
- my ($childId, $refToParentCategories) = @_;
-
- my $parentCat;
- foreach $parentCat (@$refToParentCategories) {
- if ( exists($catHierarchy{$parentCat}) ) {
- push(@{$catHierarchy{$parentCat}}, $childId);
- } else {
- # create a new array with '$childId' as the only child (for now) of '$parentCat'
- my @arr;
- push(@arr, $childId);
- $catHierarchy{$parentCat} = [ @arr ];
- }
- }
- }
-
- sub writePage($\$\$$$$\@\@\@) {
- my ($id, $refToTitle, $refToText, $orgLength, $newLength, $isStub,
- $refToCategories, $refToInternalLinks, $refToUrls) = @_;
-
- my $numCategories = scalar(@$refToCategories);
- my $numLinks = scalar(@$refToInternalLinks);
- my $numUrls = scalar(@$refToUrls);
-
- print OUTF "<page id=\"$id\" orglength=\"$orgLength\" newlength=\"$newLength\" stub=\"$isStub\" " .
- "categories=\"$numCategories\" outlinks=\"$numLinks\" urls=\"$numUrls\">\n";
-
- my $encodedTitle = $$refToTitle;
- &encodeXmlChars(\$encodedTitle);
- print OUTF "<title>$encodedTitle</title>\n";
-
- print OUTF "<categories>";
- print OUTF join(" ", @$refToCategories);
- print OUTF "</categories>\n";
-
- print OUTF "<links>";
- print OUTF join(" ", @$refToInternalLinks);
- print OUTF "</links>\n";
-
- print OUTF "<urls>\n";
-
- my $url;
- foreach $url (@$refToUrls) {
- &encodeXmlChars(\$url);
- print OUTF "$url\n";
- }
- print OUTF "</urls>\n";
-
- # text has already undergone 'encodeXmlChars' in function 'postprocessText'
- print OUTF "<text>\n$$refToText\n</text>\n";
-
- print OUTF "</page>\n";
- }
-
- # Maps a title into the id, and performs redirection if necessary.
- # Assumption: the argument was already normalized using 'normalizeTitle'
- sub resolveLink(\$) {
- my ($refToTitle) = @_;
-
- # safety precaution
- return undef if (length($$refToTitle) == 0);
-
- my $targetId; # result
- my $targetTitle = $$refToTitle;
-
- if ( exists($redir{$$refToTitle}) ) { # this link is a redirect
- $targetTitle = $redir{$$refToTitle};
-
- # check if this is a double redirect
- if ( exists($redir{$targetTitle}) ) {
- $targetTitle = undef; # double redirects are not allowed and are ignored
- print LOGF "Warning: link '$$refToTitle' caused double redirection and was ignored\n";
- } else {
- print LOGF "Link '$$refToTitle' was redirected to '$targetTitle'\n";
- }
- }
-
- if ( defined($targetTitle) ) {
- if ( exists($title2id{$targetTitle}) ) {
- $targetId = $title2id{$targetTitle};
- } else {
- # target not found
- print LOGF "Warning: link '$$refToTitle' cannot be matched to an id\n";
- $targetId = undef;
- }
- } else {
- $targetId = undef;
- }
-
- $targetId; # return value
- }
-
- sub includeTemplates(\$) {
- my ($refToText) = @_;
-
- # Using the while loop forces templates to be included recursively
- # (i.e., includes the body of templates that themselves were included
- # on the previous iteration ).
- # Template definitions can easily span several lines, hence the "/s" modifier.
-
- # Templates are frequently nested. Occasionally, parsing mistakes may cause template insertion
- # to enter an infinite loop, for instance when trying to instantiate Template:Country
- # {{country_{{{1}}}|{{{2}}}|{{{2}}}|size={{{size|}}}|name={{{name|}}}}}
- # which is repeatedly trying to insert template "country_", which is again resolved to
- # Template:Country. The straightforward solution of keeping track of templates that were
- # already inserted for the current article would not work, because the same template
- # may legally be used more than once, with different parameters in different parts of
- # the article. Therefore, we simply limit the number of iterations of nested template
- # inclusion.
-
- my $templateRecursionLevels = 0;
-
- # We also require that the body of a template does not contain the template opening sequence
- # (two successive opening braces - "\{\{"). We use negative lookahead to achieve this.
- while ( ($templateRecursionLevels < $maxTemplateRecursionLevels) &&
- $$refToText =~ s/\{\{
- (?:\s*) # optional whitespace before the template name is ignored
- (
- (?:
- (?!
- \{\{
- )
- .
- )*?
- )
- # OLD code and comments
- # (?:\s*) # optional whitespace before the template name is ignored
- # ([^\{]*?) # Occasionally, templates are nested,
- # # e.g., {{localurl:{{NAMESPACE}}:{{PAGENAME}}}}
- # # In order to prevent incorrect parsing, e.g.,
- # # "{{localurl:{{NAMESPACE}}", we require that the
- # # template name does not include opening braces,
- # # hence "[^\{]" (any char except opening brace).
- # END OF OLD code and comments
- \}\}
- /&instantiateTemplate($1)/segx
- ) {
- $templateRecursionLevels++;
- }
-
- # Since we limit the number of levels of template recursion, we might end up with several
- # un-instantiated templates. In this case we simply eliminate them - however, we do so
- # later, in function 'postprocessText()', after extracting categories, links and URLs.
- }
-
- BEGIN {
- # Making variables static for the function to avoid recompilation of regular expressions
- # every time the function is called.
-
- my $specialSeparator = "\.pAr\.";
- my $specialSeparatorRegex = qr/$specialSeparator/;
-
- sub parseTemplateInvocation(\$\$\%) {
- my ($refToTemplateInvocation, $refToTemplateTitle, $refToParameterHash) = @_;
-
- # Template definitions (especially those with parameters) can easily span several lines,
- # hence the "/s" modifier. The template name extends up to the first pipeline symbol (if any).
- # Template parameters go after the "|" symbol.
- if ($$refToTemplateInvocation =~ /^([^|]*)\|(.*)$/sx) {
- $$refToTemplateTitle = $1; # single out the template name itself
- my $paramsList = $2;
-
- # Template parameters often contain URLs, internal links, or just other useful text,
- # whereas the template serves for presenting it in some nice way.
- # Parameters are separated by "|" symbols. However, we cannot simply split the string
- # on "|" symbols, since these frequently appear inside internal links. Therefore, we split
- # on those "|" symbols that are not inside [[...]]. It's obviously sufficient to check that
- # brackets are not improperly nested on one side of "|", so we use lookahead.
- # We first replace all "|" symbols that are not inside [[...]] with a special separator that
- # we invented, which will hopefully not normally appear in the text (.pAr.).
- # Next, we use 'split' to break the string on this new separator.
-
- $paramsList =~ s/\| # split on pipeline symbol, such that
- (?: # non-capturing grouper that encloses 2 options
- (?= # zero-width lookahead - option #1
- [^\]]*$ # there are no closing brackets up to the end
- # of the string (i.e., all the characters up to
- # the end of the string are not closing brackets)
- ) # end of first lookahead (= end of option #1)
- | # or
- (?= # another zero-width lookahead - option #2
- [^\]]* \[ # the nearest opening bracket on the right is not preceded
- # by a closing bracket (i.e., all the characters that
- # precede it are not closing brackets
- ) # end of second lookahead (= end of option #2)
- ) # end of the outer grouper
- /$specialSeparator/sxg; # replace matching symbols with a special separator
- # /s means string can contain newline chars
-
- my @parameters = split(/$specialSeparatorRegex/, $paramsList);
-
- # Parameters can be either named or unnamed. In the latter case, their name is defined by their
- # ordinal position (1, 2, 3, ...).
-
- my $unnamedParameterCounter = 0;
-
- # It's legal for unnamed parameters to be skipped, in which case they will get default
- # values (if available) during actual instantiation. That is {{template_name|a||c}} means
- # parameter 1 gets the value 'a', parameter 2 value is not defined, and parameter 3 gets the value 'c'.
- # This case is correctly handled by function 'split', and does not require any special handling.
- my $param;
- foreach $param (@parameters) {
- # Spaces before or after a parameter value are normally ignored, UNLESS the parameter contains
- # a link (to prevent possible gluing the link to the following text after template substitution)
-
- # Parameter values may contain "=" symbols, hence the parameter name extends up to
- # the first such symbol.
- # It is legal for a parameter to be specified several times, in which case the last assignment
- # takes precedence. Example: "{{t|a|b|c|2=B}}" is equivalent to "{{t|a|B|c}}".
- # Therefore, we don't check if the parameter has been assigned a value before, because
- # anyway the last assignment should override any previous ones.
- if ($param =~ /^([^=]*)=(.*)$/s) {
- # This is a named parameter.
- # This case also handles parameter assignments like "2=xxx", where the number of an unnamed
- # parameter ("2") is specified explicitly - this is handled transparently.
-
- my $parameterName = $1;
- my $parameterValue = $2;
-
- &trimWhitespaceBothSides(\$parameterName);
- if ($parameterValue !~ /\]\]/) { # if the value does not contain a link, trim whitespace
- &trimWhitespaceBothSides(\$parameterValue);
- }
-
- $$refToParameterHash{$parameterName} = $parameterValue;
- } else {
- # this is an unnamed parameter
- $unnamedParameterCounter++;
-
- if ($param !~ /\]\]/) { # if the value does not contain a link, trim whitespace
- &trimWhitespaceBothSides(\$param);
- }
-
- $$refToParameterHash{$unnamedParameterCounter} = $param;
- }
- }
- } else {
- # Template invocation does not contain a pipeline symbol, hence take the entire
- # invocation text as the template title.
- $$refToTemplateTitle = $$refToTemplateInvocation;
- }
- }
-
- } # end of BEGIN block
-
-
- sub instantiateTemplate($) {
- my ($templateInvocation) = @_;
-
- my $result = "";
-
- print LOGF "Instantiating template=$templateInvocation\n";
-
- my $templateTitle;
- my %templateParams;
- &parseTemplateInvocation(\$templateInvocation, \$templateTitle, \%templateParams);
-
- &computeFullyQualifiedTemplateTitle(\$templateTitle);
-
- &includeTemplateText(\$templateTitle, \%templateParams, \$result);
-
- $result; # return value
- }
-
- sub includeTemplateText(\$\%\$) {
- my ($refToTemplateTitle, $refToParameterHash, $refToResult) = @_;
-
- &normalizeTitle($refToTemplateTitle);
- my $includedPageId = &resolveLink($refToTemplateTitle);
-
- if ( defined($includedPageId) && exists($templates{$includedPageId}) ) {
- # OK, perform the actual inclusion with parameter substitution
-
- $$refToResult = $templates{$includedPageId};
-
- # Perform parameter substitution
- # A parameter call ( {{{...}}} ) may span over a newline, hence the /s modifier
-
- # Parameters may be nested (see comments below), hence we do the substitution iteratively
- # in a while loop. We also limit the maximum number of iterations to avoid too long or
- # even endless loops (in case of malformed input).
- my $parameterRecursionLevels = 0;
-
- # We also require that the body of a parameter does not contain the parameter opening sequence
- # (three successive opening braces - "\{\{\{"). We use negative lookahead to achieve this.
- while ( ($parameterRecursionLevels < $maxParameterRecursionLevels) &&
- $$refToResult =~ s/\{\{\{
- (
- (?:
- (?!
- \{\{\{
- )
- .
- )*?
- )
-
- # OLD code and comments
- # ([^\{]*?) # Occasionally, parameters are nested because
- # # they are dependent on other parameters,
- # # e.g., {{{Author|{{{PublishYear|}}}}}}
- # # (here, the default value for 'Author' is
- # # dependent on 'PublishYear').
- # # In order to prevent incorrect parsing, e.g.,
- # # "{{{Author|{{{PublishYear|}}}", we require that the
- # # parameter name does not include opening braces,
- # # hence "[^\{]" (any char except opening brace).
- # END OF OLD code and comments
- \}\}\}
- /&substituteParameter($1, $refToParameterHash)/segx
- ) {
- $parameterRecursionLevels++;
- }
- } else {
- # The page being included cannot be identified - perhaps we skipped it (because currently
- # we only allow for inclusion of pages in the Template namespace), or perhaps it's
- # a variable name like {{NUMBEROFARTICLES}}. Just remove this inclusion directive and
- # replace it with a space
- print LOGF "Template '$$refToTemplateTitle' is not available for inclusion\n";
- $$refToResult = " ";
- }
- }
-
- sub substituteParameter($\%) {
- my ($parameter, $refToParameterHash) = @_;
-
- my $result;
-
- if ($parameter =~ /^([^|]*)\|(.*)$/) {
- # This parameter has a default value
- my $paramName = $1;
- my $defaultValue = $2;
-
- if ( defined($$refToParameterHash{$paramName}) ) {
- $result = $$refToParameterHash{$paramName}; # use parameter value specified in template invocation
- } else { # use the default value
- $result = $defaultValue;
- }
- } else {
- # parameter without a default value
-
- if ( defined($$refToParameterHash{$parameter}) ) {
- $result = $$refToParameterHash{$parameter}; # use parameter value specified in template invocation
- } else {
- # Parameter not specified in template invocation and does not have a default value -
- # do not perform substitution and keep the parameter in 3 braces
- # (these are Wiki rules for templates, see http://meta.wikimedia.org/wiki/Help:Template ).
- $result = "{{{$parameter}}}";
- }
- }
-
- # Surplus parameters - i.e., those assigned values in template invocation but not used
- # in the template body - are simply ignored.
-
- $result; # return value
- }
-
- sub computeFullyQualifiedTemplateTitle(\$) {
- my ($refToTemplateTitle) = @_;
-
- # Determine the namespace of the page being included through the template mechanism
-
- my $namespaceSpecified = 0;
-
- if ($$refToTemplateTitle =~ /^:(.*)$/) {
- # Leading colon by itself implies main namespace, so strip this colon
- $$refToTemplateTitle = $1;
- $namespaceSpecified = 1;
- } elsif ($$refToTemplateTitle =~ /^([^:]*):/) {
- # colon found but not in the first position - check if it designates a known namespace
- my $prefix = $1;
- &normalizeNamespace(\$prefix);
- $namespaceSpecified = &isKnownNamespace(\$prefix);
- }
-
- # The case when the page title does not contain a colon at all also falls here.
-
- if ($namespaceSpecified) {
- # OK, the title of the page being included is fully qualified with a namespace
- } else {
- # The title of the page being included is NOT in the main namespace and lacks
- # any other explicit designation of the namespace - therefore, it is resolved
- # to the Template namespace (that's the default for the template inclusion mechanism).
- $$refToTemplateTitle = "Template:$$refToTemplateTitle";
- }
- }
-
- sub extractCategories(\$\@$) {
- my ($refToText, $refToCategoriesArray, $id) = @_;
-
- # Remember that namespace names are case-insensitive, hence we're matching with "/i".
- # The first parameter to 'collectCategory' is passed by value rather than by reference,
- # because it might be dangerous to pass a reference to $1 in case it might get modified
- # (with unclear consequences).
- $$refToText =~ s/\[\[(?:\s*)(Category:.*?)\]\]/&collectCategory($1, $refToCategoriesArray)/ieg;
-
- # We don't accumulate categories directly in a hash table, since this would not preserve
- # their original order of appearance.
- &removeDuplicatesAndSelf($refToCategoriesArray, $id);
- }
-
- sub collectCategory($\@) {
- my ($catName, $refToCategoriesArray) = @_;
-
- if ($catName =~ /^(.*)\|/) {
- # Some categories contain a sort key, e.g., [[Category:Whatever|*]] or [[Category:Whatever| ]]
- # In such a case, take only the category name itself.
- $catName = $1;
- }
-
- &normalizeTitle(\$catName);
-
- my $catId = &resolveLink(\$catName);
- if ( defined($catId) ) {
- push(@$refToCategoriesArray, $catId);
- } else {
- print LOGF "Warning: unknown category '$catName'\n";
- }
-
- # The return value is just a space, because we remove categories from the text
- # after we collected them
- " ";
- }
-
- sub extractInternalLinks(\$\@$$$) {
- my ($refToText, $refToInternalLinksArray, $id,
- $whetherToLogAnchorText, $whetherToRemoveDuplicates) = @_;
-
- # For each internal link outgoing form the current article, this hash table maps
- # the target id into the anchor text associated with it. Naturally, we only
- # collect anchor text for links that can be resolved to a page id.
- my %anchorTexts;
-
- # Link definitions may span over adjacent lines and therefore contain line breaks,
- # hence we use the /s modifier.
- # Occasionally, links are nested, e.g.,
- # [[Image:kanner_kl2.jpg|frame|right|Dr. [[Leo Kanner]] introduced the label ''early infantile autism'' in [[1943]].]]
- # In order to prevent incorrect parsing, e.g., "[[Image:kanner_kl2.jpg|frame|right|Dr. [[Leo Kanner]]",
- # we extract links in several iterations of the while loop, while the link definition requires that
- # each pair [[...]] does not contain any opening braces.
-
- 1 while ( $$refToText =~ s/
- (\w*) # words may be glued to the beginning of the link,
- # in which case they become part of the link
- # e.g., "ex-[[Giuseppe Mazzini|Mazzinian]] "
- \[\[
- ([^\[]*?) # the link text can be any chars except an opening bracket,
- # this ensures we correctly parse nested links (see comments above)
- \]\]
- (\w*) # words may be glued to the end of the link,
- # in which case they become part of the link
- # e.g., "[[public transport]]ation"
- /&collectInternalLink($1, $2, $3, $refToInternalLinksArray, \%anchorTexts)/segx
- );
-
- if ($whetherToRemoveDuplicates) {
- &removeDuplicatesAndSelf($refToInternalLinksArray, $id);
- }
-
- if ($whetherToLogAnchorText) {
- &logAnchorText(\%anchorTexts, $id);
- }
- }
-
- sub logAnchorText(\%$) {
- my ($refToAnchorTextsHash, $curPageId) = @_;
-
- # Remember that we use a hash table to associate anchor text with target page ids.
- # Therefore, if the current page has several links to another page (it happens), then we only
- # keep the anchor text of the last one (and override the previous ones) - we can live with it.
- # Consequently, we do not need to remove duplicates as there are none.
- # However, we still remove the links that point from the page to itself.
- my $targetId;
- my $anchorText;
- while ( ($targetId, $anchorText) = each(%$refToAnchorTextsHash) ) {
- if ($targetId != $curPageId) {
- &postprocessText(\$anchorText, 0); # anchor text doesn't need escaping of XML characters,
- # hence the second function parameter is 0
- $anchorText =~ s/\n/ /g; # replace all newlines with spaces
-
- # make sure that something is left of anchor text after postprocessing
- if (length($anchorText) > 0) {
- print ANCHORF "$targetId\t$curPageId\t$anchorText\n";
- }
- }
- }
- }
-
- sub collectInternalLink($$$\@\%) {
- my ($prefix, $link, $suffix, $refToInternalLinksArray, $refToAnchorTextHash) = @_;
-
- my $originalLink = $link;
- my $result = "";
-
- # strip leading whitespace, if any
- $link =~ s/^\s*//;
-
- # Link definitions may span over adjacent lines and therefore contain line breaks,
- # hence we use the /s modifier on most matchings.
-
- # There are some special cases when the link may be preceded with a colon.
- # Known cases:
- # - Linking to a category (as opposed to actually assigning the current article
- # to a category) is performed using special syntax [[:Category:...]]
- # - Linking to other languages, e.g., [[:fr:Wikipedia:Aide]]
- # (without the leading colon, the link will go to the side menu
- # - Linking directly to the description page of an image, e.g., [[:Image:wiki.png]]
- # In all such cases, we strip the leading colon.
- if ($link =~ /^
- : # colon at the beginnning of the link name
- (.*) # the rest of the link text
- $
- /sx) {
- # just strip this initial colon (as well as any whitespace preceding it)
- $link = $1;
- }
-
- # Alternative text may be available after the pipeline symbol.
- # If the pipeline symbol is only used for masking parts of
- # the link name for presentation, we still consider that the author of the page
- # deemed the resulting text important, hence we always set this variable when
- # the pipeline symbol is present.
- my $alternativeTextAvailable = 0;
-
- # Some links contain several pipeline symbols, e.g.,
- # [[Image:Zerzan.jpeg|thumb|right|[[John Zerzan]]]]
- # It seems that the extra pipeline symbols are parameters, so we just eliminate them.
- if ($link =~ /^(.*)\|([^|]*)$/s) { # first, extract the link up to the last pipeline symbol
- $link = $1; # the part before the last pipeline
- $result = $2; # the part after the last pipeline, this is usually an alternative text for this link
-
- $alternativeTextAvailable = 1; # pipeline found, see comment above
-
- # Now check if there are pipeline symbols remaining.
- # Note that this time we're looking for the shortest match,
- # to take the part of the text up to the first pipeline symbol.
- if ($link =~ /^([^|]*)\|(.*)$/s) {
- $link = $1;
- # $2 contains the parameters, which we don't really need
- }
-
- if (length($result) == 0) {
- if ($link !~ /\#/) {
- # If the "|" symbol is not followed by some text, then it masks the namespace
- # as well as any text in parentheses at the end of the link title.
- # However, pipeline masking is only invoked if the link does not contain an anchor,
- # hence the additional condition in the 'if' statement.
- &performPipelineMasking(\$link, \$result);
- } else {
- # If the link contains an anchor, then masking is not invoked, and we take the entire link
- $result = $link;
- }
- }
- } else {
- # the link text does not contain the pipeline, so take it as-is
- $result = $link;
- }
-
- if ($link =~ /^(.*)\#(.*)$/s) {
- # The link contains an anchor, so adjust the link to point to the page as a whole.
- $link = $1;
- my $anchor = $2;
- # Check if the link points to an anchor on the current page, and if so - ignore it.
- if (length($link) == 0 && ! $alternativeTextAvailable) {
- # This is indeed a link pointing to an anchor on the current page.
- # The link is thus cleared, so that it will not be resolved and collected later.
- # For anchors to the same page, discard the leading '#' symbol, and take
- # the rest as the text - but only if no alternative text was provided for this link.
- $result = $anchor;
- }
- }
-
- # Now collect the link, or links if the original link is in the date format
- # and specifies both day and year. In the latter case, the function for date
- # normalization may also modify the link text ($result), and may collect more
- # than one link (one for the day, another one for the year).
- my $dateRecognized = 0;
-
- # Alternative text (specified after pipeline) blocks normalization of dates.
- # We also perform a quick check - if the link does not start with a digit,
- # then it surely does not contain a date
- if ( ($link =~ /^\d/) && (! $alternativeTextAvailable)) {
- $dateRecognized = &normalizeDates(\$link, \$result, $refToInternalLinksArray, $refToAnchorTextHash);
- }
-
- # If a date (either day or day + year) was recognized, then no further processing is necessary
- if (! $dateRecognized) {
- &normalizeTitle(\$link);
- my $targetId = &resolveAndCollectInternalLink(\$link, $refToInternalLinksArray);
-
- # Wikipedia pages contain many links to other Wiki projects (especially Wikipedia in
- # other languages). While these links are not resolved to valid pages, we also want
- # to ignore their text. However, simply discarding the text of all links that cannot
- # be resolved would be overly aggressive, as authors frequently define phrases as links
- # to articles that don't yet exist, in the hope that they will be added later.
- # Therefore, we formulate the following conditions that must hold simultaneously
- # for discarding the text of a link:
- # 1) the link was not resolved to a valid id
- # 2) the link does not contain alternative text (if it did, then the text is probably
- # important enough to be retained)
- # 3) the link contains a colon - this is a very simple heuristics for identifying links to
- # other Wiki projects, other languages, or simply other namespaces within current Wikipedia.
- # While this method is not fool-proof (there are regular pages in the main namespace
- # that contain a colon in their title), we believe this is a reasonable tradeoff.
- if ( !defined($targetId) && ! $alternativeTextAvailable && $link =~ /:/ ) {
- $result = "";
- print LOGF "Discarding text for link '$originalLink'\n";
- } else {
- # finally, add the text originally attached to the left and/or to the right of the link
- # (if the link represents a date, then it has not text glued to it, so it's OK to only
- # use the prefix and suffix here)
- $result = $prefix . $result . $suffix;
- }
-
- if ( defined($targetId) ) {
- # If the current page has several links to another page, then we only take the anchor
- # of the last one (and override the previous ones) - we can live with it.
- $$refToAnchorTextHash{$targetId} = $result;
- }
- }
-
- $result; #return value
- }
-
- sub performPipelineMasking(\$\$) {
- my ($refToLink, $refToResult) = @_;
-
- # First check for presence of a namespace
- if ($$refToLink =~ /^([^:]*):(.*)$/) {
- my $namespaceCandidate = $1;
- my $rest = $2;
-
- &normalizeNamespace(\$namespaceCandidate);
- if ( &isKnownNamespace(\$namespaceCandidate) ) {
- $$refToResult = $rest; # take the link text without the namespace
- } else {
- $$refToResult = $$refToLink; # otherwise, take the entire link text (for now)
- }
- } else {
- $$refToResult = $$refToLink; # otherwise, take the entire link text (for now)
- }
-
- # Now check if there are parentheses at the end of the link text
- # (we now operate on $$refToResult, because we might have stripped the leading
- # namespace in the previous test).
- if ($$refToResult =~ /^ # the beginning of the string
- (.*) # the text up to the last pair of parentheses
- \( # opening parenthesis
- (?:[^()]*) # the text in the parentheses
- \) # closing parenthesis
- (?:\s*) # optional trailing whitespace, just in case
- $ # end of string
- /x) {
- $$refToResult = $1; # discard the text in parentheses at the end of the string
- }
- }
-
- sub resolveAndCollectInternalLink(\$\@) {
- my ($refToLink, $refToInternalLinksArray) = @_;
-
- my $targetId = &resolveLink($refToLink);
- if ( defined($targetId) ) {
- push(@$refToInternalLinksArray, $targetId);
- } else {
- # Some cases in this category that obviously won't be resolved to legal ids:
- # - Links to namespaces that we don't currently handle
- # (other than those for which 'isNamespaceOK' returns true);
- # media and sound files fall in this category
- # - Links to other languages, e.g., [[de:...]]
- # - Links to other Wiki projects, e.g., [[Wiktionary:...]]
- print LOGF "Warning: unknown link '$$refToLink'\n";
- }
-
- $targetId; # return value
- }
-
- # Dates can appear in several formats
- # 1) [[July 20]], [[1969]]
- # 2) [[20 July]] [[1969]]
- # 3) [[1969]]-[[07-20]]
- # 4) [[1969-07-20]]
- # The first one is handled correctly without any special treatment,
- # so we don't even check for it here.
- # In (2) and (3), we only normalize the day, because it will be parsed separately from the year.
- # This function is only invoked if the link has no alternative text available, therefore,
- # we're free to override the result text.
- sub normalizeDates(\$\$\@\%) {
- my ($refToLink, $refToResultText, $refToInternalLinksArray, $refToAnchorTextHash) = @_;
-
- my $dateRecognized = 0;
-
- if ($$refToLink =~ /^(\d\d)\s*([A-Za-z]+)$/) {
- my $day = $1;
- my $month = ucfirst(lc($2));
-
- if ( defined($monthToNumDays{$month}) &&
- 1 <= $day && $day <= $monthToNumDays{$month} ) {
- $dateRecognized = 1;
-
- $$refToLink = "$month $day";
- $$refToResultText = "$month $day";
-
- my $targetId = &resolveAndCollectInternalLink($refToLink, $refToInternalLinksArray);
- if ( defined($targetId) ) {
- $$refToAnchorTextHash{$targetId} = $$refToResultText;
- }
- } else {
- # this doesn't look like a valid date, leave as-is
- }
- } elsif ($$refToLink =~ /^(\d\d)\-(\d\d)$/) {
- my $monthNum = int($1);
- my $day = $2;
-
- if ( defined($numberToMonth{$monthNum}) ) {
- my $month = $numberToMonth{$monthNum};
- if (1 <= $day && $day <= $monthToNumDays{$month}) {
- $dateRecognized = 1;
-
- $$refToLink = "$month $day";
- # we add a leading space, to separate the preceding year ("[[1969]]-" in the example")
- # from the day that we're creating
- $$refToResultText = " $month $day";
-
- my $targetId = &resolveAndCollectInternalLink($refToLink, $refToInternalLinksArray);
- if ( defined($targetId) ) {
- $$refToAnchorTextHash{$targetId} = $$refToResultText;
- }
- } else {
- # this doesn't look like a valid date, leave as-is
- }
- } else {
- # this doesn't look like a valid date, leave as-is
- }
- } elsif ($$refToLink =~ /^(\d\d\d\d)\-(\d\d)\-(\d\d)$/) {
- my $year = $1;
- my $monthNum = int($2);
- my $day = $3;
-
- if ( defined($numberToMonth{$monthNum}) ) {
- my $month = $numberToMonth{$monthNum};
- if (1 <= $day && $day <= $monthToNumDays{$month}) {
- $dateRecognized = 1;
-
- $$refToLink = "$month $day";
- # the link text is combined from the day and the year
- $$refToResultText = "$month $day, $year";
-
- my $targetId;
-
- # collect the link for the day
- $targetId = &resolveAndCollectInternalLink($refToLink, $refToInternalLinksArray);
- if ( defined($targetId) ) {
- $$refToAnchorTextHash{$targetId} = $$refToLink;
- }
-
- # collect the link for the year
- $targetId = &resolveAndCollectInternalLink(\$year, $refToInternalLinksArray);
- if ( defined($targetId) ) {
- $$refToAnchorTextHash{$targetId} = $year;
- }
- } else {
- # this doesn't look like a valid date, leave as-is
- }
- } else {
- # this doesn't look like a valid date, leave as-is
- }
- }
-
- $dateRecognized; # return value
- }
-
- sub extractUrls(\$\@) {
- my ($refToText, $refToUrlsArray) = @_;
-
- # First we handle the case of URLs enclosed in single brackets, with or without the description,
- # and with optional leading and/or trailing whitespace
- # Examples: [http://www.cnn.com], [ http://www.cnn.com ], [http://www.cnn.com CNN Web site]
- $$refToText =~ s/\[(?:\s*)($urlProtocols(?:[^\[\]]*))\]/&collectUrlFromBrackets($1, $refToUrlsArray)/eg;
-
- # Now we handle standalone URLs (those not enclosed in brackets)
- # The $urlTemrinator is matched via positive lookahead (?=...) in order not to remove
- # the terminator symbol itself, but rather only the URL.
- $$refToText =~ s/($urlProtocols(?:.*?))$urlTerminator/&collectStandaloneUrl($1, $refToUrlsArray)/eg;
-
- &removeDuplicatesAndSelf($refToUrlsArray, undef);
- }
-
- sub collectUrlFromBrackets($\@) {
- my ($url, $refToUrlsArray) = @_;
-
- my $text;
- # Assumption: leading whitespace has already been stripped
- if ( $url =~ /^($urlProtocols(?:.*?))($urlTerminator(?:.*))$/ ) { # description available
- push(@$refToUrlsArray, $1);
- $text = $2;
- } else { # no description
- push(@$refToUrlsArray, $url);
- $text = " ";
- }
-
- $text; # return value
- }
-
- sub collectStandaloneUrl($\@) {
- my ($url, $refToUrlsArray) = @_;
-
- push(@$refToUrlsArray, $url); # collect the URL as-is
-
- " "; # return value - replace the URL with a space
- }
-
- sub postprocessText(\$$) {
- my ($refToText, $whetherToEncodeXmlChars) = @_;
-
- # Eliminate all <includeonly> and <onlyinclude> fragments, because this text
- # will not be included anywhere, as we already handled all inclusion directives
- # in function 'includeTemplates'.
- # This block can easily span several lines, hence the "/s" modifier.
- $$refToText =~ s/<includeonly>(.*?)<\/includeonly>/ /sg;
- $$refToText =~ s/<onlyinclude>(.*?)<\/onlyinclude>/ /sg;
-
- # <noinclude> fragments remain, but remove the tags per se
- # We block the code below, as <noinclude> tags will anyway be thrown away later,
- # when we eliminate all remaining tags.
- ### This block can easily span several lines, hence the "/s" modifier
- ### $$refToText =~ s/<noinclude>(.*?)<\/noinclude>/$1/sg;
-
- # replace <br> and <br /> directives with new paragraph
- $$refToText =~ s/<br(?:\s*)(?:[\/]?)>/\n\n/g;
-
- # Remove tables, as they often carry a lot of noise
- &eliminateTables($refToText);
-
- # Since we limit the number of levels of template recursion, we might end up with several
- # un-instantiated templates. In this case we simply eliminate them now.
- # Because templates may be nested, we eliminate them iteratively by starting from the most
- # nested one (hence the 'while' loop).
- # OLD comments and code:
- # For the same reason, we also require that the body of a template does not contain
- # opening braces (hence "[^\{]", any char except opening brace).
- # 1 while ($$refToText =~ s/\{\{(?:[^\{]*?)\}\}/ /sg);
- # END OF old comments and code
- # We also require that the body of a template does not contain the template opening sequence
- # (two successive opening braces - "\{\{"). We use negative lookahead to achieve this.
- 1 while ($$refToText =~ s/\{\{
- (?:
- (?:
- (?!
- \{\{
- )
- .
- )*?
- )
- \}\}
- / /sgx);
-
- # Remove any other <...> tags - but keep the text they enclose
- # (the tags are replaced with spaces to prevent adjacent pieces of text
- # from being glued together).
- # Comments (<!-- ... -->) also fall into this category, and since they can easily span several lines,
- # we use the "/s" modifier.
- $$refToText =~ s/<(?:.*?)>/ /sg;
-
- # Change markup on bold/italics emphasis. We probably don't need to distinguish
- # these 3 types of emphasis, so we just replace all of them with a generic <em> tag.
- # IMPORTANT: If 'encodeXmlChars' has beeen called before this line, then remember that
- # the apostrophes were already quoted to "'"
- $$refToText =~ s/'''''(.*?)'''''/$1/g;
- $$refToText =~ s/'''(.*?)'''/$1/g;
- $$refToText =~ s/''(.*?)''/$1/g;
-
- # Eliminate long sequences of newlines and whitespace.
- # Note that we don't want to replace sequences of spaces only, as this might make the text
- # less readable. Instead, we only eliminate sequences of whitespace that contain at least
- # two newlines.
- $$refToText =~ s/(?:\s*)\n(?:\s*)\n(?:\s*)/\n\n/g;
-
- # Eliminate XML entities such as " " , "×" etc. - otherwise,
- # in C++ code they will give rise to spurious words "nbsp", "times" etc.
- # Note that the standard entities - & , " , ' , < and >
- # are handled by the XML parser. All other entities, such as are passed
- # by the XML parser to the upper level (in case of Wikipedia pages,
- # to the rendering engine).
- # Note that in the raw XML text, these entities look like "&nbsp;"
- # (i.e., with leading "&"). XML parser replaces "&" with "&",
- # so here in the code we see the entities as " ".
- $$refToText =~ s{& # the entity starts with "&"
- ((?:\#?)(?:\w+)) # optional '#' sign (as in α), followed by
- # an uninterrupted sequence of letters and/or digits
- ; # the entity ends with a semicolon
- }{&logReplacedXmlEntity($1)}egx; # entities are replaced with a space
-
- if ($whetherToEncodeXmlChars) {
- # encode text for XML
- &encodeXmlChars($refToText);
- }
-
- # NOTE that the following operations introduce XML tags, so they must appear
- # after the original text underwent character encoding with 'encodeXmlChars' !!
-
- # Change markup for section headers.
- # Note that section headers may only begin at the very first position in the line
- # (not even after a space). Therefore, each header markup in the following commands
- # is prefixed with "^" to make sure it begins at the beginning of the line.
- # Since the text (e.g., article body) may contains multiple lines, we use
- # the "/m" modifier to allow matching "^" at embedded "\n" positions.
- $$refToText =~ s/^=====(.*?)=====/<h4>$1<\/h4>/mg;
- $$refToText =~ s/^====(.*?)====/<h3>$1<\/h3>/mg;
- $$refToText =~ s/^===(.*?)===/<h2>$1<\/h2>/mg;
- $$refToText =~ s/^==(.*?)==/<h1>$1<\/h1>/mg;
- }
-
- sub logReplacedXmlEntity($) {
- my ($xmlEntity) = @_;
-
- print LOGF "ENTITY: &$xmlEntity;\n";
-
- " "; # return value - entities are replaced with a space
- }
-
- BEGIN {
- # Making variables static for the function to avoid recompilation of regular expressions
- # every time the function is called.
-
- # Table definitions can easily span several lines, hence the "/s" modifier
-
- my $tableOpeningSequence1 = qr{<table> # either just <table>
- | # or
- <table(?:\s+)(?:[^<>]*)>}ix; # "<table" followed by at least one space
- # (to prevent "<tablexxx"), followed by
- # some optional text, e.g., table parameters
- # as in "<table border=0>"
- # In the above definition, prohibiting '<' and '>' chars ([^<>]) ensures
- # that we do not consume more than necessary, so that in the example
- # "<table border=0> aaa <table> bbb </table> ccc </table>"
- # $1 is NOT extended to be "> aaa <table"
-
- my $tableClosingSequence1 = qr/<\/table>/i;
- # my $nonNestedTableRegex1 =
- # qr{$tableOpeningSequence1 # opening sequence
- # (
- # (?: # non-capturing grouper
- # (?! # lookahead negation
- # $tableOpeningSequence1 # that's what we don't want to find inside a table definition
- # )
- # . # any character (such that there is no table opening sequence
- # # after it because of the lookahead condition)
- # )*? # shortest match of such characters, up to the closing of a table
- # )
- # $tableClosingSequence1}sx; # closing sequence
-
- my $tableOpeningSequence2 = qr/\{\|/;
- my $tableClosingSequence2 = qr/\|\}/;
- # my $nonNestedTableRegex2 =
- # qr{$tableOpeningSequence2 # opening sequence
- # (
- # (?: # non-capturing grouper
- # (?! # lookahead negation
- # $tableOpeningSequence2 # that's what we don't want to find inside a table definition
- # )
- # . # any character (such that there is no table opening sequence
- # # after it because of the lookahead condition)
- # )*? # shortest match of such characters, up to the closing of a table
- # )
- # $tableClosingSequence2}sx; # closing sequence
-
- sub eliminateTables(\$) {
- my ($refToText) = @_;
-
- # Sadly, these patterns became too complex and cause segmentation fault,
- # hence we fall back to only handling non-nested tables :(
- # # Sometimes, tables are nested, therefore we use a while loop to eliminate them
- # # recursively, while requiring that any table we eliminate does not contain nested tables.
- # # For simplicity, we assume that tables of the two kinds (e.g., <table> ... </table> and {| ... |})
- # # are not nested in one another.
-
- $$refToText =~ s/$tableOpeningSequence1(.*?)$tableClosingSequence1/\n/sg;
- $$refToText =~ s/$tableOpeningSequence2(.*?)$tableClosingSequence2/\n/sg;
- }
-
- } # end of BEGIN block
-
- # If specified, 'elementToRemove' contains an element that needs to be removed as well.
- # For links, this ensures that a page does not link to itself. For categories, this
- # ensures that a page is not categorized to itself. This parameter is obviously
- # irrelevant for filtering URLs.
- # 'elementToRemove' must be a numeric value (not string), since we're testing it with '==' (not 'eq')
- sub removeDuplicatesAndSelf(\@$) {
- my ($refToArray, $elementToRemove) = @_;
-
- my %seen = ();
- my @uniq;
-
- my $item;
- foreach $item (@$refToArray) {
- if ( defined($elementToRemove) && ($item == $elementToRemove) ) {
- printf LOGF "Warning: current page links or categorizes to itself - " .
- "link discarded ($elementToRemove)\n";
- next;
- }
- push(@uniq, $item) unless $seen{$item}++;
- }
-
- # overwrite the original array with the new one that does not contain duplicates
- @$refToArray = @uniq;
- }
-
- # Removes elements of the second list from the first list.
- # For efficiency purposes, the second list is converted into a hash.
- sub removeElements(\@\@) {
- my ($refToArray, $refToElementsToRemove) = @_;
-
- my %elementsToRemove = ();
- my @result;
-
- # Construct the hash table for fast lookups
- my $item;
- foreach $item (@$refToElementsToRemove) {
- $elementsToRemove{$item} = 1;
- }
-
- foreach $item (@$refToArray) {
- if ( ! defined($elementsToRemove{$item}) ) {
- push(@result, $item);
- }
- }
-
- # overwrite the original array with the new one
- @$refToArray = @result;
- }
-
- sub getTimeAsString() {
- my $tm = localtime();
- my $result = sprintf("%02d:%02d:%02d", $tm->hour, $tm->min, $tm->sec);
- }
-
- sub trimWhitespaceBothSides(\$) {
- my ($stringRef) = @_;
-
- # remove leading whitespace
- $$stringRef =~ s/^\s*//;
- # remove trailing whitespace
- $$stringRef =~ s/\s*$//;
- }
-
- # There are 3 kinds of related links that we look for:
- # 1) Standalone (usually, at the beginning of the article or a section of it)
- # Ex: Main articles: ...
- # 2) Inlined - text in parentheses inside the body of the article
- # Ex: medicine (see also: [[Health]])
- # 3) Dedicated section
- # Ex: == See also ==
- #
- # In all calls to 'extractInternalLinks':
- # - The penultimate argument is 0, since we don't need to log anchor text here.
- # Anchor text will be handled when we analyze all the internal links in
- # the entire article (and not just look for related links).
- # - The last argument is 0 in order not to remove duplicates on every invocation
- # of 'extractInternalLinks'. This is because duplicates in related links are
- # not very common, but performing duplicate removal each time is expensive.
- # Instead, we remove duplicates once at the very end.
- sub identifyRelatedArticles(\$\@$) {
- my ($refToText, $refToRelatedArticles, $id) = @_;
-
- # We split the text into a set of lines. This also creates a copy of the original text -
- # this is important, since the function 'extractInternalLinks' modifies its argument,
- # so we'd better use it on a copy of the real article body.
- my @text = split("\n", $$refToText);
- my $line;
-
- # Standalone
- foreach $line (@text) {
- # We require that stanalone designators occur at the beginning of the line
- # (after at most a few characters, such as a whitespace or a colon),
- # and not just anywhere in the line. Otherwise, we would collect as related
- # those links that just happen to occur in the same line with an unrelated
- # string that represents a standalone designator.
- if ($line =~ /^(?:.{0,5})(${relatedWording_Standalone}.*)$/) {
- my $str = $1; # We extract links from the rest of the line
- print LOGF "Related(S): $id => $str\n";
- &extractInternalLinks(\$str, $refToRelatedArticles, $id, 0, 0);
- print LOGF "Related(S): $id ==> @$refToRelatedArticles\n";
- }
- }
-
- # Inlined (in parentheses)
- foreach $line (@text) {
- while ($line =~ /\((?:\s*)(${relatedWording_Inline}.*?)\)/g) {
- my $str = $1;
- print LOGF "Related(I): $id => $str\n";
- &extractInternalLinks(\$str, $refToRelatedArticles, $id, 0, 0);
- print LOGF "Related(I): $id ==> @$refToRelatedArticles\n";
- }
- }
-
- # Section
- # Sections can be at any level - "==", "===", "====" - it doesn't matter,
- # so it suffices to look for two consecutive "=" signs
- my $relatedSectionFound = 0;
- foreach $line (@text) {
- if ($relatedSectionFound) { # we're in the related section now
- if ($line =~ /==(?:.*?)==/) { # we just encountered the next section - exit the loop
- last;
- } else { # collect the links from the current line
- print LOGF "Related(N): $id => $line\n";
- # 'extractInternalLinks' may mofidy its argument ('$line'), but it's OK
- # as we do not do any further processing to '$line' or '@text'
- &extractInternalLinks(\$line, $refToRelatedArticles, $id, 0, 0);
- print LOGF "Related(N): $id ==> @$refToRelatedArticles\n";
- }
- } else { # we haven't yet found the related section
- if ($line =~ /==(.*?)==/) { # found some section header - let's check it
- my $sectionHeader = $1;
- if ($sectionHeader =~ /$relatedWording_Section/) {
- $relatedSectionFound = 1;
- next; # proceed to the next line
- } else {
- next; # unrelated section - just proceed to the next line
- }
- } else {
- next; # just proceed to the next line - nothing to do
- }
- }
- }
-
- &removeDuplicatesAndSelf($refToRelatedArticles, $id);
- }
-
- sub recordRelatedArticles($\@) {
- my ($id, $refToRelatedArticles) = @_;
-
- my $size = scalar(@$refToRelatedArticles);
- return if ($size == 0);
-
- print RELATEDF "$id\t", join(" ", @$refToRelatedArticles), "\n";
- }
-
-
- ########################################################################
-
- sub printUsage()
- {
- print "Wikiprep version $version, Copyright (C) 2007 Evgeniy Gabrilovich\n" .
- "Wikiprep comes with ABSOLUTELY NO WARRANTY; for details type '$0 -license'.\n" .
- "This is free software, and you are welcome to redistribute it\n" .
- "under certain conditions; type '$0 -license' for details.\n" .
- "Type '$0 -version' for version information.\n\n" .
- "Usage: $0 -f <XML file with page dump>\n" .
- " e.g., $0 -f pages_articles.xml\n\n";
- }
|