laywerrobot/gensim_w2v/wikiprep.pl

###############################################################################
#
# wikiprep.pl - Preprocess Wikipedia XML dumps
# Copyright (C) 2007 Evgeniy Gabrilovich
# The author can be contacted by electronic mail at gabr@cs.technion.ac.il
#
#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 2 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program; if not, write to the Free Software
#    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA,
#    or see <http://www.gnu.org/licenses/> and
#    <http://www.fsf.org/licensing/licenses/info/GPLv2.html>
#
###############################################################################


use strict;
use warnings;

use File::Basename;
use Getopt::Long;
use Time::localtime;
use XML::Parser;

my $licenseFile = "COPYING";
my $version = "2.02";

if (@ARGV < 1) {
  &printUsage();
  exit 0;
}

my $file;
my $showLicense = 0;
my $showVersion = 0;

GetOptions('f=s' => \$file,
           'license' => \$showLicense,
           'version' => \$showVersion);

if ($showLicense) {
  if (-e $licenseFile) {
    print "See file $licenseFile for more details.\n"
  } else {
    print "Please see <http://www.gnu.org/licenses/> and <http://www.fsf.org/licensing/licenses/info/GPLv2.html>\n";
  }
  exit 0;
}
if ($showVersion) {
  print "Wikiprep version $version\n";
  exit 0;
}
if (!defined($file)) {
  &printUsage();
  exit 0;
}
if (! -e $file) {
  die "Input file '$file' cannot be opened for reading\n";
}


##### Global definitions #####

my %XmlEntities = ('&' => 'amp', '"' => 'quot', "'" => 'apos', '<' => 'lt', '>' => 'gt');

# The URL protocol (e.g., http) matched here may be in either case, hence we use the /i modifier.
my $urlProtocols = qr/http:\/\/|https:\/\/|telnet:\/\/|gopher:\/\/|file:\/\/|wais:\/\/|ftp:\/\/|mailto:|news:/i;
# A URL terminator may be either one of a list of characters OR end of string (that is, '$').
# This last part is necessary to handle URLs at the very end of a string when there is no "\n"
# or any other subsequent character.
my $urlTerminator = qr/[\[\]\{\}\s\n\|\"<>]|$/;

my $relatedWording_Standalone =
  qr/Main(?:\s+)article(?:s?)|Further(?:\s+)information|Related(?:\s+)article(?:s?)|Related(?:\s+)topic(?:s?)|See(?:\s+)main(?:\s+)article(?:s?)|See(?:\s+)article(?:s?)|See(?:\s+)also|For(?:\s+)(?:more|further)/i;
  ## For(?:\s+)more(?:\s+)(?:background|details)(?:\s+)on(?:\s+)this(?:\s+)topic,(?:\s+)see
my $relatedWording_Inline = qr/See[\s:]|See(?:\s+)also|For(?:\s+)(?:more|further)/i;
my $relatedWording_Section = qr/Further(?:\s+)information|See(?:\s+)also|Related(?:\s+)article(?:s?)|Related(?:\s+)topic(?:s?)/i;

my %monthToNumDays = ('January' => 31, 'February' => 29, 'March' => 31, 'April' => 30,
                      'May' => 31, 'June' => 30, 'July' => 31, 'August' => 31,
                      'September' => 30, 'October' => 31, 'November' => 30, 'December' => 31);
my %numberToMonth = (1 => 'January', 2 => 'February', 3 => 'March', 4 => 'April',
                     5 => 'May', 6 => 'June', 7 => 'July', 8 => 'August',
                     9 => 'September', 10 => 'October', 11 => 'November', 12 => 'December');

my $maxTemplateRecursionLevels = 5;
my $maxParameterRecursionLevels = 5;

##### Global variables #####

my %namespaces;
# we only process pages in these namespaces + the main namespace (which has an empty name)
my %okNamespacesForPrescanning = ('Template' => 1, 'Category' => 1);
my %okNamespacesForTransforming = ('Category' => 1); # we don't use templates as concepts

my %id2title;
my %title2id;
my %redir;
my %templates;          # template bodies for insertion
my %catHierarchy;       # each category is associated with a list of its immediate descendants
my %statCategories;     # number of pages classified under each category
my %statIncomingLinks;  # number of links incoming to each page


my ($fileBasename, $filePath, $fileSuffix) = fileparse($file, ".xml");
my $outputFile = "$filePath/$fileBasename.hgw$fileSuffix";
my $logFile = "$filePath/$fileBasename.log";
my $anchorTextFile = "$filePath/$fileBasename.anchor_text";
my $relatedLinksFile = "$filePath/$fileBasename.related_links";

open(OUTF, "> $outputFile") or die "Cannot open $outputFile";
open(LOGF, "> $logFile") or die "Cannot open $logFile";
open(ANCHORF, "> $anchorTextFile") or die "Cannot open $anchorTextFile";
open(RELATEDF, "> $relatedLinksFile") or die "Cannot open $relatedLinksFile";

binmode(STDOUT,  ':utf8');
binmode(STDERR,  ':utf8');
binmode(OUTF,    ':utf8');
binmode(LOGF,    ':utf8');
binmode(ANCHORF, ':utf8');

print ANCHORF  "# Line format: <Target page id>  <Source page id>  <Anchor text (up to the end of the line)>\n\n\n";
print RELATEDF "# Line format: <Page id>  <List of ids of related articles>\n\n\n";

&copyXmlFileHeader();
&loadNamespaces();
&prescan();

my $numTitles = scalar( keys(%id2title) );
print "Loaded $numTitles titles\n";
my $numRedirects = scalar( keys(%redir) );
print "Loaded $numRedirects redirects\n";
my $numTemplates = scalar( keys(%templates) );
print "Loaded $numTemplates templates\n";

&transform();
&closeXmlFile();

&writeStatistics();
&writeCategoryHierarchy();

close(LOGF);
close(ANCHORF);
close(RELATEDF);

# Hogwarts needs the anchor text file to be sorted in the increading order of target page id.
# The file is originally sorted by source page id (second field in each line).
# We now use stable (-s) numeric (-n) sort on the first field (-k 1,1).
# This way, the resultant file will be sorted on the target page id (first field) as primary key,
# and on the source page id (second field) as secondary key.
system("sort -s -n -k 1,1 $anchorTextFile > $anchorTextFile.sorted");


##### Subroutines #####

sub normalizeTitle(\$) {
  my ($refToStr) = @_;

  # remove leading whitespace and underscores
  $$refToStr =~ s/^[\s_]+//;
  # remove trailing whitespace and underscores
  $$refToStr =~ s/[\s_]+$//;
  # replace sequences of whitespace and underscore chars with a single space
  $$refToStr =~ s/[\s_]+/ /g;

  if ($$refToStr =~ /^([^:]*):(\s*)(\S(?:.*))/) {
    my $prefix = $1;
    my $optionalWhitespace = $2;
    my $rest = $3;

    my $namespaceCandidate = $prefix;
    &normalizeNamespace(\$namespaceCandidate); # this must be done before the call to 'isKnownNamespace'
    if ( &isKnownNamespace(\$namespaceCandidate) ) {
      # If the prefix designates a known namespace, then it might follow by optional
      # whitespace that should be removed to get the canonical page name
      # (e.g., "Category:  Births" should become "Category:Births").
      $$refToStr = $namespaceCandidate . ":" . ucfirst($rest);
    } else {
      # No namespace, just capitalize first letter.
      # If the part before the colon is not a known namespace, then we must not remove the space
      # after the colon (if any), e.g., "3001: The_Final_Odyssey" != "3001:The_Final_Odyssey".
      # However, to get the canonical page name we must contract multiple spaces into one,
      # because "3001:   The_Final_Odyssey" != "3001: The_Final_Odyssey".
      $$refToStr = ucfirst($prefix) . ":" .
                   (length($optionalWhitespace) > 0 ? " " : "") . $rest;
    }
  } else {
    # no namespace, just capitalize first letter
    $$refToStr = ucfirst($$refToStr);
  }
}

sub normalizeNamespace(\$) {
  my ($refToStr) = @_;

  $$refToStr = ucfirst( lc($$refToStr) );
}

# Checks if the prefix of the page name before the colon is actually one of the
# 16+2+2 namespaces defined in the XML file.
# Assumption: the argument was already normalized using 'normalizeNamespace'
sub isKnownNamespace(\$) {
  my ($refToStr) = @_;

  defined( $namespaces{$$refToStr} );  # return value
}

# The correct form to create a redirect is #REDIRECT [[ link ]],
# and function 'Parse::MediaWikiDump::page->redirect' only supports this form.
# However, it seems that Wikipedia can also tolerate a variety of other forms, such as
# REDIRECT|REDIRECTS|REDIRECTED|REDIRECTION, then an optional ":", optional "to" or optional "=".
# Therefore, we use our own function to handle these cases as well.
# If the page is a redirect, the function returns the title of the target page;
# otherwise, it returns 'undef'.
sub isRedirect($) {
  my ($page) = @_;

  # quick check
  return undef if ( ${$page->text} !~ /^#REDIRECT/i );

  if ( ${$page->text} =~ m{^\#REDIRECT         # Redirect must start with "#REDIRECT"
                                               #   (the backslash is needed before "#" here, because
                                               #    "#" has special meaning with /x modifier)
                           (?:S|ED|ION)?       # The word may be in any of these forms,
                                               #   i.e., REDIRECT|REDIRECTS|REDIRECTED|REDIRECTION
                           (?:\s*)             # optional whitespace
                           (?: :|\sTO|=)?      # optional colon, "TO" or "="
                                               #   (in case of "TO", we expect a whitespace before it,
                                               #    so that it's not glued to the preceding word)
                           (?:\s*)             # optional whitespace
                           \[\[([^\]]*)\]\]    # the link itself
                          }ix ) {              # matching is case-insensitive, hence /i
    my $target = $1;

    if ($target =~ /^(.*)\#(?:.*)$/) {
      # The link contains an anchor. Anchors are not allowed in REDIRECT pages, and therefore
      # we adjust the link to point to the page as a whole (that's how Wikipedia works).
      $target = $1;
    }

    return $target;
  }

  # OK, it's probably either a malformed redirect link, or something else
  return undef;
}

sub isNamespaceOkForPrescanning($) {
  my ($page) = @_;

  &isNamespaceOk($page, \%okNamespacesForPrescanning);
}

sub isNamespaceOkForTransforming($) {
  my ($page) = @_;

  &isNamespaceOk($page, \%okNamespacesForTransforming);
}

sub isNamespaceOk($\%) {
  my ($page, $refToNamespaceHash) = @_;

  my $result = 1;

  # main namespace is OK, so we only check pages that belong to other namespaces

  if ($page->namespace ne '') {
    my $namespace = $page->namespace;
    &normalizeNamespace(\$namespace);
    if ( &isKnownNamespace(\$namespace) ) {
      $result = defined( $$refToNamespaceHash{$namespace} );
    } else {
      # the prefix before ":" in the page title is not a known namespace,
      # therefore, the page belongs to the main namespace and is OK
    }
  }

  $result; # return value
}

sub encodeXmlChars(\$) {
  my ($refToStr) = @_;

  $$refToStr =~ s/([&"'<>])/&$XmlEntities{$1};/g;
}

sub copyXmlFileHeader() {
  open(INF, "< $file") or die "Cannot open $file";
  while (<INF>) { # copy lines up to "</siteinfo>"
    if (/^<mediawiki /) {
      # The top level element - mediawiki - contains a lot of attributes (e.g., schema)
      # that are no longer applicable to the XML file after our transformation.
      # Therefore, we simply write an opening tag <mediawiki> without any attributes.
      print OUTF "<mediawiki>\n";
    } else {
      # All other lines (up to </siteinfo>) are copied as-is
      print OUTF;
    }
    last if (/<\/siteinfo>/);
  }
  close(INF); # this file will later be reopened by "Parse::MediaWikiDump"
}

sub closeXmlFile() {
  print OUTF "</mediawiki>\n";
  close(OUTF);
}

sub writeStatistics() {
  my $statCategoriesFile = "$filePath/$fileBasename.stat.categories";
  my $statIncomingLinksFile = "$filePath/$fileBasename.stat.inlinks";

  open(STAT_CATS, "> $statCategoriesFile") or die "Cannot open $statCategoriesFile";
  print STAT_CATS "# Line format: <CategoryId (= page id)>  <Number of pages in this category>\n",
                  "# Here we count the *pages* that belong to this category, i.e., articles AND\n",
                  "# sub-categories of this category (but not the articles in the sub-categories).\n",
                  "\n\n";

  my $cat;
  foreach $cat ( sort { $statCategories{$b} <=> $statCategories{$a} }
                 keys(%statCategories) ) {
    print STAT_CATS "$cat\t$statCategories{$cat}\n";
  }
  close(STAT_CATS);

  open(STAT_INLINKS, "> $statIncomingLinksFile") or die "Cannot open $statIncomingLinksFile";
  print STAT_INLINKS "# Line format: <Target page id>  <Number of links to it from other pages>\n\n\n";

  my $destination;
  foreach $destination ( sort { $statIncomingLinks{$b} <=> $statIncomingLinks{$a} }
                         keys(%statIncomingLinks) ) {
    print STAT_INLINKS "$destination\t$statIncomingLinks{$destination}\n";
  }

  close(STAT_INLINKS);
}

sub writeCategoryHierarchy() {
  my $catHierarchyFile = "$filePath/$fileBasename.cat_hier";

  open(CAT_HIER, "> $catHierarchyFile") or die "Cannot open $catHierarchyFile";
  print CAT_HIER "# Line format: <Category id>  <List of ids of immediate descendants>\n\n\n";

  my $cat;
  foreach $cat ( sort { $catHierarchy{$a} <=> $catHierarchy{$b} }
                 keys(%catHierarchy) ) {
    print CAT_HIER "$cat\t", join(" ", @{$catHierarchy{$cat}}), "\n";
  }

  close(CAT_HIER);
}

sub loadNamespaces() {
  # re-open the input XML file
  my $pages = Parse::MediaWikiDump::Pages->new($file);

  # load namespaces
  my $refNamespaces = $pages->namespaces;

  # namespace names are case-insensitive, so we force them
  # to canonical form to facilitate future comparisons
  my $ns;
  foreach $ns (@$refNamespaces) {
    my @namespaceData = @$ns;
    my $namespaceId   = $namespaceData[0];
    my $namespaceName = $namespaceData[1];
    &normalizeNamespace(\$namespaceName);
    $namespaces{$namespaceName} = $namespaceId;
  }
}

# build id <-> title mappings and redirection table,
# as well as load templates
sub prescan() {
  # re-open the input XML file
  my $pages = Parse::MediaWikiDump::Pages->new($file);

  my $counter = 0;

  my $page;
  while (defined($page = $pages->page)) {
    my $id = $page->id;

    $counter++;

    if ($counter % 1000 == 0) {
      my $timeStr = &getTimeAsString();
      print LOGF "[$timeStr] Prescanning page id=$id\n";
    }

    my $title = $page->title;
    &normalizeTitle(\$title);

    if (length($title) == 0) {
      # This is a defense against pages whose title only contains UTF-8 chars that
      # are reduced to an empty string. Right now I can think of one such case -
      # <C2><A0> which represents the non-breaking space. In this particular case,
      # this page is a redirect to [[Non-nreaking space]], but having in the system
      # a redirect page with an empty title causes numerous problems, so we'll live
      # happier without it.
      print LOGF "Skipping page with empty title id=$id\n";
      next;
    }

    my $redirect = &isRedirect($page);
    if (defined($redirect)) {
      &normalizeTitle(\$redirect);
      next if (length($redirect) == 0); # again, same precaution here - see comments above
      $redir{$title} = $redirect;

      # nothing more to do for redirect pages
      next;
    }

    if ( ! &isNamespaceOkForPrescanning($page) ) {
      next; # we're only interested in certain namespaces
    }
    # if we get here, then either the page belongs to the main namespace OR
    # it belongs to one of the namespaces we're interested in

    if ( exists($id2title{$id}) ) {
      print LOGF "Warning: Page id=$id already encountered before!\n";
      next;
    }
    if ( exists($title2id{$title}) ) {
      # A page could have been encountered before with a different spelling.
      # Examples: &nbsp; = <C2><A0> (nonbreakable space), &szlig; = <C3><9F> (German Eszett ligature)
      print LOGF "Warning: Page title='$title' already encountered before!\n";
      next;
    }
    $id2title{$id} = $title;
    $title2id{$title} = $id;

    if ($title =~ /^Template:/) {
      my $text = ${$page->text};

      # We're storing template text for future inclusion, therefore,
      # remove all <noinclude> text and keep all <includeonly> text
      # (but eliminate <includeonly> tags per se).
      # However, if <onlyinclude> ... </onlyinclude> parts are present,
      # then only keep them and discard the rest of the template body.
      # This is because using <onlyinclude> on a text fragment is
      # equivalent to enclosing it in <includeonly> tags **AND**
      # enclosing all the rest of the template body in <noinclude> tags.
      # These definitions can easily span several lines, hence the "/s" modifiers.

      my $onlyincludeAccumulator;
      while ($text =~ /<onlyinclude>(.*?)<\/onlyinclude>/sg) {
        my $onlyincludeFragment = $1;
        $onlyincludeAccumulator .= "$onlyincludeFragment\n";
      }
      if ( defined($onlyincludeAccumulator)) {
        $text = $onlyincludeAccumulator;
      } else {
        # If there are no <onlyinclude> fragments, simply eliminate
        # <noinclude> fragments and keep <includeonly> ones.
        $text =~ s/<noinclude>(?:.*?)<\/noinclude>/\n/sg;
        $text =~ s/<includeonly>(.*?)<\/includeonly>/$1/sg;
      }

      $templates{$id} = $text;
    }
  }

  my $timeStr = &getTimeAsString();
  print LOGF "[$timeStr] Prescanning complete - prescanned $counter pages\n";
}

sub transform() {
  # re-open the input XML file
  my $pages = Parse::MediaWikiDump::Pages->new($file);

  my $page;
  while (defined($page = $pages->page)) {
    my $id = $page->id;

    my $timeStr = &getTimeAsString();
    print LOGF "[$timeStr] Transforming page id=$id\n";

    if ( defined( &isRedirect($page) ) ) {
      next; # we've already loaded all redirects in the prescanning phase
    }

    if ( ! &isNamespaceOkForTransforming($page) ) {
      next; # we're only interested in pages from certain namespaces
    }

    my $title = $page->title;
    &normalizeTitle(\$title);

    # see the comment about empty titles in function 'prescan'
    if (length($title) == 0) {
      print LOGF "Skipping page with empty title id=$id\n";
      next;
    }

    my $text = ${$page->text};

    my $orgLength = length($text);  # text length BEFORE any transformations

    # The check for stub must be done BEFORE any further processing,
    # because stubs indicators are templates, and templates are substituted.
    my $isStub = 0;
    if ( $text =~ m/stub}}/i ) {
      $isStub = 1;
    }

    my @categories;
    my @internalLinks;
    my @urls;

    &includeTemplates(\$text);

    my @relatedArticles;
    # This function only examines the contents of '$text', but doesn't change it.
    &identifyRelatedArticles(\$text, \@relatedArticles, $id);

    # We process categories directly, because '$page->categories' ignores
    # categories inherited from included templates
    &extractCategories(\$text, \@categories, $id);

    # Categories are listed at the end of articles, and therefore may mistakenly
    # be added to the list of related articles (which often appear in the last
    # section such as "See also"). To avoid this, we explicitly remove all categories
    # from the list of related links, and only then record the list of related links
    # to the file.
    &removeElements(\@relatedArticles, \@categories);
    &recordRelatedArticles($id, \@relatedArticles);

    &extractInternalLinks(\$text, \@internalLinks, $id, 1, 1);
    &extractUrls(\$text, \@urls);

    &postprocessText(\$text, 1);

    my $newLength = length($text);  # text length AFTER all transformations

    &writePage($id, \$title, \$text, $orgLength, $newLength, $isStub, \@categories, \@internalLinks, \@urls);

    &updateStatistics(\@categories, \@internalLinks);

    if ($title =~ /^Category:/) {
      &updateCategoryHierarchy($id, \@categories);
    }
  }
}

sub updateStatistics(\@\@) {
  my ($refToCategories, $refToInternalLinks) = @_;

  my $cat;
  foreach $cat (@$refToCategories) {
    $statCategories{$cat}++;
  }

  my $link;
  foreach $link (@$refToInternalLinks) {
    $statIncomingLinks{$link}++;
  }
}

sub updateCategoryHierarchy($\@) {
  # The list of categories passed as a parameter is actually the list of parent categories
  # for the current category
  my ($childId, $refToParentCategories) = @_;

  my $parentCat;
  foreach $parentCat (@$refToParentCategories) {
    if ( exists($catHierarchy{$parentCat}) ) {
      push(@{$catHierarchy{$parentCat}}, $childId);
    } else {
      # create a new array with '$childId' as the only child (for now) of '$parentCat'
      my @arr;
      push(@arr, $childId);
      $catHierarchy{$parentCat} = [ @arr ];
    }
  }
}

sub writePage($\$\$$$$\@\@\@) {
  my ($id, $refToTitle, $refToText, $orgLength, $newLength, $isStub,
      $refToCategories, $refToInternalLinks, $refToUrls) = @_;

  my $numCategories = scalar(@$refToCategories);
  my $numLinks = scalar(@$refToInternalLinks);
  my $numUrls = scalar(@$refToUrls);

  print OUTF "<page id=\"$id\" orglength=\"$orgLength\" newlength=\"$newLength\" stub=\"$isStub\" " .
             "categories=\"$numCategories\" outlinks=\"$numLinks\" urls=\"$numUrls\">\n";

  my $encodedTitle = $$refToTitle;
  &encodeXmlChars(\$encodedTitle);
  print OUTF "<title>$encodedTitle</title>\n";

  print OUTF "<categories>";
  print OUTF join(" ", @$refToCategories);
  print OUTF "</categories>\n";

  print OUTF "<links>";
  print OUTF join(" ", @$refToInternalLinks);
  print OUTF "</links>\n";

  print OUTF "<urls>\n";

  my $url;
  foreach $url (@$refToUrls) {
    &encodeXmlChars(\$url);
    print OUTF "$url\n";
  }
  print OUTF "</urls>\n";

  # text has already undergone 'encodeXmlChars' in function 'postprocessText'
  print OUTF "<text>\n$$refToText\n</text>\n";

  print OUTF "</page>\n";
}

# Maps a title into the id, and performs redirection if necessary.
# Assumption: the argument was already normalized using 'normalizeTitle'
sub resolveLink(\$) {
  my ($refToTitle) = @_;

  # safety precaution
  return undef if (length($$refToTitle) == 0);

  my $targetId; # result
  my $targetTitle = $$refToTitle;

  if ( exists($redir{$$refToTitle}) ) { # this link is a redirect
    $targetTitle = $redir{$$refToTitle};

    # check if this is a double redirect
    if ( exists($redir{$targetTitle}) ) {
      $targetTitle = undef; # double redirects are not allowed and are ignored
      print LOGF "Warning: link '$$refToTitle' caused double redirection and was ignored\n";
    } else {
      print LOGF "Link '$$refToTitle' was redirected to '$targetTitle'\n";
    }
  }

  if ( defined($targetTitle) ) {
    if ( exists($title2id{$targetTitle}) ) {
      $targetId = $title2id{$targetTitle};
    } else {
      # target not found
      print LOGF "Warning: link '$$refToTitle' cannot be matched to an id\n";
      $targetId = undef;
    }
  } else {
    $targetId = undef;
  }

  $targetId; # return value
}

sub includeTemplates(\$) {
  my ($refToText) = @_;

  # Using the while loop forces templates to be included recursively
  # (i.e., includes the body of templates that themselves were included
  # on the previous iteration ).
  # Template definitions can easily span several lines, hence the "/s" modifier.

  # Templates are frequently nested. Occasionally, parsing mistakes may cause template insertion
  # to enter an infinite loop, for instance when trying to instantiate Template:Country
  # {{country_{{{1}}}|{{{2}}}|{{{2}}}|size={{{size|}}}|name={{{name|}}}}}
  # which is repeatedly trying to insert template "country_", which is again resolved to
  # Template:Country. The straightforward solution of keeping track of templates that were
  # already inserted for the current article would not work, because the same template
  # may legally be used more than once, with different parameters in different parts of
  # the article. Therefore, we simply limit the number of iterations of nested template
  # inclusion.

  my $templateRecursionLevels = 0;

  # We also require that the body of a template does not contain the template opening sequence
  # (two successive opening braces - "\{\{"). We use negative lookahead to achieve this.
  while ( ($templateRecursionLevels < $maxTemplateRecursionLevels) &&
          $$refToText =~ s/\{\{
                                (?:\s*)        # optional whitespace before the template name is ignored
                                (
                                  (?:
                                      (?!
                                          \{\{
                                      )
                                      .
                                  )*?
                                )
# OLD code and comments
#                                (?:\s*)        # optional whitespace before the template name is ignored
#                                ([^\{]*?)      # Occasionally, templates are nested,
#                                               # e.g., {{localurl:{{NAMESPACE}}:{{PAGENAME}}}}
#                                               # In order to prevent incorrect parsing, e.g.,
#                                               # "{{localurl:{{NAMESPACE}}", we require that the
#                                               # template name does not include opening braces,
#                                               # hence "[^\{]" (any char except opening brace).
# END OF OLD code and comments
                           \}\}
                          /&instantiateTemplate($1)/segx
        ) {
    $templateRecursionLevels++;
  }

  # Since we limit the number of levels of template recursion, we might end up with several
  # un-instantiated templates. In this case we simply eliminate them - however, we do so
  # later, in function 'postprocessText()', after extracting categories, links and URLs.
}

BEGIN {
  # Making variables static for the function to avoid recompilation of regular expressions
  # every time the function is called.

  my $specialSeparator = "\.pAr\.";
  my $specialSeparatorRegex = qr/$specialSeparator/;

  sub parseTemplateInvocation(\$\$\%) {
    my ($refToTemplateInvocation, $refToTemplateTitle, $refToParameterHash) = @_;

    # Template definitions (especially those with parameters) can easily span several lines,
    # hence the "/s" modifier. The template name extends up to the first pipeline symbol (if any).
    # Template parameters go after the "|" symbol.
    if ($$refToTemplateInvocation =~ /^([^|]*)\|(.*)$/sx) {
      $$refToTemplateTitle = $1;  # single out the template name itself
      my $paramsList = $2;

      # Template parameters often contain URLs, internal links, or just other useful text,
      # whereas the template serves for presenting it in some nice way.
      # Parameters are separated by "|" symbols. However, we cannot simply split the string
      # on "|" symbols, since these frequently appear inside internal links. Therefore, we split
      # on those "|" symbols that are not inside [[...]]. It's obviously sufficient to check that
      # brackets are not improperly nested on one side of "|", so we use lookahead.
      # We first replace all "|" symbols that are not inside [[...]] with a special separator that
      # we invented, which will hopefully not normally appear in the text (.pAr.).
      # Next, we use 'split' to break the string on this new separator.

      $paramsList =~ s/\|                       # split on pipeline symbol, such that
                          (?:                   # non-capturing grouper that encloses 2 options
                              (?=               #   zero-width lookahead - option #1
                                  [^\]]*$       #     there are no closing brackets up to the end
                                                #     of the string (i.e., all the characters up to
                                                #     the end of the string are not closing brackets)
                              )                 #   end of first lookahead (= end of option #1)
                              |                 #   or
                              (?=               #   another zero-width lookahead - option #2
                                  [^\]]* \[     #     the nearest opening bracket on the right is not preceded
                                                #     by a closing bracket (i.e., all the characters that
                                                #     precede it are not closing brackets
                              )                 #   end of second lookahead  (= end of option #2)
                          )                     # end of the outer grouper
                      /$specialSeparator/sxg;   # replace matching symbols with a special separator
                                                # /s means string can contain newline chars

      my @parameters = split(/$specialSeparatorRegex/, $paramsList);

      # Parameters can be either named or unnamed. In the latter case, their name is defined by their
      # ordinal position (1, 2, 3, ...).

      my $unnamedParameterCounter = 0;

      # It's legal for unnamed parameters to be skipped, in which case they will get default
      # values (if available) during actual instantiation. That is {{template_name|a||c}} means
      # parameter 1 gets the value 'a', parameter 2 value is not defined, and parameter 3 gets the value 'c'.
      # This case is correctly handled by function 'split', and does not require any special handling.
      my $param;
      foreach $param (@parameters) {
        # Spaces before or after a parameter value are normally ignored, UNLESS the parameter contains
        # a link (to prevent possible gluing the link to the following text after template substitution)

        # Parameter values may contain "=" symbols, hence the parameter name extends up to
        # the first such symbol.
        # It is legal for a parameter to be specified several times, in which case the last assignment
        # takes precedence. Example: "{{t|a|b|c|2=B}}" is equivalent to "{{t|a|B|c}}".
        # Therefore, we don't check if the parameter has been assigned a value before, because
        # anyway the last assignment should override any previous ones.
        if ($param =~ /^([^=]*)=(.*)$/s) {
          # This is a named parameter.
          # This case also handles parameter assignments like "2=xxx", where the number of an unnamed
          # parameter ("2") is specified explicitly - this is handled transparently.

          my $parameterName = $1;
          my $parameterValue = $2;

          &trimWhitespaceBothSides(\$parameterName);
          if ($parameterValue !~ /\]\]/) { # if the value does not contain a link, trim whitespace
            &trimWhitespaceBothSides(\$parameterValue);
          }

          $$refToParameterHash{$parameterName} = $parameterValue;
        } else {
          # this is an unnamed parameter
          $unnamedParameterCounter++;

          if ($param !~ /\]\]/) { # if the value does not contain a link, trim whitespace
            &trimWhitespaceBothSides(\$param);
          }

          $$refToParameterHash{$unnamedParameterCounter} = $param;
        }
      }
    } else {
      # Template invocation does not contain a pipeline symbol, hence take the entire
      # invocation text as the template title.
      $$refToTemplateTitle = $$refToTemplateInvocation;
    }
  }

} # end of BEGIN block


sub instantiateTemplate($) {
  my ($templateInvocation) = @_;

  my $result = "";

  print LOGF "Instantiating template=$templateInvocation\n";

  my $templateTitle;
  my %templateParams;
  &parseTemplateInvocation(\$templateInvocation, \$templateTitle, \%templateParams);

  &computeFullyQualifiedTemplateTitle(\$templateTitle);

  &includeTemplateText(\$templateTitle, \%templateParams, \$result);

  $result;  # return value
}

sub includeTemplateText(\$\%\$) {
  my ($refToTemplateTitle, $refToParameterHash, $refToResult) = @_;

  &normalizeTitle($refToTemplateTitle);
  my $includedPageId = &resolveLink($refToTemplateTitle);

  if ( defined($includedPageId) && exists($templates{$includedPageId}) ) {
    # OK, perform the actual inclusion with parameter substitution

    $$refToResult = $templates{$includedPageId};

    # Perform parameter substitution
    # A parameter call ( {{{...}}} ) may span over a newline, hence the /s modifier

    # Parameters may be nested (see comments below), hence we do the substitution iteratively
    # in a while loop. We also limit the maximum number of iterations to avoid too long or
    # even endless loops (in case of malformed input).
    my $parameterRecursionLevels = 0;

    # We also require that the body of a parameter does not contain the parameter opening sequence
    # (three successive opening braces - "\{\{\{"). We use negative lookahead to achieve this.
    while ( ($parameterRecursionLevels < $maxParameterRecursionLevels) &&
            $$refToResult =~ s/\{\{\{
                                (
                                  (?:
                                      (?!
                                          \{\{\{
                                      )
                                      .
                                  )*?
                                )

# OLD code and comments
#                                      ([^\{]*?)      # Occasionally, parameters are nested because
#                                                     # they are dependent on other parameters,
#                                                     # e.g., {{{Author|{{{PublishYear|}}}}}}
#                                                     # (here, the default value for 'Author' is
#                                                     # dependent on 'PublishYear').
#                                                     # In order to prevent incorrect parsing, e.g.,
#                                                     # "{{{Author|{{{PublishYear|}}}", we require that the
#                                                     # parameter name does not include opening braces,
#                                                     # hence "[^\{]" (any char except opening brace).
# END OF OLD code and comments
                               \}\}\}
                              /&substituteParameter($1, $refToParameterHash)/segx
          ) {
      $parameterRecursionLevels++;
    }
  } else {
    # The page being included cannot be identified - perhaps we skipped it (because currently
    # we only allow for inclusion of pages in the Template namespace), or perhaps it's
    # a variable name like {{NUMBEROFARTICLES}}. Just remove this inclusion directive and
    # replace it with a space
    print LOGF "Template '$$refToTemplateTitle' is not available for inclusion\n";
    $$refToResult = " ";
  }
}

sub substituteParameter($\%) {
  my ($parameter, $refToParameterHash) = @_;

  my $result;

  if ($parameter =~ /^([^|]*)\|(.*)$/) {
    # This parameter has a default value
    my $paramName = $1;
    my $defaultValue = $2;

    if ( defined($$refToParameterHash{$paramName}) ) {
      $result = $$refToParameterHash{$paramName};  # use parameter value specified in template invocation
    } else { # use the default value
      $result = $defaultValue;
    }
  } else {
    # parameter without a default value

    if ( defined($$refToParameterHash{$parameter}) ) {
      $result = $$refToParameterHash{$parameter};  # use parameter value specified in template invocation
    } else {
      # Parameter not specified in template invocation and does not have a default value -
      # do not perform substitution and keep the parameter in 3 braces
      # (these are Wiki rules for templates, see  http://meta.wikimedia.org/wiki/Help:Template ).
      $result = "{{{$parameter}}}";
    }
  }

  # Surplus parameters - i.e., those assigned values in template invocation but not used
  # in the template body - are simply ignored.

  $result;  # return value
}

sub computeFullyQualifiedTemplateTitle(\$) {
  my ($refToTemplateTitle) = @_;

  # Determine the namespace of the page being included through the template mechanism

  my $namespaceSpecified = 0;

  if ($$refToTemplateTitle =~ /^:(.*)$/) {
    # Leading colon by itself implies main namespace, so strip this colon
    $$refToTemplateTitle = $1;
    $namespaceSpecified = 1;
  } elsif ($$refToTemplateTitle =~ /^([^:]*):/) {
    # colon found but not in the first position - check if it designates a known namespace
    my $prefix = $1;
    &normalizeNamespace(\$prefix);
    $namespaceSpecified = &isKnownNamespace(\$prefix);
  }

  # The case when the page title does not contain a colon at all also falls here.

  if ($namespaceSpecified) {
    # OK, the title of the page being included is fully qualified with a namespace
  } else {
    # The title of the page being included is NOT in the main namespace and lacks
    # any other explicit designation of the namespace - therefore, it is resolved
    # to the Template namespace (that's the default for the template inclusion mechanism).
    $$refToTemplateTitle = "Template:$$refToTemplateTitle";
  }
}

sub extractCategories(\$\@$) {
  my ($refToText, $refToCategoriesArray, $id) = @_;

  # Remember that namespace names are case-insensitive, hence we're matching with "/i".
  # The first parameter to 'collectCategory' is passed by value rather than by reference,
  # because it might be dangerous to pass a reference to $1 in case it might get modified
  # (with unclear consequences).
  $$refToText =~ s/\[\[(?:\s*)(Category:.*?)\]\]/&collectCategory($1, $refToCategoriesArray)/ieg;

  # We don't accumulate categories directly in a hash table, since this would not preserve
  # their original order of appearance.
  &removeDuplicatesAndSelf($refToCategoriesArray, $id);
}

sub collectCategory($\@) {
  my ($catName, $refToCategoriesArray) = @_;

  if ($catName =~ /^(.*)\|/) {
    # Some categories contain a sort key, e.g., [[Category:Whatever|*]] or [[Category:Whatever| ]]
    # In such a case, take only the category name itself.
    $catName = $1;
  }

  &normalizeTitle(\$catName);

  my $catId = &resolveLink(\$catName);
  if ( defined($catId) ) {
    push(@$refToCategoriesArray, $catId);
  } else {
    print LOGF "Warning: unknown category '$catName'\n";
  }

  # The return value is just a space, because we remove categories from the text
  # after we collected them
  " ";
}

sub extractInternalLinks(\$\@$$$) {
  my ($refToText, $refToInternalLinksArray, $id,
      $whetherToLogAnchorText, $whetherToRemoveDuplicates) = @_;

  # For each internal link outgoing form the current article, this hash table maps
  # the target id into the anchor text associated with it. Naturally, we only
  # collect anchor text for links that can be resolved to a page id.
  my %anchorTexts;

  # Link definitions may span over adjacent lines and therefore contain line breaks,
  # hence we use the /s modifier.
  # Occasionally, links are nested, e.g.,
  # [[Image:kanner_kl2.jpg|frame|right|Dr. [[Leo Kanner]] introduced the label ''early infantile autism'' in [[1943]].]]
  # In order to prevent incorrect parsing, e.g., "[[Image:kanner_kl2.jpg|frame|right|Dr. [[Leo Kanner]]",
  # we extract links in several iterations of the while loop, while the link definition requires that
  # each pair [[...]] does not contain any opening braces.

  1 while ( $$refToText =~ s/
                             (\w*)            # words may be glued to the beginning of the link,
                                              # in which case they become part of the link
                                              # e.g., "ex-[[Giuseppe Mazzini|Mazzinian]] "
                             \[\[
                                   ([^\[]*?)  # the link text can be any chars except an opening bracket,
                                              # this ensures we correctly parse nested links (see comments above)
                             \]\]
                             (\w*)            # words may be glued to the end of the link,
                                              # in which case they become part of the link
                                              # e.g., "[[public transport]]ation"
                            /&collectInternalLink($1, $2, $3, $refToInternalLinksArray, \%anchorTexts)/segx
          );

  if ($whetherToRemoveDuplicates) {
    &removeDuplicatesAndSelf($refToInternalLinksArray, $id);
  }

  if ($whetherToLogAnchorText) {
    &logAnchorText(\%anchorTexts, $id);
  }
}

sub logAnchorText(\%$) {
  my ($refToAnchorTextsHash, $curPageId) = @_;

  # Remember that we use a hash table to associate anchor text with target page ids.
  # Therefore, if the current page has several links to another page (it happens), then we only
  # keep the anchor text of the last one (and override the previous ones) - we can live with it.
  # Consequently, we do not need to remove duplicates as there are none.
  # However, we still remove the links that point from the page to itself.
  my $targetId;
  my $anchorText;
  while ( ($targetId, $anchorText) = each(%$refToAnchorTextsHash) ) {
    if ($targetId != $curPageId) {
      &postprocessText(\$anchorText, 0); # anchor text doesn't need escaping of XML characters,
                                         # hence the second function parameter is 0
      $anchorText =~ s/\n/ /g;  # replace all newlines with spaces

      # make sure that something is left of anchor text after postprocessing
      if (length($anchorText) > 0) {
        print ANCHORF "$targetId\t$curPageId\t$anchorText\n";
      }
    }
  }
}

sub collectInternalLink($$$\@\%) {
  my ($prefix, $link, $suffix, $refToInternalLinksArray, $refToAnchorTextHash) = @_;

  my $originalLink = $link;
  my $result = "";

  # strip leading whitespace, if any
  $link =~ s/^\s*//;

  # Link definitions may span over adjacent lines and therefore contain line breaks,
  # hence we use the /s modifier on most matchings.

  # There are some special cases when the link may be preceded with a colon.
  # Known cases:
  # - Linking to a category (as opposed to actually assigning the current article
  #   to a category) is performed using special syntax [[:Category:...]]
  # - Linking to other languages, e.g., [[:fr:Wikipedia:Aide]]
  #   (without the leading colon, the link will go to the side menu
  # - Linking directly to the description page of an image, e.g., [[:Image:wiki.png]]
  # In all such cases, we strip the leading colon.
  if ($link =~ /^
                   :        # colon at the beginnning of the link name
                   (.*)     # the rest of the link text
                $
               /sx) {
    # just strip this initial colon (as well as any whitespace preceding it)
    $link = $1;
  }

  # Alternative text may be available after the pipeline symbol.
  # If the pipeline symbol is only used for masking parts of
  # the link name for presentation, we still consider that the author of the page
  # deemed the resulting text important, hence we always set this variable when
  # the pipeline symbol is present.
  my $alternativeTextAvailable = 0;

  # Some links contain several pipeline symbols, e.g.,
  # [[Image:Zerzan.jpeg|thumb|right|[[John Zerzan]]]]
  # It seems that the extra pipeline symbols are parameters, so we just eliminate them.
  if ($link =~ /^(.*)\|([^|]*)$/s) { # first, extract the link up to the last pipeline symbol
    $link = $1;    # the part before the last pipeline
    $result = $2;  # the part  after the last pipeline, this is usually an alternative text for this link

    $alternativeTextAvailable = 1; # pipeline found, see comment above

    # Now check if there are pipeline symbols remaining.
    # Note that this time we're looking for the shortest match,
    # to take the part of the text up to the first pipeline symbol.
    if ($link =~ /^([^|]*)\|(.*)$/s) {
      $link = $1;
      # $2 contains the parameters, which we don't really need
    }

    if (length($result) == 0) {
      if ($link !~ /\#/) {
        # If the "|" symbol is not followed by some text, then it masks the namespace
        # as well as any text in parentheses at the end of the link title.
        # However, pipeline masking is only invoked if the link does not contain an anchor,
        # hence the additional condition in the 'if' statement.
        &performPipelineMasking(\$link, \$result);
      } else {
        # If the link contains an anchor, then masking is not invoked, and we take the entire link
        $result = $link;
      }
    }
  } else {
    # the link text does not contain the pipeline, so take it as-is
    $result = $link;
  }

  if ($link =~ /^(.*)\#(.*)$/s) {
    # The link contains an anchor, so adjust the link to point to the page as a whole.
    $link = $1;
    my $anchor = $2;
    # Check if the link points to an anchor on the current page, and if so - ignore it.
    if (length($link) == 0 && ! $alternativeTextAvailable) {
      # This is indeed a link pointing to an anchor on the current page.
      # The link is thus cleared, so that it will not be resolved and collected later.
      # For anchors to the same page, discard the leading '#' symbol, and take
      # the rest as the text - but only if no alternative text was provided for this link.
      $result = $anchor;
    }
  }

  # Now collect the link, or links if the original link is in the date format
  # and specifies both day and year. In the latter case, the function for date
  # normalization may also modify the link text ($result), and may collect more
  # than one link (one for the day, another one for the year).
  my $dateRecognized = 0;

  # Alternative text (specified after pipeline) blocks normalization of dates.
  # We also perform a quick check - if the link does not start with a digit,
  # then it surely does not contain a date
  if ( ($link =~ /^\d/) && (! $alternativeTextAvailable)) {
    $dateRecognized = &normalizeDates(\$link, \$result, $refToInternalLinksArray, $refToAnchorTextHash);
  }

  # If a date (either day or day + year) was recognized, then no further processing is necessary
  if (! $dateRecognized) {
    &normalizeTitle(\$link);
    my $targetId = &resolveAndCollectInternalLink(\$link, $refToInternalLinksArray);

    # Wikipedia pages contain many links to other Wiki projects (especially Wikipedia in
    # other languages). While these links are not resolved to valid pages, we also want
    # to ignore their text. However, simply discarding the text of all links that cannot
    # be resolved would be overly aggressive, as authors frequently define phrases as links
    # to articles that don't yet exist, in the hope that they will be added later.
    # Therefore, we formulate the following conditions that must hold simultaneously
    # for discarding the text of a link:
    # 1) the link was not resolved to a valid id
    # 2) the link does not contain alternative text (if it did, then the text is probably
    #    important enough to be retained)
    # 3) the link contains a colon - this is a very simple heuristics for identifying links to
    #    other Wiki projects, other languages, or simply other namespaces within current Wikipedia.
    #    While this method is not fool-proof (there are regular pages in the main namespace
    #    that contain a colon in their title), we believe this is a reasonable tradeoff.
    if ( !defined($targetId) && ! $alternativeTextAvailable && $link =~ /:/ ) {
      $result = "";
      print LOGF "Discarding text for link '$originalLink'\n";
    } else {
      # finally, add the text originally attached to the left and/or to the right of the link
      # (if the link represents a date, then it has not text glued to it, so it's OK to only
      # use the prefix and suffix here)
      $result = $prefix . $result . $suffix;
    }

    if ( defined($targetId) ) {
      # If the current page has several links to another page, then we only take the anchor
      # of the last one (and override the previous ones) - we can live with it.
      $$refToAnchorTextHash{$targetId} = $result;
    }
  }

  $result;  #return value
}

sub performPipelineMasking(\$\$) {
  my ($refToLink, $refToResult) = @_;

  # First check for presence of a namespace
  if ($$refToLink =~ /^([^:]*):(.*)$/) {
    my $namespaceCandidate = $1;
    my $rest = $2;

    &normalizeNamespace(\$namespaceCandidate);
    if ( &isKnownNamespace(\$namespaceCandidate) ) {
      $$refToResult = $rest; # take the link text without the namespace
    } else {
      $$refToResult = $$refToLink; # otherwise, take the entire link text (for now)
    }
  } else {
    $$refToResult = $$refToLink; # otherwise, take the entire link text (for now)
  }

  # Now check if there are parentheses at the end of the link text
  # (we now operate on $$refToResult, because we might have stripped the leading
  # namespace in the previous test).
  if ($$refToResult =~ /^                  # the beginning of the string
                          (.*)             # the text up to the last pair of parentheses
                          \(               # opening parenthesis
                              (?:[^()]*)   #   the text in the parentheses
                          \)               # closing parenthesis
                          (?:\s*)          # optional trailing whitespace, just in case
                        $                  # end of string
                       /x) {
    $$refToResult = $1; # discard the text in parentheses at the end of the string
  }
}

sub resolveAndCollectInternalLink(\$\@) {
  my ($refToLink, $refToInternalLinksArray) = @_;

  my $targetId = &resolveLink($refToLink);
  if ( defined($targetId) ) {
    push(@$refToInternalLinksArray, $targetId);
  } else {
    # Some cases in this category that obviously won't be resolved to legal ids:
    # - Links to namespaces that we don't currently handle
    #   (other than those for which 'isNamespaceOK' returns true);
    #   media and sound files fall in this category
    # - Links to other languages, e.g., [[de:...]]
    # - Links to other Wiki projects, e.g., [[Wiktionary:...]]
    print LOGF "Warning: unknown link '$$refToLink'\n";
  }

  $targetId;  # return value
}

# Dates can appear in several formats
# 1) [[July 20]], [[1969]]
# 2) [[20 July]] [[1969]]
# 3) [[1969]]-[[07-20]]
# 4) [[1969-07-20]]
# The first one is handled correctly without any special treatment,
# so we don't even check for it here.
# In (2) and (3), we only normalize the day, because it will be parsed separately from the year.
# This function is only invoked if the link has no alternative text available, therefore,
# we're free to override the result text.
sub normalizeDates(\$\$\@\%) {
  my ($refToLink, $refToResultText, $refToInternalLinksArray, $refToAnchorTextHash) = @_;

  my $dateRecognized = 0;

  if ($$refToLink =~ /^(\d\d)\s*([A-Za-z]+)$/) {
    my $day = $1;
    my $month = ucfirst(lc($2));

    if ( defined($monthToNumDays{$month}) &&
         1 <= $day && $day <= $monthToNumDays{$month} ) {
      $dateRecognized = 1;

      $$refToLink = "$month $day";
      $$refToResultText = "$month $day";

      my $targetId = &resolveAndCollectInternalLink($refToLink, $refToInternalLinksArray);
      if ( defined($targetId) ) {
        $$refToAnchorTextHash{$targetId} = $$refToResultText;
      }
    } else {
      # this doesn't look like a valid date, leave as-is
    }
  } elsif ($$refToLink =~ /^(\d\d)\-(\d\d)$/) {
    my $monthNum = int($1);
    my $day = $2;

    if ( defined($numberToMonth{$monthNum}) ) {
      my $month = $numberToMonth{$monthNum};
      if (1 <= $day && $day <= $monthToNumDays{$month}) {
        $dateRecognized = 1;

        $$refToLink = "$month $day";
        # we add a leading space, to separate the preceding year ("[[1969]]-" in the example")
        # from the day that we're creating
        $$refToResultText = " $month $day";

        my $targetId = &resolveAndCollectInternalLink($refToLink, $refToInternalLinksArray);
        if ( defined($targetId) ) {
            $$refToAnchorTextHash{$targetId} = $$refToResultText;
        }
      } else {
        # this doesn't look like a valid date, leave as-is
      }
    } else {
      # this doesn't look like a valid date, leave as-is
    }
  } elsif ($$refToLink =~ /^(\d\d\d\d)\-(\d\d)\-(\d\d)$/) {
    my $year = $1;
    my $monthNum = int($2);
    my $day = $3;

    if ( defined($numberToMonth{$monthNum}) ) {
      my $month = $numberToMonth{$monthNum};
      if (1 <= $day && $day <= $monthToNumDays{$month}) {
        $dateRecognized = 1;

        $$refToLink = "$month $day";
        # the link text is combined from the day and the year
        $$refToResultText = "$month $day, $year";

        my $targetId;

        # collect the link for the day
        $targetId = &resolveAndCollectInternalLink($refToLink, $refToInternalLinksArray);
        if ( defined($targetId) ) {
            $$refToAnchorTextHash{$targetId} = $$refToLink;
        }

        # collect the link for the year
        $targetId = &resolveAndCollectInternalLink(\$year, $refToInternalLinksArray);
        if ( defined($targetId) ) {
            $$refToAnchorTextHash{$targetId} = $year;
        }
      } else {
        # this doesn't look like a valid date, leave as-is
      }
    } else {
      # this doesn't look like a valid date, leave as-is
    }
  }

  $dateRecognized;  # return value
}

sub extractUrls(\$\@) {
  my ($refToText, $refToUrlsArray) = @_;

  # First we handle the case of URLs enclosed in single brackets, with or without the description,
  # and with optional leading and/or trailing whitespace
  # Examples: [http://www.cnn.com], [ http://www.cnn.com  ], [http://www.cnn.com  CNN Web site]
  $$refToText =~ s/\[(?:\s*)($urlProtocols(?:[^\[\]]*))\]/&collectUrlFromBrackets($1, $refToUrlsArray)/eg;

  # Now we handle standalone URLs (those not enclosed in brackets)
  # The $urlTemrinator is matched via positive lookahead (?=...) in order not to remove
  # the terminator symbol itself, but rather only the URL.
  $$refToText =~ s/($urlProtocols(?:.*?))$urlTerminator/&collectStandaloneUrl($1, $refToUrlsArray)/eg;

  &removeDuplicatesAndSelf($refToUrlsArray, undef);
}

sub collectUrlFromBrackets($\@) {
  my ($url, $refToUrlsArray) = @_;

  my $text;
  # Assumption: leading whitespace has already been stripped
  if ( $url =~ /^($urlProtocols(?:.*?))($urlTerminator(?:.*))$/ ) { # description available
    push(@$refToUrlsArray, $1);
    $text = $2;
  } else { # no description
    push(@$refToUrlsArray, $url);
    $text = " ";
  }

  $text;  # return value
}

sub collectStandaloneUrl($\@) {
  my ($url, $refToUrlsArray) = @_;

  push(@$refToUrlsArray, $url); # collect the URL as-is

  " "; # return value - replace the URL with a space
}

sub postprocessText(\$$) {
  my ($refToText, $whetherToEncodeXmlChars) = @_;

  # Eliminate all <includeonly> and <onlyinclude> fragments, because this text
  # will not be included anywhere, as we already handled all inclusion directives
  # in function 'includeTemplates'.
  # This block can easily span several lines, hence the "/s" modifier.
  $$refToText =~ s/<includeonly>(.*?)<\/includeonly>/ /sg;
  $$refToText =~ s/<onlyinclude>(.*?)<\/onlyinclude>/ /sg;

  # <noinclude> fragments remain, but remove the tags per se
  # We block the code below, as <noinclude> tags will anyway be thrown away later,
  # when we eliminate all remaining tags.
  ### This block can easily span several lines, hence the "/s" modifier
  ### $$refToText =~ s/<noinclude>(.*?)<\/noinclude>/$1/sg;

  # replace <br> and <br /> directives with new paragraph
  $$refToText =~ s/<br(?:\s*)(?:[\/]?)>/\n\n/g;

  # Remove tables, as they often carry a lot of noise
  &eliminateTables($refToText);

  # Since we limit the number of levels of template recursion, we might end up with several
  # un-instantiated templates. In this case we simply eliminate them now.
  # Because templates may be nested, we eliminate them iteratively by starting from the most
  # nested one (hence the 'while' loop).
  #    OLD comments and code:
  #    For the same reason, we also require that the body of a template does not contain
  #    opening braces (hence "[^\{]", any char except opening brace).
  #    1 while ($$refToText =~ s/\{\{(?:[^\{]*?)\}\}/ /sg);
  #    END OF old comments and code
  # We also require that the body of a template does not contain the template opening sequence
  # (two successive opening braces - "\{\{"). We use negative lookahead to achieve this.
  1 while ($$refToText =~ s/\{\{
                                 (?:
                                     (?:
                                         (?!
                                             \{\{
                                         )
                                         .
                                     )*?
                                 )
                            \}\}
                           / /sgx);

  # Remove any other <...> tags - but keep the text they enclose
  # (the tags are replaced with spaces to prevent adjacent pieces of text
  # from being glued together).
  # Comments (<!-- ... -->) also fall into this category, and since they can easily span several lines,
  # we use the "/s" modifier.
  $$refToText =~ s/<(?:.*?)>/ /sg;

  # Change markup on bold/italics emphasis. We probably don't need to distinguish
  # these 3 types of emphasis, so we just replace all of them with a generic <em> tag.
  # IMPORTANT: If 'encodeXmlChars' has beeen called before this line, then remember that
  # the apostrophes were already quoted to "&apos;"
  $$refToText =~ s/'''''(.*?)'''''/$1/g;
  $$refToText =~ s/'''(.*?)'''/$1/g;
  $$refToText =~ s/''(.*?)''/$1/g;

  # Eliminate long sequences of newlines and whitespace.
  # Note that we don't want to replace sequences of spaces only, as this might make the text
  # less readable. Instead, we only eliminate sequences of whitespace that contain at least
  # two newlines.
  $$refToText =~ s/(?:\s*)\n(?:\s*)\n(?:\s*)/\n\n/g;

  # Eliminate XML entities such as "&nbsp;" , "&times;" etc. - otherwise,
  # in C++ code they will give rise to spurious words "nbsp", "times" etc.
  # Note that the standard entities - &amp; , &quot; , &apos; , &lt; and &gt;
  # are handled by the XML parser. All other entities, such as &nbsp; are passed
  # by the XML parser to the upper level (in case of Wikipedia pages,
  # to the rendering engine).
  # Note that in the raw XML text, these entities look like "&amp;nbsp;"
  # (i.e., with leading "&amp;"). XML parser replaces "&amp;" with "&",
  # so here in the code we see the entities as "&nbsp;".
  $$refToText =~ s{&                 # the entity starts with "&"
                   ((?:\#?)(?:\w+))  # optional '#' sign (as in &#945;), followed by
                                     # an uninterrupted sequence of letters and/or digits
                   ;                 # the entity ends with a semicolon
                  }{&logReplacedXmlEntity($1)}egx;   # entities are replaced with a space

  if ($whetherToEncodeXmlChars) {
    # encode text for XML
    &encodeXmlChars($refToText);
  }

  # NOTE that the following operations introduce XML tags, so they must appear
  # after the original text underwent character encoding with 'encodeXmlChars' !!

  # Change markup for section headers.
  # Note that section headers may only begin at the very first position in the line
  # (not even after a space). Therefore, each header markup in the following commands
  # is prefixed with "^" to make sure it begins at the beginning of the line.
  # Since the text (e.g., article body) may contains multiple lines, we use
  # the "/m" modifier to allow matching "^" at embedded "\n" positions.
  $$refToText =~ s/^=====(.*?)=====/<h4>$1<\/h4>/mg;
  $$refToText =~ s/^====(.*?)====/<h3>$1<\/h3>/mg;
  $$refToText =~ s/^===(.*?)===/<h2>$1<\/h2>/mg;
  $$refToText =~ s/^==(.*?)==/<h1>$1<\/h1>/mg;
}

sub logReplacedXmlEntity($) {
  my ($xmlEntity) = @_;

  print LOGF "ENTITY: &$xmlEntity;\n";

  " "; # return value - entities are replaced with a space
}

BEGIN {
  # Making variables static for the function to avoid recompilation of regular expressions
  # every time the function is called.

  # Table definitions can easily span several lines, hence the "/s" modifier

  my $tableOpeningSequence1 = qr{<table>                         # either just <table>
                                 |                               # or
                                 <table(?:\s+)(?:[^<>]*)>}ix;    # "<table" followed by at least one space
                                                                 # (to prevent "<tablexxx"), followed by
                                                                 # some optional text, e.g., table parameters
                                                                 # as in "<table border=0>"
                                 # In the above definition, prohibiting '<' and '>' chars ([^<>]) ensures
                                 # that we do not consume more than necessary, so that in the example
                                 #  "<table border=0> aaa <table> bbb </table> ccc </table>"
                                 #  $1 is NOT extended to be "> aaa <table"

  my $tableClosingSequence1 = qr/<\/table>/i;
#  my $nonNestedTableRegex1 =
#    qr{$tableOpeningSequence1            # opening sequence
#       (
#         (?:                             # non-capturing grouper
#             (?!                         # lookahead negation
#                 $tableOpeningSequence1  # that's what we don't want to find inside a table definition
#             )
#             .                           # any character (such that there is no table opening sequence
#                                         #   after it because of the lookahead condition)
#         )*?                             # shortest match of such characters, up to the closing of a table
#       )
#       $tableClosingSequence1}sx;        # closing sequence

  my $tableOpeningSequence2 = qr/\{\|/;
  my $tableClosingSequence2 = qr/\|\}/;
#  my $nonNestedTableRegex2 =
#    qr{$tableOpeningSequence2            # opening sequence
#       (
#         (?:                             # non-capturing grouper
#             (?!                         # lookahead negation
#                 $tableOpeningSequence2  # that's what we don't want to find inside a table definition
#             )
#             .                           # any character (such that there is no table opening sequence
#                                         #   after it because of the lookahead condition)
#         )*?                             # shortest match of such characters, up to the closing of a table
#       )
#       $tableClosingSequence2}sx;        # closing sequence

  sub eliminateTables(\$) {
    my ($refToText) = @_;

# Sadly, these patterns became too complex and cause segmentation fault,
# hence we fall back to only handling non-nested tables :(
#    # Sometimes, tables are nested, therefore we use a while loop to eliminate them
#    # recursively, while requiring that any table we eliminate does not contain nested tables.
#    # For simplicity, we assume that tables of the two kinds (e.g., <table> ... </table> and {| ... |})
#    # are not nested in one another.

    $$refToText =~ s/$tableOpeningSequence1(.*?)$tableClosingSequence1/\n/sg;
    $$refToText =~ s/$tableOpeningSequence2(.*?)$tableClosingSequence2/\n/sg;
  }

} # end of BEGIN block

# If specified, 'elementToRemove' contains an element that needs to be removed as well.
# For links, this ensures that a page does not link to itself. For categories, this
# ensures that a page is not categorized to itself. This parameter is obviously
# irrelevant for filtering URLs.
# 'elementToRemove' must be a numeric value (not string), since we're testing it with '==' (not 'eq')
sub removeDuplicatesAndSelf(\@$) {
  my ($refToArray, $elementToRemove) = @_;

  my %seen = ();
  my @uniq;

  my $item;
  foreach $item (@$refToArray) {
    if ( defined($elementToRemove) && ($item == $elementToRemove) ) {
      printf LOGF "Warning: current page links or categorizes to itself - " .
                  "link discarded ($elementToRemove)\n";
      next;
    }
    push(@uniq, $item) unless $seen{$item}++;
  }

  # overwrite the original array with the new one that does not contain duplicates
  @$refToArray = @uniq;
}

# Removes elements of the second list from the first list.
# For efficiency purposes, the second list is converted into a hash.
sub removeElements(\@\@) {
  my ($refToArray, $refToElementsToRemove) = @_;

  my %elementsToRemove = ();
  my @result;

  # Construct the hash table for fast lookups
  my $item;
  foreach $item (@$refToElementsToRemove) {
    $elementsToRemove{$item} = 1;
  }

  foreach $item (@$refToArray) {
    if ( ! defined($elementsToRemove{$item}) ) {
      push(@result, $item);
    }
  }

  # overwrite the original array with the new one
  @$refToArray = @result;
}

sub getTimeAsString() {
  my $tm = localtime();
  my $result = sprintf("%02d:%02d:%02d", $tm->hour, $tm->min, $tm->sec);
}

sub trimWhitespaceBothSides(\$) {
    my ($stringRef) = @_;

    # remove leading whitespace
    $$stringRef =~ s/^\s*//;
    # remove trailing whitespace
    $$stringRef =~ s/\s*$//;
}

# There are 3 kinds of related links that we look for:
# 1) Standalone (usually, at the beginning of the article or a section of it)
#    Ex: Main articles: ...
# 2) Inlined - text in parentheses inside the body of the article
#    Ex: medicine (see also: [[Health]])
# 3) Dedicated section
#    Ex: == See also ==
#
# In all calls to 'extractInternalLinks':
# - The penultimate argument is 0, since we don't need to log anchor text here.
#   Anchor text will be handled when we analyze all the internal links in
#   the entire article (and not just look for related links).
# - The last argument is 0 in order not to remove duplicates on every invocation
#   of 'extractInternalLinks'. This is because duplicates in related links are
#   not very common, but performing duplicate removal each time is expensive.
#   Instead, we remove duplicates once at the very end.
sub identifyRelatedArticles(\$\@$) {
  my ($refToText, $refToRelatedArticles, $id) = @_;

  # We split the text into a set of lines. This also creates a copy of the original text -
  # this is important, since the function 'extractInternalLinks' modifies its argument,
  # so we'd better use it on a copy of the real article body.
  my @text = split("\n", $$refToText);
  my $line;

  # Standalone
  foreach $line (@text) {
    # We require that stanalone designators occur at the beginning of the line
    # (after at most a few characters, such as a whitespace or a colon),
    # and not just anywhere in the line. Otherwise, we would collect as related
    # those links that just happen to occur in the same line with an unrelated
    # string that represents a standalone designator.
    if ($line =~ /^(?:.{0,5})(${relatedWording_Standalone}.*)$/) {
      my $str = $1; # We extract links from the rest of the line
      print LOGF "Related(S): $id => $str\n";
      &extractInternalLinks(\$str, $refToRelatedArticles, $id, 0, 0);
      print LOGF "Related(S): $id ==> @$refToRelatedArticles\n";
    }
  }

  # Inlined (in parentheses)
  foreach $line (@text) {
    while ($line =~ /\((?:\s*)(${relatedWording_Inline}.*?)\)/g) {
      my $str = $1;
      print LOGF "Related(I): $id => $str\n";
      &extractInternalLinks(\$str, $refToRelatedArticles, $id, 0, 0);
      print LOGF "Related(I): $id ==> @$refToRelatedArticles\n";
    }
  }

  # Section
  # Sections can be at any level - "==", "===", "====" - it doesn't matter,
  # so it suffices to look for two consecutive "=" signs
  my $relatedSectionFound = 0;
  foreach $line (@text) {
    if ($relatedSectionFound) { # we're in the related section now
      if ($line =~ /==(?:.*?)==/) { # we just encountered the next section - exit the loop
        last;
      } else { # collect the links from the current line
        print LOGF "Related(N): $id => $line\n";
        # 'extractInternalLinks' may mofidy its argument ('$line'), but it's OK
        # as we do not do any further processing to '$line' or '@text'
        &extractInternalLinks(\$line, $refToRelatedArticles, $id, 0, 0);
        print LOGF "Related(N): $id ==> @$refToRelatedArticles\n";
      }
    } else { # we haven't yet found the related section
      if ($line =~ /==(.*?)==/) { # found some section header - let's check it
        my $sectionHeader = $1;
        if ($sectionHeader =~ /$relatedWording_Section/) {
          $relatedSectionFound = 1;
          next; # proceed to the next line
        } else {
          next; # unrelated section - just proceed to the next line
        }
      } else {
        next; # just proceed to the next line - nothing to do
      }
    }
  }

  &removeDuplicatesAndSelf($refToRelatedArticles, $id);
}

sub recordRelatedArticles($\@) {
  my ($id, $refToRelatedArticles) = @_;

  my $size = scalar(@$refToRelatedArticles);
  return if ($size == 0);

  print RELATEDF "$id\t", join(" ", @$refToRelatedArticles), "\n";
}


########################################################################

sub printUsage()
{
  print "Wikiprep version $version, Copyright (C) 2007 Evgeniy Gabrilovich\n" .
        "Wikiprep comes with ABSOLUTELY NO WARRANTY; for details type '$0 -license'.\n" .
        "This is free software, and you are welcome to redistribute it\n" .
        "under certain conditions; type '$0 -license' for details.\n" .
        "Type '$0 -version' for version information.\n\n" .
        "Usage: $0 -f <XML file with page dump>\n" .
        "       e.g., $0 -f pages_articles.xml\n\n";
}