alpcentaur
/
laywerrobot

################################################################################# wikiprep.pl - Preprocess Wikipedia XML dumps# Copyright (C) 2007 Evgeniy Gabrilovich# The author can be contacted by electronic mail at gabr@cs.technion.ac.il##    This program is free software; you can redistribute it and/or modify#    it under the terms of the GNU General Public License as published by#    the Free Software Foundation; either version 2 of the License, or#    (at your option) any later version.##    This program is distributed in the hope that it will be useful,#    but WITHOUT ANY WARRANTY; without even the implied warranty of#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the#    GNU General Public License for more details.##    You should have received a copy of the GNU General Public License#    along with this program; if not, write to the Free Software#    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA,#    or see <http://www.gnu.org/licenses/> and#    <http://www.fsf.org/licensing/licenses/info/GPLv2.html>################################################################################

use strict;use warnings;
use File::Basename;use Getopt::Long;use Time::localtime;use XML::Parser;
my $licenseFile = "COPYING";my $version = "2.02";
if (@ARGV < 1) {  &printUsage();  exit 0;}
my $file;my $showLicense = 0;my $showVersion = 0;
GetOptions('f=s' => \$file,           'license' => \$showLicense,           'version' => \$showVersion);
if ($showLicense) {  if (-e $licenseFile) {    print "See file $licenseFile for more details.\n"  } else {    print "Please see <http://www.gnu.org/licenses/> and <http://www.fsf.org/licensing/licenses/info/GPLv2.html>\n";  }  exit 0;}if ($showVersion) {  print "Wikiprep version $version\n";  exit 0;}if (!defined($file)) {  &printUsage();  exit 0;}if (! -e $file) {  die "Input file '$file' cannot be opened for reading\n";}

##### Global definitions #####
my %XmlEntities = ('&' => 'amp', '"' => 'quot', "'" => 'apos', '<' => 'lt', '>' => 'gt');
# The URL protocol (e.g., http) matched here may be in either case, hence we use the /i modifier.my $urlProtocols = qr/http:\/\/|https:\/\/|telnet:\/\/|gopher:\/\/|file:\/\/|wais:\/\/|ftp:\/\/|mailto:|news:/i;# A URL terminator may be either one of a list of characters OR end of string (that is, '$').# This last part is necessary to handle URLs at the very end of a string when there is no "\n"# or any other subsequent character.my $urlTerminator = qr/[\[\]\{\}\s\n\|\"<>]|$/;
my $relatedWording_Standalone =  qr/Main(?:\s+)article(?:s?)|Further(?:\s+)information|Related(?:\s+)article(?:s?)|Related(?:\s+)topic(?:s?)|See(?:\s+)main(?:\s+)article(?:s?)|See(?:\s+)article(?:s?)|See(?:\s+)also|For(?:\s+)(?:more|further)/i;  ## For(?:\s+)more(?:\s+)(?:background|details)(?:\s+)on(?:\s+)this(?:\s+)topic,(?:\s+)seemy $relatedWording_Inline = qr/See[\s:]|See(?:\s+)also|For(?:\s+)(?:more|further)/i;my $relatedWording_Section = qr/Further(?:\s+)information|See(?:\s+)also|Related(?:\s+)article(?:s?)|Related(?:\s+)topic(?:s?)/i;
my %monthToNumDays = ('January' => 31, 'February' => 29, 'March' => 31, 'April' => 30,                      'May' => 31, 'June' => 30, 'July' => 31, 'August' => 31,                      'September' => 30, 'October' => 31, 'November' => 30, 'December' => 31);my %numberToMonth = (1 => 'January', 2 => 'February', 3 => 'March', 4 => 'April',                     5 => 'May', 6 => 'June', 7 => 'July', 8 => 'August',                     9 => 'September', 10 => 'October', 11 => 'November', 12 => 'December');
my $maxTemplateRecursionLevels = 5;my $maxParameterRecursionLevels = 5;
##### Global variables #####
my %namespaces;# we only process pages in these namespaces + the main namespace (which has an empty name)my %okNamespacesForPrescanning = ('Template' => 1, 'Category' => 1);my %okNamespacesForTransforming = ('Category' => 1); # we don't use templates as concepts
my %id2title;my %title2id;my %redir;my %templates;          # template bodies for insertionmy %catHierarchy;       # each category is associated with a list of its immediate descendantsmy %statCategories;     # number of pages classified under each categorymy %statIncomingLinks;  # number of links incoming to each page

my ($fileBasename, $filePath, $fileSuffix) = fileparse($file, ".xml");my $outputFile = "$filePath/$fileBasename.hgw$fileSuffix";my $logFile = "$filePath/$fileBasename.log";my $anchorTextFile = "$filePath/$fileBasename.anchor_text";my $relatedLinksFile = "$filePath/$fileBasename.related_links";
open(OUTF, "> $outputFile") or die "Cannot open $outputFile";open(LOGF, "> $logFile") or die "Cannot open $logFile";open(ANCHORF, "> $anchorTextFile") or die "Cannot open $anchorTextFile";open(RELATEDF, "> $relatedLinksFile") or die "Cannot open $relatedLinksFile";
binmode(STDOUT,  ':utf8');binmode(STDERR,  ':utf8');binmode(OUTF,    ':utf8');binmode(LOGF,    ':utf8');binmode(ANCHORF, ':utf8');
print ANCHORF  "# Line format: <Target page id>  <Source page id>  <Anchor text (up to the end of the line)>\n\n\n";print RELATEDF "# Line format: <Page id>  <List of ids of related articles>\n\n\n";
&copyXmlFileHeader();&loadNamespaces();&prescan();
my $numTitles = scalar( keys(%id2title) );print "Loaded $numTitles titles\n";my $numRedirects = scalar( keys(%redir) );print "Loaded $numRedirects redirects\n";my $numTemplates = scalar( keys(%templates) );print "Loaded $numTemplates templates\n";
&transform();&closeXmlFile();
&writeStatistics();&writeCategoryHierarchy();
close(LOGF);close(ANCHORF);close(RELATEDF);
# Hogwarts needs the anchor text file to be sorted in the increading order of target page id.# The file is originally sorted by source page id (second field in each line).# We now use stable (-s) numeric (-n) sort on the first field (-k 1,1).# This way, the resultant file will be sorted on the target page id (first field) as primary key,# and on the source page id (second field) as secondary key.system("sort -s -n -k 1,1 $anchorTextFile > $anchorTextFile.sorted");

##### Subroutines #####
sub normalizeTitle(\$) {  my ($refToStr) = @_;
  # remove leading whitespace and underscores  $$refToStr =~ s/^[\s_]+//;  # remove trailing whitespace and underscores  $$refToStr =~ s/[\s_]+$//;  # replace sequences of whitespace and underscore chars with a single space  $$refToStr =~ s/[\s_]+/ /g;
  if ($$refToStr =~ /^([^:]*):(\s*)(\S(?:.*))/) {    my $prefix = $1;    my $optionalWhitespace = $2;    my $rest = $3;
    my $namespaceCandidate = $prefix;    &normalizeNamespace(\$namespaceCandidate); # this must be done before the call to 'isKnownNamespace'    if ( &isKnownNamespace(\$namespaceCandidate) ) {      # If the prefix designates a known namespace, then it might follow by optional      # whitespace that should be removed to get the canonical page name      # (e.g., "Category:  Births" should become "Category:Births").      $$refToStr = $namespaceCandidate . ":" . ucfirst($rest);    } else {      # No namespace, just capitalize first letter.      # If the part before the colon is not a known namespace, then we must not remove the space      # after the colon (if any), e.g., "3001: The_Final_Odyssey" != "3001:The_Final_Odyssey".      # However, to get the canonical page name we must contract multiple spaces into one,      # because "3001:   The_Final_Odyssey" != "3001: The_Final_Odyssey".      $$refToStr = ucfirst($prefix) . ":" .                   (length($optionalWhitespace) > 0 ? " " : "") . $rest;    }  } else {    # no namespace, just capitalize first letter    $$refToStr = ucfirst($$refToStr);  }}
sub normalizeNamespace(\$) {  my ($refToStr) = @_;
  $$refToStr = ucfirst( lc($$refToStr) );}
# Checks if the prefix of the page name before the colon is actually one of the# 16+2+2 namespaces defined in the XML file.# Assumption: the argument was already normalized using 'normalizeNamespace'sub isKnownNamespace(\$) {  my ($refToStr) = @_;
  defined( $namespaces{$$refToStr} );  # return value}
# The correct form to create a redirect is #REDIRECT [[ link ]],# and function 'Parse::MediaWikiDump::page->redirect' only supports this form.# However, it seems that Wikipedia can also tolerate a variety of other forms, such as# REDIRECT|REDIRECTS|REDIRECTED|REDIRECTION, then an optional ":", optional "to" or optional "=".# Therefore, we use our own function to handle these cases as well.# If the page is a redirect, the function returns the title of the target page;# otherwise, it returns 'undef'.sub isRedirect($) {  my ($page) = @_;
  # quick check  return undef if ( ${$page->text} !~ /^#REDIRECT/i );
  if ( ${$page->text} =~ m{^\#REDIRECT         # Redirect must start with "#REDIRECT"                                               #   (the backslash is needed before "#" here, because                                               #    "#" has special meaning with /x modifier)                           (?:S|ED|ION)?       # The word may be in any of these forms,                                               #   i.e., REDIRECT|REDIRECTS|REDIRECTED|REDIRECTION                           (?:\s*)             # optional whitespace                           (?: :|\sTO|=)?      # optional colon, "TO" or "="                                               #   (in case of "TO", we expect a whitespace before it,                                               #    so that it's not glued to the preceding word)                           (?:\s*)             # optional whitespace                           \[\[([^\]]*)\]\]    # the link itself                          }ix ) {              # matching is case-insensitive, hence /i    my $target = $1;
    if ($target =~ /^(.*)\#(?:.*)$/) {      # The link contains an anchor. Anchors are not allowed in REDIRECT pages, and therefore      # we adjust the link to point to the page as a whole (that's how Wikipedia works).      $target = $1;    }
    return $target;  }
  # OK, it's probably either a malformed redirect link, or something else  return undef;}
sub isNamespaceOkForPrescanning($) {  my ($page) = @_;
  &isNamespaceOk($page, \%okNamespacesForPrescanning);}
sub isNamespaceOkForTransforming($) {  my ($page) = @_;
  &isNamespaceOk($page, \%okNamespacesForTransforming);}
sub isNamespaceOk($\%) {  my ($page, $refToNamespaceHash) = @_;
  my $result = 1;
  # main namespace is OK, so we only check pages that belong to other namespaces
  if ($page->namespace ne '') {    my $namespace = $page->namespace;    &normalizeNamespace(\$namespace);    if ( &isKnownNamespace(\$namespace) ) {      $result = defined( $$refToNamespaceHash{$namespace} );    } else {      # the prefix before ":" in the page title is not a known namespace,      # therefore, the page belongs to the main namespace and is OK    }  }
  $result; # return value}
sub encodeXmlChars(\$) {  my ($refToStr) = @_;
  $$refToStr =~ s/([&"'<>])/&$XmlEntities{$1};/g;}
sub copyXmlFileHeader() {  open(INF, "< $file") or die "Cannot open $file";  while (<INF>) { # copy lines up to "</siteinfo>"    if (/^<mediawiki /) {      # The top level element - mediawiki - contains a lot of attributes (e.g., schema)      # that are no longer applicable to the XML file after our transformation.      # Therefore, we simply write an opening tag <mediawiki> without any attributes.      print OUTF "<mediawiki>\n";    } else {      # All other lines (up to </siteinfo>) are copied as-is      print OUTF;    }    last if (/<\/siteinfo>/);  }  close(INF); # this file will later be reopened by "Parse::MediaWikiDump"}
sub closeXmlFile() {  print OUTF "</mediawiki>\n";  close(OUTF);}
sub writeStatistics() {  my $statCategoriesFile = "$filePath/$fileBasename.stat.categories";  my $statIncomingLinksFile = "$filePath/$fileBasename.stat.inlinks";
  open(STAT_CATS, "> $statCategoriesFile") or die "Cannot open $statCategoriesFile";  print STAT_CATS "# Line format: <CategoryId (= page id)>  <Number of pages in this category>\n",                  "# Here we count the *pages* that belong to this category, i.e., articles AND\n",                  "# sub-categories of this category (but not the articles in the sub-categories).\n",                  "\n\n";
  my $cat;  foreach $cat ( sort { $statCategories{$b} <=> $statCategories{$a} }                 keys(%statCategories) ) {    print STAT_CATS "$cat\t$statCategories{$cat}\n";  }  close(STAT_CATS);
  open(STAT_INLINKS, "> $statIncomingLinksFile") or die "Cannot open $statIncomingLinksFile";  print STAT_INLINKS "# Line format: <Target page id>  <Number of links to it from other pages>\n\n\n";
  my $destination;  foreach $destination ( sort { $statIncomingLinks{$b} <=> $statIncomingLinks{$a} }                         keys(%statIncomingLinks) ) {    print STAT_INLINKS "$destination\t$statIncomingLinks{$destination}\n";  }
  close(STAT_INLINKS);}
sub writeCategoryHierarchy() {  my $catHierarchyFile = "$filePath/$fileBasename.cat_hier";
  open(CAT_HIER, "> $catHierarchyFile") or die "Cannot open $catHierarchyFile";  print CAT_HIER "# Line format: <Category id>  <List of ids of immediate descendants>\n\n\n";
  my $cat;  foreach $cat ( sort { $catHierarchy{$a} <=> $catHierarchy{$b} }                 keys(%catHierarchy) ) {    print CAT_HIER "$cat\t", join(" ", @{$catHierarchy{$cat}}), "\n";  }
  close(CAT_HIER);}
sub loadNamespaces() {  # re-open the input XML file  my $pages = Parse::MediaWikiDump::Pages->new($file);
  # load namespaces  my $refNamespaces = $pages->namespaces;
  # namespace names are case-insensitive, so we force them  # to canonical form to facilitate future comparisons  my $ns;  foreach $ns (@$refNamespaces) {    my @namespaceData = @$ns;    my $namespaceId   = $namespaceData[0];    my $namespaceName = $namespaceData[1];    &normalizeNamespace(\$namespaceName);    $namespaces{$namespaceName} = $namespaceId;  }}
# build id <-> title mappings and redirection table,# as well as load templatessub prescan() {  # re-open the input XML file  my $pages = Parse::MediaWikiDump::Pages->new($file);
  my $counter = 0;
  my $page;  while (defined($page = $pages->page)) {    my $id = $page->id;
    $counter++;
    if ($counter % 1000 == 0) {      my $timeStr = &getTimeAsString();      print LOGF "[$timeStr] Prescanning page id=$id\n";    }
    my $title = $page->title;    &normalizeTitle(\$title);
    if (length($title) == 0) {      # This is a defense against pages whose title only contains UTF-8 chars that      # are reduced to an empty string. Right now I can think of one such case -      # <C2><A0> which represents the non-breaking space. In this particular case,      # this page is a redirect to [[Non-nreaking space]], but having in the system      # a redirect page with an empty title causes numerous problems, so we'll live      # happier without it.      print LOGF "Skipping page with empty title id=$id\n";      next;    }
    my $redirect = &isRedirect($page);    if (defined($redirect)) {      &normalizeTitle(\$redirect);      next if (length($redirect) == 0); # again, same precaution here - see comments above      $redir{$title} = $redirect;
      # nothing more to do for redirect pages      next;    }
    if ( ! &isNamespaceOkForPrescanning($page) ) {      next; # we're only interested in certain namespaces    }    # if we get here, then either the page belongs to the main namespace OR    # it belongs to one of the namespaces we're interested in
    if ( exists($id2title{$id}) ) {      print LOGF "Warning: Page id=$id already encountered before!\n";      next;    }    if ( exists($title2id{$title}) ) {      # A page could have been encountered before with a different spelling.      # Examples: &nbsp; = <C2><A0> (nonbreakable space), &szlig; = <C3><9F> (German Eszett ligature)      print LOGF "Warning: Page title='$title' already encountered before!\n";      next;    }    $id2title{$id} = $title;    $title2id{$title} = $id;
    if ($title =~ /^Template:/) {      my $text = ${$page->text};
      # We're storing template text for future inclusion, therefore,      # remove all <noinclude> text and keep all <includeonly> text      # (but eliminate <includeonly> tags per se).      # However, if <onlyinclude> ... </onlyinclude> parts are present,      # then only keep them and discard the rest of the template body.      # This is because using <onlyinclude> on a text fragment is      # equivalent to enclosing it in <includeonly> tags **AND**      # enclosing all the rest of the template body in <noinclude> tags.      # These definitions can easily span several lines, hence the "/s" modifiers.
      my $onlyincludeAccumulator;      while ($text =~ /<onlyinclude>(.*?)<\/onlyinclude>/sg) {        my $onlyincludeFragment = $1;        $onlyincludeAccumulator .= "$onlyincludeFragment\n";      }      if ( defined($onlyincludeAccumulator)) {        $text = $onlyincludeAccumulator;      } else {        # If there are no <onlyinclude> fragments, simply eliminate        # <noinclude> fragments and keep <includeonly> ones.        $text =~ s/<noinclude>(?:.*?)<\/noinclude>/\n/sg;        $text =~ s/<includeonly>(.*?)<\/includeonly>/$1/sg;      }
      $templates{$id} = $text;    }  }
  my $timeStr = &getTimeAsString();  print LOGF "[$timeStr] Prescanning complete - prescanned $counter pages\n";}
sub transform() {  # re-open the input XML file  my $pages = Parse::MediaWikiDump::Pages->new($file);
  my $page;  while (defined($page = $pages->page)) {    my $id = $page->id;
    my $timeStr = &getTimeAsString();    print LOGF "[$timeStr] Transforming page id=$id\n";
    if ( defined( &isRedirect($page) ) ) {      next; # we've already loaded all redirects in the prescanning phase    }
    if ( ! &isNamespaceOkForTransforming($page) ) {      next; # we're only interested in pages from certain namespaces    }
    my $title = $page->title;    &normalizeTitle(\$title);
    # see the comment about empty titles in function 'prescan'    if (length($title) == 0) {      print LOGF "Skipping page with empty title id=$id\n";      next;    }
    my $text = ${$page->text};
    my $orgLength = length($text);  # text length BEFORE any transformations
    # The check for stub must be done BEFORE any further processing,    # because stubs indicators are templates, and templates are substituted.    my $isStub = 0;    if ( $text =~ m/stub}}/i ) {      $isStub = 1;    }
    my @categories;    my @internalLinks;    my @urls;
    &includeTemplates(\$text);
    my @relatedArticles;    # This function only examines the contents of '$text', but doesn't change it.    &identifyRelatedArticles(\$text, \@relatedArticles, $id);
    # We process categories directly, because '$page->categories' ignores    # categories inherited from included templates    &extractCategories(\$text, \@categories, $id);
    # Categories are listed at the end of articles, and therefore may mistakenly    # be added to the list of related articles (which often appear in the last    # section such as "See also"). To avoid this, we explicitly remove all categories    # from the list of related links, and only then record the list of related links    # to the file.    &removeElements(\@relatedArticles, \@categories);    &recordRelatedArticles($id, \@relatedArticles);
    &extractInternalLinks(\$text, \@internalLinks, $id, 1, 1);    &extractUrls(\$text, \@urls);
    &postprocessText(\$text, 1);
    my $newLength = length($text);  # text length AFTER all transformations
    &writePage($id, \$title, \$text, $orgLength, $newLength, $isStub, \@categories, \@internalLinks, \@urls);
    &updateStatistics(\@categories, \@internalLinks);
    if ($title =~ /^Category:/) {      &updateCategoryHierarchy($id, \@categories);    }  }}
sub updateStatistics(\@\@) {  my ($refToCategories, $refToInternalLinks) = @_;
  my $cat;  foreach $cat (@$refToCategories) {    $statCategories{$cat}++;  }
  my $link;  foreach $link (@$refToInternalLinks) {    $statIncomingLinks{$link}++;  }}
sub updateCategoryHierarchy($\@) {  # The list of categories passed as a parameter is actually the list of parent categories  # for the current category  my ($childId, $refToParentCategories) = @_;
  my $parentCat;  foreach $parentCat (@$refToParentCategories) {    if ( exists($catHierarchy{$parentCat}) ) {      push(@{$catHierarchy{$parentCat}}, $childId);    } else {      # create a new array with '$childId' as the only child (for now) of '$parentCat'      my @arr;      push(@arr, $childId);      $catHierarchy{$parentCat} = [ @arr ];    }  }}
sub writePage($\$\$$$$\@\@\@) {  my ($id, $refToTitle, $refToText, $orgLength, $newLength, $isStub,      $refToCategories, $refToInternalLinks, $refToUrls) = @_;
  my $numCategories = scalar(@$refToCategories);  my $numLinks = scalar(@$refToInternalLinks);  my $numUrls = scalar(@$refToUrls);
  print OUTF "<page id=\"$id\" orglength=\"$orgLength\" newlength=\"$newLength\" stub=\"$isStub\" " .             "categories=\"$numCategories\" outlinks=\"$numLinks\" urls=\"$numUrls\">\n";
  my $encodedTitle = $$refToTitle;  &encodeXmlChars(\$encodedTitle);  print OUTF "<title>$encodedTitle</title>\n";
  print OUTF "<categories>";  print OUTF join(" ", @$refToCategories);  print OUTF "</categories>\n";
  print OUTF "<links>";  print OUTF join(" ", @$refToInternalLinks);  print OUTF "</links>\n";
  print OUTF "<urls>\n";
  my $url;  foreach $url (@$refToUrls) {    &encodeXmlChars(\$url);    print OUTF "$url\n";  }  print OUTF "</urls>\n";
  # text has already undergone 'encodeXmlChars' in function 'postprocessText'  print OUTF "<text>\n$$refToText\n</text>\n";
  print OUTF "</page>\n";}
# Maps a title into the id, and performs redirection if necessary.# Assumption: the argument was already normalized using 'normalizeTitle'sub resolveLink(\$) {  my ($refToTitle) = @_;
  # safety precaution  return undef if (length($$refToTitle) == 0);
  my $targetId; # result  my $targetTitle = $$refToTitle;
  if ( exists($redir{$$refToTitle}) ) { # this link is a redirect    $targetTitle = $redir{$$refToTitle};
    # check if this is a double redirect    if ( exists($redir{$targetTitle}) ) {      $targetTitle = undef; # double redirects are not allowed and are ignored      print LOGF "Warning: link '$$refToTitle' caused double redirection and was ignored\n";    } else {      print LOGF "Link '$$refToTitle' was redirected to '$targetTitle'\n";    }  }
  if ( defined($targetTitle) ) {    if ( exists($title2id{$targetTitle}) ) {      $targetId = $title2id{$targetTitle};    } else {      # target not found      print LOGF "Warning: link '$$refToTitle' cannot be matched to an id\n";      $targetId = undef;    }  } else {    $targetId = undef;  }
  $targetId; # return value}
sub includeTemplates(\$) {  my ($refToText) = @_;
  # Using the while loop forces templates to be included recursively  # (i.e., includes the body of templates that themselves were included  # on the previous iteration ).  # Template definitions can easily span several lines, hence the "/s" modifier.
  # Templates are frequently nested. Occasionally, parsing mistakes may cause template insertion  # to enter an infinite loop, for instance when trying to instantiate Template:Country  # {{country_{{{1}}}|{{{2}}}|{{{2}}}|size={{{size|}}}|name={{{name|}}}}}  # which is repeatedly trying to insert template "country_", which is again resolved to  # Template:Country. The straightforward solution of keeping track of templates that were  # already inserted for the current article would not work, because the same template  # may legally be used more than once, with different parameters in different parts of  # the article. Therefore, we simply limit the number of iterations of nested template  # inclusion.
  my $templateRecursionLevels = 0;
  # We also require that the body of a template does not contain the template opening sequence  # (two successive opening braces - "\{\{"). We use negative lookahead to achieve this.  while ( ($templateRecursionLevels < $maxTemplateRecursionLevels) &&          $$refToText =~ s/\{\{                                (?:\s*)        # optional whitespace before the template name is ignored                                (                                  (?:                                      (?!                                          \{\{                                      )                                      .                                  )*?                                )# OLD code and comments#                                (?:\s*)        # optional whitespace before the template name is ignored#                                ([^\{]*?)      # Occasionally, templates are nested,#                                               # e.g., {{localurl:{{NAMESPACE}}:{{PAGENAME}}}}#                                               # In order to prevent incorrect parsing, e.g.,#                                               # "{{localurl:{{NAMESPACE}}", we require that the#                                               # template name does not include opening braces,#                                               # hence "[^\{]" (any char except opening brace).# END OF OLD code and comments                           \}\}                          /&instantiateTemplate($1)/segx        ) {    $templateRecursionLevels++;  }
  # Since we limit the number of levels of template recursion, we might end up with several  # un-instantiated templates. In this case we simply eliminate them - however, we do so  # later, in function 'postprocessText()', after extracting categories, links and URLs.}
BEGIN {  # Making variables static for the function to avoid recompilation of regular expressions  # every time the function is called.
  my $specialSeparator = "\.pAr\.";  my $specialSeparatorRegex = qr/$specialSeparator/;
  sub parseTemplateInvocation(\$\$\%) {    my ($refToTemplateInvocation, $refToTemplateTitle, $refToParameterHash) = @_;
    # Template definitions (especially those with parameters) can easily span several lines,    # hence the "/s" modifier. The template name extends up to the first pipeline symbol (if any).    # Template parameters go after the "|" symbol.    if ($$refToTemplateInvocation =~ /^([^|]*)\|(.*)$/sx) {      $$refToTemplateTitle = $1;  # single out the template name itself      my $paramsList = $2;
      # Template parameters often contain URLs, internal links, or just other useful text,      # whereas the template serves for presenting it in some nice way.      # Parameters are separated by "|" symbols. However, we cannot simply split the string      # on "|" symbols, since these frequently appear inside internal links. Therefore, we split      # on those "|" symbols that are not inside [[...]]. It's obviously sufficient to check that      # brackets are not improperly nested on one side of "|", so we use lookahead.      # We first replace all "|" symbols that are not inside [[...]] with a special separator that      # we invented, which will hopefully not normally appear in the text (.pAr.).      # Next, we use 'split' to break the string on this new separator.
      $paramsList =~ s/\|                       # split on pipeline symbol, such that                          (?:                   # non-capturing grouper that encloses 2 options                              (?=               #   zero-width lookahead - option #1                                  [^\]]*$       #     there are no closing brackets up to the end                                                #     of the string (i.e., all the characters up to                                                #     the end of the string are not closing brackets)                              )                 #   end of first lookahead (= end of option #1)                              |                 #   or                              (?=               #   another zero-width lookahead - option #2                                  [^\]]* \[     #     the nearest opening bracket on the right is not preceded                                                #     by a closing bracket (i.e., all the characters that                                                #     precede it are not closing brackets                              )                 #   end of second lookahead  (= end of option #2)                          )                     # end of the outer grouper                      /$specialSeparator/sxg;   # replace matching symbols with a special separator                                                # /s means string can contain newline chars
      my @parameters = split(/$specialSeparatorRegex/, $paramsList);
      # Parameters can be either named or unnamed. In the latter case, their name is defined by their      # ordinal position (1, 2, 3, ...).
      my $unnamedParameterCounter = 0;
      # It's legal for unnamed parameters to be skipped, in which case they will get default      # values (if available) during actual instantiation. That is {{template_name|a||c}} means      # parameter 1 gets the value 'a', parameter 2 value is not defined, and parameter 3 gets the value 'c'.      # This case is correctly handled by function 'split', and does not require any special handling.      my $param;      foreach $param (@parameters) {        # Spaces before or after a parameter value are normally ignored, UNLESS the parameter contains        # a link (to prevent possible gluing the link to the following text after template substitution)
        # Parameter values may contain "=" symbols, hence the parameter name extends up to        # the first such symbol.        # It is legal for a parameter to be specified several times, in which case the last assignment        # takes precedence. Example: "{{t|a|b|c|2=B}}" is equivalent to "{{t|a|B|c}}".        # Therefore, we don't check if the parameter has been assigned a value before, because        # anyway the last assignment should override any previous ones.        if ($param =~ /^([^=]*)=(.*)$/s) {          # This is a named parameter.          # This case also handles parameter assignments like "2=xxx", where the number of an unnamed          # parameter ("2") is specified explicitly - this is handled transparently.
          my $parameterName = $1;          my $parameterValue = $2;
          &trimWhitespaceBothSides(\$parameterName);          if ($parameterValue !~ /\]\]/) { # if the value does not contain a link, trim whitespace            &trimWhitespaceBothSides(\$parameterValue);          }
          $$refToParameterHash{$parameterName} = $parameterValue;        } else {          # this is an unnamed parameter          $unnamedParameterCounter++;
          if ($param !~ /\]\]/) { # if the value does not contain a link, trim whitespace            &trimWhitespaceBothSides(\$param);          }
          $$refToParameterHash{$unnamedParameterCounter} = $param;        }      }    } else {      # Template invocation does not contain a pipeline symbol, hence take the entire      # invocation text as the template title.      $$refToTemplateTitle = $$refToTemplateInvocation;    }  }
} # end of BEGIN block

sub instantiateTemplate($) {  my ($templateInvocation) = @_;
  my $result = "";
  print LOGF "Instantiating template=$templateInvocation\n";
  my $templateTitle;  my %templateParams;  &parseTemplateInvocation(\$templateInvocation, \$templateTitle, \%templateParams);
  &computeFullyQualifiedTemplateTitle(\$templateTitle);
  &includeTemplateText(\$templateTitle, \%templateParams, \$result);
  $result;  # return value}
sub includeTemplateText(\$\%\$) {  my ($refToTemplateTitle, $refToParameterHash, $refToResult) = @_;
  &normalizeTitle($refToTemplateTitle);  my $includedPageId = &resolveLink($refToTemplateTitle);
  if ( defined($includedPageId) && exists($templates{$includedPageId}) ) {    # OK, perform the actual inclusion with parameter substitution
    $$refToResult = $templates{$includedPageId};
    # Perform parameter substitution    # A parameter call ( {{{...}}} ) may span over a newline, hence the /s modifier
    # Parameters may be nested (see comments below), hence we do the substitution iteratively    # in a while loop. We also limit the maximum number of iterations to avoid too long or    # even endless loops (in case of malformed input).    my $parameterRecursionLevels = 0;
    # We also require that the body of a parameter does not contain the parameter opening sequence    # (three successive opening braces - "\{\{\{"). We use negative lookahead to achieve this.    while ( ($parameterRecursionLevels < $maxParameterRecursionLevels) &&            $$refToResult =~ s/\{\{\{                                (                                  (?:                                      (?!                                          \{\{\{                                      )                                      .                                  )*?                                )
# OLD code and comments#                                      ([^\{]*?)      # Occasionally, parameters are nested because#                                                     # they are dependent on other parameters,#                                                     # e.g., {{{Author|{{{PublishYear|}}}}}}#                                                     # (here, the default value for 'Author' is#                                                     # dependent on 'PublishYear').#                                                     # In order to prevent incorrect parsing, e.g.,#                                                     # "{{{Author|{{{PublishYear|}}}", we require that the#                                                     # parameter name does not include opening braces,#                                                     # hence "[^\{]" (any char except opening brace).# END OF OLD code and comments                               \}\}\}                              /&substituteParameter($1, $refToParameterHash)/segx          ) {      $parameterRecursionLevels++;    }  } else {    # The page being included cannot be identified - perhaps we skipped it (because currently    # we only allow for inclusion of pages in the Template namespace), or perhaps it's    # a variable name like {{NUMBEROFARTICLES}}. Just remove this inclusion directive and    # replace it with a space    print LOGF "Template '$$refToTemplateTitle' is not available for inclusion\n";    $$refToResult = " ";  }}
sub substituteParameter($\%) {  my ($parameter, $refToParameterHash) = @_;
  my $result;
  if ($parameter =~ /^([^|]*)\|(.*)$/) {    # This parameter has a default value    my $paramName = $1;    my $defaultValue = $2;
    if ( defined($$refToParameterHash{$paramName}) ) {      $result = $$refToParameterHash{$paramName};  # use parameter value specified in template invocation    } else { # use the default value      $result = $defaultValue;    }  } else {    # parameter without a default value
    if ( defined($$refToParameterHash{$parameter}) ) {      $result = $$refToParameterHash{$parameter};  # use parameter value specified in template invocation    } else {      # Parameter not specified in template invocation and does not have a default value -      # do not perform substitution and keep the parameter in 3 braces      # (these are Wiki rules for templates, see  http://meta.wikimedia.org/wiki/Help:Template ).      $result = "{{{$parameter}}}";    }  }
  # Surplus parameters - i.e., those assigned values in template invocation but not used  # in the template body - are simply ignored.
  $result;  # return value}
sub computeFullyQualifiedTemplateTitle(\$) {  my ($refToTemplateTitle) = @_;
  # Determine the namespace of the page being included through the template mechanism
  my $namespaceSpecified = 0;
  if ($$refToTemplateTitle =~ /^:(.*)$/) {    # Leading colon by itself implies main namespace, so strip this colon    $$refToTemplateTitle = $1;    $namespaceSpecified = 1;  } elsif ($$refToTemplateTitle =~ /^([^:]*):/) {    # colon found but not in the first position - check if it designates a known namespace    my $prefix = $1;    &normalizeNamespace(\$prefix);    $namespaceSpecified = &isKnownNamespace(\$prefix);  }
  # The case when the page title does not contain a colon at all also falls here.
  if ($namespaceSpecified) {    # OK, the title of the page being included is fully qualified with a namespace  } else {    # The title of the page being included is NOT in the main namespace and lacks    # any other explicit designation of the namespace - therefore, it is resolved    # to the Template namespace (that's the default for the template inclusion mechanism).    $$refToTemplateTitle = "Template:$$refToTemplateTitle";  }}
sub extractCategories(\$\@$) {  my ($refToText, $refToCategoriesArray, $id) = @_;
  # Remember that namespace names are case-insensitive, hence we're matching with "/i".  # The first parameter to 'collectCategory' is passed by value rather than by reference,  # because it might be dangerous to pass a reference to $1 in case it might get modified  # (with unclear consequences).  $$refToText =~ s/\[\[(?:\s*)(Category:.*?)\]\]/&collectCategory($1, $refToCategoriesArray)/ieg;
  # We don't accumulate categories directly in a hash table, since this would not preserve  # their original order of appearance.  &removeDuplicatesAndSelf($refToCategoriesArray, $id);}
sub collectCategory($\@) {  my ($catName, $refToCategoriesArray) = @_;
  if ($catName =~ /^(.*)\|/) {    # Some categories contain a sort key, e.g., [[Category:Whatever|*]] or [[Category:Whatever| ]]    # In such a case, take only the category name itself.    $catName = $1;  }
  &normalizeTitle(\$catName);
  my $catId = &resolveLink(\$catName);  if ( defined($catId) ) {    push(@$refToCategoriesArray, $catId);  } else {    print LOGF "Warning: unknown category '$catName'\n";  }
  # The return value is just a space, because we remove categories from the text  # after we collected them  " ";}
sub extractInternalLinks(\$\@$$$) {  my ($refToText, $refToInternalLinksArray, $id,      $whetherToLogAnchorText, $whetherToRemoveDuplicates) = @_;
  # For each internal link outgoing form the current article, this hash table maps  # the target id into the anchor text associated with it. Naturally, we only  # collect anchor text for links that can be resolved to a page id.  my %anchorTexts;
  # Link definitions may span over adjacent lines and therefore contain line breaks,  # hence we use the /s modifier.  # Occasionally, links are nested, e.g.,  # [[Image:kanner_kl2.jpg|frame|right|Dr. [[Leo Kanner]] introduced the label ''early infantile autism'' in [[1943]].]]  # In order to prevent incorrect parsing, e.g., "[[Image:kanner_kl2.jpg|frame|right|Dr. [[Leo Kanner]]",  # we extract links in several iterations of the while loop, while the link definition requires that  # each pair [[...]] does not contain any opening braces.
  1 while ( $$refToText =~ s/                             (\w*)            # words may be glued to the beginning of the link,                                              # in which case they become part of the link                                              # e.g., "ex-[[Giuseppe Mazzini|Mazzinian]] "                             \[\[                                   ([^\[]*?)  # the link text can be any chars except an opening bracket,                                              # this ensures we correctly parse nested links (see comments above)                             \]\]                             (\w*)            # words may be glued to the end of the link,                                              # in which case they become part of the link                                              # e.g., "[[public transport]]ation"                            /&collectInternalLink($1, $2, $3, $refToInternalLinksArray, \%anchorTexts)/segx          );
  if ($whetherToRemoveDuplicates) {    &removeDuplicatesAndSelf($refToInternalLinksArray, $id);  }
  if ($whetherToLogAnchorText) {    &logAnchorText(\%anchorTexts, $id);  }}
sub logAnchorText(\%$) {  my ($refToAnchorTextsHash, $curPageId) = @_;
  # Remember that we use a hash table to associate anchor text with target page ids.  # Therefore, if the current page has several links to another page (it happens), then we only  # keep the anchor text of the last one (and override the previous ones) - we can live with it.  # Consequently, we do not need to remove duplicates as there are none.  # However, we still remove the links that point from the page to itself.  my $targetId;  my $anchorText;  while ( ($targetId, $anchorText) = each(%$refToAnchorTextsHash) ) {    if ($targetId != $curPageId) {      &postprocessText(\$anchorText, 0); # anchor text doesn't need escaping of XML characters,                                         # hence the second function parameter is 0      $anchorText =~ s/\n/ /g;  # replace all newlines with spaces
      # make sure that something is left of anchor text after postprocessing      if (length($anchorText) > 0) {        print ANCHORF "$targetId\t$curPageId\t$anchorText\n";      }    }  }}
sub collectInternalLink($$$\@\%) {  my ($prefix, $link, $suffix, $refToInternalLinksArray, $refToAnchorTextHash) = @_;
  my $originalLink = $link;  my $result = "";
  # strip leading whitespace, if any  $link =~ s/^\s*//;
  # Link definitions may span over adjacent lines and therefore contain line breaks,  # hence we use the /s modifier on most matchings.
  # There are some special cases when the link may be preceded with a colon.  # Known cases:  # - Linking to a category (as opposed to actually assigning the current article  #   to a category) is performed using special syntax [[:Category:...]]  # - Linking to other languages, e.g., [[:fr:Wikipedia:Aide]]  #   (without the leading colon, the link will go to the side menu  # - Linking directly to the description page of an image, e.g., [[:Image:wiki.png]]  # In all such cases, we strip the leading colon.  if ($link =~ /^                   :        # colon at the beginnning of the link name                   (.*)     # the rest of the link text                $               /sx) {    # just strip this initial colon (as well as any whitespace preceding it)    $link = $1;  }
  # Alternative text may be available after the pipeline symbol.  # If the pipeline symbol is only used for masking parts of  # the link name for presentation, we still consider that the author of the page  # deemed the resulting text important, hence we always set this variable when  # the pipeline symbol is present.  my $alternativeTextAvailable = 0;
  # Some links contain several pipeline symbols, e.g.,  # [[Image:Zerzan.jpeg|thumb|right|[[John Zerzan]]]]  # It seems that the extra pipeline symbols are parameters, so we just eliminate them.  if ($link =~ /^(.*)\|([^|]*)$/s) { # first, extract the link up to the last pipeline symbol    $link = $1;    # the part before the last pipeline    $result = $2;  # the part  after the last pipeline, this is usually an alternative text for this link
    $alternativeTextAvailable = 1; # pipeline found, see comment above
    # Now check if there are pipeline symbols remaining.    # Note that this time we're looking for the shortest match,    # to take the part of the text up to the first pipeline symbol.    if ($link =~ /^([^|]*)\|(.*)$/s) {      $link = $1;      # $2 contains the parameters, which we don't really need    }
    if (length($result) == 0) {      if ($link !~ /\#/) {        # If the "|" symbol is not followed by some text, then it masks the namespace        # as well as any text in parentheses at the end of the link title.        # However, pipeline masking is only invoked if the link does not contain an anchor,        # hence the additional condition in the 'if' statement.        &performPipelineMasking(\$link, \$result);      } else {        # If the link contains an anchor, then masking is not invoked, and we take the entire link        $result = $link;      }    }  } else {    # the link text does not contain the pipeline, so take it as-is    $result = $link;  }
  if ($link =~ /^(.*)\#(.*)$/s) {    # The link contains an anchor, so adjust the link to point to the page as a whole.    $link = $1;    my $anchor = $2;    # Check if the link points to an anchor on the current page, and if so - ignore it.    if (length($link) == 0 && ! $alternativeTextAvailable) {      # This is indeed a link pointing to an anchor on the current page.      # The link is thus cleared, so that it will not be resolved and collected later.      # For anchors to the same page, discard the leading '#' symbol, and take      # the rest as the text - but only if no alternative text was provided for this link.      $result = $anchor;    }  }
  # Now collect the link, or links if the original link is in the date format  # and specifies both day and year. In the latter case, the function for date  # normalization may also modify the link text ($result), and may collect more  # than one link (one for the day, another one for the year).  my $dateRecognized = 0;
  # Alternative text (specified after pipeline) blocks normalization of dates.  # We also perform a quick check - if the link does not start with a digit,  # then it surely does not contain a date  if ( ($link =~ /^\d/) && (! $alternativeTextAvailable)) {    $dateRecognized = &normalizeDates(\$link, \$result, $refToInternalLinksArray, $refToAnchorTextHash);  }
  # If a date (either day or day + year) was recognized, then no further processing is necessary  if (! $dateRecognized) {    &normalizeTitle(\$link);    my $targetId = &resolveAndCollectInternalLink(\$link, $refToInternalLinksArray);
    # Wikipedia pages contain many links to other Wiki projects (especially Wikipedia in    # other languages). While these links are not resolved to valid pages, we also want    # to ignore their text. However, simply discarding the text of all links that cannot    # be resolved would be overly aggressive, as authors frequently define phrases as links    # to articles that don't yet exist, in the hope that they will be added later.    # Therefore, we formulate the following conditions that must hold simultaneously    # for discarding the text of a link:    # 1) the link was not resolved to a valid id    # 2) the link does not contain alternative text (if it did, then the text is probably    #    important enough to be retained)    # 3) the link contains a colon - this is a very simple heuristics for identifying links to    #    other Wiki projects, other languages, or simply other namespaces within current Wikipedia.    #    While this method is not fool-proof (there are regular pages in the main namespace    #    that contain a colon in their title), we believe this is a reasonable tradeoff.    if ( !defined($targetId) && ! $alternativeTextAvailable && $link =~ /:/ ) {      $result = "";      print LOGF "Discarding text for link '$originalLink'\n";    } else {      # finally, add the text originally attached to the left and/or to the right of the link      # (if the link represents a date, then it has not text glued to it, so it's OK to only      # use the prefix and suffix here)      $result = $prefix . $result . $suffix;    }
    if ( defined($targetId) ) {      # If the current page has several links to another page, then we only take the anchor      # of the last one (and override the previous ones) - we can live with it.      $$refToAnchorTextHash{$targetId} = $result;    }  }
  $result;  #return value}
sub performPipelineMasking(\$\$) {  my ($refToLink, $refToResult) = @_;
  # First check for presence of a namespace  if ($$refToLink =~ /^([^:]*):(.*)$/) {    my $namespaceCandidate = $1;    my $rest = $2;
    &normalizeNamespace(\$namespaceCandidate);    if ( &isKnownNamespace(\$namespaceCandidate) ) {      $$refToResult = $rest; # take the link text without the namespace    } else {      $$refToResult = $$refToLink; # otherwise, take the entire link text (for now)    }  } else {    $$refToResult = $$refToLink; # otherwise, take the entire link text (for now)  }
  # Now check if there are parentheses at the end of the link text  # (we now operate on $$refToResult, because we might have stripped the leading  # namespace in the previous test).  if ($$refToResult =~ /^                  # the beginning of the string                          (.*)             # the text up to the last pair of parentheses                          \(               # opening parenthesis                              (?:[^()]*)   #   the text in the parentheses                          \)               # closing parenthesis                          (?:\s*)          # optional trailing whitespace, just in case                        $                  # end of string                       /x) {    $$refToResult = $1; # discard the text in parentheses at the end of the string  }}
sub resolveAndCollectInternalLink(\$\@) {  my ($refToLink, $refToInternalLinksArray) = @_;
  my $targetId = &resolveLink($refToLink);  if ( defined($targetId) ) {    push(@$refToInternalLinksArray, $targetId);  } else {    # Some cases in this category that obviously won't be resolved to legal ids:    # - Links to namespaces that we don't currently handle    #   (other than those for which 'isNamespaceOK' returns true);    #   media and sound files fall in this category    # - Links to other languages, e.g., [[de:...]]    # - Links to other Wiki projects, e.g., [[Wiktionary:...]]    print LOGF "Warning: unknown link '$$refToLink'\n";  }
  $targetId;  # return value}
# Dates can appear in several formats# 1) [[July 20]], [[1969]]# 2) [[20 July]] [[1969]]# 3) [[1969]]-[[07-20]]# 4) [[1969-07-20]]# The first one is handled correctly without any special treatment,# so we don't even check for it here.# In (2) and (3), we only normalize the day, because it will be parsed separately from the year.# This function is only invoked if the link has no alternative text available, therefore,# we're free to override the result text.sub normalizeDates(\$\$\@\%) {  my ($refToLink, $refToResultText, $refToInternalLinksArray, $refToAnchorTextHash) = @_;
  my $dateRecognized = 0;
  if ($$refToLink =~ /^(\d\d)\s*([A-Za-z]+)$/) {    my $day = $1;    my $month = ucfirst(lc($2));
    if ( defined($monthToNumDays{$month}) &&         1 <= $day && $day <= $monthToNumDays{$month} ) {      $dateRecognized = 1;
      $$refToLink = "$month $day";      $$refToResultText = "$month $day";
      my $targetId = &resolveAndCollectInternalLink($refToLink, $refToInternalLinksArray);      if ( defined($targetId) ) {        $$refToAnchorTextHash{$targetId} = $$refToResultText;      }    } else {      # this doesn't look like a valid date, leave as-is    }  } elsif ($$refToLink =~ /^(\d\d)\-(\d\d)$/) {    my $monthNum = int($1);    my $day = $2;
    if ( defined($numberToMonth{$monthNum}) ) {      my $month = $numberToMonth{$monthNum};      if (1 <= $day && $day <= $monthToNumDays{$month}) {        $dateRecognized = 1;
        $$refToLink = "$month $day";        # we add a leading space, to separate the preceding year ("[[1969]]-" in the example")        # from the day that we're creating        $$refToResultText = " $month $day";
        my $targetId = &resolveAndCollectInternalLink($refToLink, $refToInternalLinksArray);        if ( defined($targetId) ) {            $$refToAnchorTextHash{$targetId} = $$refToResultText;        }      } else {        # this doesn't look like a valid date, leave as-is      }    } else {      # this doesn't look like a valid date, leave as-is    }  } elsif ($$refToLink =~ /^(\d\d\d\d)\-(\d\d)\-(\d\d)$/) {    my $year = $1;    my $monthNum = int($2);    my $day = $3;
    if ( defined($numberToMonth{$monthNum}) ) {      my $month = $numberToMonth{$monthNum};      if (1 <= $day && $day <= $monthToNumDays{$month}) {        $dateRecognized = 1;
        $$refToLink = "$month $day";        # the link text is combined from the day and the year        $$refToResultText = "$month $day, $year";
        my $targetId;
        # collect the link for the day        $targetId = &resolveAndCollectInternalLink($refToLink, $refToInternalLinksArray);        if ( defined($targetId) ) {            $$refToAnchorTextHash{$targetId} = $$refToLink;        }
        # collect the link for the year        $targetId = &resolveAndCollectInternalLink(\$year, $refToInternalLinksArray);        if ( defined($targetId) ) {            $$refToAnchorTextHash{$targetId} = $year;        }      } else {        # this doesn't look like a valid date, leave as-is      }    } else {      # this doesn't look like a valid date, leave as-is    }  }
  $dateRecognized;  # return value}
sub extractUrls(\$\@) {  my ($refToText, $refToUrlsArray) = @_;
  # First we handle the case of URLs enclosed in single brackets, with or without the description,  # and with optional leading and/or trailing whitespace  # Examples: [http://www.cnn.com], [ http://www.cnn.com  ], [http://www.cnn.com  CNN Web site]  $$refToText =~ s/\[(?:\s*)($urlProtocols(?:[^\[\]]*))\]/&collectUrlFromBrackets($1, $refToUrlsArray)/eg;
  # Now we handle standalone URLs (those not enclosed in brackets)  # The $urlTemrinator is matched via positive lookahead (?=...) in order not to remove  # the terminator symbol itself, but rather only the URL.  $$refToText =~ s/($urlProtocols(?:.*?))$urlTerminator/&collectStandaloneUrl($1, $refToUrlsArray)/eg;
  &removeDuplicatesAndSelf($refToUrlsArray, undef);}
sub collectUrlFromBrackets($\@) {  my ($url, $refToUrlsArray) = @_;
  my $text;  # Assumption: leading whitespace has already been stripped  if ( $url =~ /^($urlProtocols(?:.*?))($urlTerminator(?:.*))$/ ) { # description available    push(@$refToUrlsArray, $1);    $text = $2;  } else { # no description    push(@$refToUrlsArray, $url);    $text = " ";  }
  $text;  # return value}
sub collectStandaloneUrl($\@) {  my ($url, $refToUrlsArray) = @_;
  push(@$refToUrlsArray, $url); # collect the URL as-is
  " "; # return value - replace the URL with a space}
sub postprocessText(\$$) {  my ($refToText, $whetherToEncodeXmlChars) = @_;
  # Eliminate all <includeonly> and <onlyinclude> fragments, because this text  # will not be included anywhere, as we already handled all inclusion directives  # in function 'includeTemplates'.  # This block can easily span several lines, hence the "/s" modifier.  $$refToText =~ s/<includeonly>(.*?)<\/includeonly>/ /sg;  $$refToText =~ s/<onlyinclude>(.*?)<\/onlyinclude>/ /sg;
  # <noinclude> fragments remain, but remove the tags per se  # We block the code below, as <noinclude> tags will anyway be thrown away later,  # when we eliminate all remaining tags.  ### This block can easily span several lines, hence the "/s" modifier  ### $$refToText =~ s/<noinclude>(.*?)<\/noinclude>/$1/sg;
  # replace <br> and <br /> directives with new paragraph  $$refToText =~ s/<br(?:\s*)(?:[\/]?)>/\n\n/g;
  # Remove tables, as they often carry a lot of noise  &eliminateTables($refToText);
  # Since we limit the number of levels of template recursion, we might end up with several  # un-instantiated templates. In this case we simply eliminate them now.  # Because templates may be nested, we eliminate them iteratively by starting from the most  # nested one (hence the 'while' loop).  #    OLD comments and code:  #    For the same reason, we also require that the body of a template does not contain  #    opening braces (hence "[^\{]", any char except opening brace).  #    1 while ($$refToText =~ s/\{\{(?:[^\{]*?)\}\}/ /sg);  #    END OF old comments and code  # We also require that the body of a template does not contain the template opening sequence  # (two successive opening braces - "\{\{"). We use negative lookahead to achieve this.  1 while ($$refToText =~ s/\{\{                                 (?:                                     (?:                                         (?!                                             \{\{                                         )                                         .                                     )*?                                 )                            \}\}                           / /sgx);
  # Remove any other <...> tags - but keep the text they enclose  # (the tags are replaced with spaces to prevent adjacent pieces of text  # from being glued together).  # Comments (<!-- ... -->) also fall into this category, and since they can easily span several lines,  # we use the "/s" modifier.  $$refToText =~ s/<(?:.*?)>/ /sg;
  # Change markup on bold/italics emphasis. We probably don't need to distinguish  # these 3 types of emphasis, so we just replace all of them with a generic <em> tag.  # IMPORTANT: If 'encodeXmlChars' has beeen called before this line, then remember that  # the apostrophes were already quoted to "&apos;"  $$refToText =~ s/'''''(.*?)'''''/$1/g;  $$refToText =~ s/'''(.*?)'''/$1/g;  $$refToText =~ s/''(.*?)''/$1/g;
  # Eliminate long sequences of newlines and whitespace.  # Note that we don't want to replace sequences of spaces only, as this might make the text  # less readable. Instead, we only eliminate sequences of whitespace that contain at least  # two newlines.  $$refToText =~ s/(?:\s*)\n(?:\s*)\n(?:\s*)/\n\n/g;
  # Eliminate XML entities such as "&nbsp;" , "&times;" etc. - otherwise,  # in C++ code they will give rise to spurious words "nbsp", "times" etc.  # Note that the standard entities - &amp; , &quot; , &apos; , &lt; and &gt;  # are handled by the XML parser. All other entities, such as &nbsp; are passed  # by the XML parser to the upper level (in case of Wikipedia pages,  # to the rendering engine).  # Note that in the raw XML text, these entities look like "&amp;nbsp;"  # (i.e., with leading "&amp;"). XML parser replaces "&amp;" with "&",  # so here in the code we see the entities as "&nbsp;".  $$refToText =~ s{&                 # the entity starts with "&"                   ((?:\#?)(?:\w+))  # optional '#' sign (as in &#945;), followed by                                     # an uninterrupted sequence of letters and/or digits                   ;                 # the entity ends with a semicolon                  }{&logReplacedXmlEntity($1)}egx;   # entities are replaced with a space
  if ($whetherToEncodeXmlChars) {    # encode text for XML    &encodeXmlChars($refToText);  }
  # NOTE that the following operations introduce XML tags, so they must appear  # after the original text underwent character encoding with 'encodeXmlChars' !!
  # Change markup for section headers.  # Note that section headers may only begin at the very first position in the line  # (not even after a space). Therefore, each header markup in the following commands  # is prefixed with "^" to make sure it begins at the beginning of the line.  # Since the text (e.g., article body) may contains multiple lines, we use  # the "/m" modifier to allow matching "^" at embedded "\n" positions.  $$refToText =~ s/^=====(.*?)=====/<h4>$1<\/h4>/mg;  $$refToText =~ s/^====(.*?)====/<h3>$1<\/h3>/mg;  $$refToText =~ s/^===(.*?)===/<h2>$1<\/h2>/mg;  $$refToText =~ s/^==(.*?)==/<h1>$1<\/h1>/mg;}
sub logReplacedXmlEntity($) {  my ($xmlEntity) = @_;
  print LOGF "ENTITY: &$xmlEntity;\n";
  " "; # return value - entities are replaced with a space}
BEGIN {  # Making variables static for the function to avoid recompilation of regular expressions  # every time the function is called.
  # Table definitions can easily span several lines, hence the "/s" modifier
  my $tableOpeningSequence1 = qr{<table>                         # either just <table>
                                 |                               # or                                 <table(?:\s+)(?:[^<>]*)>}ix;    # "<table" followed by at least one space                                                                 # (to prevent "<tablexxx"), followed by                                                                 # some optional text, e.g., table parameters                                                                 # as in "<table border=0>"                                 # In the above definition, prohibiting '<' and '>' chars ([^<>]) ensures                                 # that we do not consume more than necessary, so that in the example                                 #  "<table border=0> aaa <table> bbb </table> ccc </table>"                                 #  $1 is NOT extended to be "> aaa <table"
  my $tableClosingSequence1 = qr/<\/table>/i;#  my $nonNestedTableRegex1 =#    qr{$tableOpeningSequence1            # opening sequence#       (#         (?:                             # non-capturing grouper#             (?!                         # lookahead negation#                 $tableOpeningSequence1  # that's what we don't want to find inside a table definition#             )#             .                           # any character (such that there is no table opening sequence#                                         #   after it because of the lookahead condition)#         )*?                             # shortest match of such characters, up to the closing of a table#       )#       $tableClosingSequence1}sx;        # closing sequence
  my $tableOpeningSequence2 = qr/\{\|/;  my $tableClosingSequence2 = qr/\|\}/;#  my $nonNestedTableRegex2 =#    qr{$tableOpeningSequence2            # opening sequence#       (#         (?:                             # non-capturing grouper#             (?!                         # lookahead negation#                 $tableOpeningSequence2  # that's what we don't want to find inside a table definition#             )#             .                           # any character (such that there is no table opening sequence#                                         #   after it because of the lookahead condition)#         )*?                             # shortest match of such characters, up to the closing of a table#       )#       $tableClosingSequence2}sx;        # closing sequence
  sub eliminateTables(\$) {    my ($refToText) = @_;
# Sadly, these patterns became too complex and cause segmentation fault,# hence we fall back to only handling non-nested tables :(#    # Sometimes, tables are nested, therefore we use a while loop to eliminate them#    # recursively, while requiring that any table we eliminate does not contain nested tables.#    # For simplicity, we assume that tables of the two kinds (e.g., <table> ... </table> and {| ... |})#    # are not nested in one another.
    $$refToText =~ s/$tableOpeningSequence1(.*?)$tableClosingSequence1/\n/sg;    $$refToText =~ s/$tableOpeningSequence2(.*?)$tableClosingSequence2/\n/sg;  }
} # end of BEGIN block
# If specified, 'elementToRemove' contains an element that needs to be removed as well.# For links, this ensures that a page does not link to itself. For categories, this# ensures that a page is not categorized to itself. This parameter is obviously# irrelevant for filtering URLs.# 'elementToRemove' must be a numeric value (not string), since we're testing it with '==' (not 'eq')sub removeDuplicatesAndSelf(\@$) {  my ($refToArray, $elementToRemove) = @_;
  my %seen = ();  my @uniq;
  my $item;  foreach $item (@$refToArray) {    if ( defined($elementToRemove) && ($item == $elementToRemove) ) {      printf LOGF "Warning: current page links or categorizes to itself - " .                  "link discarded ($elementToRemove)\n";      next;    }    push(@uniq, $item) unless $seen{$item}++;  }
  # overwrite the original array with the new one that does not contain duplicates  @$refToArray = @uniq;}
# Removes elements of the second list from the first list.# For efficiency purposes, the second list is converted into a hash.sub removeElements(\@\@) {  my ($refToArray, $refToElementsToRemove) = @_;
  my %elementsToRemove = ();  my @result;
  # Construct the hash table for fast lookups  my $item;  foreach $item (@$refToElementsToRemove) {    $elementsToRemove{$item} = 1;  }
  foreach $item (@$refToArray) {    if ( ! defined($elementsToRemove{$item}) ) {      push(@result, $item);    }  }
  # overwrite the original array with the new one  @$refToArray = @result;}
sub getTimeAsString() {  my $tm = localtime();  my $result = sprintf("%02d:%02d:%02d", $tm->hour, $tm->min, $tm->sec);}
sub trimWhitespaceBothSides(\$) {    my ($stringRef) = @_;
    # remove leading whitespace    $$stringRef =~ s/^\s*//;    # remove trailing whitespace    $$stringRef =~ s/\s*$//;}
# There are 3 kinds of related links that we look for:# 1) Standalone (usually, at the beginning of the article or a section of it)#    Ex: Main articles: ...# 2) Inlined - text in parentheses inside the body of the article#    Ex: medicine (see also: [[Health]])# 3) Dedicated section#    Ex: == See also ==## In all calls to 'extractInternalLinks':# - The penultimate argument is 0, since we don't need to log anchor text here.#   Anchor text will be handled when we analyze all the internal links in#   the entire article (and not just look for related links).# - The last argument is 0 in order not to remove duplicates on every invocation#   of 'extractInternalLinks'. This is because duplicates in related links are#   not very common, but performing duplicate removal each time is expensive.#   Instead, we remove duplicates once at the very end.sub identifyRelatedArticles(\$\@$) {  my ($refToText, $refToRelatedArticles, $id) = @_;
  # We split the text into a set of lines. This also creates a copy of the original text -  # this is important, since the function 'extractInternalLinks' modifies its argument,  # so we'd better use it on a copy of the real article body.  my @text = split("\n", $$refToText);  my $line;
  # Standalone  foreach $line (@text) {    # We require that stanalone designators occur at the beginning of the line    # (after at most a few characters, such as a whitespace or a colon),    # and not just anywhere in the line. Otherwise, we would collect as related    # those links that just happen to occur in the same line with an unrelated    # string that represents a standalone designator.    if ($line =~ /^(?:.{0,5})(${relatedWording_Standalone}.*)$/) {      my $str = $1; # We extract links from the rest of the line      print LOGF "Related(S): $id => $str\n";      &extractInternalLinks(\$str, $refToRelatedArticles, $id, 0, 0);      print LOGF "Related(S): $id ==> @$refToRelatedArticles\n";    }  }
  # Inlined (in parentheses)  foreach $line (@text) {    while ($line =~ /\((?:\s*)(${relatedWording_Inline}.*?)\)/g) {      my $str = $1;      print LOGF "Related(I): $id => $str\n";      &extractInternalLinks(\$str, $refToRelatedArticles, $id, 0, 0);      print LOGF "Related(I): $id ==> @$refToRelatedArticles\n";    }  }
  # Section  # Sections can be at any level - "==", "===", "====" - it doesn't matter,  # so it suffices to look for two consecutive "=" signs  my $relatedSectionFound = 0;  foreach $line (@text) {    if ($relatedSectionFound) { # we're in the related section now      if ($line =~ /==(?:.*?)==/) { # we just encountered the next section - exit the loop        last;      } else { # collect the links from the current line        print LOGF "Related(N): $id => $line\n";        # 'extractInternalLinks' may mofidy its argument ('$line'), but it's OK        # as we do not do any further processing to '$line' or '@text'        &extractInternalLinks(\$line, $refToRelatedArticles, $id, 0, 0);        print LOGF "Related(N): $id ==> @$refToRelatedArticles\n";      }    } else { # we haven't yet found the related section      if ($line =~ /==(.*?)==/) { # found some section header - let's check it        my $sectionHeader = $1;        if ($sectionHeader =~ /$relatedWording_Section/) {          $relatedSectionFound = 1;          next; # proceed to the next line        } else {          next; # unrelated section - just proceed to the next line        }      } else {        next; # just proceed to the next line - nothing to do      }    }  }
  &removeDuplicatesAndSelf($refToRelatedArticles, $id);}
sub recordRelatedArticles($\@) {  my ($id, $refToRelatedArticles) = @_;
  my $size = scalar(@$refToRelatedArticles);  return if ($size == 0);
  print RELATEDF "$id\t", join(" ", @$refToRelatedArticles), "\n";}

########################################################################
sub printUsage(){  print "Wikiprep version $version, Copyright (C) 2007 Evgeniy Gabrilovich\n" .        "Wikiprep comes with ABSOLUTELY NO WARRANTY; for details type '$0 -license'.\n" .        "This is free software, and you are welcome to redistribute it\n" .        "under certain conditions; type '$0 -license' for details.\n" .        "Type '$0 -version' for version information.\n\n" .        "Usage: $0 -f <XML file with page dump>\n" .        "       e.g., $0 -f pages_articles.xml\n\n";}