############################################################################### # # wikiprep.pl - Preprocess Wikipedia XML dumps # Copyright (C) 2007 Evgeniy Gabrilovich # The author can be contacted by electronic mail at gabr@cs.technion.ac.il # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA, # or see and # # ############################################################################### use strict; use warnings; use File::Basename; use Getopt::Long; use Time::localtime; use XML::Parser; my $licenseFile = "COPYING"; my $version = "2.02"; if (@ARGV < 1) { &printUsage(); exit 0; } my $file; my $showLicense = 0; my $showVersion = 0; GetOptions('f=s' => \$file, 'license' => \$showLicense, 'version' => \$showVersion); if ($showLicense) { if (-e $licenseFile) { print "See file $licenseFile for more details.\n" } else { print "Please see and \n"; } exit 0; } if ($showVersion) { print "Wikiprep version $version\n"; exit 0; } if (!defined($file)) { &printUsage(); exit 0; } if (! -e $file) { die "Input file '$file' cannot be opened for reading\n"; } ##### Global definitions ##### my %XmlEntities = ('&' => 'amp', '"' => 'quot', "'" => 'apos', '<' => 'lt', '>' => 'gt'); # The URL protocol (e.g., http) matched here may be in either case, hence we use the /i modifier. my $urlProtocols = qr/http:\/\/|https:\/\/|telnet:\/\/|gopher:\/\/|file:\/\/|wais:\/\/|ftp:\/\/|mailto:|news:/i; # A URL terminator may be either one of a list of characters OR end of string (that is, '$'). # This last part is necessary to handle URLs at the very end of a string when there is no "\n" # or any other subsequent character. my $urlTerminator = qr/[\[\]\{\}\s\n\|\"<>]|$/; my $relatedWording_Standalone = qr/Main(?:\s+)article(?:s?)|Further(?:\s+)information|Related(?:\s+)article(?:s?)|Related(?:\s+)topic(?:s?)|See(?:\s+)main(?:\s+)article(?:s?)|See(?:\s+)article(?:s?)|See(?:\s+)also|For(?:\s+)(?:more|further)/i; ## For(?:\s+)more(?:\s+)(?:background|details)(?:\s+)on(?:\s+)this(?:\s+)topic,(?:\s+)see my $relatedWording_Inline = qr/See[\s:]|See(?:\s+)also|For(?:\s+)(?:more|further)/i; my $relatedWording_Section = qr/Further(?:\s+)information|See(?:\s+)also|Related(?:\s+)article(?:s?)|Related(?:\s+)topic(?:s?)/i; my %monthToNumDays = ('January' => 31, 'February' => 29, 'March' => 31, 'April' => 30, 'May' => 31, 'June' => 30, 'July' => 31, 'August' => 31, 'September' => 30, 'October' => 31, 'November' => 30, 'December' => 31); my %numberToMonth = (1 => 'January', 2 => 'February', 3 => 'March', 4 => 'April', 5 => 'May', 6 => 'June', 7 => 'July', 8 => 'August', 9 => 'September', 10 => 'October', 11 => 'November', 12 => 'December'); my $maxTemplateRecursionLevels = 5; my $maxParameterRecursionLevels = 5; ##### Global variables ##### my %namespaces; # we only process pages in these namespaces + the main namespace (which has an empty name) my %okNamespacesForPrescanning = ('Template' => 1, 'Category' => 1); my %okNamespacesForTransforming = ('Category' => 1); # we don't use templates as concepts my %id2title; my %title2id; my %redir; my %templates; # template bodies for insertion my %catHierarchy; # each category is associated with a list of its immediate descendants my %statCategories; # number of pages classified under each category my %statIncomingLinks; # number of links incoming to each page my ($fileBasename, $filePath, $fileSuffix) = fileparse($file, ".xml"); my $outputFile = "$filePath/$fileBasename.hgw$fileSuffix"; my $logFile = "$filePath/$fileBasename.log"; my $anchorTextFile = "$filePath/$fileBasename.anchor_text"; my $relatedLinksFile = "$filePath/$fileBasename.related_links"; open(OUTF, "> $outputFile") or die "Cannot open $outputFile"; open(LOGF, "> $logFile") or die "Cannot open $logFile"; open(ANCHORF, "> $anchorTextFile") or die "Cannot open $anchorTextFile"; open(RELATEDF, "> $relatedLinksFile") or die "Cannot open $relatedLinksFile"; binmode(STDOUT, ':utf8'); binmode(STDERR, ':utf8'); binmode(OUTF, ':utf8'); binmode(LOGF, ':utf8'); binmode(ANCHORF, ':utf8'); print ANCHORF "# Line format: \n\n\n"; print RELATEDF "# Line format: \n\n\n"; ©XmlFileHeader(); &loadNamespaces(); &prescan(); my $numTitles = scalar( keys(%id2title) ); print "Loaded $numTitles titles\n"; my $numRedirects = scalar( keys(%redir) ); print "Loaded $numRedirects redirects\n"; my $numTemplates = scalar( keys(%templates) ); print "Loaded $numTemplates templates\n"; &transform(); &closeXmlFile(); &writeStatistics(); &writeCategoryHierarchy(); close(LOGF); close(ANCHORF); close(RELATEDF); # Hogwarts needs the anchor text file to be sorted in the increading order of target page id. # The file is originally sorted by source page id (second field in each line). # We now use stable (-s) numeric (-n) sort on the first field (-k 1,1). # This way, the resultant file will be sorted on the target page id (first field) as primary key, # and on the source page id (second field) as secondary key. system("sort -s -n -k 1,1 $anchorTextFile > $anchorTextFile.sorted"); ##### Subroutines ##### sub normalizeTitle(\$) { my ($refToStr) = @_; # remove leading whitespace and underscores $$refToStr =~ s/^[\s_]+//; # remove trailing whitespace and underscores $$refToStr =~ s/[\s_]+$//; # replace sequences of whitespace and underscore chars with a single space $$refToStr =~ s/[\s_]+/ /g; if ($$refToStr =~ /^([^:]*):(\s*)(\S(?:.*))/) { my $prefix = $1; my $optionalWhitespace = $2; my $rest = $3; my $namespaceCandidate = $prefix; &normalizeNamespace(\$namespaceCandidate); # this must be done before the call to 'isKnownNamespace' if ( &isKnownNamespace(\$namespaceCandidate) ) { # If the prefix designates a known namespace, then it might follow by optional # whitespace that should be removed to get the canonical page name # (e.g., "Category: Births" should become "Category:Births"). $$refToStr = $namespaceCandidate . ":" . ucfirst($rest); } else { # No namespace, just capitalize first letter. # If the part before the colon is not a known namespace, then we must not remove the space # after the colon (if any), e.g., "3001: The_Final_Odyssey" != "3001:The_Final_Odyssey". # However, to get the canonical page name we must contract multiple spaces into one, # because "3001: The_Final_Odyssey" != "3001: The_Final_Odyssey". $$refToStr = ucfirst($prefix) . ":" . (length($optionalWhitespace) > 0 ? " " : "") . $rest; } } else { # no namespace, just capitalize first letter $$refToStr = ucfirst($$refToStr); } } sub normalizeNamespace(\$) { my ($refToStr) = @_; $$refToStr = ucfirst( lc($$refToStr) ); } # Checks if the prefix of the page name before the colon is actually one of the # 16+2+2 namespaces defined in the XML file. # Assumption: the argument was already normalized using 'normalizeNamespace' sub isKnownNamespace(\$) { my ($refToStr) = @_; defined( $namespaces{$$refToStr} ); # return value } # The correct form to create a redirect is #REDIRECT [[ link ]], # and function 'Parse::MediaWikiDump::page->redirect' only supports this form. # However, it seems that Wikipedia can also tolerate a variety of other forms, such as # REDIRECT|REDIRECTS|REDIRECTED|REDIRECTION, then an optional ":", optional "to" or optional "=". # Therefore, we use our own function to handle these cases as well. # If the page is a redirect, the function returns the title of the target page; # otherwise, it returns 'undef'. sub isRedirect($) { my ($page) = @_; # quick check return undef if ( ${$page->text} !~ /^#REDIRECT/i ); if ( ${$page->text} =~ m{^\#REDIRECT # Redirect must start with "#REDIRECT" # (the backslash is needed before "#" here, because # "#" has special meaning with /x modifier) (?:S|ED|ION)? # The word may be in any of these forms, # i.e., REDIRECT|REDIRECTS|REDIRECTED|REDIRECTION (?:\s*) # optional whitespace (?: :|\sTO|=)? # optional colon, "TO" or "=" # (in case of "TO", we expect a whitespace before it, # so that it's not glued to the preceding word) (?:\s*) # optional whitespace \[\[([^\]]*)\]\] # the link itself }ix ) { # matching is case-insensitive, hence /i my $target = $1; if ($target =~ /^(.*)\#(?:.*)$/) { # The link contains an anchor. Anchors are not allowed in REDIRECT pages, and therefore # we adjust the link to point to the page as a whole (that's how Wikipedia works). $target = $1; } return $target; } # OK, it's probably either a malformed redirect link, or something else return undef; } sub isNamespaceOkForPrescanning($) { my ($page) = @_; &isNamespaceOk($page, \%okNamespacesForPrescanning); } sub isNamespaceOkForTransforming($) { my ($page) = @_; &isNamespaceOk($page, \%okNamespacesForTransforming); } sub isNamespaceOk($\%) { my ($page, $refToNamespaceHash) = @_; my $result = 1; # main namespace is OK, so we only check pages that belong to other namespaces if ($page->namespace ne '') { my $namespace = $page->namespace; &normalizeNamespace(\$namespace); if ( &isKnownNamespace(\$namespace) ) { $result = defined( $$refToNamespaceHash{$namespace} ); } else { # the prefix before ":" in the page title is not a known namespace, # therefore, the page belongs to the main namespace and is OK } } $result; # return value } sub encodeXmlChars(\$) { my ($refToStr) = @_; $$refToStr =~ s/([&"'<>])/&$XmlEntities{$1};/g; } sub copyXmlFileHeader() { open(INF, "< $file") or die "Cannot open $file"; while () { # copy lines up to "" if (/^ without any attributes. print OUTF "\n"; } else { # All other lines (up to ) are copied as-is print OUTF; } last if (/<\/siteinfo>/); } close(INF); # this file will later be reopened by "Parse::MediaWikiDump" } sub closeXmlFile() { print OUTF "\n"; close(OUTF); } sub writeStatistics() { my $statCategoriesFile = "$filePath/$fileBasename.stat.categories"; my $statIncomingLinksFile = "$filePath/$fileBasename.stat.inlinks"; open(STAT_CATS, "> $statCategoriesFile") or die "Cannot open $statCategoriesFile"; print STAT_CATS "# Line format: \n", "# Here we count the *pages* that belong to this category, i.e., articles AND\n", "# sub-categories of this category (but not the articles in the sub-categories).\n", "\n\n"; my $cat; foreach $cat ( sort { $statCategories{$b} <=> $statCategories{$a} } keys(%statCategories) ) { print STAT_CATS "$cat\t$statCategories{$cat}\n"; } close(STAT_CATS); open(STAT_INLINKS, "> $statIncomingLinksFile") or die "Cannot open $statIncomingLinksFile"; print STAT_INLINKS "# Line format: \n\n\n"; my $destination; foreach $destination ( sort { $statIncomingLinks{$b} <=> $statIncomingLinks{$a} } keys(%statIncomingLinks) ) { print STAT_INLINKS "$destination\t$statIncomingLinks{$destination}\n"; } close(STAT_INLINKS); } sub writeCategoryHierarchy() { my $catHierarchyFile = "$filePath/$fileBasename.cat_hier"; open(CAT_HIER, "> $catHierarchyFile") or die "Cannot open $catHierarchyFile"; print CAT_HIER "# Line format: \n\n\n"; my $cat; foreach $cat ( sort { $catHierarchy{$a} <=> $catHierarchy{$b} } keys(%catHierarchy) ) { print CAT_HIER "$cat\t", join(" ", @{$catHierarchy{$cat}}), "\n"; } close(CAT_HIER); } sub loadNamespaces() { # re-open the input XML file my $pages = Parse::MediaWikiDump::Pages->new($file); # load namespaces my $refNamespaces = $pages->namespaces; # namespace names are case-insensitive, so we force them # to canonical form to facilitate future comparisons my $ns; foreach $ns (@$refNamespaces) { my @namespaceData = @$ns; my $namespaceId = $namespaceData[0]; my $namespaceName = $namespaceData[1]; &normalizeNamespace(\$namespaceName); $namespaces{$namespaceName} = $namespaceId; } } # build id <-> title mappings and redirection table, # as well as load templates sub prescan() { # re-open the input XML file my $pages = Parse::MediaWikiDump::Pages->new($file); my $counter = 0; my $page; while (defined($page = $pages->page)) { my $id = $page->id; $counter++; if ($counter % 1000 == 0) { my $timeStr = &getTimeAsString(); print LOGF "[$timeStr] Prescanning page id=$id\n"; } my $title = $page->title; &normalizeTitle(\$title); if (length($title) == 0) { # This is a defense against pages whose title only contains UTF-8 chars that # are reduced to an empty string. Right now I can think of one such case - # which represents the non-breaking space. In this particular case, # this page is a redirect to [[Non-nreaking space]], but having in the system # a redirect page with an empty title causes numerous problems, so we'll live # happier without it. print LOGF "Skipping page with empty title id=$id\n"; next; } my $redirect = &isRedirect($page); if (defined($redirect)) { &normalizeTitle(\$redirect); next if (length($redirect) == 0); # again, same precaution here - see comments above $redir{$title} = $redirect; # nothing more to do for redirect pages next; } if ( ! &isNamespaceOkForPrescanning($page) ) { next; # we're only interested in certain namespaces } # if we get here, then either the page belongs to the main namespace OR # it belongs to one of the namespaces we're interested in if ( exists($id2title{$id}) ) { print LOGF "Warning: Page id=$id already encountered before!\n"; next; } if ( exists($title2id{$title}) ) { # A page could have been encountered before with a different spelling. # Examples:   = (nonbreakable space), ß = <9F> (German Eszett ligature) print LOGF "Warning: Page title='$title' already encountered before!\n"; next; } $id2title{$id} = $title; $title2id{$title} = $id; if ($title =~ /^Template:/) { my $text = ${$page->text}; # We're storing template text for future inclusion, therefore, # remove all text and keep all text # (but eliminate tags per se). # However, if ... parts are present, # then only keep them and discard the rest of the template body. # This is because using on a text fragment is # equivalent to enclosing it in tags **AND** # enclosing all the rest of the template body in tags. # These definitions can easily span several lines, hence the "/s" modifiers. my $onlyincludeAccumulator; while ($text =~ /(.*?)<\/onlyinclude>/sg) { my $onlyincludeFragment = $1; $onlyincludeAccumulator .= "$onlyincludeFragment\n"; } if ( defined($onlyincludeAccumulator)) { $text = $onlyincludeAccumulator; } else { # If there are no fragments, simply eliminate # fragments and keep ones. $text =~ s/(?:.*?)<\/noinclude>/\n/sg; $text =~ s/(.*?)<\/includeonly>/$1/sg; } $templates{$id} = $text; } } my $timeStr = &getTimeAsString(); print LOGF "[$timeStr] Prescanning complete - prescanned $counter pages\n"; } sub transform() { # re-open the input XML file my $pages = Parse::MediaWikiDump::Pages->new($file); my $page; while (defined($page = $pages->page)) { my $id = $page->id; my $timeStr = &getTimeAsString(); print LOGF "[$timeStr] Transforming page id=$id\n"; if ( defined( &isRedirect($page) ) ) { next; # we've already loaded all redirects in the prescanning phase } if ( ! &isNamespaceOkForTransforming($page) ) { next; # we're only interested in pages from certain namespaces } my $title = $page->title; &normalizeTitle(\$title); # see the comment about empty titles in function 'prescan' if (length($title) == 0) { print LOGF "Skipping page with empty title id=$id\n"; next; } my $text = ${$page->text}; my $orgLength = length($text); # text length BEFORE any transformations # The check for stub must be done BEFORE any further processing, # because stubs indicators are templates, and templates are substituted. my $isStub = 0; if ( $text =~ m/stub}}/i ) { $isStub = 1; } my @categories; my @internalLinks; my @urls; &includeTemplates(\$text); my @relatedArticles; # This function only examines the contents of '$text', but doesn't change it. &identifyRelatedArticles(\$text, \@relatedArticles, $id); # We process categories directly, because '$page->categories' ignores # categories inherited from included templates &extractCategories(\$text, \@categories, $id); # Categories are listed at the end of articles, and therefore may mistakenly # be added to the list of related articles (which often appear in the last # section such as "See also"). To avoid this, we explicitly remove all categories # from the list of related links, and only then record the list of related links # to the file. &removeElements(\@relatedArticles, \@categories); &recordRelatedArticles($id, \@relatedArticles); &extractInternalLinks(\$text, \@internalLinks, $id, 1, 1); &extractUrls(\$text, \@urls); &postprocessText(\$text, 1); my $newLength = length($text); # text length AFTER all transformations &writePage($id, \$title, \$text, $orgLength, $newLength, $isStub, \@categories, \@internalLinks, \@urls); &updateStatistics(\@categories, \@internalLinks); if ($title =~ /^Category:/) { &updateCategoryHierarchy($id, \@categories); } } } sub updateStatistics(\@\@) { my ($refToCategories, $refToInternalLinks) = @_; my $cat; foreach $cat (@$refToCategories) { $statCategories{$cat}++; } my $link; foreach $link (@$refToInternalLinks) { $statIncomingLinks{$link}++; } } sub updateCategoryHierarchy($\@) { # The list of categories passed as a parameter is actually the list of parent categories # for the current category my ($childId, $refToParentCategories) = @_; my $parentCat; foreach $parentCat (@$refToParentCategories) { if ( exists($catHierarchy{$parentCat}) ) { push(@{$catHierarchy{$parentCat}}, $childId); } else { # create a new array with '$childId' as the only child (for now) of '$parentCat' my @arr; push(@arr, $childId); $catHierarchy{$parentCat} = [ @arr ]; } } } sub writePage($\$\$$$$\@\@\@) { my ($id, $refToTitle, $refToText, $orgLength, $newLength, $isStub, $refToCategories, $refToInternalLinks, $refToUrls) = @_; my $numCategories = scalar(@$refToCategories); my $numLinks = scalar(@$refToInternalLinks); my $numUrls = scalar(@$refToUrls); print OUTF "\n"; my $encodedTitle = $$refToTitle; &encodeXmlChars(\$encodedTitle); print OUTF "$encodedTitle\n"; print OUTF ""; print OUTF join(" ", @$refToCategories); print OUTF "\n"; print OUTF ""; print OUTF join(" ", @$refToInternalLinks); print OUTF "\n"; print OUTF "\n"; my $url; foreach $url (@$refToUrls) { &encodeXmlChars(\$url); print OUTF "$url\n"; } print OUTF "\n"; # text has already undergone 'encodeXmlChars' in function 'postprocessText' print OUTF "\n$$refToText\n\n"; print OUTF "\n"; } # Maps a title into the id, and performs redirection if necessary. # Assumption: the argument was already normalized using 'normalizeTitle' sub resolveLink(\$) { my ($refToTitle) = @_; # safety precaution return undef if (length($$refToTitle) == 0); my $targetId; # result my $targetTitle = $$refToTitle; if ( exists($redir{$$refToTitle}) ) { # this link is a redirect $targetTitle = $redir{$$refToTitle}; # check if this is a double redirect if ( exists($redir{$targetTitle}) ) { $targetTitle = undef; # double redirects are not allowed and are ignored print LOGF "Warning: link '$$refToTitle' caused double redirection and was ignored\n"; } else { print LOGF "Link '$$refToTitle' was redirected to '$targetTitle'\n"; } } if ( defined($targetTitle) ) { if ( exists($title2id{$targetTitle}) ) { $targetId = $title2id{$targetTitle}; } else { # target not found print LOGF "Warning: link '$$refToTitle' cannot be matched to an id\n"; $targetId = undef; } } else { $targetId = undef; } $targetId; # return value } sub includeTemplates(\$) { my ($refToText) = @_; # Using the while loop forces templates to be included recursively # (i.e., includes the body of templates that themselves were included # on the previous iteration ). # Template definitions can easily span several lines, hence the "/s" modifier. # Templates are frequently nested. Occasionally, parsing mistakes may cause template insertion # to enter an infinite loop, for instance when trying to instantiate Template:Country # {{country_{{{1}}}|{{{2}}}|{{{2}}}|size={{{size|}}}|name={{{name|}}}}} # which is repeatedly trying to insert template "country_", which is again resolved to # Template:Country. The straightforward solution of keeping track of templates that were # already inserted for the current article would not work, because the same template # may legally be used more than once, with different parameters in different parts of # the article. Therefore, we simply limit the number of iterations of nested template # inclusion. my $templateRecursionLevels = 0; # We also require that the body of a template does not contain the template opening sequence # (two successive opening braces - "\{\{"). We use negative lookahead to achieve this. while ( ($templateRecursionLevels < $maxTemplateRecursionLevels) && $$refToText =~ s/\{\{ (?:\s*) # optional whitespace before the template name is ignored ( (?: (?! \{\{ ) . )*? ) # OLD code and comments # (?:\s*) # optional whitespace before the template name is ignored # ([^\{]*?) # Occasionally, templates are nested, # # e.g., {{localurl:{{NAMESPACE}}:{{PAGENAME}}}} # # In order to prevent incorrect parsing, e.g., # # "{{localurl:{{NAMESPACE}}", we require that the # # template name does not include opening braces, # # hence "[^\{]" (any char except opening brace). # END OF OLD code and comments \}\} /&instantiateTemplate($1)/segx ) { $templateRecursionLevels++; } # Since we limit the number of levels of template recursion, we might end up with several # un-instantiated templates. In this case we simply eliminate them - however, we do so # later, in function 'postprocessText()', after extracting categories, links and URLs. } BEGIN { # Making variables static for the function to avoid recompilation of regular expressions # every time the function is called. my $specialSeparator = "\.pAr\."; my $specialSeparatorRegex = qr/$specialSeparator/; sub parseTemplateInvocation(\$\$\%) { my ($refToTemplateInvocation, $refToTemplateTitle, $refToParameterHash) = @_; # Template definitions (especially those with parameters) can easily span several lines, # hence the "/s" modifier. The template name extends up to the first pipeline symbol (if any). # Template parameters go after the "|" symbol. if ($$refToTemplateInvocation =~ /^([^|]*)\|(.*)$/sx) { $$refToTemplateTitle = $1; # single out the template name itself my $paramsList = $2; # Template parameters often contain URLs, internal links, or just other useful text, # whereas the template serves for presenting it in some nice way. # Parameters are separated by "|" symbols. However, we cannot simply split the string # on "|" symbols, since these frequently appear inside internal links. Therefore, we split # on those "|" symbols that are not inside [[...]]. It's obviously sufficient to check that # brackets are not improperly nested on one side of "|", so we use lookahead. # We first replace all "|" symbols that are not inside [[...]] with a special separator that # we invented, which will hopefully not normally appear in the text (.pAr.). # Next, we use 'split' to break the string on this new separator. $paramsList =~ s/\| # split on pipeline symbol, such that (?: # non-capturing grouper that encloses 2 options (?= # zero-width lookahead - option #1 [^\]]*$ # there are no closing brackets up to the end # of the string (i.e., all the characters up to # the end of the string are not closing brackets) ) # end of first lookahead (= end of option #1) | # or (?= # another zero-width lookahead - option #2 [^\]]* \[ # the nearest opening bracket on the right is not preceded # by a closing bracket (i.e., all the characters that # precede it are not closing brackets ) # end of second lookahead (= end of option #2) ) # end of the outer grouper /$specialSeparator/sxg; # replace matching symbols with a special separator # /s means string can contain newline chars my @parameters = split(/$specialSeparatorRegex/, $paramsList); # Parameters can be either named or unnamed. In the latter case, their name is defined by their # ordinal position (1, 2, 3, ...). my $unnamedParameterCounter = 0; # It's legal for unnamed parameters to be skipped, in which case they will get default # values (if available) during actual instantiation. That is {{template_name|a||c}} means # parameter 1 gets the value 'a', parameter 2 value is not defined, and parameter 3 gets the value 'c'. # This case is correctly handled by function 'split', and does not require any special handling. my $param; foreach $param (@parameters) { # Spaces before or after a parameter value are normally ignored, UNLESS the parameter contains # a link (to prevent possible gluing the link to the following text after template substitution) # Parameter values may contain "=" symbols, hence the parameter name extends up to # the first such symbol. # It is legal for a parameter to be specified several times, in which case the last assignment # takes precedence. Example: "{{t|a|b|c|2=B}}" is equivalent to "{{t|a|B|c}}". # Therefore, we don't check if the parameter has been assigned a value before, because # anyway the last assignment should override any previous ones. if ($param =~ /^([^=]*)=(.*)$/s) { # This is a named parameter. # This case also handles parameter assignments like "2=xxx", where the number of an unnamed # parameter ("2") is specified explicitly - this is handled transparently. my $parameterName = $1; my $parameterValue = $2; &trimWhitespaceBothSides(\$parameterName); if ($parameterValue !~ /\]\]/) { # if the value does not contain a link, trim whitespace &trimWhitespaceBothSides(\$parameterValue); } $$refToParameterHash{$parameterName} = $parameterValue; } else { # this is an unnamed parameter $unnamedParameterCounter++; if ($param !~ /\]\]/) { # if the value does not contain a link, trim whitespace &trimWhitespaceBothSides(\$param); } $$refToParameterHash{$unnamedParameterCounter} = $param; } } } else { # Template invocation does not contain a pipeline symbol, hence take the entire # invocation text as the template title. $$refToTemplateTitle = $$refToTemplateInvocation; } } } # end of BEGIN block sub instantiateTemplate($) { my ($templateInvocation) = @_; my $result = ""; print LOGF "Instantiating template=$templateInvocation\n"; my $templateTitle; my %templateParams; &parseTemplateInvocation(\$templateInvocation, \$templateTitle, \%templateParams); &computeFullyQualifiedTemplateTitle(\$templateTitle); &includeTemplateText(\$templateTitle, \%templateParams, \$result); $result; # return value } sub includeTemplateText(\$\%\$) { my ($refToTemplateTitle, $refToParameterHash, $refToResult) = @_; &normalizeTitle($refToTemplateTitle); my $includedPageId = &resolveLink($refToTemplateTitle); if ( defined($includedPageId) && exists($templates{$includedPageId}) ) { # OK, perform the actual inclusion with parameter substitution $$refToResult = $templates{$includedPageId}; # Perform parameter substitution # A parameter call ( {{{...}}} ) may span over a newline, hence the /s modifier # Parameters may be nested (see comments below), hence we do the substitution iteratively # in a while loop. We also limit the maximum number of iterations to avoid too long or # even endless loops (in case of malformed input). my $parameterRecursionLevels = 0; # We also require that the body of a parameter does not contain the parameter opening sequence # (three successive opening braces - "\{\{\{"). We use negative lookahead to achieve this. while ( ($parameterRecursionLevels < $maxParameterRecursionLevels) && $$refToResult =~ s/\{\{\{ ( (?: (?! \{\{\{ ) . )*? ) # OLD code and comments # ([^\{]*?) # Occasionally, parameters are nested because # # they are dependent on other parameters, # # e.g., {{{Author|{{{PublishYear|}}}}}} # # (here, the default value for 'Author' is # # dependent on 'PublishYear'). # # In order to prevent incorrect parsing, e.g., # # "{{{Author|{{{PublishYear|}}}", we require that the # # parameter name does not include opening braces, # # hence "[^\{]" (any char except opening brace). # END OF OLD code and comments \}\}\} /&substituteParameter($1, $refToParameterHash)/segx ) { $parameterRecursionLevels++; } } else { # The page being included cannot be identified - perhaps we skipped it (because currently # we only allow for inclusion of pages in the Template namespace), or perhaps it's # a variable name like {{NUMBEROFARTICLES}}. Just remove this inclusion directive and # replace it with a space print LOGF "Template '$$refToTemplateTitle' is not available for inclusion\n"; $$refToResult = " "; } } sub substituteParameter($\%) { my ($parameter, $refToParameterHash) = @_; my $result; if ($parameter =~ /^([^|]*)\|(.*)$/) { # This parameter has a default value my $paramName = $1; my $defaultValue = $2; if ( defined($$refToParameterHash{$paramName}) ) { $result = $$refToParameterHash{$paramName}; # use parameter value specified in template invocation } else { # use the default value $result = $defaultValue; } } else { # parameter without a default value if ( defined($$refToParameterHash{$parameter}) ) { $result = $$refToParameterHash{$parameter}; # use parameter value specified in template invocation } else { # Parameter not specified in template invocation and does not have a default value - # do not perform substitution and keep the parameter in 3 braces # (these are Wiki rules for templates, see http://meta.wikimedia.org/wiki/Help:Template ). $result = "{{{$parameter}}}"; } } # Surplus parameters - i.e., those assigned values in template invocation but not used # in the template body - are simply ignored. $result; # return value } sub computeFullyQualifiedTemplateTitle(\$) { my ($refToTemplateTitle) = @_; # Determine the namespace of the page being included through the template mechanism my $namespaceSpecified = 0; if ($$refToTemplateTitle =~ /^:(.*)$/) { # Leading colon by itself implies main namespace, so strip this colon $$refToTemplateTitle = $1; $namespaceSpecified = 1; } elsif ($$refToTemplateTitle =~ /^([^:]*):/) { # colon found but not in the first position - check if it designates a known namespace my $prefix = $1; &normalizeNamespace(\$prefix); $namespaceSpecified = &isKnownNamespace(\$prefix); } # The case when the page title does not contain a colon at all also falls here. if ($namespaceSpecified) { # OK, the title of the page being included is fully qualified with a namespace } else { # The title of the page being included is NOT in the main namespace and lacks # any other explicit designation of the namespace - therefore, it is resolved # to the Template namespace (that's the default for the template inclusion mechanism). $$refToTemplateTitle = "Template:$$refToTemplateTitle"; } } sub extractCategories(\$\@$) { my ($refToText, $refToCategoriesArray, $id) = @_; # Remember that namespace names are case-insensitive, hence we're matching with "/i". # The first parameter to 'collectCategory' is passed by value rather than by reference, # because it might be dangerous to pass a reference to $1 in case it might get modified # (with unclear consequences). $$refToText =~ s/\[\[(?:\s*)(Category:.*?)\]\]/&collectCategory($1, $refToCategoriesArray)/ieg; # We don't accumulate categories directly in a hash table, since this would not preserve # their original order of appearance. &removeDuplicatesAndSelf($refToCategoriesArray, $id); } sub collectCategory($\@) { my ($catName, $refToCategoriesArray) = @_; if ($catName =~ /^(.*)\|/) { # Some categories contain a sort key, e.g., [[Category:Whatever|*]] or [[Category:Whatever| ]] # In such a case, take only the category name itself. $catName = $1; } &normalizeTitle(\$catName); my $catId = &resolveLink(\$catName); if ( defined($catId) ) { push(@$refToCategoriesArray, $catId); } else { print LOGF "Warning: unknown category '$catName'\n"; } # The return value is just a space, because we remove categories from the text # after we collected them " "; } sub extractInternalLinks(\$\@$$$) { my ($refToText, $refToInternalLinksArray, $id, $whetherToLogAnchorText, $whetherToRemoveDuplicates) = @_; # For each internal link outgoing form the current article, this hash table maps # the target id into the anchor text associated with it. Naturally, we only # collect anchor text for links that can be resolved to a page id. my %anchorTexts; # Link definitions may span over adjacent lines and therefore contain line breaks, # hence we use the /s modifier. # Occasionally, links are nested, e.g., # [[Image:kanner_kl2.jpg|frame|right|Dr. [[Leo Kanner]] introduced the label ''early infantile autism'' in [[1943]].]] # In order to prevent incorrect parsing, e.g., "[[Image:kanner_kl2.jpg|frame|right|Dr. [[Leo Kanner]]", # we extract links in several iterations of the while loop, while the link definition requires that # each pair [[...]] does not contain any opening braces. 1 while ( $$refToText =~ s/ (\w*) # words may be glued to the beginning of the link, # in which case they become part of the link # e.g., "ex-[[Giuseppe Mazzini|Mazzinian]] " \[\[ ([^\[]*?) # the link text can be any chars except an opening bracket, # this ensures we correctly parse nested links (see comments above) \]\] (\w*) # words may be glued to the end of the link, # in which case they become part of the link # e.g., "[[public transport]]ation" /&collectInternalLink($1, $2, $3, $refToInternalLinksArray, \%anchorTexts)/segx ); if ($whetherToRemoveDuplicates) { &removeDuplicatesAndSelf($refToInternalLinksArray, $id); } if ($whetherToLogAnchorText) { &logAnchorText(\%anchorTexts, $id); } } sub logAnchorText(\%$) { my ($refToAnchorTextsHash, $curPageId) = @_; # Remember that we use a hash table to associate anchor text with target page ids. # Therefore, if the current page has several links to another page (it happens), then we only # keep the anchor text of the last one (and override the previous ones) - we can live with it. # Consequently, we do not need to remove duplicates as there are none. # However, we still remove the links that point from the page to itself. my $targetId; my $anchorText; while ( ($targetId, $anchorText) = each(%$refToAnchorTextsHash) ) { if ($targetId != $curPageId) { &postprocessText(\$anchorText, 0); # anchor text doesn't need escaping of XML characters, # hence the second function parameter is 0 $anchorText =~ s/\n/ /g; # replace all newlines with spaces # make sure that something is left of anchor text after postprocessing if (length($anchorText) > 0) { print ANCHORF "$targetId\t$curPageId\t$anchorText\n"; } } } } sub collectInternalLink($$$\@\%) { my ($prefix, $link, $suffix, $refToInternalLinksArray, $refToAnchorTextHash) = @_; my $originalLink = $link; my $result = ""; # strip leading whitespace, if any $link =~ s/^\s*//; # Link definitions may span over adjacent lines and therefore contain line breaks, # hence we use the /s modifier on most matchings. # There are some special cases when the link may be preceded with a colon. # Known cases: # - Linking to a category (as opposed to actually assigning the current article # to a category) is performed using special syntax [[:Category:...]] # - Linking to other languages, e.g., [[:fr:Wikipedia:Aide]] # (without the leading colon, the link will go to the side menu # - Linking directly to the description page of an image, e.g., [[:Image:wiki.png]] # In all such cases, we strip the leading colon. if ($link =~ /^ : # colon at the beginnning of the link name (.*) # the rest of the link text $ /sx) { # just strip this initial colon (as well as any whitespace preceding it) $link = $1; } # Alternative text may be available after the pipeline symbol. # If the pipeline symbol is only used for masking parts of # the link name for presentation, we still consider that the author of the page # deemed the resulting text important, hence we always set this variable when # the pipeline symbol is present. my $alternativeTextAvailable = 0; # Some links contain several pipeline symbols, e.g., # [[Image:Zerzan.jpeg|thumb|right|[[John Zerzan]]]] # It seems that the extra pipeline symbols are parameters, so we just eliminate them. if ($link =~ /^(.*)\|([^|]*)$/s) { # first, extract the link up to the last pipeline symbol $link = $1; # the part before the last pipeline $result = $2; # the part after the last pipeline, this is usually an alternative text for this link $alternativeTextAvailable = 1; # pipeline found, see comment above # Now check if there are pipeline symbols remaining. # Note that this time we're looking for the shortest match, # to take the part of the text up to the first pipeline symbol. if ($link =~ /^([^|]*)\|(.*)$/s) { $link = $1; # $2 contains the parameters, which we don't really need } if (length($result) == 0) { if ($link !~ /\#/) { # If the "|" symbol is not followed by some text, then it masks the namespace # as well as any text in parentheses at the end of the link title. # However, pipeline masking is only invoked if the link does not contain an anchor, # hence the additional condition in the 'if' statement. &performPipelineMasking(\$link, \$result); } else { # If the link contains an anchor, then masking is not invoked, and we take the entire link $result = $link; } } } else { # the link text does not contain the pipeline, so take it as-is $result = $link; } if ($link =~ /^(.*)\#(.*)$/s) { # The link contains an anchor, so adjust the link to point to the page as a whole. $link = $1; my $anchor = $2; # Check if the link points to an anchor on the current page, and if so - ignore it. if (length($link) == 0 && ! $alternativeTextAvailable) { # This is indeed a link pointing to an anchor on the current page. # The link is thus cleared, so that it will not be resolved and collected later. # For anchors to the same page, discard the leading '#' symbol, and take # the rest as the text - but only if no alternative text was provided for this link. $result = $anchor; } } # Now collect the link, or links if the original link is in the date format # and specifies both day and year. In the latter case, the function for date # normalization may also modify the link text ($result), and may collect more # than one link (one for the day, another one for the year). my $dateRecognized = 0; # Alternative text (specified after pipeline) blocks normalization of dates. # We also perform a quick check - if the link does not start with a digit, # then it surely does not contain a date if ( ($link =~ /^\d/) && (! $alternativeTextAvailable)) { $dateRecognized = &normalizeDates(\$link, \$result, $refToInternalLinksArray, $refToAnchorTextHash); } # If a date (either day or day + year) was recognized, then no further processing is necessary if (! $dateRecognized) { &normalizeTitle(\$link); my $targetId = &resolveAndCollectInternalLink(\$link, $refToInternalLinksArray); # Wikipedia pages contain many links to other Wiki projects (especially Wikipedia in # other languages). While these links are not resolved to valid pages, we also want # to ignore their text. However, simply discarding the text of all links that cannot # be resolved would be overly aggressive, as authors frequently define phrases as links # to articles that don't yet exist, in the hope that they will be added later. # Therefore, we formulate the following conditions that must hold simultaneously # for discarding the text of a link: # 1) the link was not resolved to a valid id # 2) the link does not contain alternative text (if it did, then the text is probably # important enough to be retained) # 3) the link contains a colon - this is a very simple heuristics for identifying links to # other Wiki projects, other languages, or simply other namespaces within current Wikipedia. # While this method is not fool-proof (there are regular pages in the main namespace # that contain a colon in their title), we believe this is a reasonable tradeoff. if ( !defined($targetId) && ! $alternativeTextAvailable && $link =~ /:/ ) { $result = ""; print LOGF "Discarding text for link '$originalLink'\n"; } else { # finally, add the text originally attached to the left and/or to the right of the link # (if the link represents a date, then it has not text glued to it, so it's OK to only # use the prefix and suffix here) $result = $prefix . $result . $suffix; } if ( defined($targetId) ) { # If the current page has several links to another page, then we only take the anchor # of the last one (and override the previous ones) - we can live with it. $$refToAnchorTextHash{$targetId} = $result; } } $result; #return value } sub performPipelineMasking(\$\$) { my ($refToLink, $refToResult) = @_; # First check for presence of a namespace if ($$refToLink =~ /^([^:]*):(.*)$/) { my $namespaceCandidate = $1; my $rest = $2; &normalizeNamespace(\$namespaceCandidate); if ( &isKnownNamespace(\$namespaceCandidate) ) { $$refToResult = $rest; # take the link text without the namespace } else { $$refToResult = $$refToLink; # otherwise, take the entire link text (for now) } } else { $$refToResult = $$refToLink; # otherwise, take the entire link text (for now) } # Now check if there are parentheses at the end of the link text # (we now operate on $$refToResult, because we might have stripped the leading # namespace in the previous test). if ($$refToResult =~ /^ # the beginning of the string (.*) # the text up to the last pair of parentheses \( # opening parenthesis (?:[^()]*) # the text in the parentheses \) # closing parenthesis (?:\s*) # optional trailing whitespace, just in case $ # end of string /x) { $$refToResult = $1; # discard the text in parentheses at the end of the string } } sub resolveAndCollectInternalLink(\$\@) { my ($refToLink, $refToInternalLinksArray) = @_; my $targetId = &resolveLink($refToLink); if ( defined($targetId) ) { push(@$refToInternalLinksArray, $targetId); } else { # Some cases in this category that obviously won't be resolved to legal ids: # - Links to namespaces that we don't currently handle # (other than those for which 'isNamespaceOK' returns true); # media and sound files fall in this category # - Links to other languages, e.g., [[de:...]] # - Links to other Wiki projects, e.g., [[Wiktionary:...]] print LOGF "Warning: unknown link '$$refToLink'\n"; } $targetId; # return value } # Dates can appear in several formats # 1) [[July 20]], [[1969]] # 2) [[20 July]] [[1969]] # 3) [[1969]]-[[07-20]] # 4) [[1969-07-20]] # The first one is handled correctly without any special treatment, # so we don't even check for it here. # In (2) and (3), we only normalize the day, because it will be parsed separately from the year. # This function is only invoked if the link has no alternative text available, therefore, # we're free to override the result text. sub normalizeDates(\$\$\@\%) { my ($refToLink, $refToResultText, $refToInternalLinksArray, $refToAnchorTextHash) = @_; my $dateRecognized = 0; if ($$refToLink =~ /^(\d\d)\s*([A-Za-z]+)$/) { my $day = $1; my $month = ucfirst(lc($2)); if ( defined($monthToNumDays{$month}) && 1 <= $day && $day <= $monthToNumDays{$month} ) { $dateRecognized = 1; $$refToLink = "$month $day"; $$refToResultText = "$month $day"; my $targetId = &resolveAndCollectInternalLink($refToLink, $refToInternalLinksArray); if ( defined($targetId) ) { $$refToAnchorTextHash{$targetId} = $$refToResultText; } } else { # this doesn't look like a valid date, leave as-is } } elsif ($$refToLink =~ /^(\d\d)\-(\d\d)$/) { my $monthNum = int($1); my $day = $2; if ( defined($numberToMonth{$monthNum}) ) { my $month = $numberToMonth{$monthNum}; if (1 <= $day && $day <= $monthToNumDays{$month}) { $dateRecognized = 1; $$refToLink = "$month $day"; # we add a leading space, to separate the preceding year ("[[1969]]-" in the example") # from the day that we're creating $$refToResultText = " $month $day"; my $targetId = &resolveAndCollectInternalLink($refToLink, $refToInternalLinksArray); if ( defined($targetId) ) { $$refToAnchorTextHash{$targetId} = $$refToResultText; } } else { # this doesn't look like a valid date, leave as-is } } else { # this doesn't look like a valid date, leave as-is } } elsif ($$refToLink =~ /^(\d\d\d\d)\-(\d\d)\-(\d\d)$/) { my $year = $1; my $monthNum = int($2); my $day = $3; if ( defined($numberToMonth{$monthNum}) ) { my $month = $numberToMonth{$monthNum}; if (1 <= $day && $day <= $monthToNumDays{$month}) { $dateRecognized = 1; $$refToLink = "$month $day"; # the link text is combined from the day and the year $$refToResultText = "$month $day, $year"; my $targetId; # collect the link for the day $targetId = &resolveAndCollectInternalLink($refToLink, $refToInternalLinksArray); if ( defined($targetId) ) { $$refToAnchorTextHash{$targetId} = $$refToLink; } # collect the link for the year $targetId = &resolveAndCollectInternalLink(\$year, $refToInternalLinksArray); if ( defined($targetId) ) { $$refToAnchorTextHash{$targetId} = $year; } } else { # this doesn't look like a valid date, leave as-is } } else { # this doesn't look like a valid date, leave as-is } } $dateRecognized; # return value } sub extractUrls(\$\@) { my ($refToText, $refToUrlsArray) = @_; # First we handle the case of URLs enclosed in single brackets, with or without the description, # and with optional leading and/or trailing whitespace # Examples: [http://www.cnn.com], [ http://www.cnn.com ], [http://www.cnn.com CNN Web site] $$refToText =~ s/\[(?:\s*)($urlProtocols(?:[^\[\]]*))\]/&collectUrlFromBrackets($1, $refToUrlsArray)/eg; # Now we handle standalone URLs (those not enclosed in brackets) # The $urlTemrinator is matched via positive lookahead (?=...) in order not to remove # the terminator symbol itself, but rather only the URL. $$refToText =~ s/($urlProtocols(?:.*?))$urlTerminator/&collectStandaloneUrl($1, $refToUrlsArray)/eg; &removeDuplicatesAndSelf($refToUrlsArray, undef); } sub collectUrlFromBrackets($\@) { my ($url, $refToUrlsArray) = @_; my $text; # Assumption: leading whitespace has already been stripped if ( $url =~ /^($urlProtocols(?:.*?))($urlTerminator(?:.*))$/ ) { # description available push(@$refToUrlsArray, $1); $text = $2; } else { # no description push(@$refToUrlsArray, $url); $text = " "; } $text; # return value } sub collectStandaloneUrl($\@) { my ($url, $refToUrlsArray) = @_; push(@$refToUrlsArray, $url); # collect the URL as-is " "; # return value - replace the URL with a space } sub postprocessText(\$$) { my ($refToText, $whetherToEncodeXmlChars) = @_; # Eliminate all and fragments, because this text # will not be included anywhere, as we already handled all inclusion directives # in function 'includeTemplates'. # This block can easily span several lines, hence the "/s" modifier. $$refToText =~ s/(.*?)<\/includeonly>/ /sg; $$refToText =~ s/(.*?)<\/onlyinclude>/ /sg; # fragments remain, but remove the tags per se # We block the code below, as tags will anyway be thrown away later, # when we eliminate all remaining tags. ### This block can easily span several lines, hence the "/s" modifier ### $$refToText =~ s/(.*?)<\/noinclude>/$1/sg; # replace
and
directives with new paragraph $$refToText =~ s//\n\n/g; # Remove tables, as they often carry a lot of noise &eliminateTables($refToText); # Since we limit the number of levels of template recursion, we might end up with several # un-instantiated templates. In this case we simply eliminate them now. # Because templates may be nested, we eliminate them iteratively by starting from the most # nested one (hence the 'while' loop). # OLD comments and code: # For the same reason, we also require that the body of a template does not contain # opening braces (hence "[^\{]", any char except opening brace). # 1 while ($$refToText =~ s/\{\{(?:[^\{]*?)\}\}/ /sg); # END OF old comments and code # We also require that the body of a template does not contain the template opening sequence # (two successive opening braces - "\{\{"). We use negative lookahead to achieve this. 1 while ($$refToText =~ s/\{\{ (?: (?: (?! \{\{ ) . )*? ) \}\} / /sgx); # Remove any other <...> tags - but keep the text they enclose # (the tags are replaced with spaces to prevent adjacent pieces of text # from being glued together). # Comments () also fall into this category, and since they can easily span several lines, # we use the "/s" modifier. $$refToText =~ s/<(?:.*?)>/ /sg; # Change markup on bold/italics emphasis. We probably don't need to distinguish # these 3 types of emphasis, so we just replace all of them with a generic tag. # IMPORTANT: If 'encodeXmlChars' has beeen called before this line, then remember that # the apostrophes were already quoted to "'" $$refToText =~ s/'''''(.*?)'''''/$1/g; $$refToText =~ s/'''(.*?)'''/$1/g; $$refToText =~ s/''(.*?)''/$1/g; # Eliminate long sequences of newlines and whitespace. # Note that we don't want to replace sequences of spaces only, as this might make the text # less readable. Instead, we only eliminate sequences of whitespace that contain at least # two newlines. $$refToText =~ s/(?:\s*)\n(?:\s*)\n(?:\s*)/\n\n/g; # Eliminate XML entities such as " " , "×" etc. - otherwise, # in C++ code they will give rise to spurious words "nbsp", "times" etc. # Note that the standard entities - & , " , ' , < and > # are handled by the XML parser. All other entities, such as   are passed # by the XML parser to the upper level (in case of Wikipedia pages, # to the rendering engine). # Note that in the raw XML text, these entities look like "&nbsp;" # (i.e., with leading "&"). XML parser replaces "&" with "&", # so here in the code we see the entities as " ". $$refToText =~ s{& # the entity starts with "&" ((?:\#?)(?:\w+)) # optional '#' sign (as in α), followed by # an uninterrupted sequence of letters and/or digits ; # the entity ends with a semicolon }{&logReplacedXmlEntity($1)}egx; # entities are replaced with a space if ($whetherToEncodeXmlChars) { # encode text for XML &encodeXmlChars($refToText); } # NOTE that the following operations introduce XML tags, so they must appear # after the original text underwent character encoding with 'encodeXmlChars' !! # Change markup for section headers. # Note that section headers may only begin at the very first position in the line # (not even after a space). Therefore, each header markup in the following commands # is prefixed with "^" to make sure it begins at the beginning of the line. # Since the text (e.g., article body) may contains multiple lines, we use # the "/m" modifier to allow matching "^" at embedded "\n" positions. $$refToText =~ s/^=====(.*?)=====/

$1<\/h4>/mg; $$refToText =~ s/^====(.*?)====/

$1<\/h3>/mg; $$refToText =~ s/^===(.*?)===/

$1<\/h2>/mg; $$refToText =~ s/^==(.*?)==/

$1<\/h1>/mg; } sub logReplacedXmlEntity($) { my ($xmlEntity) = @_; print LOGF "ENTITY: &$xmlEntity;\n"; " "; # return value - entities are replaced with a space } BEGIN { # Making variables static for the function to avoid recompilation of regular expressions # every time the function is called. # Table definitions can easily span several lines, hence the "/s" modifier my $tableOpeningSequence1 = qr{ # either just
| # or ]*)>}ix; # "" # In the above definition, prohibiting '<' and '>' chars ([^<>]) ensures # that we do not consume more than necessary, so that in the example # "
aaa
bbb
ccc " # $1 is NOT extended to be "> aaa /i; # my $nonNestedTableRegex1 = # qr{$tableOpeningSequence1 # opening sequence # ( # (?: # non-capturing grouper # (?! # lookahead negation # $tableOpeningSequence1 # that's what we don't want to find inside a table definition # ) # . # any character (such that there is no table opening sequence # # after it because of the lookahead condition) # )*? # shortest match of such characters, up to the closing of a table # ) # $tableClosingSequence1}sx; # closing sequence my $tableOpeningSequence2 = qr/\{\|/; my $tableClosingSequence2 = qr/\|\}/; # my $nonNestedTableRegex2 = # qr{$tableOpeningSequence2 # opening sequence # ( # (?: # non-capturing grouper # (?! # lookahead negation # $tableOpeningSequence2 # that's what we don't want to find inside a table definition # ) # . # any character (such that there is no table opening sequence # # after it because of the lookahead condition) # )*? # shortest match of such characters, up to the closing of a table # ) # $tableClosingSequence2}sx; # closing sequence sub eliminateTables(\$) { my ($refToText) = @_; # Sadly, these patterns became too complex and cause segmentation fault, # hence we fall back to only handling non-nested tables :( # # Sometimes, tables are nested, therefore we use a while loop to eliminate them # # recursively, while requiring that any table we eliminate does not contain nested tables. # # For simplicity, we assume that tables of the two kinds (e.g., ...
and {| ... |}) # # are not nested in one another. $$refToText =~ s/$tableOpeningSequence1(.*?)$tableClosingSequence1/\n/sg; $$refToText =~ s/$tableOpeningSequence2(.*?)$tableClosingSequence2/\n/sg; } } # end of BEGIN block # If specified, 'elementToRemove' contains an element that needs to be removed as well. # For links, this ensures that a page does not link to itself. For categories, this # ensures that a page is not categorized to itself. This parameter is obviously # irrelevant for filtering URLs. # 'elementToRemove' must be a numeric value (not string), since we're testing it with '==' (not 'eq') sub removeDuplicatesAndSelf(\@$) { my ($refToArray, $elementToRemove) = @_; my %seen = (); my @uniq; my $item; foreach $item (@$refToArray) { if ( defined($elementToRemove) && ($item == $elementToRemove) ) { printf LOGF "Warning: current page links or categorizes to itself - " . "link discarded ($elementToRemove)\n"; next; } push(@uniq, $item) unless $seen{$item}++; } # overwrite the original array with the new one that does not contain duplicates @$refToArray = @uniq; } # Removes elements of the second list from the first list. # For efficiency purposes, the second list is converted into a hash. sub removeElements(\@\@) { my ($refToArray, $refToElementsToRemove) = @_; my %elementsToRemove = (); my @result; # Construct the hash table for fast lookups my $item; foreach $item (@$refToElementsToRemove) { $elementsToRemove{$item} = 1; } foreach $item (@$refToArray) { if ( ! defined($elementsToRemove{$item}) ) { push(@result, $item); } } # overwrite the original array with the new one @$refToArray = @result; } sub getTimeAsString() { my $tm = localtime(); my $result = sprintf("%02d:%02d:%02d", $tm->hour, $tm->min, $tm->sec); } sub trimWhitespaceBothSides(\$) { my ($stringRef) = @_; # remove leading whitespace $$stringRef =~ s/^\s*//; # remove trailing whitespace $$stringRef =~ s/\s*$//; } # There are 3 kinds of related links that we look for: # 1) Standalone (usually, at the beginning of the article or a section of it) # Ex: Main articles: ... # 2) Inlined - text in parentheses inside the body of the article # Ex: medicine (see also: [[Health]]) # 3) Dedicated section # Ex: == See also == # # In all calls to 'extractInternalLinks': # - The penultimate argument is 0, since we don't need to log anchor text here. # Anchor text will be handled when we analyze all the internal links in # the entire article (and not just look for related links). # - The last argument is 0 in order not to remove duplicates on every invocation # of 'extractInternalLinks'. This is because duplicates in related links are # not very common, but performing duplicate removal each time is expensive. # Instead, we remove duplicates once at the very end. sub identifyRelatedArticles(\$\@$) { my ($refToText, $refToRelatedArticles, $id) = @_; # We split the text into a set of lines. This also creates a copy of the original text - # this is important, since the function 'extractInternalLinks' modifies its argument, # so we'd better use it on a copy of the real article body. my @text = split("\n", $$refToText); my $line; # Standalone foreach $line (@text) { # We require that stanalone designators occur at the beginning of the line # (after at most a few characters, such as a whitespace or a colon), # and not just anywhere in the line. Otherwise, we would collect as related # those links that just happen to occur in the same line with an unrelated # string that represents a standalone designator. if ($line =~ /^(?:.{0,5})(${relatedWording_Standalone}.*)$/) { my $str = $1; # We extract links from the rest of the line print LOGF "Related(S): $id => $str\n"; &extractInternalLinks(\$str, $refToRelatedArticles, $id, 0, 0); print LOGF "Related(S): $id ==> @$refToRelatedArticles\n"; } } # Inlined (in parentheses) foreach $line (@text) { while ($line =~ /\((?:\s*)(${relatedWording_Inline}.*?)\)/g) { my $str = $1; print LOGF "Related(I): $id => $str\n"; &extractInternalLinks(\$str, $refToRelatedArticles, $id, 0, 0); print LOGF "Related(I): $id ==> @$refToRelatedArticles\n"; } } # Section # Sections can be at any level - "==", "===", "====" - it doesn't matter, # so it suffices to look for two consecutive "=" signs my $relatedSectionFound = 0; foreach $line (@text) { if ($relatedSectionFound) { # we're in the related section now if ($line =~ /==(?:.*?)==/) { # we just encountered the next section - exit the loop last; } else { # collect the links from the current line print LOGF "Related(N): $id => $line\n"; # 'extractInternalLinks' may mofidy its argument ('$line'), but it's OK # as we do not do any further processing to '$line' or '@text' &extractInternalLinks(\$line, $refToRelatedArticles, $id, 0, 0); print LOGF "Related(N): $id ==> @$refToRelatedArticles\n"; } } else { # we haven't yet found the related section if ($line =~ /==(.*?)==/) { # found some section header - let's check it my $sectionHeader = $1; if ($sectionHeader =~ /$relatedWording_Section/) { $relatedSectionFound = 1; next; # proceed to the next line } else { next; # unrelated section - just proceed to the next line } } else { next; # just proceed to the next line - nothing to do } } } &removeDuplicatesAndSelf($refToRelatedArticles, $id); } sub recordRelatedArticles($\@) { my ($id, $refToRelatedArticles) = @_; my $size = scalar(@$refToRelatedArticles); return if ($size == 0); print RELATEDF "$id\t", join(" ", @$refToRelatedArticles), "\n"; } ######################################################################## sub printUsage() { print "Wikiprep version $version, Copyright (C) 2007 Evgeniy Gabrilovich\n" . "Wikiprep comes with ABSOLUTELY NO WARRANTY; for details type '$0 -license'.\n" . "This is free software, and you are welcome to redistribute it\n" . "under certain conditions; type '$0 -license' for details.\n" . "Type '$0 -version' for version information.\n\n" . "Usage: $0 -f \n" . " e.g., $0 -f pages_articles.xml\n\n"; }