You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1720 lines
68 KiB

4 years ago
  1. ###############################################################################
  2. #
  3. # wikiprep.pl - Preprocess Wikipedia XML dumps
  4. # Copyright (C) 2007 Evgeniy Gabrilovich
  5. # The author can be contacted by electronic mail at gabr@cs.technion.ac.il
  6. #
  7. # This program is free software; you can redistribute it and/or modify
  8. # it under the terms of the GNU General Public License as published by
  9. # the Free Software Foundation; either version 2 of the License, or
  10. # (at your option) any later version.
  11. #
  12. # This program is distributed in the hope that it will be useful,
  13. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. # GNU General Public License for more details.
  16. #
  17. # You should have received a copy of the GNU General Public License
  18. # along with this program; if not, write to the Free Software
  19. # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA,
  20. # or see <http://www.gnu.org/licenses/> and
  21. # <http://www.fsf.org/licensing/licenses/info/GPLv2.html>
  22. #
  23. ###############################################################################
  24. use strict;
  25. use warnings;
  26. use File::Basename;
  27. use Getopt::Long;
  28. use Time::localtime;
  29. use XML::Parser;
  30. my $licenseFile = "COPYING";
  31. my $version = "2.02";
  32. if (@ARGV < 1) {
  33. &printUsage();
  34. exit 0;
  35. }
  36. my $file;
  37. my $showLicense = 0;
  38. my $showVersion = 0;
  39. GetOptions('f=s' => \$file,
  40. 'license' => \$showLicense,
  41. 'version' => \$showVersion);
  42. if ($showLicense) {
  43. if (-e $licenseFile) {
  44. print "See file $licenseFile for more details.\n"
  45. } else {
  46. print "Please see <http://www.gnu.org/licenses/> and <http://www.fsf.org/licensing/licenses/info/GPLv2.html>\n";
  47. }
  48. exit 0;
  49. }
  50. if ($showVersion) {
  51. print "Wikiprep version $version\n";
  52. exit 0;
  53. }
  54. if (!defined($file)) {
  55. &printUsage();
  56. exit 0;
  57. }
  58. if (! -e $file) {
  59. die "Input file '$file' cannot be opened for reading\n";
  60. }
  61. ##### Global definitions #####
  62. my %XmlEntities = ('&' => 'amp', '"' => 'quot', "'" => 'apos', '<' => 'lt', '>' => 'gt');
  63. # The URL protocol (e.g., http) matched here may be in either case, hence we use the /i modifier.
  64. my $urlProtocols = qr/http:\/\/|https:\/\/|telnet:\/\/|gopher:\/\/|file:\/\/|wais:\/\/|ftp:\/\/|mailto:|news:/i;
  65. # A URL terminator may be either one of a list of characters OR end of string (that is, '$').
  66. # This last part is necessary to handle URLs at the very end of a string when there is no "\n"
  67. # or any other subsequent character.
  68. my $urlTerminator = qr/[\[\]\{\}\s\n\|\"<>]|$/;
  69. my $relatedWording_Standalone =
  70. qr/Main(?:\s+)article(?:s?)|Further(?:\s+)information|Related(?:\s+)article(?:s?)|Related(?:\s+)topic(?:s?)|See(?:\s+)main(?:\s+)article(?:s?)|See(?:\s+)article(?:s?)|See(?:\s+)also|For(?:\s+)(?:more|further)/i;
  71. ## For(?:\s+)more(?:\s+)(?:background|details)(?:\s+)on(?:\s+)this(?:\s+)topic,(?:\s+)see
  72. my $relatedWording_Inline = qr/See[\s:]|See(?:\s+)also|For(?:\s+)(?:more|further)/i;
  73. my $relatedWording_Section = qr/Further(?:\s+)information|See(?:\s+)also|Related(?:\s+)article(?:s?)|Related(?:\s+)topic(?:s?)/i;
  74. my %monthToNumDays = ('January' => 31, 'February' => 29, 'March' => 31, 'April' => 30,
  75. 'May' => 31, 'June' => 30, 'July' => 31, 'August' => 31,
  76. 'September' => 30, 'October' => 31, 'November' => 30, 'December' => 31);
  77. my %numberToMonth = (1 => 'January', 2 => 'February', 3 => 'March', 4 => 'April',
  78. 5 => 'May', 6 => 'June', 7 => 'July', 8 => 'August',
  79. 9 => 'September', 10 => 'October', 11 => 'November', 12 => 'December');
  80. my $maxTemplateRecursionLevels = 5;
  81. my $maxParameterRecursionLevels = 5;
  82. ##### Global variables #####
  83. my %namespaces;
  84. # we only process pages in these namespaces + the main namespace (which has an empty name)
  85. my %okNamespacesForPrescanning = ('Template' => 1, 'Category' => 1);
  86. my %okNamespacesForTransforming = ('Category' => 1); # we don't use templates as concepts
  87. my %id2title;
  88. my %title2id;
  89. my %redir;
  90. my %templates; # template bodies for insertion
  91. my %catHierarchy; # each category is associated with a list of its immediate descendants
  92. my %statCategories; # number of pages classified under each category
  93. my %statIncomingLinks; # number of links incoming to each page
  94. my ($fileBasename, $filePath, $fileSuffix) = fileparse($file, ".xml");
  95. my $outputFile = "$filePath/$fileBasename.hgw$fileSuffix";
  96. my $logFile = "$filePath/$fileBasename.log";
  97. my $anchorTextFile = "$filePath/$fileBasename.anchor_text";
  98. my $relatedLinksFile = "$filePath/$fileBasename.related_links";
  99. open(OUTF, "> $outputFile") or die "Cannot open $outputFile";
  100. open(LOGF, "> $logFile") or die "Cannot open $logFile";
  101. open(ANCHORF, "> $anchorTextFile") or die "Cannot open $anchorTextFile";
  102. open(RELATEDF, "> $relatedLinksFile") or die "Cannot open $relatedLinksFile";
  103. binmode(STDOUT, ':utf8');
  104. binmode(STDERR, ':utf8');
  105. binmode(OUTF, ':utf8');
  106. binmode(LOGF, ':utf8');
  107. binmode(ANCHORF, ':utf8');
  108. print ANCHORF "# Line format: <Target page id> <Source page id> <Anchor text (up to the end of the line)>\n\n\n";
  109. print RELATEDF "# Line format: <Page id> <List of ids of related articles>\n\n\n";
  110. &copyXmlFileHeader();
  111. &loadNamespaces();
  112. &prescan();
  113. my $numTitles = scalar( keys(%id2title) );
  114. print "Loaded $numTitles titles\n";
  115. my $numRedirects = scalar( keys(%redir) );
  116. print "Loaded $numRedirects redirects\n";
  117. my $numTemplates = scalar( keys(%templates) );
  118. print "Loaded $numTemplates templates\n";
  119. &transform();
  120. &closeXmlFile();
  121. &writeStatistics();
  122. &writeCategoryHierarchy();
  123. close(LOGF);
  124. close(ANCHORF);
  125. close(RELATEDF);
  126. # Hogwarts needs the anchor text file to be sorted in the increading order of target page id.
  127. # The file is originally sorted by source page id (second field in each line).
  128. # We now use stable (-s) numeric (-n) sort on the first field (-k 1,1).
  129. # This way, the resultant file will be sorted on the target page id (first field) as primary key,
  130. # and on the source page id (second field) as secondary key.
  131. system("sort -s -n -k 1,1 $anchorTextFile > $anchorTextFile.sorted");
  132. ##### Subroutines #####
  133. sub normalizeTitle(\$) {
  134. my ($refToStr) = @_;
  135. # remove leading whitespace and underscores
  136. $$refToStr =~ s/^[\s_]+//;
  137. # remove trailing whitespace and underscores
  138. $$refToStr =~ s/[\s_]+$//;
  139. # replace sequences of whitespace and underscore chars with a single space
  140. $$refToStr =~ s/[\s_]+/ /g;
  141. if ($$refToStr =~ /^([^:]*):(\s*)(\S(?:.*))/) {
  142. my $prefix = $1;
  143. my $optionalWhitespace = $2;
  144. my $rest = $3;
  145. my $namespaceCandidate = $prefix;
  146. &normalizeNamespace(\$namespaceCandidate); # this must be done before the call to 'isKnownNamespace'
  147. if ( &isKnownNamespace(\$namespaceCandidate) ) {
  148. # If the prefix designates a known namespace, then it might follow by optional
  149. # whitespace that should be removed to get the canonical page name
  150. # (e.g., "Category: Births" should become "Category:Births").
  151. $$refToStr = $namespaceCandidate . ":" . ucfirst($rest);
  152. } else {
  153. # No namespace, just capitalize first letter.
  154. # If the part before the colon is not a known namespace, then we must not remove the space
  155. # after the colon (if any), e.g., "3001: The_Final_Odyssey" != "3001:The_Final_Odyssey".
  156. # However, to get the canonical page name we must contract multiple spaces into one,
  157. # because "3001: The_Final_Odyssey" != "3001: The_Final_Odyssey".
  158. $$refToStr = ucfirst($prefix) . ":" .
  159. (length($optionalWhitespace) > 0 ? " " : "") . $rest;
  160. }
  161. } else {
  162. # no namespace, just capitalize first letter
  163. $$refToStr = ucfirst($$refToStr);
  164. }
  165. }
  166. sub normalizeNamespace(\$) {
  167. my ($refToStr) = @_;
  168. $$refToStr = ucfirst( lc($$refToStr) );
  169. }
  170. # Checks if the prefix of the page name before the colon is actually one of the
  171. # 16+2+2 namespaces defined in the XML file.
  172. # Assumption: the argument was already normalized using 'normalizeNamespace'
  173. sub isKnownNamespace(\$) {
  174. my ($refToStr) = @_;
  175. defined( $namespaces{$$refToStr} ); # return value
  176. }
  177. # The correct form to create a redirect is #REDIRECT [[ link ]],
  178. # and function 'Parse::MediaWikiDump::page->redirect' only supports this form.
  179. # However, it seems that Wikipedia can also tolerate a variety of other forms, such as
  180. # REDIRECT|REDIRECTS|REDIRECTED|REDIRECTION, then an optional ":", optional "to" or optional "=".
  181. # Therefore, we use our own function to handle these cases as well.
  182. # If the page is a redirect, the function returns the title of the target page;
  183. # otherwise, it returns 'undef'.
  184. sub isRedirect($) {
  185. my ($page) = @_;
  186. # quick check
  187. return undef if ( ${$page->text} !~ /^#REDIRECT/i );
  188. if ( ${$page->text} =~ m{^\#REDIRECT # Redirect must start with "#REDIRECT"
  189. # (the backslash is needed before "#" here, because
  190. # "#" has special meaning with /x modifier)
  191. (?:S|ED|ION)? # The word may be in any of these forms,
  192. # i.e., REDIRECT|REDIRECTS|REDIRECTED|REDIRECTION
  193. (?:\s*) # optional whitespace
  194. (?: :|\sTO|=)? # optional colon, "TO" or "="
  195. # (in case of "TO", we expect a whitespace before it,
  196. # so that it's not glued to the preceding word)
  197. (?:\s*) # optional whitespace
  198. \[\[([^\]]*)\]\] # the link itself
  199. }ix ) { # matching is case-insensitive, hence /i
  200. my $target = $1;
  201. if ($target =~ /^(.*)\#(?:.*)$/) {
  202. # The link contains an anchor. Anchors are not allowed in REDIRECT pages, and therefore
  203. # we adjust the link to point to the page as a whole (that's how Wikipedia works).
  204. $target = $1;
  205. }
  206. return $target;
  207. }
  208. # OK, it's probably either a malformed redirect link, or something else
  209. return undef;
  210. }
  211. sub isNamespaceOkForPrescanning($) {
  212. my ($page) = @_;
  213. &isNamespaceOk($page, \%okNamespacesForPrescanning);
  214. }
  215. sub isNamespaceOkForTransforming($) {
  216. my ($page) = @_;
  217. &isNamespaceOk($page, \%okNamespacesForTransforming);
  218. }
  219. sub isNamespaceOk($\%) {
  220. my ($page, $refToNamespaceHash) = @_;
  221. my $result = 1;
  222. # main namespace is OK, so we only check pages that belong to other namespaces
  223. if ($page->namespace ne '') {
  224. my $namespace = $page->namespace;
  225. &normalizeNamespace(\$namespace);
  226. if ( &isKnownNamespace(\$namespace) ) {
  227. $result = defined( $$refToNamespaceHash{$namespace} );
  228. } else {
  229. # the prefix before ":" in the page title is not a known namespace,
  230. # therefore, the page belongs to the main namespace and is OK
  231. }
  232. }
  233. $result; # return value
  234. }
  235. sub encodeXmlChars(\$) {
  236. my ($refToStr) = @_;
  237. $$refToStr =~ s/([&"'<>])/&$XmlEntities{$1};/g;
  238. }
  239. sub copyXmlFileHeader() {
  240. open(INF, "< $file") or die "Cannot open $file";
  241. while (<INF>) { # copy lines up to "</siteinfo>"
  242. if (/^<mediawiki /) {
  243. # The top level element - mediawiki - contains a lot of attributes (e.g., schema)
  244. # that are no longer applicable to the XML file after our transformation.
  245. # Therefore, we simply write an opening tag <mediawiki> without any attributes.
  246. print OUTF "<mediawiki>\n";
  247. } else {
  248. # All other lines (up to </siteinfo>) are copied as-is
  249. print OUTF;
  250. }
  251. last if (/<\/siteinfo>/);
  252. }
  253. close(INF); # this file will later be reopened by "Parse::MediaWikiDump"
  254. }
  255. sub closeXmlFile() {
  256. print OUTF "</mediawiki>\n";
  257. close(OUTF);
  258. }
  259. sub writeStatistics() {
  260. my $statCategoriesFile = "$filePath/$fileBasename.stat.categories";
  261. my $statIncomingLinksFile = "$filePath/$fileBasename.stat.inlinks";
  262. open(STAT_CATS, "> $statCategoriesFile") or die "Cannot open $statCategoriesFile";
  263. print STAT_CATS "# Line format: <CategoryId (= page id)> <Number of pages in this category>\n",
  264. "# Here we count the *pages* that belong to this category, i.e., articles AND\n",
  265. "# sub-categories of this category (but not the articles in the sub-categories).\n",
  266. "\n\n";
  267. my $cat;
  268. foreach $cat ( sort { $statCategories{$b} <=> $statCategories{$a} }
  269. keys(%statCategories) ) {
  270. print STAT_CATS "$cat\t$statCategories{$cat}\n";
  271. }
  272. close(STAT_CATS);
  273. open(STAT_INLINKS, "> $statIncomingLinksFile") or die "Cannot open $statIncomingLinksFile";
  274. print STAT_INLINKS "# Line format: <Target page id> <Number of links to it from other pages>\n\n\n";
  275. my $destination;
  276. foreach $destination ( sort { $statIncomingLinks{$b} <=> $statIncomingLinks{$a} }
  277. keys(%statIncomingLinks) ) {
  278. print STAT_INLINKS "$destination\t$statIncomingLinks{$destination}\n";
  279. }
  280. close(STAT_INLINKS);
  281. }
  282. sub writeCategoryHierarchy() {
  283. my $catHierarchyFile = "$filePath/$fileBasename.cat_hier";
  284. open(CAT_HIER, "> $catHierarchyFile") or die "Cannot open $catHierarchyFile";
  285. print CAT_HIER "# Line format: <Category id> <List of ids of immediate descendants>\n\n\n";
  286. my $cat;
  287. foreach $cat ( sort { $catHierarchy{$a} <=> $catHierarchy{$b} }
  288. keys(%catHierarchy) ) {
  289. print CAT_HIER "$cat\t", join(" ", @{$catHierarchy{$cat}}), "\n";
  290. }
  291. close(CAT_HIER);
  292. }
  293. sub loadNamespaces() {
  294. # re-open the input XML file
  295. my $pages = Parse::MediaWikiDump::Pages->new($file);
  296. # load namespaces
  297. my $refNamespaces = $pages->namespaces;
  298. # namespace names are case-insensitive, so we force them
  299. # to canonical form to facilitate future comparisons
  300. my $ns;
  301. foreach $ns (@$refNamespaces) {
  302. my @namespaceData = @$ns;
  303. my $namespaceId = $namespaceData[0];
  304. my $namespaceName = $namespaceData[1];
  305. &normalizeNamespace(\$namespaceName);
  306. $namespaces{$namespaceName} = $namespaceId;
  307. }
  308. }
  309. # build id <-> title mappings and redirection table,
  310. # as well as load templates
  311. sub prescan() {
  312. # re-open the input XML file
  313. my $pages = Parse::MediaWikiDump::Pages->new($file);
  314. my $counter = 0;
  315. my $page;
  316. while (defined($page = $pages->page)) {
  317. my $id = $page->id;
  318. $counter++;
  319. if ($counter % 1000 == 0) {
  320. my $timeStr = &getTimeAsString();
  321. print LOGF "[$timeStr] Prescanning page id=$id\n";
  322. }
  323. my $title = $page->title;
  324. &normalizeTitle(\$title);
  325. if (length($title) == 0) {
  326. # This is a defense against pages whose title only contains UTF-8 chars that
  327. # are reduced to an empty string. Right now I can think of one such case -
  328. # <C2><A0> which represents the non-breaking space. In this particular case,
  329. # this page is a redirect to [[Non-nreaking space]], but having in the system
  330. # a redirect page with an empty title causes numerous problems, so we'll live
  331. # happier without it.
  332. print LOGF "Skipping page with empty title id=$id\n";
  333. next;
  334. }
  335. my $redirect = &isRedirect($page);
  336. if (defined($redirect)) {
  337. &normalizeTitle(\$redirect);
  338. next if (length($redirect) == 0); # again, same precaution here - see comments above
  339. $redir{$title} = $redirect;
  340. # nothing more to do for redirect pages
  341. next;
  342. }
  343. if ( ! &isNamespaceOkForPrescanning($page) ) {
  344. next; # we're only interested in certain namespaces
  345. }
  346. # if we get here, then either the page belongs to the main namespace OR
  347. # it belongs to one of the namespaces we're interested in
  348. if ( exists($id2title{$id}) ) {
  349. print LOGF "Warning: Page id=$id already encountered before!\n";
  350. next;
  351. }
  352. if ( exists($title2id{$title}) ) {
  353. # A page could have been encountered before with a different spelling.
  354. # Examples: &nbsp; = <C2><A0> (nonbreakable space), &szlig; = <C3><9F> (German Eszett ligature)
  355. print LOGF "Warning: Page title='$title' already encountered before!\n";
  356. next;
  357. }
  358. $id2title{$id} = $title;
  359. $title2id{$title} = $id;
  360. if ($title =~ /^Template:/) {
  361. my $text = ${$page->text};
  362. # We're storing template text for future inclusion, therefore,
  363. # remove all <noinclude> text and keep all <includeonly> text
  364. # (but eliminate <includeonly> tags per se).
  365. # However, if <onlyinclude> ... </onlyinclude> parts are present,
  366. # then only keep them and discard the rest of the template body.
  367. # This is because using <onlyinclude> on a text fragment is
  368. # equivalent to enclosing it in <includeonly> tags **AND**
  369. # enclosing all the rest of the template body in <noinclude> tags.
  370. # These definitions can easily span several lines, hence the "/s" modifiers.
  371. my $onlyincludeAccumulator;
  372. while ($text =~ /<onlyinclude>(.*?)<\/onlyinclude>/sg) {
  373. my $onlyincludeFragment = $1;
  374. $onlyincludeAccumulator .= "$onlyincludeFragment\n";
  375. }
  376. if ( defined($onlyincludeAccumulator)) {
  377. $text = $onlyincludeAccumulator;
  378. } else {
  379. # If there are no <onlyinclude> fragments, simply eliminate
  380. # <noinclude> fragments and keep <includeonly> ones.
  381. $text =~ s/<noinclude>(?:.*?)<\/noinclude>/\n/sg;
  382. $text =~ s/<includeonly>(.*?)<\/includeonly>/$1/sg;
  383. }
  384. $templates{$id} = $text;
  385. }
  386. }
  387. my $timeStr = &getTimeAsString();
  388. print LOGF "[$timeStr] Prescanning complete - prescanned $counter pages\n";
  389. }
  390. sub transform() {
  391. # re-open the input XML file
  392. my $pages = Parse::MediaWikiDump::Pages->new($file);
  393. my $page;
  394. while (defined($page = $pages->page)) {
  395. my $id = $page->id;
  396. my $timeStr = &getTimeAsString();
  397. print LOGF "[$timeStr] Transforming page id=$id\n";
  398. if ( defined( &isRedirect($page) ) ) {
  399. next; # we've already loaded all redirects in the prescanning phase
  400. }
  401. if ( ! &isNamespaceOkForTransforming($page) ) {
  402. next; # we're only interested in pages from certain namespaces
  403. }
  404. my $title = $page->title;
  405. &normalizeTitle(\$title);
  406. # see the comment about empty titles in function 'prescan'
  407. if (length($title) == 0) {
  408. print LOGF "Skipping page with empty title id=$id\n";
  409. next;
  410. }
  411. my $text = ${$page->text};
  412. my $orgLength = length($text); # text length BEFORE any transformations
  413. # The check for stub must be done BEFORE any further processing,
  414. # because stubs indicators are templates, and templates are substituted.
  415. my $isStub = 0;
  416. if ( $text =~ m/stub}}/i ) {
  417. $isStub = 1;
  418. }
  419. my @categories;
  420. my @internalLinks;
  421. my @urls;
  422. &includeTemplates(\$text);
  423. my @relatedArticles;
  424. # This function only examines the contents of '$text', but doesn't change it.
  425. &identifyRelatedArticles(\$text, \@relatedArticles, $id);
  426. # We process categories directly, because '$page->categories' ignores
  427. # categories inherited from included templates
  428. &extractCategories(\$text, \@categories, $id);
  429. # Categories are listed at the end of articles, and therefore may mistakenly
  430. # be added to the list of related articles (which often appear in the last
  431. # section such as "See also"). To avoid this, we explicitly remove all categories
  432. # from the list of related links, and only then record the list of related links
  433. # to the file.
  434. &removeElements(\@relatedArticles, \@categories);
  435. &recordRelatedArticles($id, \@relatedArticles);
  436. &extractInternalLinks(\$text, \@internalLinks, $id, 1, 1);
  437. &extractUrls(\$text, \@urls);
  438. &postprocessText(\$text, 1);
  439. my $newLength = length($text); # text length AFTER all transformations
  440. &writePage($id, \$title, \$text, $orgLength, $newLength, $isStub, \@categories, \@internalLinks, \@urls);
  441. &updateStatistics(\@categories, \@internalLinks);
  442. if ($title =~ /^Category:/) {
  443. &updateCategoryHierarchy($id, \@categories);
  444. }
  445. }
  446. }
  447. sub updateStatistics(\@\@) {
  448. my ($refToCategories, $refToInternalLinks) = @_;
  449. my $cat;
  450. foreach $cat (@$refToCategories) {
  451. $statCategories{$cat}++;
  452. }
  453. my $link;
  454. foreach $link (@$refToInternalLinks) {
  455. $statIncomingLinks{$link}++;
  456. }
  457. }
  458. sub updateCategoryHierarchy($\@) {
  459. # The list of categories passed as a parameter is actually the list of parent categories
  460. # for the current category
  461. my ($childId, $refToParentCategories) = @_;
  462. my $parentCat;
  463. foreach $parentCat (@$refToParentCategories) {
  464. if ( exists($catHierarchy{$parentCat}) ) {
  465. push(@{$catHierarchy{$parentCat}}, $childId);
  466. } else {
  467. # create a new array with '$childId' as the only child (for now) of '$parentCat'
  468. my @arr;
  469. push(@arr, $childId);
  470. $catHierarchy{$parentCat} = [ @arr ];
  471. }
  472. }
  473. }
  474. sub writePage($\$\$$$$\@\@\@) {
  475. my ($id, $refToTitle, $refToText, $orgLength, $newLength, $isStub,
  476. $refToCategories, $refToInternalLinks, $refToUrls) = @_;
  477. my $numCategories = scalar(@$refToCategories);
  478. my $numLinks = scalar(@$refToInternalLinks);
  479. my $numUrls = scalar(@$refToUrls);
  480. print OUTF "<page id=\"$id\" orglength=\"$orgLength\" newlength=\"$newLength\" stub=\"$isStub\" " .
  481. "categories=\"$numCategories\" outlinks=\"$numLinks\" urls=\"$numUrls\">\n";
  482. my $encodedTitle = $$refToTitle;
  483. &encodeXmlChars(\$encodedTitle);
  484. print OUTF "<title>$encodedTitle</title>\n";
  485. print OUTF "<categories>";
  486. print OUTF join(" ", @$refToCategories);
  487. print OUTF "</categories>\n";
  488. print OUTF "<links>";
  489. print OUTF join(" ", @$refToInternalLinks);
  490. print OUTF "</links>\n";
  491. print OUTF "<urls>\n";
  492. my $url;
  493. foreach $url (@$refToUrls) {
  494. &encodeXmlChars(\$url);
  495. print OUTF "$url\n";
  496. }
  497. print OUTF "</urls>\n";
  498. # text has already undergone 'encodeXmlChars' in function 'postprocessText'
  499. print OUTF "<text>\n$$refToText\n</text>\n";
  500. print OUTF "</page>\n";
  501. }
  502. # Maps a title into the id, and performs redirection if necessary.
  503. # Assumption: the argument was already normalized using 'normalizeTitle'
  504. sub resolveLink(\$) {
  505. my ($refToTitle) = @_;
  506. # safety precaution
  507. return undef if (length($$refToTitle) == 0);
  508. my $targetId; # result
  509. my $targetTitle = $$refToTitle;
  510. if ( exists($redir{$$refToTitle}) ) { # this link is a redirect
  511. $targetTitle = $redir{$$refToTitle};
  512. # check if this is a double redirect
  513. if ( exists($redir{$targetTitle}) ) {
  514. $targetTitle = undef; # double redirects are not allowed and are ignored
  515. print LOGF "Warning: link '$$refToTitle' caused double redirection and was ignored\n";
  516. } else {
  517. print LOGF "Link '$$refToTitle' was redirected to '$targetTitle'\n";
  518. }
  519. }
  520. if ( defined($targetTitle) ) {
  521. if ( exists($title2id{$targetTitle}) ) {
  522. $targetId = $title2id{$targetTitle};
  523. } else {
  524. # target not found
  525. print LOGF "Warning: link '$$refToTitle' cannot be matched to an id\n";
  526. $targetId = undef;
  527. }
  528. } else {
  529. $targetId = undef;
  530. }
  531. $targetId; # return value
  532. }
  533. sub includeTemplates(\$) {
  534. my ($refToText) = @_;
  535. # Using the while loop forces templates to be included recursively
  536. # (i.e., includes the body of templates that themselves were included
  537. # on the previous iteration ).
  538. # Template definitions can easily span several lines, hence the "/s" modifier.
  539. # Templates are frequently nested. Occasionally, parsing mistakes may cause template insertion
  540. # to enter an infinite loop, for instance when trying to instantiate Template:Country
  541. # {{country_{{{1}}}|{{{2}}}|{{{2}}}|size={{{size|}}}|name={{{name|}}}}}
  542. # which is repeatedly trying to insert template "country_", which is again resolved to
  543. # Template:Country. The straightforward solution of keeping track of templates that were
  544. # already inserted for the current article would not work, because the same template
  545. # may legally be used more than once, with different parameters in different parts of
  546. # the article. Therefore, we simply limit the number of iterations of nested template
  547. # inclusion.
  548. my $templateRecursionLevels = 0;
  549. # We also require that the body of a template does not contain the template opening sequence
  550. # (two successive opening braces - "\{\{"). We use negative lookahead to achieve this.
  551. while ( ($templateRecursionLevels < $maxTemplateRecursionLevels) &&
  552. $$refToText =~ s/\{\{
  553. (?:\s*) # optional whitespace before the template name is ignored
  554. (
  555. (?:
  556. (?!
  557. \{\{
  558. )
  559. .
  560. )*?
  561. )
  562. # OLD code and comments
  563. # (?:\s*) # optional whitespace before the template name is ignored
  564. # ([^\{]*?) # Occasionally, templates are nested,
  565. # # e.g., {{localurl:{{NAMESPACE}}:{{PAGENAME}}}}
  566. # # In order to prevent incorrect parsing, e.g.,
  567. # # "{{localurl:{{NAMESPACE}}", we require that the
  568. # # template name does not include opening braces,
  569. # # hence "[^\{]" (any char except opening brace).
  570. # END OF OLD code and comments
  571. \}\}
  572. /&instantiateTemplate($1)/segx
  573. ) {
  574. $templateRecursionLevels++;
  575. }
  576. # Since we limit the number of levels of template recursion, we might end up with several
  577. # un-instantiated templates. In this case we simply eliminate them - however, we do so
  578. # later, in function 'postprocessText()', after extracting categories, links and URLs.
  579. }
  580. BEGIN {
  581. # Making variables static for the function to avoid recompilation of regular expressions
  582. # every time the function is called.
  583. my $specialSeparator = "\.pAr\.";
  584. my $specialSeparatorRegex = qr/$specialSeparator/;
  585. sub parseTemplateInvocation(\$\$\%) {
  586. my ($refToTemplateInvocation, $refToTemplateTitle, $refToParameterHash) = @_;
  587. # Template definitions (especially those with parameters) can easily span several lines,
  588. # hence the "/s" modifier. The template name extends up to the first pipeline symbol (if any).
  589. # Template parameters go after the "|" symbol.
  590. if ($$refToTemplateInvocation =~ /^([^|]*)\|(.*)$/sx) {
  591. $$refToTemplateTitle = $1; # single out the template name itself
  592. my $paramsList = $2;
  593. # Template parameters often contain URLs, internal links, or just other useful text,
  594. # whereas the template serves for presenting it in some nice way.
  595. # Parameters are separated by "|" symbols. However, we cannot simply split the string
  596. # on "|" symbols, since these frequently appear inside internal links. Therefore, we split
  597. # on those "|" symbols that are not inside [[...]]. It's obviously sufficient to check that
  598. # brackets are not improperly nested on one side of "|", so we use lookahead.
  599. # We first replace all "|" symbols that are not inside [[...]] with a special separator that
  600. # we invented, which will hopefully not normally appear in the text (.pAr.).
  601. # Next, we use 'split' to break the string on this new separator.
  602. $paramsList =~ s/\| # split on pipeline symbol, such that
  603. (?: # non-capturing grouper that encloses 2 options
  604. (?= # zero-width lookahead - option #1
  605. [^\]]*$ # there are no closing brackets up to the end
  606. # of the string (i.e., all the characters up to
  607. # the end of the string are not closing brackets)
  608. ) # end of first lookahead (= end of option #1)
  609. | # or
  610. (?= # another zero-width lookahead - option #2
  611. [^\]]* \[ # the nearest opening bracket on the right is not preceded
  612. # by a closing bracket (i.e., all the characters that
  613. # precede it are not closing brackets
  614. ) # end of second lookahead (= end of option #2)
  615. ) # end of the outer grouper
  616. /$specialSeparator/sxg; # replace matching symbols with a special separator
  617. # /s means string can contain newline chars
  618. my @parameters = split(/$specialSeparatorRegex/, $paramsList);
  619. # Parameters can be either named or unnamed. In the latter case, their name is defined by their
  620. # ordinal position (1, 2, 3, ...).
  621. my $unnamedParameterCounter = 0;
  622. # It's legal for unnamed parameters to be skipped, in which case they will get default
  623. # values (if available) during actual instantiation. That is {{template_name|a||c}} means
  624. # parameter 1 gets the value 'a', parameter 2 value is not defined, and parameter 3 gets the value 'c'.
  625. # This case is correctly handled by function 'split', and does not require any special handling.
  626. my $param;
  627. foreach $param (@parameters) {
  628. # Spaces before or after a parameter value are normally ignored, UNLESS the parameter contains
  629. # a link (to prevent possible gluing the link to the following text after template substitution)
  630. # Parameter values may contain "=" symbols, hence the parameter name extends up to
  631. # the first such symbol.
  632. # It is legal for a parameter to be specified several times, in which case the last assignment
  633. # takes precedence. Example: "{{t|a|b|c|2=B}}" is equivalent to "{{t|a|B|c}}".
  634. # Therefore, we don't check if the parameter has been assigned a value before, because
  635. # anyway the last assignment should override any previous ones.
  636. if ($param =~ /^([^=]*)=(.*)$/s) {
  637. # This is a named parameter.
  638. # This case also handles parameter assignments like "2=xxx", where the number of an unnamed
  639. # parameter ("2") is specified explicitly - this is handled transparently.
  640. my $parameterName = $1;
  641. my $parameterValue = $2;
  642. &trimWhitespaceBothSides(\$parameterName);
  643. if ($parameterValue !~ /\]\]/) { # if the value does not contain a link, trim whitespace
  644. &trimWhitespaceBothSides(\$parameterValue);
  645. }
  646. $$refToParameterHash{$parameterName} = $parameterValue;
  647. } else {
  648. # this is an unnamed parameter
  649. $unnamedParameterCounter++;
  650. if ($param !~ /\]\]/) { # if the value does not contain a link, trim whitespace
  651. &trimWhitespaceBothSides(\$param);
  652. }
  653. $$refToParameterHash{$unnamedParameterCounter} = $param;
  654. }
  655. }
  656. } else {
  657. # Template invocation does not contain a pipeline symbol, hence take the entire
  658. # invocation text as the template title.
  659. $$refToTemplateTitle = $$refToTemplateInvocation;
  660. }
  661. }
  662. } # end of BEGIN block
  663. sub instantiateTemplate($) {
  664. my ($templateInvocation) = @_;
  665. my $result = "";
  666. print LOGF "Instantiating template=$templateInvocation\n";
  667. my $templateTitle;
  668. my %templateParams;
  669. &parseTemplateInvocation(\$templateInvocation, \$templateTitle, \%templateParams);
  670. &computeFullyQualifiedTemplateTitle(\$templateTitle);
  671. &includeTemplateText(\$templateTitle, \%templateParams, \$result);
  672. $result; # return value
  673. }
  674. sub includeTemplateText(\$\%\$) {
  675. my ($refToTemplateTitle, $refToParameterHash, $refToResult) = @_;
  676. &normalizeTitle($refToTemplateTitle);
  677. my $includedPageId = &resolveLink($refToTemplateTitle);
  678. if ( defined($includedPageId) && exists($templates{$includedPageId}) ) {
  679. # OK, perform the actual inclusion with parameter substitution
  680. $$refToResult = $templates{$includedPageId};
  681. # Perform parameter substitution
  682. # A parameter call ( {{{...}}} ) may span over a newline, hence the /s modifier
  683. # Parameters may be nested (see comments below), hence we do the substitution iteratively
  684. # in a while loop. We also limit the maximum number of iterations to avoid too long or
  685. # even endless loops (in case of malformed input).
  686. my $parameterRecursionLevels = 0;
  687. # We also require that the body of a parameter does not contain the parameter opening sequence
  688. # (three successive opening braces - "\{\{\{"). We use negative lookahead to achieve this.
  689. while ( ($parameterRecursionLevels < $maxParameterRecursionLevels) &&
  690. $$refToResult =~ s/\{\{\{
  691. (
  692. (?:
  693. (?!
  694. \{\{\{
  695. )
  696. .
  697. )*?
  698. )
  699. # OLD code and comments
  700. # ([^\{]*?) # Occasionally, parameters are nested because
  701. # # they are dependent on other parameters,
  702. # # e.g., {{{Author|{{{PublishYear|}}}}}}
  703. # # (here, the default value for 'Author' is
  704. # # dependent on 'PublishYear').
  705. # # In order to prevent incorrect parsing, e.g.,
  706. # # "{{{Author|{{{PublishYear|}}}", we require that the
  707. # # parameter name does not include opening braces,
  708. # # hence "[^\{]" (any char except opening brace).
  709. # END OF OLD code and comments
  710. \}\}\}
  711. /&substituteParameter($1, $refToParameterHash)/segx
  712. ) {
  713. $parameterRecursionLevels++;
  714. }
  715. } else {
  716. # The page being included cannot be identified - perhaps we skipped it (because currently
  717. # we only allow for inclusion of pages in the Template namespace), or perhaps it's
  718. # a variable name like {{NUMBEROFARTICLES}}. Just remove this inclusion directive and
  719. # replace it with a space
  720. print LOGF "Template '$$refToTemplateTitle' is not available for inclusion\n";
  721. $$refToResult = " ";
  722. }
  723. }
  724. sub substituteParameter($\%) {
  725. my ($parameter, $refToParameterHash) = @_;
  726. my $result;
  727. if ($parameter =~ /^([^|]*)\|(.*)$/) {
  728. # This parameter has a default value
  729. my $paramName = $1;
  730. my $defaultValue = $2;
  731. if ( defined($$refToParameterHash{$paramName}) ) {
  732. $result = $$refToParameterHash{$paramName}; # use parameter value specified in template invocation
  733. } else { # use the default value
  734. $result = $defaultValue;
  735. }
  736. } else {
  737. # parameter without a default value
  738. if ( defined($$refToParameterHash{$parameter}) ) {
  739. $result = $$refToParameterHash{$parameter}; # use parameter value specified in template invocation
  740. } else {
  741. # Parameter not specified in template invocation and does not have a default value -
  742. # do not perform substitution and keep the parameter in 3 braces
  743. # (these are Wiki rules for templates, see http://meta.wikimedia.org/wiki/Help:Template ).
  744. $result = "{{{$parameter}}}";
  745. }
  746. }
  747. # Surplus parameters - i.e., those assigned values in template invocation but not used
  748. # in the template body - are simply ignored.
  749. $result; # return value
  750. }
  751. sub computeFullyQualifiedTemplateTitle(\$) {
  752. my ($refToTemplateTitle) = @_;
  753. # Determine the namespace of the page being included through the template mechanism
  754. my $namespaceSpecified = 0;
  755. if ($$refToTemplateTitle =~ /^:(.*)$/) {
  756. # Leading colon by itself implies main namespace, so strip this colon
  757. $$refToTemplateTitle = $1;
  758. $namespaceSpecified = 1;
  759. } elsif ($$refToTemplateTitle =~ /^([^:]*):/) {
  760. # colon found but not in the first position - check if it designates a known namespace
  761. my $prefix = $1;
  762. &normalizeNamespace(\$prefix);
  763. $namespaceSpecified = &isKnownNamespace(\$prefix);
  764. }
  765. # The case when the page title does not contain a colon at all also falls here.
  766. if ($namespaceSpecified) {
  767. # OK, the title of the page being included is fully qualified with a namespace
  768. } else {
  769. # The title of the page being included is NOT in the main namespace and lacks
  770. # any other explicit designation of the namespace - therefore, it is resolved
  771. # to the Template namespace (that's the default for the template inclusion mechanism).
  772. $$refToTemplateTitle = "Template:$$refToTemplateTitle";
  773. }
  774. }
  775. sub extractCategories(\$\@$) {
  776. my ($refToText, $refToCategoriesArray, $id) = @_;
  777. # Remember that namespace names are case-insensitive, hence we're matching with "/i".
  778. # The first parameter to 'collectCategory' is passed by value rather than by reference,
  779. # because it might be dangerous to pass a reference to $1 in case it might get modified
  780. # (with unclear consequences).
  781. $$refToText =~ s/\[\[(?:\s*)(Category:.*?)\]\]/&collectCategory($1, $refToCategoriesArray)/ieg;
  782. # We don't accumulate categories directly in a hash table, since this would not preserve
  783. # their original order of appearance.
  784. &removeDuplicatesAndSelf($refToCategoriesArray, $id);
  785. }
  786. sub collectCategory($\@) {
  787. my ($catName, $refToCategoriesArray) = @_;
  788. if ($catName =~ /^(.*)\|/) {
  789. # Some categories contain a sort key, e.g., [[Category:Whatever|*]] or [[Category:Whatever| ]]
  790. # In such a case, take only the category name itself.
  791. $catName = $1;
  792. }
  793. &normalizeTitle(\$catName);
  794. my $catId = &resolveLink(\$catName);
  795. if ( defined($catId) ) {
  796. push(@$refToCategoriesArray, $catId);
  797. } else {
  798. print LOGF "Warning: unknown category '$catName'\n";
  799. }
  800. # The return value is just a space, because we remove categories from the text
  801. # after we collected them
  802. " ";
  803. }
  804. sub extractInternalLinks(\$\@$$$) {
  805. my ($refToText, $refToInternalLinksArray, $id,
  806. $whetherToLogAnchorText, $whetherToRemoveDuplicates) = @_;
  807. # For each internal link outgoing form the current article, this hash table maps
  808. # the target id into the anchor text associated with it. Naturally, we only
  809. # collect anchor text for links that can be resolved to a page id.
  810. my %anchorTexts;
  811. # Link definitions may span over adjacent lines and therefore contain line breaks,
  812. # hence we use the /s modifier.
  813. # Occasionally, links are nested, e.g.,
  814. # [[Image:kanner_kl2.jpg|frame|right|Dr. [[Leo Kanner]] introduced the label ''early infantile autism'' in [[1943]].]]
  815. # In order to prevent incorrect parsing, e.g., "[[Image:kanner_kl2.jpg|frame|right|Dr. [[Leo Kanner]]",
  816. # we extract links in several iterations of the while loop, while the link definition requires that
  817. # each pair [[...]] does not contain any opening braces.
  818. 1 while ( $$refToText =~ s/
  819. (\w*) # words may be glued to the beginning of the link,
  820. # in which case they become part of the link
  821. # e.g., "ex-[[Giuseppe Mazzini|Mazzinian]] "
  822. \[\[
  823. ([^\[]*?) # the link text can be any chars except an opening bracket,
  824. # this ensures we correctly parse nested links (see comments above)
  825. \]\]
  826. (\w*) # words may be glued to the end of the link,
  827. # in which case they become part of the link
  828. # e.g., "[[public transport]]ation"
  829. /&collectInternalLink($1, $2, $3, $refToInternalLinksArray, \%anchorTexts)/segx
  830. );
  831. if ($whetherToRemoveDuplicates) {
  832. &removeDuplicatesAndSelf($refToInternalLinksArray, $id);
  833. }
  834. if ($whetherToLogAnchorText) {
  835. &logAnchorText(\%anchorTexts, $id);
  836. }
  837. }
  838. sub logAnchorText(\%$) {
  839. my ($refToAnchorTextsHash, $curPageId) = @_;
  840. # Remember that we use a hash table to associate anchor text with target page ids.
  841. # Therefore, if the current page has several links to another page (it happens), then we only
  842. # keep the anchor text of the last one (and override the previous ones) - we can live with it.
  843. # Consequently, we do not need to remove duplicates as there are none.
  844. # However, we still remove the links that point from the page to itself.
  845. my $targetId;
  846. my $anchorText;
  847. while ( ($targetId, $anchorText) = each(%$refToAnchorTextsHash) ) {
  848. if ($targetId != $curPageId) {
  849. &postprocessText(\$anchorText, 0); # anchor text doesn't need escaping of XML characters,
  850. # hence the second function parameter is 0
  851. $anchorText =~ s/\n/ /g; # replace all newlines with spaces
  852. # make sure that something is left of anchor text after postprocessing
  853. if (length($anchorText) > 0) {
  854. print ANCHORF "$targetId\t$curPageId\t$anchorText\n";
  855. }
  856. }
  857. }
  858. }
  859. sub collectInternalLink($$$\@\%) {
  860. my ($prefix, $link, $suffix, $refToInternalLinksArray, $refToAnchorTextHash) = @_;
  861. my $originalLink = $link;
  862. my $result = "";
  863. # strip leading whitespace, if any
  864. $link =~ s/^\s*//;
  865. # Link definitions may span over adjacent lines and therefore contain line breaks,
  866. # hence we use the /s modifier on most matchings.
  867. # There are some special cases when the link may be preceded with a colon.
  868. # Known cases:
  869. # - Linking to a category (as opposed to actually assigning the current article
  870. # to a category) is performed using special syntax [[:Category:...]]
  871. # - Linking to other languages, e.g., [[:fr:Wikipedia:Aide]]
  872. # (without the leading colon, the link will go to the side menu
  873. # - Linking directly to the description page of an image, e.g., [[:Image:wiki.png]]
  874. # In all such cases, we strip the leading colon.
  875. if ($link =~ /^
  876. : # colon at the beginnning of the link name
  877. (.*) # the rest of the link text
  878. $
  879. /sx) {
  880. # just strip this initial colon (as well as any whitespace preceding it)
  881. $link = $1;
  882. }
  883. # Alternative text may be available after the pipeline symbol.
  884. # If the pipeline symbol is only used for masking parts of
  885. # the link name for presentation, we still consider that the author of the page
  886. # deemed the resulting text important, hence we always set this variable when
  887. # the pipeline symbol is present.
  888. my $alternativeTextAvailable = 0;
  889. # Some links contain several pipeline symbols, e.g.,
  890. # [[Image:Zerzan.jpeg|thumb|right|[[John Zerzan]]]]
  891. # It seems that the extra pipeline symbols are parameters, so we just eliminate them.
  892. if ($link =~ /^(.*)\|([^|]*)$/s) { # first, extract the link up to the last pipeline symbol
  893. $link = $1; # the part before the last pipeline
  894. $result = $2; # the part after the last pipeline, this is usually an alternative text for this link
  895. $alternativeTextAvailable = 1; # pipeline found, see comment above
  896. # Now check if there are pipeline symbols remaining.
  897. # Note that this time we're looking for the shortest match,
  898. # to take the part of the text up to the first pipeline symbol.
  899. if ($link =~ /^([^|]*)\|(.*)$/s) {
  900. $link = $1;
  901. # $2 contains the parameters, which we don't really need
  902. }
  903. if (length($result) == 0) {
  904. if ($link !~ /\#/) {
  905. # If the "|" symbol is not followed by some text, then it masks the namespace
  906. # as well as any text in parentheses at the end of the link title.
  907. # However, pipeline masking is only invoked if the link does not contain an anchor,
  908. # hence the additional condition in the 'if' statement.
  909. &performPipelineMasking(\$link, \$result);
  910. } else {
  911. # If the link contains an anchor, then masking is not invoked, and we take the entire link
  912. $result = $link;
  913. }
  914. }
  915. } else {
  916. # the link text does not contain the pipeline, so take it as-is
  917. $result = $link;
  918. }
  919. if ($link =~ /^(.*)\#(.*)$/s) {
  920. # The link contains an anchor, so adjust the link to point to the page as a whole.
  921. $link = $1;
  922. my $anchor = $2;
  923. # Check if the link points to an anchor on the current page, and if so - ignore it.
  924. if (length($link) == 0 && ! $alternativeTextAvailable) {
  925. # This is indeed a link pointing to an anchor on the current page.
  926. # The link is thus cleared, so that it will not be resolved and collected later.
  927. # For anchors to the same page, discard the leading '#' symbol, and take
  928. # the rest as the text - but only if no alternative text was provided for this link.
  929. $result = $anchor;
  930. }
  931. }
  932. # Now collect the link, or links if the original link is in the date format
  933. # and specifies both day and year. In the latter case, the function for date
  934. # normalization may also modify the link text ($result), and may collect more
  935. # than one link (one for the day, another one for the year).
  936. my $dateRecognized = 0;
  937. # Alternative text (specified after pipeline) blocks normalization of dates.
  938. # We also perform a quick check - if the link does not start with a digit,
  939. # then it surely does not contain a date
  940. if ( ($link =~ /^\d/) && (! $alternativeTextAvailable)) {
  941. $dateRecognized = &normalizeDates(\$link, \$result, $refToInternalLinksArray, $refToAnchorTextHash);
  942. }
  943. # If a date (either day or day + year) was recognized, then no further processing is necessary
  944. if (! $dateRecognized) {
  945. &normalizeTitle(\$link);
  946. my $targetId = &resolveAndCollectInternalLink(\$link, $refToInternalLinksArray);
  947. # Wikipedia pages contain many links to other Wiki projects (especially Wikipedia in
  948. # other languages). While these links are not resolved to valid pages, we also want
  949. # to ignore their text. However, simply discarding the text of all links that cannot
  950. # be resolved would be overly aggressive, as authors frequently define phrases as links
  951. # to articles that don't yet exist, in the hope that they will be added later.
  952. # Therefore, we formulate the following conditions that must hold simultaneously
  953. # for discarding the text of a link:
  954. # 1) the link was not resolved to a valid id
  955. # 2) the link does not contain alternative text (if it did, then the text is probably
  956. # important enough to be retained)
  957. # 3) the link contains a colon - this is a very simple heuristics for identifying links to
  958. # other Wiki projects, other languages, or simply other namespaces within current Wikipedia.
  959. # While this method is not fool-proof (there are regular pages in the main namespace
  960. # that contain a colon in their title), we believe this is a reasonable tradeoff.
  961. if ( !defined($targetId) && ! $alternativeTextAvailable && $link =~ /:/ ) {
  962. $result = "";
  963. print LOGF "Discarding text for link '$originalLink'\n";
  964. } else {
  965. # finally, add the text originally attached to the left and/or to the right of the link
  966. # (if the link represents a date, then it has not text glued to it, so it's OK to only
  967. # use the prefix and suffix here)
  968. $result = $prefix . $result . $suffix;
  969. }
  970. if ( defined($targetId) ) {
  971. # If the current page has several links to another page, then we only take the anchor
  972. # of the last one (and override the previous ones) - we can live with it.
  973. $$refToAnchorTextHash{$targetId} = $result;
  974. }
  975. }
  976. $result; #return value
  977. }
  978. sub performPipelineMasking(\$\$) {
  979. my ($refToLink, $refToResult) = @_;
  980. # First check for presence of a namespace
  981. if ($$refToLink =~ /^([^:]*):(.*)$/) {
  982. my $namespaceCandidate = $1;
  983. my $rest = $2;
  984. &normalizeNamespace(\$namespaceCandidate);
  985. if ( &isKnownNamespace(\$namespaceCandidate) ) {
  986. $$refToResult = $rest; # take the link text without the namespace
  987. } else {
  988. $$refToResult = $$refToLink; # otherwise, take the entire link text (for now)
  989. }
  990. } else {
  991. $$refToResult = $$refToLink; # otherwise, take the entire link text (for now)
  992. }
  993. # Now check if there are parentheses at the end of the link text
  994. # (we now operate on $$refToResult, because we might have stripped the leading
  995. # namespace in the previous test).
  996. if ($$refToResult =~ /^ # the beginning of the string
  997. (.*) # the text up to the last pair of parentheses
  998. \( # opening parenthesis
  999. (?:[^()]*) # the text in the parentheses
  1000. \) # closing parenthesis
  1001. (?:\s*) # optional trailing whitespace, just in case
  1002. $ # end of string
  1003. /x) {
  1004. $$refToResult = $1; # discard the text in parentheses at the end of the string
  1005. }
  1006. }
  1007. sub resolveAndCollectInternalLink(\$\@) {
  1008. my ($refToLink, $refToInternalLinksArray) = @_;
  1009. my $targetId = &resolveLink($refToLink);
  1010. if ( defined($targetId) ) {
  1011. push(@$refToInternalLinksArray, $targetId);
  1012. } else {
  1013. # Some cases in this category that obviously won't be resolved to legal ids:
  1014. # - Links to namespaces that we don't currently handle
  1015. # (other than those for which 'isNamespaceOK' returns true);
  1016. # media and sound files fall in this category
  1017. # - Links to other languages, e.g., [[de:...]]
  1018. # - Links to other Wiki projects, e.g., [[Wiktionary:...]]
  1019. print LOGF "Warning: unknown link '$$refToLink'\n";
  1020. }
  1021. $targetId; # return value
  1022. }
  1023. # Dates can appear in several formats
  1024. # 1) [[July 20]], [[1969]]
  1025. # 2) [[20 July]] [[1969]]
  1026. # 3) [[1969]]-[[07-20]]
  1027. # 4) [[1969-07-20]]
  1028. # The first one is handled correctly without any special treatment,
  1029. # so we don't even check for it here.
  1030. # In (2) and (3), we only normalize the day, because it will be parsed separately from the year.
  1031. # This function is only invoked if the link has no alternative text available, therefore,
  1032. # we're free to override the result text.
  1033. sub normalizeDates(\$\$\@\%) {
  1034. my ($refToLink, $refToResultText, $refToInternalLinksArray, $refToAnchorTextHash) = @_;
  1035. my $dateRecognized = 0;
  1036. if ($$refToLink =~ /^(\d\d)\s*([A-Za-z]+)$/) {
  1037. my $day = $1;
  1038. my $month = ucfirst(lc($2));
  1039. if ( defined($monthToNumDays{$month}) &&
  1040. 1 <= $day && $day <= $monthToNumDays{$month} ) {
  1041. $dateRecognized = 1;
  1042. $$refToLink = "$month $day";
  1043. $$refToResultText = "$month $day";
  1044. my $targetId = &resolveAndCollectInternalLink($refToLink, $refToInternalLinksArray);
  1045. if ( defined($targetId) ) {
  1046. $$refToAnchorTextHash{$targetId} = $$refToResultText;
  1047. }
  1048. } else {
  1049. # this doesn't look like a valid date, leave as-is
  1050. }
  1051. } elsif ($$refToLink =~ /^(\d\d)\-(\d\d)$/) {
  1052. my $monthNum = int($1);
  1053. my $day = $2;
  1054. if ( defined($numberToMonth{$monthNum}) ) {
  1055. my $month = $numberToMonth{$monthNum};
  1056. if (1 <= $day && $day <= $monthToNumDays{$month}) {
  1057. $dateRecognized = 1;
  1058. $$refToLink = "$month $day";
  1059. # we add a leading space, to separate the preceding year ("[[1969]]-" in the example")
  1060. # from the day that we're creating
  1061. $$refToResultText = " $month $day";
  1062. my $targetId = &resolveAndCollectInternalLink($refToLink, $refToInternalLinksArray);
  1063. if ( defined($targetId) ) {
  1064. $$refToAnchorTextHash{$targetId} = $$refToResultText;
  1065. }
  1066. } else {
  1067. # this doesn't look like a valid date, leave as-is
  1068. }
  1069. } else {
  1070. # this doesn't look like a valid date, leave as-is
  1071. }
  1072. } elsif ($$refToLink =~ /^(\d\d\d\d)\-(\d\d)\-(\d\d)$/) {
  1073. my $year = $1;
  1074. my $monthNum = int($2);
  1075. my $day = $3;
  1076. if ( defined($numberToMonth{$monthNum}) ) {
  1077. my $month = $numberToMonth{$monthNum};
  1078. if (1 <= $day && $day <= $monthToNumDays{$month}) {
  1079. $dateRecognized = 1;
  1080. $$refToLink = "$month $day";
  1081. # the link text is combined from the day and the year
  1082. $$refToResultText = "$month $day, $year";
  1083. my $targetId;
  1084. # collect the link for the day
  1085. $targetId = &resolveAndCollectInternalLink($refToLink, $refToInternalLinksArray);
  1086. if ( defined($targetId) ) {
  1087. $$refToAnchorTextHash{$targetId} = $$refToLink;
  1088. }
  1089. # collect the link for the year
  1090. $targetId = &resolveAndCollectInternalLink(\$year, $refToInternalLinksArray);
  1091. if ( defined($targetId) ) {
  1092. $$refToAnchorTextHash{$targetId} = $year;
  1093. }
  1094. } else {
  1095. # this doesn't look like a valid date, leave as-is
  1096. }
  1097. } else {
  1098. # this doesn't look like a valid date, leave as-is
  1099. }
  1100. }
  1101. $dateRecognized; # return value
  1102. }
  1103. sub extractUrls(\$\@) {
  1104. my ($refToText, $refToUrlsArray) = @_;
  1105. # First we handle the case of URLs enclosed in single brackets, with or without the description,
  1106. # and with optional leading and/or trailing whitespace
  1107. # Examples: [http://www.cnn.com], [ http://www.cnn.com ], [http://www.cnn.com CNN Web site]
  1108. $$refToText =~ s/\[(?:\s*)($urlProtocols(?:[^\[\]]*))\]/&collectUrlFromBrackets($1, $refToUrlsArray)/eg;
  1109. # Now we handle standalone URLs (those not enclosed in brackets)
  1110. # The $urlTemrinator is matched via positive lookahead (?=...) in order not to remove
  1111. # the terminator symbol itself, but rather only the URL.
  1112. $$refToText =~ s/($urlProtocols(?:.*?))$urlTerminator/&collectStandaloneUrl($1, $refToUrlsArray)/eg;
  1113. &removeDuplicatesAndSelf($refToUrlsArray, undef);
  1114. }
  1115. sub collectUrlFromBrackets($\@) {
  1116. my ($url, $refToUrlsArray) = @_;
  1117. my $text;
  1118. # Assumption: leading whitespace has already been stripped
  1119. if ( $url =~ /^($urlProtocols(?:.*?))($urlTerminator(?:.*))$/ ) { # description available
  1120. push(@$refToUrlsArray, $1);
  1121. $text = $2;
  1122. } else { # no description
  1123. push(@$refToUrlsArray, $url);
  1124. $text = " ";
  1125. }
  1126. $text; # return value
  1127. }
  1128. sub collectStandaloneUrl($\@) {
  1129. my ($url, $refToUrlsArray) = @_;
  1130. push(@$refToUrlsArray, $url); # collect the URL as-is
  1131. " "; # return value - replace the URL with a space
  1132. }
  1133. sub postprocessText(\$$) {
  1134. my ($refToText, $whetherToEncodeXmlChars) = @_;
  1135. # Eliminate all <includeonly> and <onlyinclude> fragments, because this text
  1136. # will not be included anywhere, as we already handled all inclusion directives
  1137. # in function 'includeTemplates'.
  1138. # This block can easily span several lines, hence the "/s" modifier.
  1139. $$refToText =~ s/<includeonly>(.*?)<\/includeonly>/ /sg;
  1140. $$refToText =~ s/<onlyinclude>(.*?)<\/onlyinclude>/ /sg;
  1141. # <noinclude> fragments remain, but remove the tags per se
  1142. # We block the code below, as <noinclude> tags will anyway be thrown away later,
  1143. # when we eliminate all remaining tags.
  1144. ### This block can easily span several lines, hence the "/s" modifier
  1145. ### $$refToText =~ s/<noinclude>(.*?)<\/noinclude>/$1/sg;
  1146. # replace <br> and <br /> directives with new paragraph
  1147. $$refToText =~ s/<br(?:\s*)(?:[\/]?)>/\n\n/g;
  1148. # Remove tables, as they often carry a lot of noise
  1149. &eliminateTables($refToText);
  1150. # Since we limit the number of levels of template recursion, we might end up with several
  1151. # un-instantiated templates. In this case we simply eliminate them now.
  1152. # Because templates may be nested, we eliminate them iteratively by starting from the most
  1153. # nested one (hence the 'while' loop).
  1154. # OLD comments and code:
  1155. # For the same reason, we also require that the body of a template does not contain
  1156. # opening braces (hence "[^\{]", any char except opening brace).
  1157. # 1 while ($$refToText =~ s/\{\{(?:[^\{]*?)\}\}/ /sg);
  1158. # END OF old comments and code
  1159. # We also require that the body of a template does not contain the template opening sequence
  1160. # (two successive opening braces - "\{\{"). We use negative lookahead to achieve this.
  1161. 1 while ($$refToText =~ s/\{\{
  1162. (?:
  1163. (?:
  1164. (?!
  1165. \{\{
  1166. )
  1167. .
  1168. )*?
  1169. )
  1170. \}\}
  1171. / /sgx);
  1172. # Remove any other <...> tags - but keep the text they enclose
  1173. # (the tags are replaced with spaces to prevent adjacent pieces of text
  1174. # from being glued together).
  1175. # Comments (<!-- ... -->) also fall into this category, and since they can easily span several lines,
  1176. # we use the "/s" modifier.
  1177. $$refToText =~ s/<(?:.*?)>/ /sg;
  1178. # Change markup on bold/italics emphasis. We probably don't need to distinguish
  1179. # these 3 types of emphasis, so we just replace all of them with a generic <em> tag.
  1180. # IMPORTANT: If 'encodeXmlChars' has beeen called before this line, then remember that
  1181. # the apostrophes were already quoted to "&apos;"
  1182. $$refToText =~ s/'''''(.*?)'''''/$1/g;
  1183. $$refToText =~ s/'''(.*?)'''/$1/g;
  1184. $$refToText =~ s/''(.*?)''/$1/g;
  1185. # Eliminate long sequences of newlines and whitespace.
  1186. # Note that we don't want to replace sequences of spaces only, as this might make the text
  1187. # less readable. Instead, we only eliminate sequences of whitespace that contain at least
  1188. # two newlines.
  1189. $$refToText =~ s/(?:\s*)\n(?:\s*)\n(?:\s*)/\n\n/g;
  1190. # Eliminate XML entities such as "&nbsp;" , "&times;" etc. - otherwise,
  1191. # in C++ code they will give rise to spurious words "nbsp", "times" etc.
  1192. # Note that the standard entities - &amp; , &quot; , &apos; , &lt; and &gt;
  1193. # are handled by the XML parser. All other entities, such as &nbsp; are passed
  1194. # by the XML parser to the upper level (in case of Wikipedia pages,
  1195. # to the rendering engine).
  1196. # Note that in the raw XML text, these entities look like "&amp;nbsp;"
  1197. # (i.e., with leading "&amp;"). XML parser replaces "&amp;" with "&",
  1198. # so here in the code we see the entities as "&nbsp;".
  1199. $$refToText =~ s{& # the entity starts with "&"
  1200. ((?:\#?)(?:\w+)) # optional '#' sign (as in &#945;), followed by
  1201. # an uninterrupted sequence of letters and/or digits
  1202. ; # the entity ends with a semicolon
  1203. }{&logReplacedXmlEntity($1)}egx; # entities are replaced with a space
  1204. if ($whetherToEncodeXmlChars) {
  1205. # encode text for XML
  1206. &encodeXmlChars($refToText);
  1207. }
  1208. # NOTE that the following operations introduce XML tags, so they must appear
  1209. # after the original text underwent character encoding with 'encodeXmlChars' !!
  1210. # Change markup for section headers.
  1211. # Note that section headers may only begin at the very first position in the line
  1212. # (not even after a space). Therefore, each header markup in the following commands
  1213. # is prefixed with "^" to make sure it begins at the beginning of the line.
  1214. # Since the text (e.g., article body) may contains multiple lines, we use
  1215. # the "/m" modifier to allow matching "^" at embedded "\n" positions.
  1216. $$refToText =~ s/^=====(.*?)=====/<h4>$1<\/h4>/mg;
  1217. $$refToText =~ s/^====(.*?)====/<h3>$1<\/h3>/mg;
  1218. $$refToText =~ s/^===(.*?)===/<h2>$1<\/h2>/mg;
  1219. $$refToText =~ s/^==(.*?)==/<h1>$1<\/h1>/mg;
  1220. }
  1221. sub logReplacedXmlEntity($) {
  1222. my ($xmlEntity) = @_;
  1223. print LOGF "ENTITY: &$xmlEntity;\n";
  1224. " "; # return value - entities are replaced with a space
  1225. }
  1226. BEGIN {
  1227. # Making variables static for the function to avoid recompilation of regular expressions
  1228. # every time the function is called.
  1229. # Table definitions can easily span several lines, hence the "/s" modifier
  1230. my $tableOpeningSequence1 = qr{<table> # either just <table>
  1231. | # or
  1232. <table(?:\s+)(?:[^<>]*)>}ix; # "<table" followed by at least one space
  1233. # (to prevent "<tablexxx"), followed by
  1234. # some optional text, e.g., table parameters
  1235. # as in "<table border=0>"
  1236. # In the above definition, prohibiting '<' and '>' chars ([^<>]) ensures
  1237. # that we do not consume more than necessary, so that in the example
  1238. # "<table border=0> aaa <table> bbb </table> ccc </table>"
  1239. # $1 is NOT extended to be "> aaa <table"
  1240. my $tableClosingSequence1 = qr/<\/table>/i;
  1241. # my $nonNestedTableRegex1 =
  1242. # qr{$tableOpeningSequence1 # opening sequence
  1243. # (
  1244. # (?: # non-capturing grouper
  1245. # (?! # lookahead negation
  1246. # $tableOpeningSequence1 # that's what we don't want to find inside a table definition
  1247. # )
  1248. # . # any character (such that there is no table opening sequence
  1249. # # after it because of the lookahead condition)
  1250. # )*? # shortest match of such characters, up to the closing of a table
  1251. # )
  1252. # $tableClosingSequence1}sx; # closing sequence
  1253. my $tableOpeningSequence2 = qr/\{\|/;
  1254. my $tableClosingSequence2 = qr/\|\}/;
  1255. # my $nonNestedTableRegex2 =
  1256. # qr{$tableOpeningSequence2 # opening sequence
  1257. # (
  1258. # (?: # non-capturing grouper
  1259. # (?! # lookahead negation
  1260. # $tableOpeningSequence2 # that's what we don't want to find inside a table definition
  1261. # )
  1262. # . # any character (such that there is no table opening sequence
  1263. # # after it because of the lookahead condition)
  1264. # )*? # shortest match of such characters, up to the closing of a table
  1265. # )
  1266. # $tableClosingSequence2}sx; # closing sequence
  1267. sub eliminateTables(\$) {
  1268. my ($refToText) = @_;
  1269. # Sadly, these patterns became too complex and cause segmentation fault,
  1270. # hence we fall back to only handling non-nested tables :(
  1271. # # Sometimes, tables are nested, therefore we use a while loop to eliminate them
  1272. # # recursively, while requiring that any table we eliminate does not contain nested tables.
  1273. # # For simplicity, we assume that tables of the two kinds (e.g., <table> ... </table> and {| ... |})
  1274. # # are not nested in one another.
  1275. $$refToText =~ s/$tableOpeningSequence1(.*?)$tableClosingSequence1/\n/sg;
  1276. $$refToText =~ s/$tableOpeningSequence2(.*?)$tableClosingSequence2/\n/sg;
  1277. }
  1278. } # end of BEGIN block
  1279. # If specified, 'elementToRemove' contains an element that needs to be removed as well.
  1280. # For links, this ensures that a page does not link to itself. For categories, this
  1281. # ensures that a page is not categorized to itself. This parameter is obviously
  1282. # irrelevant for filtering URLs.
  1283. # 'elementToRemove' must be a numeric value (not string), since we're testing it with '==' (not 'eq')
  1284. sub removeDuplicatesAndSelf(\@$) {
  1285. my ($refToArray, $elementToRemove) = @_;
  1286. my %seen = ();
  1287. my @uniq;
  1288. my $item;
  1289. foreach $item (@$refToArray) {
  1290. if ( defined($elementToRemove) && ($item == $elementToRemove) ) {
  1291. printf LOGF "Warning: current page links or categorizes to itself - " .
  1292. "link discarded ($elementToRemove)\n";
  1293. next;
  1294. }
  1295. push(@uniq, $item) unless $seen{$item}++;
  1296. }
  1297. # overwrite the original array with the new one that does not contain duplicates
  1298. @$refToArray = @uniq;
  1299. }
  1300. # Removes elements of the second list from the first list.
  1301. # For efficiency purposes, the second list is converted into a hash.
  1302. sub removeElements(\@\@) {
  1303. my ($refToArray, $refToElementsToRemove) = @_;
  1304. my %elementsToRemove = ();
  1305. my @result;
  1306. # Construct the hash table for fast lookups
  1307. my $item;
  1308. foreach $item (@$refToElementsToRemove) {
  1309. $elementsToRemove{$item} = 1;
  1310. }
  1311. foreach $item (@$refToArray) {
  1312. if ( ! defined($elementsToRemove{$item}) ) {
  1313. push(@result, $item);
  1314. }
  1315. }
  1316. # overwrite the original array with the new one
  1317. @$refToArray = @result;
  1318. }
  1319. sub getTimeAsString() {
  1320. my $tm = localtime();
  1321. my $result = sprintf("%02d:%02d:%02d", $tm->hour, $tm->min, $tm->sec);
  1322. }
  1323. sub trimWhitespaceBothSides(\$) {
  1324. my ($stringRef) = @_;
  1325. # remove leading whitespace
  1326. $$stringRef =~ s/^\s*//;
  1327. # remove trailing whitespace
  1328. $$stringRef =~ s/\s*$//;
  1329. }
  1330. # There are 3 kinds of related links that we look for:
  1331. # 1) Standalone (usually, at the beginning of the article or a section of it)
  1332. # Ex: Main articles: ...
  1333. # 2) Inlined - text in parentheses inside the body of the article
  1334. # Ex: medicine (see also: [[Health]])
  1335. # 3) Dedicated section
  1336. # Ex: == See also ==
  1337. #
  1338. # In all calls to 'extractInternalLinks':
  1339. # - The penultimate argument is 0, since we don't need to log anchor text here.
  1340. # Anchor text will be handled when we analyze all the internal links in
  1341. # the entire article (and not just look for related links).
  1342. # - The last argument is 0 in order not to remove duplicates on every invocation
  1343. # of 'extractInternalLinks'. This is because duplicates in related links are
  1344. # not very common, but performing duplicate removal each time is expensive.
  1345. # Instead, we remove duplicates once at the very end.
  1346. sub identifyRelatedArticles(\$\@$) {
  1347. my ($refToText, $refToRelatedArticles, $id) = @_;
  1348. # We split the text into a set of lines. This also creates a copy of the original text -
  1349. # this is important, since the function 'extractInternalLinks' modifies its argument,
  1350. # so we'd better use it on a copy of the real article body.
  1351. my @text = split("\n", $$refToText);
  1352. my $line;
  1353. # Standalone
  1354. foreach $line (@text) {
  1355. # We require that stanalone designators occur at the beginning of the line
  1356. # (after at most a few characters, such as a whitespace or a colon),
  1357. # and not just anywhere in the line. Otherwise, we would collect as related
  1358. # those links that just happen to occur in the same line with an unrelated
  1359. # string that represents a standalone designator.
  1360. if ($line =~ /^(?:.{0,5})(${relatedWording_Standalone}.*)$/) {
  1361. my $str = $1; # We extract links from the rest of the line
  1362. print LOGF "Related(S): $id => $str\n";
  1363. &extractInternalLinks(\$str, $refToRelatedArticles, $id, 0, 0);
  1364. print LOGF "Related(S): $id ==> @$refToRelatedArticles\n";
  1365. }
  1366. }
  1367. # Inlined (in parentheses)
  1368. foreach $line (@text) {
  1369. while ($line =~ /\((?:\s*)(${relatedWording_Inline}.*?)\)/g) {
  1370. my $str = $1;
  1371. print LOGF "Related(I): $id => $str\n";
  1372. &extractInternalLinks(\$str, $refToRelatedArticles, $id, 0, 0);
  1373. print LOGF "Related(I): $id ==> @$refToRelatedArticles\n";
  1374. }
  1375. }
  1376. # Section
  1377. # Sections can be at any level - "==", "===", "====" - it doesn't matter,
  1378. # so it suffices to look for two consecutive "=" signs
  1379. my $relatedSectionFound = 0;
  1380. foreach $line (@text) {
  1381. if ($relatedSectionFound) { # we're in the related section now
  1382. if ($line =~ /==(?:.*?)==/) { # we just encountered the next section - exit the loop
  1383. last;
  1384. } else { # collect the links from the current line
  1385. print LOGF "Related(N): $id => $line\n";
  1386. # 'extractInternalLinks' may mofidy its argument ('$line'), but it's OK
  1387. # as we do not do any further processing to '$line' or '@text'
  1388. &extractInternalLinks(\$line, $refToRelatedArticles, $id, 0, 0);
  1389. print LOGF "Related(N): $id ==> @$refToRelatedArticles\n";
  1390. }
  1391. } else { # we haven't yet found the related section
  1392. if ($line =~ /==(.*?)==/) { # found some section header - let's check it
  1393. my $sectionHeader = $1;
  1394. if ($sectionHeader =~ /$relatedWording_Section/) {
  1395. $relatedSectionFound = 1;
  1396. next; # proceed to the next line
  1397. } else {
  1398. next; # unrelated section - just proceed to the next line
  1399. }
  1400. } else {
  1401. next; # just proceed to the next line - nothing to do
  1402. }
  1403. }
  1404. }
  1405. &removeDuplicatesAndSelf($refToRelatedArticles, $id);
  1406. }
  1407. sub recordRelatedArticles($\@) {
  1408. my ($id, $refToRelatedArticles) = @_;
  1409. my $size = scalar(@$refToRelatedArticles);
  1410. return if ($size == 0);
  1411. print RELATEDF "$id\t", join(" ", @$refToRelatedArticles), "\n";
  1412. }
  1413. ########################################################################
  1414. sub printUsage()
  1415. {
  1416. print "Wikiprep version $version, Copyright (C) 2007 Evgeniy Gabrilovich\n" .
  1417. "Wikiprep comes with ABSOLUTELY NO WARRANTY; for details type '$0 -license'.\n" .
  1418. "This is free software, and you are welcome to redistribute it\n" .
  1419. "under certain conditions; type '$0 -license' for details.\n" .
  1420. "Type '$0 -version' for version information.\n\n" .
  1421. "Usage: $0 -f <XML file with page dump>\n" .
  1422. " e.g., $0 -f pages_articles.xml\n\n";
  1423. }