User:AnomieBOT/source/tasks/OrphanReferenceFixer.pm
Appearance
BFRA approved 2008-09-04 Wikipedia:Bots/Requests for approval/AnomieBOT |
First supplemental BFRA approved 2008-09-11 Wikipedia:Bots/Requests for approval/AnomieBOT 3 |
Second supplemental BRFA approved 2008-09-20 Wikipedia:Bots/Requests for approval/AnomieBOT 6 |
Third supplemental BRFA approval requested Wikipedia:Bots/Requests for approval/AnomieBOT 27 |
package tasks::OrphanReferenceFixer;
=pod
=begin metadata
Bot: AnomieBOT
Task: OrphanReferenceFixer
BRFA: Wikipedia:Bots/Requests for approval/AnomieBOT
Status: Approved 2008-09-04
+BRFA: Wikipedia:Bots/Requests for approval/AnomieBOT 3
+Status: Approved 2008-09-11
+BRFA: Wikipedia:Bots/Requests for approval/AnomieBOT 6
+Status: Approved 2008-09-20
+BRFA: Wikipedia:Bots/Requests for approval/AnomieBOT 27
+Status: Approved 2009-03-23
Created: 2008-08-20
Applies the following corrections to pages in [[:Category:Pages with incorrect
ref formatting]] and/or [[:Category:Pages with broken reference names]]. This
is often enough to get them removed from the category.
<div style="font-size:90%">
* <nowiki><ref name=foo bar> → <ref name="foo bar"></nowiki>
* <nowiki><ref name="foo> → <ref name="foo"></nowiki>
* <nowiki><ref name=bar"> → <ref name="bar"></nowiki>
* <nowiki><ref name "foo"> → <ref name="foo"></nowiki>
* <nowiki><ref name-"foo"> → <ref name="foo"></nowiki>
* <nowiki><ref name+"foo"> → <ref name="foo"></nowiki>
* <nowiki><ref "foo"> → <ref name="foo"></nowiki>
* <nowiki><ref name="foo" name="bar"> → <ref name="bar"></nowiki>
* <nowiki><ref …></ref> → <ref …/></nowiki>
* Remove <nowiki><ref …/></nowiki> without <code>name</code>
* Strip parameters other than <code>name</code> and <code>group</code> from <nowiki><ref> and <references></nowiki>
* Rename refs with numeric names
* Copy content for orphaned named refs from past page revisions
* Copy content for orphaned named refs from linked articles
* Move content for named refs out of infoboxen and other templates
</div>
Actions are periodically logged to [[User:AnomieBOT/OrphanReferenceFixer log]].
=end metadata
=cut
use utf8;
use strict;
use AnomieBOT::API;
use AnomieBOT::Task qw/:time bunchlist/;
use Storable qw/thaw/;
use HTML::Entities ();
use Data::Dumper;
use vars qw/@ISA/;
@ISA=qw/AnomieBOT::Task/;
my %inuse=(
'Category:Pages actively undergoing a major edit'=>1,
);
my $inuse_delay=7200;
my $min_delay=600;
my $arbitrary_untrusted_threshold=1000;
my $arbitrary_trusted_threshold=2000;
my $untrusted_delay=7200;
my %skiptags=(
'possible libel or vandalism' => 1,
);
my $logpage='User:AnomieBOT/OrphanReferenceFixer log';
my $logfrequency=21600;
my $loglength=28;
my $initialized=0;
my %no_move_refs_out=(
'Template:Graphic novel list' => 1,
'Template:Infobox nrhp' => 1,
'Template:Infobox Lighthouse' => 1,
'Template:Episode list/sublist' => 1,
);
my $knowngroups = qr/^(?:note|upper-alpha|upper-roman|lower-alpha|lower-greek|lower-roman)$/;
my %reftpl=(
'Template:Refn' => { group => '', groupre => qr/[^\x22\x27]*?/, content => [ 'refn', '1' ] },
'Template:Efn' => { group => 'lower-alpha', groupre => $knowngroups, content => [ 'reference', '1', 'content', 'text' ] },
'Template:Efn-ua' => { group => 'upper-alpha', groupre => qr/(?!)/, content => [ 'reference', '1', 'content', 'text' ] },
'Template:Efn-lr' => { group => 'lower-roman', groupre => qr/(?!)/, content => [ 'reference', '1', 'content', 'text' ] },
'Template:Efn-ur' => { group => 'upper-roman', groupre => qr/(?!)/, content => [ 'reference', '1', 'content', 'text' ] },
'Template:Efn-lg' => { group => 'lower-greek', groupre => qr/(?!)/, content => [ 'reference', '1', 'content', 'text' ] },
'Template:NoteTag' => { group => 'note', groupre => qr/(?!)/, content => [ 'note', '1', 'content', 'text' ] },
);
my %reflist=(
'Template:Reflist' => { group => '', groupre => qr/[^\x22\x27]*?/, refs => [ 'refs' ] },
'Template:Notelist' => { group => 'lower-alpha', groupre => $knowngroups, refs => [ 'refs', 'notes' ] },
'Template:Notelist-la' => { group => 'lower-alpha', groupre => qr/(?!)/, refs => [ 'refs', 'notes' ] },
'Template:Notelist-ua' => { group => 'upper-alpha', groupre => qr/(?!)/, refs => [ 'refs', 'notes' ] },
'Template:Notelist-lr' => { group => 'lower-roman', groupre => qr/(?!)/, refs => [ 'refs', 'notes' ] },
'Template:Notelist-ur' => { group => 'upper-roman', groupre => qr/(?!)/, refs => [ 'refs', 'notes' ] },
'Template:Notelist-lg' => { group => 'lower-greek', groupre => qr/(?!)/, refs => [ 'refs', 'notes' ] },
'Template:NoteFoot' => { group => 'note', groupre => qr/(?!)/, refs => [ 'refs', 'notes' ] },
);
my %alltpl;
my $attrRe;
{{
my $s = "\x09\x0a\x0c\x0d\x20";
$attrRe = qr/([$s]*([^$s\/>][^$s\/>=]*)(?:[$s]*=[$s]*(?|(")([^"]*)("|$)|(')([^']*)('|$)|()([^$s>]*)()))?)/
}}
sub new {
my $class=shift;
my $self=$class->SUPER::new();
# "Skip" list is used to keep one long page from monopolizing the bot's
# time. It checks a page for a max of 10 minutes, and then skips it on
# subsequent runs until it has processed all other pages in the category.
$self->{'skip'}={};
# Used to determine when to scan the datastore for removing obsolete
# entries.
$self->{'lastcleanup'}=0;
bless $self, $class;
return $self;
}
=pod
=for info
BFRA approved 2008-09-04<br />[[Wikipedia:Bots/Requests for approval/AnomieBOT]]
=for info
First supplemental BFRA approved 2008-09-11<br />[[Wikipedia:Bots/Requests for approval/AnomieBOT 3]]
=for info
Second supplemental BRFA approved 2008-09-20<br />[[Wikipedia:Bots/Requests for approval/AnomieBOT 6]]
=for info
Third supplemental BRFA approval requested<br />[[Wikipedia:Bots/Requests for approval/AnomieBOT 27]]
=cut
sub approved {
return 4;
}
sub init {
my ($self,$api)=@_;
if(!$initialized){
my %r=$api->redirects_to_resolved(keys %no_move_refs_out);
if(exists($r{''})){
$api->warn("Failed to get non-removal redirects: ".$r{''}{'error'}."\n");
return 60;
}
%no_move_refs_out=%r;
%r=$api->redirects_to_resolved(keys %reflist);
if(exists($r{''})){
$api->warn("Failed to get reflist redirects: ".$r{''}{'error'}."\n");
return 60;
}
while ( my ($k, $v) = each %r ) {
$reflist{$k} = $reflist{$v};
$no_move_refs_out{$k} = 1;
}
%r=$api->redirects_to_resolved(keys %reftpl);
if(exists($r{''})){
$api->warn("Failed to get reftpl redirects: ".$r{''}{'error'}."\n");
return 60;
}
while ( my ($k, $v) = each %r ) {
$reftpl{$k} = $reftpl{$v};
}
%alltpl=(
( map { s/^Template://; $_ => 'references', "Template:$_" => 'references' } keys %reflist ),
( map { s/^Template://; $_ => 'ref', "Template:$_" => 'ref' } keys %reftpl ),
);
$initialized=1;
}
return undef;
}
sub run {
my ($self, $api)=@_;
$api->task('OrphanReferenceFixer', 0, 10, qw/d::Talk d::Timestamp d::Templates d::Redirects d::IWNS/);
return 300 unless $api->load_IWNS_maps();
if(!exists($api->store->{'did_upgrade'}) || $api->store->{'did_upgrade'}<1){
while(my ($k,$v)=each %{$api->store}){
if($k=~/^\d+$/){
foreach (@{$v->{"unfound"}}){
my ($g,$n)=@{thaw($_)};
$_="$g>$n";
}
$api->store->{$k}=$v;
} elsif($k=~/^p\d+$/){
my %x=();
while(my ($blob,$x) = each %$v){
my ($g,$n)=@{thaw($blob)};
$x{"$g>$n"}=$x;
}
$api->store->{$k}=\%x;
}
}
$api->store->{'did_upgrade'} = 1;
}
if($api->store->{'did_upgrade'}<2){
# Changed the code to handle cases that were previously broken, so recheck all pages.
while(my ($k,$v)=each %{$api->store}){
delete $api->store->{$k} if $k=~/^\d+$/;
}
$api->store->{'did_upgrade'} = 2;
}
if($api->store->{'did_upgrade'}<4){
# Added ignoring of old refs. Add the prop for that.
while(my ($k,$v)=each %{$api->store}){
next unless $k=~/^\d+$/;
if(!defined($v->{'ignored'})){
$v->{'ignored'} = [];
$api->store->{$k}=$v;
}
}
$api->store->{'did_upgrade'} = 4;
}
if($self->{'lastcleanup'}+86400<time()){
# Cleanup obsolete entries in the data store
my $exp=time()-86400*30;
while(my ($k,$v)=each %{$api->store}){
next unless $k=~/^\d+$/;
delete $api->store->{$k} if $v->{'touched'}<$exp;
}
}
my $r=$self->init($api);
return $r if defined($r);
$api->store->{'log'}='' unless exists($api->store->{'log'});
$api->store->{'lastlog'}=0 unless exists($api->store->{'lastlog'});
my $log=$api->store->{'log'};
my $lastlog=$api->store->{'lastlog'};
if($log ne '' && $lastlog!=-1 && $lastlog+$logfrequency<time()){
my $tok=$api->edittoken($logpage);
if($tok->{'code'} eq 'shutoff'){
$api->warn("Task disabled: ".$tok->{'content'}."\n");
return 300;
}
if($tok->{'code'} ne 'success'){
$api->warn("Failed to get edit token for $logpage: ".$tok->{'error'}."\n");
} else {
my @txt;
my $nowiki={};
my $txt;
if(exists($tok->{'revisions'}[0]{'slots'}{'main'}{'*'})){
($txt,$nowiki)=$api->strip_nowiki($tok->{'revisions'}[0]{'slots'}{'main'}{'*'});
@txt=split /(?=(?:^|\n)==[^=])/, $txt;
} else {
@txt=(
"<!-- Please do not edit the lead section. -->\n".
"<!-- You may edit any of the below sections as you wish. -->\n".
"This is a log of AnomieBOT's actions over the past few days.\n\n"
);
}
my $h="\n== AnomieBOT Log";
if($lastlog!=0){
$h.=' for '.strftime('%F %T Z', gmtime($lastlog));
}
$h.=' to '.strftime('%F %T Z', gmtime($api->ISO2timestamp($tok->{'curtimestamp'})))."==\n";
splice @txt, 1, 0, $h.$log;
@txt=@txt[0..$loglength] if @txt>$loglength;
$txt=join('',@txt);
$txt=$api->replace_nowiki(join('',@txt), $nowiki);
my $r=$api->edit($tok, $txt, "Log recent actions", 0, 0);
if($r->{'code'} eq 'httperror'){
# Could well be that MediaWiki saved the edit, but timed out
# when trying to respond. So wait a short time and then check
# the timestamp on the most recent edit by the bot.
sleep(10);
my $r2=$api->query(
titles => $logpage,
prop => 'revisions',
rvuser => $api->user,
rvprop => 'timestamp',
rvlimit => 1 # Only need the last rev
);
if($r2->{'code'} eq 'success'){
$r2=[values(%{$r2->{'query'}{'pages'}})];
if(exists($r2->[0]{'lastrevid'})){
$r=$r2 if $r2->[0]{'lastrevid'} != $tok->{'lastrevid'};
}
}
}
if($r->{'code'} ne 'success'){
$api->warn("Write failed on $logpage: ".$r->{'error'}."\n");
} else {
$log='';
$lastlog=$api->ISO2timestamp($tok->{'curtimestamp'});
$api->store->{'log'}=$log;
$api->store->{'lastlog'}=$lastlog;
}
}
}
# Spend a max of 5 minutes on this task before restarting
my $endtime=time()+300;
while(1){
# Get an iterator for the list of pages to check
my $iter=$api->iterator(
generator => 'categorymembers',
gcmtitle => [
'Category:Pages with broken reference names',
'Category:Pages with incorrect ref formatting',
],
gcmnamespace => '0',
gcmtype => 'page',
gcmlimit => 'max',
gcmsort => 'timestamp',
gcmdir => 'desc',
prop => 'info|categories',
cllimit => 'max',
clcategories => join('|', keys %inuse)
);
PAGE: while($_=$iter->next){
return 0 if $api->halting;
if(!$_->{'_ok_'}){
$api->warn("Failed to retrieve category list: ".$_->{'error'}."\n");
return 60;
}
my $title=$_->{'title'};
# WTF?
if(exists($_->{'missing'})){
$api->warn("$title is missing? WTF?\n");
next;
}
# Don't try fixing any page touched too recently, to give the real
# editor a chance to fix it.
my $lastmod=$api->ISO2timestamp($_->{'touched'});
if(time()-$lastmod<$min_delay){
$api->log("$title touched too recently, leave it for later");
next;
}
# Any page marked with {{inuse}} should be left for longer.
if(time()-$lastmod<$inuse_delay &&
grep { exists($inuse{$_->{'title'}}) } @{$_->{'categories'}}){
$api->log("$title marked {{inuse}} and last touched less than $inuse_delay seconds ago, leave it for later");
next;
}
# In the skip list?
if(exists($self->{'skip'}{$_->{'pageid'}}) && $self->{'skip'}{$_->{'pageid'}} eq $_->{'lastrevid'}){
$api->log("Skipping $title for now to let other pages get a chance");
next;
}
# Did we check this revision already?
my $checked;
if(!exists($api->store->{$_->{'pageid'}})){
# No, never saw it before
$checked={
revid=>$_->{'lastrevid'},
continue=>'<beginning>',
touched=>0,
did_summary_links=>0,
did_page_links=>0,
prev_ts=>$lastmod,
prev_info=>[0,'','',-1],
unfound=>[],
ignored=>[],
afwarn => 0,
};
$api->store->{$_->{'pageid'}} = $checked;
} else {
$checked=$api->store->{$_->{'pageid'}};
if($checked->{'revid'} ne $_->{'lastrevid'}){
# Saw an old revision, rescan this new one
$checked->{'revid'}=$_->{'lastrevid'};
$checked->{'continue'}='<beginning>';
$checked->{'prev_ts'}=$lastmod;
$checked->{'prev_info'}=[0,'','',-1];
$checked->{'did_summary_links'}=0;
$checked->{'did_page_links'}=0;
$checked->{'afwarn'}=0;
delete $checked->{'skipuntil'};
} elsif($checked->{'continue'} ne '' || !$checked->{'did_summary_links'} || !$checked->{'did_page_links'}){
# In the middle of checking this revision
} else {
# Yes, we (supposedly) completed this one
$checked->{'touched'}=time();
$api->store->{$_->{'pageid'}}=$checked;
next;
}
}
# Was this flagged for additional delay (e.g. because of a
# spamblacklist hit)?
if ( time() < ($checked->{'skipuntil'} // 0) ) {
$api->log("$title being skipped until " . strftime('%Y-%m-%d %H:%M:%S (UTC)', gmtime($checked->{'skipuntil'})));
next;
}
# To try to avoid "fixing" vandalism, we choose some arbitrary
# groups and edit count limits to trust and wait longer if the page
# hasn't been edited by someone "trusted" since someone "untrusted"
# edited.
my $res=$api->query([],
titles => $title,
prop => 'revisions',
rvprop => 'user',
rvlimit => 'max',
rvend => $api->timestamp2ISO(time()-$untrusted_delay)
);
if($res->{'code'} ne 'success'){
$api->warn("Failed to retrieve revisions for $title: ".$res->{'error'}."\n");
return 60;
}
my @users=grep { defined($_) } map $_->{'user'}, @{(values %{$res->{'query'}{'pages'}})[0]{'revisions'}};
my %u; @u{@users}=();
$res=$api->query([],
list => 'users',
usprop => 'editcount|groups',
ususers => join("|", keys %u)
);
if($res->{'code'} ne 'success'){
$api->warn("Failed to retrieve edit counts for editors of $title: ".$res->{'error'}."\n");
return 60;
}
%u=map { my $n=$_->{'name'}; "$n#g" => ($_->{'groups'} // []), "$n#e" => ($_->{'editcount'} // 0) } @{$res->{'query'}{'users'}};
my $ok=1;
foreach my $u (@users) {
next if grep(/^(?:bot)$/, @{$u{"$u#g"}}); # Skip bots
last if grep(/^(?:sysop|reviewer)$/, @{$u{"$u#g"}}); # Trust these
last if $u{"$u#e"}>$arbitrary_trusted_threshold; # Trust these too
next if $u{"$u#e"}>$arbitrary_untrusted_threshold; # Neutral on these
$ok=0; # Don't trust anyone else
$api->log("$title touched too recently by untrusted user $u");
last;
}
next unless $ok;
# Get edit token
my $tok=$api->edittoken($title);
if($tok->{'code'} eq 'shutoff'){
$api->warn("Task disabled: ".$tok->{'content'}."\n");
return 300;
}
if($tok->{'code'} ne 'success'){
$api->warn("Failed to get edit token for $title: ".$tok->{'error'}."\n");
next;
}
next if exists($tok->{'missing'});
if($tok->{'lastrevid'} ne $checked->{'revid'}){
# Someone edited in between loading the cat and getting the
# token. We'll catch the new revision next time around.
$api->log("$title was edited since cat list was loaded, abort");
next;
}
# Check if any tags on the topmost revision are in the skip list
my @tags=@{$tok->{'revisions'}[0]{'tags'} // []};
for my $tag (@tags) {
if(exists($skiptags{$tag})){
$api->log("Skipping revision ".$tok->{'revisions'}[0]{'revid'}." of $title because of tag '$tag'\n");
next;
}
}
# Ok, check the page
$api->log("Checking references in $title");
# Get page text
my $intxt=$tok->{'revisions'}[0]{'slots'}{'main'}{'*'};
# Process page
my $ret=$self->process_page($api,$_->{'pageid'},$_->{'lastrevid'},$title,$intxt,$checked,$endtime);
next PAGE unless defined $ret;
return $ret unless ref($ret);
# Need to edit?
my $skiplogct = 0;
if($ret->{'outtxt'} ne $intxt){
my $post_summary='';
my $chkrevert=$api->query(
titles => $title,
prop => 'revisions',
rvprop => 'user|ids|content',
rvslots => 'main',
rvlimit => 3,
);
if($chkrevert->{'code'} eq 'shutoff'){
$api->warn("Task disabled: ".$chkrevert->{'content'}."\n");
return 300;
}
if($chkrevert->{'code'} ne 'success'){
$api->warn("Failed to get revisions for $title: ".$chkrevert->{'error'}."\n");
next PAGE;
}
next unless exists($chkrevert->{'query'}{'pages'}{$_->{'pageid'}}); # Deleted at just the wrong time
my @revs=@{$chkrevert->{'query'}{'pages'}{$_->{'pageid'}}{'revisions'}};
if(@revs==3 && $revs[1]{'user'} eq $api->user && $ret->{'outtxt'} eq $revs[1]{'slots'}{'main'}{'*'} && $revs[0]{'slots'}{'main'}{'*'} eq $revs[2]{'slots'}{'main'}{'*'}){
if($revs[0]{'user'} eq $api->user){
# Something is really screwed up, the bot wants to revert itself.
$api->whine("Bot confusion at [[:$title]]", "When trying to fix orphaned refs in [[:$title]], it seems that I want to revert myself. That's definitely not right, a human will need to fix the situation.");
next PAGE;
}
my $r=$self->_notify_reverter($api,$revs[0]{'user'},$title,$revs[0]{'revid'});
if($r==-1){
# Failed, continue next time
} elsif($r==-2){
# Major fail
next PAGE;
} elsif($r>0){
# Really major fail
return $r;
} else {
# Success!
}
$post_summary='. [[User:AnomieBOT/OrphanReferenceFixer revert help|Read this before reverting]].';
}
my @summary=();
push @summary, 'fixing reference errors' if $ret->{'anyfix'};
push @summary, 'moving refs out of templates' if $ret->{'moved'};
push @summary, 'rescuing orphaned refs ('.join('; ', @{$ret->{'found'}}).')' if @{$ret->{'found'}};
if(!@summary){
$api->warn("No summary for $title even though changes were made, WTF?\n");
next;
}
$summary[-1]='and '.$summary[-1] if @summary>1;
my $summary=ucfirst(join((@summary>2)?', ':' ', @summary)).$post_summary;
$api->log("$summary in $title");
if(length($summary)>500){
@summary=();
push @summary, 'fixing reference errors' if $ret->{'anyfix'};
push @summary, 'moving refs out of templates' if $ret->{'moved'};
push @summary, 'rescuing orphaned refs' if @{$ret->{'found'}};
$summary[-1]='and '.$summary[-1] if @summary>1;
$summary=ucfirst(join((@summary>2)?', ':' ', @summary)).$post_summary;
}
my $r=$api->edit($tok, $ret->{'outtxt'}, $summary, 0, 0);
if($r->{'code'} eq 'spamblacklist'){
my @bl=@{$r->{'spamblacklist'}{'matches'}};
$bl[@bl-1] = 'and ' . @bl[@bl-1] if @bl > 1;
my $bl = join( @bl > 2 ? ', ' : ' ', @bl );
$api->log("Write failed on $title: Blacklisted link $bl");
$api->warn("Write failed on $title: Blacklisted link $bl\n");
$api->whine("Blacklisted orphaned reference in [[:$title]]", "When trying to fix orphaned refs in [[:$title]], MediaWiki's [[MediaWiki:Spam-blacklist|spam blacklist]] complained about <nowiki>$bl</nowiki>. This ''probably'' means someone didn't properly clean up after themselves when blacklisting the link and removing existing uses, but a human needs to double-check it. The attempted changes were:\n* [[:$title]] revision [[Special:PermaLink/".$tok->{'lastrevid'}."|".$tok->{'lastrevid'}."]]:\n".join("\n",@{$ret->{'log'}})."\nYou might also use {{tlus|User:Anomie/uw-orphans|1{{=}}rm diff|2{{=}}fix diff|subst=y}} to let the remover know, if their edit summary indicates they were specifically removing the blacklisted ref. ");
# Don't check again for a while
$checked=$api->store->{$_->{'pageid'}};
$checked->{'skipuntil'} = time() + 7200;
$api->store->{$_->{'pageid'}} = $checked;
next;
}
if($r->{'code'} eq 'abusefilter-disallowed' || $r->{'code'} eq 'abusefilter-warning'){
my $code = $r->{'code'};
my $info = $r->{'abusefilter'}{'description'};
$api->log("Write failed on $title: $code hit for $info");
$api->warn("Write failed on $title: $code hit for $info\n");
if($r->{'code'} ne 'abusefilter-warning' || ++$checked->{'afwarn'} > 2){
# T184191: AbuseFilter warnings are currently bypassed the next time the bot tries to edit.
# So let's not bother whining unless it fails multiple times in a row.
$api->whine("AbuseFilter hit in [[:$title]]", "When trying to fix orphaned refs in [[:$title]], MediaWiki's [[WP:Edit filter|edit filter]] complained about an AbuseFilter hit for <nowiki>$info</nowiki> with code [[MediaWiki:$code|$code]]. This ''probably'' means some anti-spam measure is using AbuseFilter rather than SpamBlacklist and someone didn't properly clean up after themselves, but a human needs to double-check it. The attempted changes were:\n* [[:$title]] revision [[Special:PermaLink/".$tok->{'lastrevid'}."|".$tok->{'lastrevid'}."]]:\n".join("\n",@{$ret->{'log'}})."\nYou might also use {{tlus|User:Anomie/uw-orphans|1{{=}}rm diff|2{{=}}fix diff|subst=y}} to let the remover know, if their edit summary indicates they were specifically removing the blacklisted ref. ");
}
# Don't check again for a while
$checked=$api->store->{$_->{'pageid'}};
$checked->{'skipuntil'} = time() + 7200;
$api->store->{$_->{'pageid'}} = $checked;
next;
}
if($r->{'code'} ne 'success'){
$api->warn("Write failed on $title: ".$r->{'error'}."\n");
next;
}
my $i=$r->{'edit'}{'newrevid'};
unshift @{$ret->{'log'}}, "* Edited [[Special:Diff/$i|$title]]:";
} else {
$api->log("Nothing I can fix in $title");
unshift @{$ret->{'log'}}, "* Processed [[Special:PermaLink/".$_->{'lastrevid'}."|$title]] (no edit):";
$skiplogct++;
}
# If we're not continuing next time, any refs that are still needed
# are not in the article history at all. Record them so we don't
# bother searching the whole history again next time someone edits
# the page.
if($checked->{'continue'} eq '' && $checked->{'did_summary_links'} && $checked->{'did_page_links'}){
push @{$ret->{'unfound'}}, @{$ret->{'needed'}};
$checked->{'unfound'}=$ret->{'unfound'};
if(@{$ret->{'unfound'}} || @{$checked->{'ignored'}}){
my $log = "** Scan complete.";
$log .= " The following references could not be found: <nowiki>".join(', ', map { my $x=$_; $x=~s/^>//; $x } @{$ret->{'unfound'}})."</nowiki>" if @{$ret->{'unfound'}};
$log .= " The following references were found but ignored: <nowiki>".join(', ', map { my $x=$_; $x=~s/^>//; $x } @{$checked->{'ignored'}})."</nowiki>" if @{$checked->{'ignored'}};
push @{$ret->{'log'}}, $log;
} else {
push @{$ret->{'log'}}, "** <small>Scan complete.</small>";
}
$api->log("Completed scanning $title revision ".$_->{'lastrevid'});
} else {
# If we are continuing, add the page to the "skip" list to let
# other pages have a chance to be scanned.
$api->log("$title will be continued later");
$self->{'skip'}{$_->{'pageid'}}=$_->{'lastrevid'};
push @{$ret->{'log'}}, "** <small>Scan not complete, will continue later.</small>";
$skiplogct++;
}
$self->_log($api, join("\n", @{$ret->{'log'}})) if @{$ret->{'log'}} > $skiplogct;
# Ok, we successfully processed the page. Save the persistant data
# now.
$checked->{'title'}=$title; # for manual db editing
$checked->{'touched'}=time();
$api->store->{$_->{'pageid'}}=$checked;
# If we've been at it long enough, let another task have a go.
return 0 if time()>=$endtime;
}
last unless %{$self->{'skip'}};
%{$self->{'skip'}}=();
}
# No more pages to check, try again in 10 minutes or so.
return 600;
}
# Do the actual processing of the input. Returns a hashref full of values.
sub process_page {
my ($self,$api,$pageid,$lastrevid,$title,$intxt,$checked,$endtime)=@_;
# Obvious vandalism?
if(index($intxt,'Cite your sources: <ref></ref>')>=0 ||
$intxt=~m(<blockquote></blockquote>\s+<ref></ref>\s+\{\{Reflist\}\}\s+<references/>)){
$self->_log($api,"* [[:$title]]: Probable vandalism, ignoring revision [[Special:Diff/$lastrevid|$lastrevid]]</span>");
$api->log("Probable vandalism, ignoring revision $lastrevid of $title");
$checked->{'title'}=$title;
$checked->{'touched'}=time();
$checked->{'continue'}='';
$checked->{'did_summary_links'}=1;
$checked->{'did_page_links'}=1;
$api->store->{$pageid}=$checked;
return undef;
}
my ($outtxt,$nowiki)=$api->strip_nowiki($intxt);
my $b0rken='';
my $anyfix=$outtxt;
my @log=();
# First, fix obvious errors.
my $i=1;
$i=($_>=$i?$_+1:$i) foreach ($outtxt=~/autogenerated(\d+)/g);
$outtxt=~s{<ref\s+name\s*(/|(?<!/))>}{'<ref name="autogenerated'.($i++).'"'.$1.'>'}oige;
# Citation bot bugs
$outtxt=~s{<ref(\s+[^>\n]*)name\s*=\s*" "([^ >\n]+)"}{<ref$1name="$2"}oig;
$outtxt=~s{<ref(\s+[^>\n]*)name\s*=\s*"[\x{2018}\x{2019}\x{201c}\x{201d}]([^"\n]*)[\x{2018}\x{2019}\x{201c}\x{201d}]"}{<ref$1name="$2"}oig;
# Other issues
$outtxt=~s{<ref(\s+[^>\n]*)name\s*=\s*(name\s*=)}{<ref$1$2}oig;
$outtxt=~s{<ref\s+name\s*=\s*([^\s\x22\x{201c}\x{201d}\x27<>=]+(?-i:[\s\x27\x{00c0}-\x{02af}]+[^\s\x22\x27<>=]+)+)\s*(/|(?<!/))>}{<ref name="$1"$2>}oig;
$outtxt=~s{<ref\s+name\s*=\s*[\x22\x27]([^\x22\x27<>=\n]*?)\s*(/|(?<!/))>}{<ref name="$1"$2>}oig;
$outtxt=~s{<ref\s+name\s*=\s*([^\x22\x27<>=\n]+)[\x22\x27]\s*(/|(?<!/))>}{<ref name="$1"$2>}oig;
$outtxt=~s{<ref\s+name\s*=\s*\x27\x27([^\x22\x27<>=\n]*)\x27\x27\s*(/|(?<!/))>}{<ref name="$1"$2>}oig;
$outtxt=~s{<ref\s+name\s*=\s*\x22([^\x22\x27<>=\n]*)\x27\s*(/|(?<!/))>}{<ref name="$1"$2>}oig;
$outtxt=~s{<ref\s+name\s*=\s*\x27([^\x22\x27<>=\n]*)\x22\s*(/|(?<!/))>}{<ref name="$1"$2>}oig;
$outtxt=~s{<ref\s+(?:name\s*(?:[+-]\s*)?)?(\x22[^\x22<>=\n]*\x22|\x27[^\x27<>=\n]*\x27)\s*(/|(?<!/))>}{<ref name=$1$2>}oig;
$outtxt=~s{<ref((?:\s+[^>\n]*)?)(?<!/)></ref\s*>}{<ref$1/>}oig;
$outtxt=~s!<ref\s*/>!!oig;
$outtxt=~s{(<ref\s++)([^>\n]+?)(/>|(?<!/)>)}{ $1._uniq_params($2).$3 }oige;
$outtxt=~s{(<references\s++)([^>\n]+?)(/>|(?<!/)>)}{ $1._uniq_params($2,'group').$3 }oige;
my $tmptxt=$api->process_templates($outtxt, sub { return ''; }, undef);
my $re = join( '|', map { my ($f,$r)=split(//, $_, 2); "(?i:\Q$f\E)\Q$r\E" } keys %reflist );
if($tmptxt=~/\{\{\s*(?i:Template\s*:\s*)?($re)\s*\|/){
$b0rken="page contains an unclosed {{$1, which probably means there\'s really an LDR with an unclosed cite template";
$self->_log($api,"* [[:$title]]: Revision [[Special:Diff/$lastrevid|$lastrevid]] is too b0rken to fix (<nowiki>$b0rken</nowiki>), skipping");
$api->log("Revision $lastrevid of $title is too b0rken to fix ($b0rken), skipping");
$checked->{'title'}=$title;
$checked->{'touched'}=time();
$checked->{'continue'}='';
$checked->{'did_summary_links'}=1;
$checked->{'did_page_links'}=1;
$api->store->{$pageid}=$checked;
return undef;
}
$re = $api->get_token_regex();
if ( $outtxt=~/($re)$/s ) {
my $last = $api->replace_stripped( $1, $nowiki );
if ( $last=~/^<!--/s && $last!~/-->$/s && $last=~/<ref\s/i ) {
$b0rken='page ends in an unclosed comment that contains a ref tag, which probably means LDRs are broken';
$self->_log($api,"* [[:$title]]: Revision [[Special:Diff/$lastrevid|$lastrevid]] is too b0rken to fix (<nowiki>$b0rken</nowiki>), skipping");
$api->log("Revision $lastrevid of $title is too b0rken to fix ($b0rken), skipping");
$checked->{'title'}=$title;
$checked->{'touched'}=time();
$checked->{'continue'}='';
$checked->{'did_summary_links'}=1;
$checked->{'did_page_links'}=1;
$api->store->{$pageid}=$checked;
return undef;
}
}
# Find references currently in the article, and build list of
# replacements to be applied.
my @replacements=();
my %refs=$self->_get_refs($api, $outtxt, \@replacements, \$b0rken);
if($b0rken ne ''){
$self->_log($api,"* [[:$title]]: Revision [[Special:Diff/$lastrevid|$lastrevid]] is too b0rken to fix (<nowiki>$b0rken</nowiki>), skipping");
$api->log("Revision $lastrevid of $title is too b0rken to fix ($b0rken), skipping");
$checked->{'title'}=$title;
$checked->{'touched'}=time();
$checked->{'continue'}='';
$checked->{'did_summary_links'}=1;
$checked->{'did_page_links'}=1;
$api->store->{$pageid}=$checked;
return undef;
}
# People like to make these silly little templates to avoid typing "<ref>"
# themselves. To avoid whining, detect that situation.
my $exptxt = $api->process_templates($outtxt, sub {
my ($name, $params, $wikitext, $data, $oname) = @_;
# Replace #tag:ref and #tag:references because expandtemplates does weird things to them.
# Just prefixing with "subst:" is currently enough of a guard to keep it from expanding it.
# Same for our recognized templates, for similar reasons.
if($name=~/^#tag:\s*(ref|references)$/is || exists($alltpl{$name})){
return "{{subst:$oname|" . join('|', @$params) . "}}";
}
return undef;
} );
my $res=$api->query(
action => 'expandtemplates',
title => $title,
text => $api->replace_stripped( $exptxt, $nowiki ),
prop => 'wikitext',
);
if($res->{'code'} ne 'success'){
$api->warn("Failed to expand templates for $title: ".$res->{'error'}."\n");
return undef;
}
$exptxt = $api->process_templates($res->{'expandtemplates'}->{'wikitext'}, sub {
my ($name, $params, $wikitext, $data) = @_;
# Undo the replacements above.
if($name=~m!^subst:(.+)$!is){
my $name2=$1;
if($name2=~/^#tag:\s*(ref|references)$/is || exists($alltpl{$name2})){
return "{{$name2|" . join('|', @$params) . "}}";
}
}
return undef;
} );
my $tmplB0rken = '';
my %tmplrefs=$self->_get_refs($api, $exptxt, undef, \$tmplB0rken);
if($tmplB0rken ne ''){
$self->_log($api,"* [[:$title]]: Revision [[Special:Diff/$lastrevid|$lastrevid]] is transcluding something too b0rken to fix (<nowiki>$tmplB0rken</nowiki>), skipping");
$api->log("Revision $lastrevid of $title is transcluding something too b0rken to fix ($tmplB0rken), skipping");
$checked->{'title'}=$title;
$checked->{'touched'}=time();
$checked->{'continue'}='';
$checked->{'did_summary_links'}=1;
$checked->{'did_page_links'}=1;
$api->store->{$pageid}=$checked;
return undef;
}
# Any orphaned refs?
my @unfound=@{$checked->{'unfound'}};
my @ignored=@{$checked->{'ignored'}};
my @intemplates=();
my %needed=();
while(my ($g,$refs)=each(%refs)){
while(my ($n,$v)=each(%$refs)){
my $x="$g>$n";
if(exists($refs{$g}{$n}{'broken'})){
# Broken ref (contains "<ref"), just completely ignore
# it.
} elsif($v->{'type'} eq ''){
if(($tmplrefs{$g}{$n}{'type'} // '') ne ''){
# The template seems to be defined inside one of those silly templates.
# Don't add it to %needed.
push @intemplates, $x unless grep { $_ eq $x } @intemplates;
} else {
# Orphan found, mark as needed unless known to be
# unfindable/unusable.
$needed{$x}=$v->{'orig'} unless grep { $_ eq $x } ( @unfound, @ignored );
}
} else {
# Check if someone added a previously unfound or ignored ref
@unfound=grep { $_ ne $x } @unfound;
@ignored=grep { $_ ne $x } @ignored;
}
}
}
$anyfix=(@replacements || $anyfix ne $outtxt);
push @log, "** Ignoring missing refs that appear to be in silly templates: <nowiki>".join(', ', map { my $x=$_; $x=~s/^>//; $x } @intemplates)."</nowiki>" if @intemplates;
push @log, "** Fixed broken references" if $anyfix;
# Setup for checking for unfound refs in page history
my %rq=(
pageids => $pageid,
prop => 'revisions',
rvprop => 'ids|timestamp|user|comment|content',
rvslots => 'main',
# Using 1 instead of max because we're downloading the content
# of each revision
rvlimit => 1
);
my %found=();
my $needed=scalar keys %needed;
while($needed>0 && $checked->{'continue'} ne ''){
# We found some orphaned refs. Now we have to start going back
# through the history to try to find the original text...
$rq{'rvstartid'}=$checked->{'revid'};
$rq{'rvcontinue'}=$checked->{'continue'} unless $checked->{'continue'} eq '<beginning>';
my $prevts=$checked->{'continue'} eq '<beginning>' ? time() : $checked->{'prev_ts'};
my $rres=$api->query(%rq);
if($rres->{'code'} eq 'rvbadcontinue'){
# Bad saved continue value?
delete $rq{'rvcontinue'};
$checked->{'continue'} = '<beginning>';
redo;
}
if($rres->{'code'} ne 'success'){
$api->warn("Failed to retrieve revision for $title: ".$rres->{'error'}."\n");
last;
}
if(exists($rres->{'query-continue'})){
$checked->{'continue'}=$rres->{'query-continue'}{'revisions'}{'rvcontinue'};
} else {
$checked->{'continue'}='';
}
my $r=$rres->{'query'}{'pages'}{$pageid}{'revisions'}[0];
if($r->{'revid'} ne $checked->{'revid'}){
# Get refs from this past revision, and see if any of them
# are the ones we need.
next unless exists($r->{'slots'}{'main'}{'*'}); # RevDel
my %rrefs=$self->_get_refs($api, $r->{'slots'}{'main'}{'*'});
foreach (keys %needed){
my ($g,$n)=split />/, $_, 2;
next if !exists($rrefs{$g}{$n});
next if $rrefs{$g}{$n}{'type'} eq '';
# Only rescue generic names from recent-ish revisions
if(_is_generic_ref_name($n,$g) && $prevts < time() - 525600 * 60){
delete $needed{$_};
$needed--;
my $k = $_;
unless ( grep { $_ eq $k } @{$checked->{'ignored'}} ) {
push @log, "** Ignored \"$n\" from rev [[Special:PermaLink/".$r->{'revid'}."|".$r->{'revid'}."]], name is generic and revision is old";
push @{$checked->{'ignored'}}, $k;
}
next;
}
my ($dup,$dupref)=_check_dups($g,$rrefs{$g}{$n},$refs{$g});
my $log;
if(defined($dup)){
foreach my $need (@{$needed{$_}}) {
push @replacements, {
'orig' => $need,
'repl' => $dupref
};
}
$found{$_}="\"$n\" → \"$dup\" from rev ".$r->{'revid'};
$log="** Renamed \"$n\" → \"$dup\" from rev [[Special:PermaLink/".$r->{'revid'}."|".$r->{'revid'}."]]";
} else {
push @replacements, {
'orig' => $needed{$_}[0],
'repl' => $rrefs{$g}{$n}{'repl'}
};
$found{$_}="\"$n\" from rev ".$r->{'revid'};
$log="** Rescued \"$n\" from rev [[Special:PermaLink/".$r->{'revid'}."|".$r->{'revid'}."]]";
}
if(exists($checked->{'prev_info'})){
my @i=@{$checked->{'prev_info'}};
$log.="<br /><small>Removed";
$log.=" in revision [[Special:Diff/$i[0]|$i[0]]]" if $i[0]>0;
$log.=" by [[User:$i[1]|]] ([[User talk:$i[1]|talk]] • [[Special:Contributions/$i[1]|contribs]] • [[Special:Log/$i[1]|logs]])" if $i[1] ne '';
if(defined($i[2]) && $i[2] ne ''){
$i[2]=~s!</nowiki>!</nowiki>!g;
$log.=" with comment \"<nowiki>$i[2]</nowiki>\"";
} else {
$log.=" with no comment";
}
if($i[3]!=-1){
my $l=length($r->{'slots'}{'main'}{'*'});
my $d=$i[3]-$l;
if($d>0){
my $p=int($d/$l*100+.5);
$log.=" (added $d/$l bytes, $p%)";
} elsif($d<0){
$d=-$d;
my $p=int($d/$l*100+.5);
$log.=" (removed $d/$l bytes, $p%)";
}
}
$log.="</small>";
}
push @log, $log;
delete $needed{$_};
$needed--;
}
}
# Update the previous revision time to this revision's time
$checked->{'prev_ts'}=$api->ISO2timestamp($r->{'timestamp'});
$checked->{'prev_info'}=[$r->{'revid'},$r->{'user'},$r->{'comment'},length($r->{'slots'}{'main'}{'*'})];
# If we've been at it long enough, exit the loop to give
# another page a chance.
last if time()>=$endtime;
last if $api->halting;
}
# If we found all orphans, no need to continue next time.
if($needed==0){
$checked->{'continue'}='';
$checked->{'did_summary_links'}=1;
$checked->{'did_page_links'}=1;
}
if($checked->{'continue'} eq '' && !$checked->{'did_summary_links'} && time()<$endtime-60 && !$api->halting){
# Setup for checking for unfound refs in pages linked from edit
# summaries. We do this all at once because it's easier.
$api->log("Checking for content in pages linked from $title edit summaries (this may take a while)");
my %rq=(
pageids => $pageid,
prop => 'revisions',
rvprop => 'comment',
rvslots => 'main',
rvlimit => 'max',
);
my %links=();
my $re='(?:'.$api->interwiki_re().'|'.$api->namespace_re(qw/! 0/).')';
do {
my $rres=$api->query(%rq);
if($rres->{'code'} ne 'success'){
$api->warn("Failed to retrieve edit summaries for $title: ".$rres->{'error'}."\n");
return -1;
}
if(exists($rres->{'query-continue'})){
$rq{'rvcontinue'}=$rres->{'query-continue'}{'revisions'}{'rvcontinue'};
} else {
delete($rq{'rvcontinue'});
}
foreach my $r (@{(values %{$rres->{'query'}{'pages'}})[0]{'revisions'}}){
next unless ref($r) eq 'HASH';
next unless exists($r->{'comment'});
foreach my $l ($r->{'comment'}=~/\[\[(.*?)(?:\|.*?)?\]\]/g){
next if $l=~/^\s*(?::\s*)?$re\s*:/i;
$links{$l}=1;
}
}
} while(exists($rq{'rvcontinue'}));
my $r=%links?_check_linked_pages($api,$self,[keys %links],'summary',$pageid,$title,\%refs,\%needed,\%found,\@replacements,\@log):0;
if($r==-1){
# Failed, continue next time
} elsif($r==-2){
# Major fail
return undef;
} elsif($r>0){
# Really major fail
return $r;
} else {
# Success!
$checked->{'did_summary_links'}=1;
}
}
if(scalar(keys %needed)==0){
$checked->{'did_page_links'}=1;
}
## 2023-07-21: Disable linked pages check pending figuring out a way for it to be less FP-prone. People too often use generic ref names like the name of the newspaper/website.
$checked->{'did_page_links'} = 1;
if($checked->{'continue'} eq '' && $checked->{'did_summary_links'} && !$checked->{'did_page_links'} && time()<$endtime-60 && !$api->halting){{
# Setup for checking for unfound refs in linked pages. We do
# this all at once because we want to take into account that
# different articles could have the same named ref with
# different content.
$api->log("Checking for content in pages linked from or linking to $title (this may take a while)");
my %links=();
my $i=$api->iterator(
pageids => $pageid,
generator => 'links',
gplnamespace => 0,
gpllimit => 'max',
);
my $fail=0;
while(my $p=$i->next){
if(!$p->{'_ok_'}){
$api->warn("Failed to retrieve links for $title: ".$p->{'error'}."\n");
$fail=1;
last;
}
$links{$p->{'title'}}=1;
}
last if $fail; # break from the "if"
my $res=$api->query(
generator => 'backlinks',
gbltitle => $title,
gblnamespace => 0,
gblredirect => 1,
gbllimit => 1000,
);
if($res->{'code'} ne 'success'){
$api->warn("Failed to retrieve backlinks for $title: ".$res->{'error'}."\n");
$fail=1;
last;
} elsif(exists($res->{'query-continue'})){
$api->log("Skipping check for content in pages linking to $title, there are too many");
} else {
for my $p (values %{$res->{'query'}{'pages'}}){
$links{$p->{'title'}}=1;
}
}
my $r=%links?_check_linked_pages($api,$self,[keys %links],'page',$pageid,$title,\%refs,\%needed,\%found,\@replacements,\@log):0;
if($r==-1){
# Failed, continue next time
} elsif($r==-2){
# Major fail
return undef;
} elsif($r>0){
# Really major fail
return $r;
} else {
# Success!
$checked->{'did_page_links'}=1;
}
}}
# Process the list of replacements now.
foreach (@replacements){
my $i=index($outtxt, $_->{'orig'});
substr($outtxt, $i, length($_->{'orig'}))=$_->{'repl'} if $i>=0;
}
# Refs inside of templates have a habit of causing problems if the
# parameter they are inside of is not rendered. So if we find a
# named ref where the body is inside a template and a reference is
# outside, move the body to the outside instance. We do this by
# stripping out all templates, looking for and replacing any
# "orphans" in what is left, and then replacing all the templates.
#
# But don't do it if the page contains transclusion control tags, as that
# probably means someone is doing something stupid with transcluding one
# article into another.
my $moved=0;
unless($outtxt=~/<(?:includeonly|noinclude|onlyinclude)>/){
my $outtmpl={};
$outtxt=$api->strip_templates($outtxt, \&_strip_templates, undef, $outtmpl);
%refs=$self->_get_refs($api, $outtxt);
my %needed2=();
my $fail=0;
while(my ($g,$refs)=each(%refs)){
while(my ($n,$v)=each(%$refs)){
my $x="$g>$n";
# Broken ref (contains "<ref"), completely ignore it.
next if(exists($refs{$g}{$n}{'broken'}));
# Body version has content, ignore it.
next if($v->{'type'} ne '');
# Orphan found, mark as needed unless known to be
# unfindable.
$needed2{$x}=$v->{'orig'}[0];
}
}
$needed=scalar values %needed2;
while($needed>0 && (my ($k,$v)=each %$outtmpl)){
# Skip the template if it contains a reflist, too weird to mess with.
next if $v=~/<references/;
my $found = 0;
$api->strip_templates($v, sub {
my ($name, $params, $wikitext, $data) = @_;
$found = 1 if $name=~/^#tag:\s*references$/is || ($alltpl{$name} // '') eq 'references';
return undef;
} );
next if $found;
my %rrefs=$self->_get_refs($api, $v);
foreach (keys %needed2){
my ($g,$n)=split />/, $_, 2;
next if !exists($rrefs{$g}{$n});
next if $rrefs{$g}{$n}{'type'} eq '';
# Instructed to ignore it
next if($api->replace_nowiki($rrefs{$g}{$n}{'content'}, $nowiki)=~/<!--\s*AnomieBOT:\s*Don\x27t move\s*-->/i);
my $orig=$needed2{$_};
my $repl=$rrefs{$g}{$n}{'repl'};
my $i=index($outtxt, $orig);
my $j=index($v, $repl);
next unless($i>=0 && $j>=0);
# Found a candidate to move! But first, verify it will actually
# have an effect.
my $txt1="$v\n\n$orig\n<references".($g?" group=\"$g\"":"")."/>";
my $res=$api->query(
action => 'parse',
title => $title,
text => $api->replace_nowiki($txt1, $nowiki),
prop => 'text',
disablelimitreport => 1
);
if($res->{'code'} ne 'success'){
$api->warn("Failed to test template for $title: ".$res->{'error'}."\n");
$fail=1;
last;
}
($txt1=$res->{'parse'}{'text'}{'*'})=~s/^\s*|\s*$//g;
my $txt2="$v\n\n$repl\n<references".($g?" group=\"$g\"":"")."/>";
substr($txt2, $j, length($repl))=$orig;
$res=$api->query(
action => 'parse',
title => $title,
text => $api->replace_nowiki($txt2, $nowiki),
prop => 'text',
disablelimitreport => 1
);
if($res->{'code'} ne 'success'){
$api->warn("Failed to test template for $title: ".$res->{'error'}."\n");
$fail=1;
last;
}
($txt2=$res->{'parse'}{'text'}{'*'})=~s/^\s*|\s*$//g;
if($txt1 ne $txt2){
# Something changed in the output, so it's probably a worthwhile move.
substr($outtxt, $i, length($orig))=$repl;
substr($v, $j, length($repl))=$orig;
delete $needed2{$_};
$needed--;
$moved++ unless exists($found{$_});
}
}
last if $fail;
$outtmpl->{$k}=$v;
}
$outtxt=$api->replace_stripped($outtxt, $outtmpl);
push @log, "** Moved refs out of templates" if $moved;
}
# Done processing, put back the <nowiki>s now
$outtxt=$api->replace_nowiki($outtxt, $nowiki);
return {
outtxt => $outtxt,
log => \@log,
anyfix => $anyfix,
moved => $moved,
found => [values %found],
unfound => \@unfound,
needed => [keys %needed],
};
}
# Return just the last of duplicate params, and optionally strip all but those
# specifically allowed
sub _uniq_params {
my $in=shift;
my @keep=@_;
my %k=();
my %out=();
my @p=($in=~/(([^\f\t\r\n \x00"'>\/=\p{Control}]+)(?:[\f\t\r\n ]*=[\f\t\r\n ]*(?:"[^<\x22]*"|'[^<\x27]*'|[^ >]+))?)/sgu);
for(my $i=0; $i<@p; $i+=2){
$p[$i+1]=lc($p[$i+1]);
next if(@keep && !grep($_ eq $p[$i+1], @keep));
$k{$p[$i+1]}=$i;
$out{$p[$i+1]}=$p[$i];
}
my @kk=();
while(my ($k,$v)=each %k){ $kk[$v]=$k; }
my $out='';
foreach (@kk){ $out.=' ' . $out{$_} if defined($_); }
$out .= $1 if $in =~ /(\s+)$/;
$out=~s/^\s+//;
return $out;
}
# Check if the found ref is identical to a reference in the current version of
# the article. If so, use the current version.
sub _check_dups {
my $g=shift;
my $ref1=shift;
my $refs=shift;
my $c1=$ref1->{'content'}; $c1=~s/\s+//g;
$c1=~s/[\x{2013}\x{2014}]|&([mn]dash|#0*821[12]|#x0*201[34]);/-/g;
$c1=~s/\x{2212}|&(minus|#0*8722|#x0*2212);/-/g;
while(my ($n2,$v2)=each(%$refs)){
my $c2=$v2->{'content'}//''; $c2=~s/\s+//g;
$c2=~s/[\x{2013}\x{2014}]|&([mn]dash|#0*821[12]|#x0*201[34]);/-/g;
$c2=~s/\x{2212}|&(minus|#0*8722|#x0*2212);/-/g;
next unless $c1 eq $c2;
my $ref='<ref';
if($g ne ''){
my $gx=$g;
my $q='"';
if($gx=~/"/){
if($gx=~/'/){
$gx=~s/"/"/g;
} else {
$q="'";
}
}
$gx=~s/</</g; $gx=~s/>/>/g;
$ref.=" group=$q$gx$q";
}
my $nx=$n2;
my $q='"';
if($nx=~/"/){
if($nx=~/'/){
$nx=~s/"/"/g;
} else {
$q="'";
}
}
$nx=~s/</</g; $nx=~s/>/>/g;
$ref.=" name=$q$nx$q";
$ref.=" />";
return ($n2,$ref);
}
return (undef,undef);
}
# Subroutine to get all the references in some wikitext.
sub _get_refs {
my $self=shift;
my $api=shift;
my $text=shift;
my $replacements=shift; $replacements=[] unless defined($replacements);
my $dummy='';
my $b0rken=shift; $b0rken=\$dummy unless defined($b0rken);
my %refs=();
# The new "list-defined references" have to be handled specially, which
# means we have to manage to pull them out of the wikitext. Fun.
# First, do the XML-style tags.
my $nowiki;
($text,$nowiki)=$api->strip_nowiki($text);
my @matches=($text=~m!(<references((?:\s+[^>]*[^/>])?)(?:/>|>(.*?)(</references>|$)))!oigs);
for(my $i=0; $i<@matches; $i+=4){
$text=~s/\Q$matches[$i]\E//g;
# Last ref in the page broken?
if(defined($matches[$i+3]) && $matches[$i+3] eq ''){
$$b0rken='Last <references> in page is unclosed'; next;
}
# Comments don't work right inside ref names
if($matches[$i+1]=~/\x02/){ $$b0rken='References parameters contain strip marker'; next; }
# Don't delete tons of content if some vandal breaks a ref tag
if($matches[$i+1]=~/<references(?:[\s>]|$)/){ $$b0rken='References parameters contain <references>'; next; }
if($matches[$i+1]=~/\n==/){ $$b0rken='References parameters contain =='; next; }
# I can't believe someone actually used “” quotes in a ref tag, but
# they did. So test for it.
my $x=$matches[$i+1];
$x=~s/(\s+group\s*=\s*)[\x{2018}\x{2019}]([^\x{2018}\x{2019}\x22<]*)\x{2019}/$1"$2"/g;
$x=~s/(\s+group\s*=\s*)[\x22\x{201c}\x{201d}]([^\x{201c}\x{201d}\x22<]*)\x{201d}/$1"$2"/g;
if($x ne $matches[$i+1]){
my $old=$matches[$i];
$matches[$i+1]=$x;
$matches[$i]='<references'.$matches[$i+1].(defined($matches[$i+2])?'>'.$matches[$i+2].'</references>':'/>');
push @$replacements, {
'orig' => $api->replace_nowiki($old,$nowiki),
'repl' => $api->replace_nowiki($matches[$i],$nowiki)
};
}
$matches[$i+1]=~s/\s+$//g;
# Group?
my ($gg,$g);
if($matches[$i+1]=~/(\s+group\s*=\s*"([^\x22<]*)")/oi ||
$matches[$i+1]=~/(\s+group\s*=\s*'([^\x27<]*)')/oi ||
$matches[$i+1]=~/(\s+group\s*=\s*([^\x09\x0a\x0c\x0d\x20]+))/oi){
$gg=$1; $g=$2;
} else {
$gg=''; $g='';
}
# Ok, parse the list-defined refs
if(defined($matches[$i+2])){
%refs=_get_refs2($self,$api,$api->replace_stripped($matches[$i+2],$nowiki),$replacements,$b0rken,$g,'references',%refs);
}
}
$text=$api->replace_nowiki($text,$nowiki);
# Next, do reflist and #tag:references
$api->process_templates($text, sub {
my $name=shift;
my @params=@{shift()};
my $orig=shift;
my ($c, $g, $type);
my $prop = $reflist{$name} // $reflist{"Template:$name"} // undef;
if($prop){
($type=$name)=~s/^Template://;
my $groupre=$prop->{'groupre'};
my @refparams=@{$prop->{'refs'}};
$g=$prop->{'group'};
$c='';
foreach my $p ($api->process_paramlist(@params)){
if(grep { $p->{'name'} eq $_ } @refparams) {
$c=$p->{'value'};
} elsif($p->{'name'} eq 'group' && $p->{'value'}=~/^\s*([\x22\x27]?)($groupre)\1\s*$/oi){
$g=$2;
}
}
} elsif($name=~/^#tag:\s*references$/is){
$type=$name;
$c=shift(@params) // '';
my $bad=0;
foreach (@params){
if(/^\s*group\s*=\s*([\x22\x27]?)([^\x22\x27]*?)\1\s*$/oi){
$g=$2;
} else {
$bad=1;
}
}
# If it had unrecognized parameters to the tag, strip them
if($bad){
my $old=$orig;
$orig="\x7b\x7b#tag:references|$c";
$orig.="|group=$g" if $g ne '';
$orig.="\x7d\x7d";
push @$replacements, {
'orig' => $old,
'repl' => $orig
};
}
} else {
return undef;
}
# Ok, parse the list-defined refs
%refs=_get_refs2($self,$api,$c,$replacements,$b0rken,$g,$type,%refs);
return '';
});
# And finally, parse the page text.
return _get_refs2($self,$api,$text,$replacements,$b0rken,'','',%refs);
}
sub _get_refs2 {
my $self=shift;
my $api=shift;
my ($text,$nowiki)=$api->strip_nowiki(shift);
my $replacements=shift;
my $b0rken=shift;
my $defaultgroup=shift;
my $listdefined=shift;
my %refs=@_;
# Fix whitespace in default group
$defaultgroup =~ s/[\t\r\n ]+/ /g;
$defaultgroup =~ s/^\s+|\s+$//g;
# Find all ref tags
my @matches=($text=~m!(<ref((?:\s+[^>]*[^/>])?)(?:/>|>(.*?)(</ref\s*>|$)))!oigs);
for(my $i=0; $i<@matches; $i+=4){
# Last ref in the page broken?
if(defined($matches[$i+3]) && $matches[$i+3] eq ''){
$$b0rken='Last <ref> in page is unclosed'; next;
}
# Comments don't work right inside ref names
if($matches[$i+1]=~/\x02/){ $$b0rken='Ref parameters contain strip marker'; next; }
# Don't delete tons of content if some vandal breaks a ref tag
if($matches[$i+1]=~/<ref(?:[\s>]|$)/){ $$b0rken='Ref parameters contain <ref>'; next; }
if($matches[$i+1]=~/\n==/){ $$b0rken='Ref parameters contain =='; next; }
# I can't believe someone actually used “” quotes in a ref tag, but
# they did. So test for it.
my $x=$matches[$i+1];
$x=~s/(\s+(?:name|group)\s*=\s*)[\x{2018}\x{2019}]([^\x{2018}\x{2019}\x22<]*)\x{2019}/$1"$2"/g;
$x=~s/(\s+(?:name|group)\s*=\s*)[\x22\x{201c}\x{201d}]([^\x{201c}\x{201d}\x22<]*)\x{201d}/$1"$2"/g;
if($x ne $matches[$i+1]){
my $old=$matches[$i];
$matches[$i+1]=$x;
$matches[$i]='<ref'.$matches[$i+1].(defined($matches[$i+2])?'>'.$matches[$i+2].'</ref>':'/>');
push @$replacements, {
'orig' => $api->replace_nowiki($old,$nowiki),
'repl' => $api->replace_nowiki($matches[$i],$nowiki)
};
}
$matches[$i+1]=~s/\s+$//g;
# Fix obviously incorrect ref bodies.
if(defined($matches[$i+2]) &&
($matches[$i+2]=~/^\s*$/ ||
$matches[$i+2] eq 'Insert footnote text here')){
my $old=$matches[$i];
$matches[$i+2]=undef;
$matches[$i]='<ref'.$matches[$i+1].'/>';
push @$replacements, {
'orig' => $api->replace_nowiki($old,$nowiki),
'repl' => $api->replace_nowiki($matches[$i],$nowiki)
};
}
# Extract params
my ($gg, $g) = ('', $defaultgroup);
my ($nn, $n) = ('', '');
my $params = '';
my @m = $matches[$i+1] =~ /$attrRe/g;
for(my $j=0; $j<@m; $j+=5){
my $a = lc( $m[$j+1] );
if ( $a eq 'group' || $a eq 'name' || $a eq 'dir' || $a eq 'follow' ) {
$m[$j] .= $m[$j+2] if( ($m[$j+2]//'') ne ($m[$j+4]//'') );
$params .= $m[$j];
($gg, $g) = ($m[$j], $m[$j+3]//'') if $a eq 'group';
($nn, $n) = ($m[$j], $m[$j+3]//'') if $a eq 'name';
}
}
# If it's unnamed and empty, remove it completely.
if($nn eq '' && !defined($matches[$i+2])){
push @$replacements, {
'orig' => $api->replace_nowiki($matches[$i],$nowiki),
'repl' => ''
};
next;
}
# Unknown parameters cause errors, so replace them if found.
if($matches[$i+1] ne $params) {
my $old=$matches[$i];
$matches[$i+1]=$params;
$matches[$i]='<ref'.$matches[$i+1].(defined($matches[$i+2])?'>'.$matches[$i+2].'</ref>':'/>');
push @$replacements, {
'orig' => $api->replace_nowiki($old,$nowiki),
'repl' => $api->replace_nowiki($matches[$i],$nowiki)
};
}
# We're not interested if it's unnamed.
if($nn eq ''){
$$b0rken='Ref contains <ref>' if(defined($matches[$i+2]) && $matches[$i+2]=~/<ref(?:[\s>]|$)/);
$$b0rken='Ref contains ==' if(defined($matches[$i+2]) && $matches[$i+2]=~/\n==/);
next;
}
# Fix whitespace in parameters
my $g2 = $g; $g=~s/[\t\r\n ]+/ /g; $g=~s/^\s+|\s+$//g;
if ( $g ne $g2 ) {
my $old=$matches[$i];
$matches[$i+1]=~s/group\s*=\s*([\x22\x27]?)\Q$g2\E\1/group=$1$g$1/i;
$matches[$i]='<ref'.$matches[$i+1].(defined($matches[$i+2])?'>'.$matches[$i+2].'</ref>':'/>');
push @$replacements, {
'orig' => $api->replace_nowiki($old,$nowiki),
'repl' => $api->replace_nowiki($matches[$i],$nowiki)
};
}
my $n2 = $n; $n=~s/[\t\r\n ]+/ /g; $n=~s/^\s+|\s+$//g;
if ( $n ne $n2 ) {
my $old=$matches[$i];
$matches[$i+1]=~s/name\s*=\s*([\x22\x27]?)\Q$n2\E\1/name=$1$n$1/i;
$matches[$i]='<ref'.$matches[$i+1].(defined($matches[$i+2])?'>'.$matches[$i+2].'</ref>':'/>');
push @$replacements, {
'orig' => $api->replace_nowiki($old,$nowiki),
'repl' => $api->replace_nowiki($matches[$i],$nowiki)
};
}
# Integer names cause errors, so replace them if found.
if($n=~/^\d+$/){
my $x="renamed_from_".$n."_on_".strftime('%Y%m%d%H%M%S', gmtime);
next if index($text, $x)>=0;
my $old=$matches[$i];
$matches[$i+1]=~s/name\s*=\s*([\x22\x27]?)$n\1/name=$1$x$1/i;
$matches[$i]='<ref'.$matches[$i+1].(defined($matches[$i+2])?'>'.$matches[$i+2].'</ref>':'/>');
push @$replacements, {
'orig' => $api->replace_nowiki($old,$nowiki),
'repl' => $api->replace_nowiki($matches[$i],$nowiki)
};
$n=$x;
}
# Decode HTML entities, as MediaWiki does for <ref> tags (but not {{#tag:ref}})
$g = HTML::Entities::decode( $g );
$n = HTML::Entities::decode( $n );
# Save detected reference
$refs{$g}={} unless exists($refs{$g});
if(!exists($refs{$g}{$n})){
$refs{$g}{$n}={
orig => [],
type => '',
content => undef,
listdefined => $listdefined
};
}
push @{$refs{$g}{$n}{'orig'}}, $api->replace_nowiki($matches[$i+0],$nowiki);
if(defined($matches[$i+2]) && $matches[$i+2]=~/<ref(?:[\s>]|$)/){
# Reference contains "<ref", so probably someone forgot a </ref>
# somewhere (and then that's probably how it got "orphaned"). To be
# safe, don't use it.
$matches[$i+2]=undef;
$refs{$g}{$n}{'broken'}=1;
$$b0rken='Ref contains <ref>';
}
if(defined($matches[$i+2]) && $matches[$i+2]=~/\n==/){
# Reference contains "==", so probably someone forgot a </ref>
# somewhere (and then that's probably how it got "orphaned"). To be
# safe, don't use it.
$matches[$i+2]=undef;
$refs{$g}{$n}{'broken'}=1;
$$b0rken='Ref contains ==';
}
if($refs{$g}{$n}{'type'} eq '' && defined($matches[$i+2])){
$refs{$g}{$n}{'type'}='ref';
$refs{$g}{$n}{'repl'}=$api->replace_nowiki($matches[$i+0],$nowiki);
$refs{$g}{$n}{'content'}=$api->replace_nowiki($matches[$i+2],$nowiki);
}
}
# Darn. Now we have to parse through the page and find all the #tag:refs
# and {{refn}}/{{efn}} too.
$api->process_templates($text, sub {
my $name=shift;
my @params=@{shift()};
my $orig=$api->replace_nowiki(shift,$nowiki);
shift;
my $oname=shift;
my ($type, $groupre, @contentparams);
my $g=$defaultgroup;
my $c;
if($name=~/^#tag:\s*ref$/is){
$oname='#tag:ref';
$type='tag';
$groupre=qr/[^\x22\x27]*/;
@contentparams=();
$c=$api->replace_nowiki(shift(@params),$nowiki);
} elsif(exists($reftpl{$name}) || exists($reftpl{"Template:$name"})){
$type='tpl';
my $props=$reftpl{$name} // $reftpl{"Template:$name"};
$g=$props->{'group'} if $props->{'group'} ne '';
$groupre=$props->{'groupre'};
@contentparams=@{$props->{'content'}};
} else {
return undef;
}
my $n=undef;
my @bad=();
foreach my $p ($api->process_paramlist(@params)){
# Whitespace and quotes will be stripped from name and group by #tag, and all the templates use #tag at some level too.
my $v = $p->{'value'};
$v=~s/^\s*([\x22\x27]?)(.*?)\1\s*$/$2/;
if($p->{'name'} eq 'group'){
$g=$api->replace_nowiki($v,$nowiki) if $v =~ /$groupre/;
} elsif($p->{'name'} eq 'name') {
$n=$api->replace_nowiki($v,$nowiki);
} elsif(grep { $p->{'name'} eq $_ } @contentparams) {
$c=$api->replace_nowiki($p->{'value'},$nowiki);
} else {
push @bad, $p->{'text'};
}
}
# If it's a template, no content, and one "bad" param that contains an `=`, let's guess it's
# a case where they should have used an explicit param name and didn't.
if(!defined($c) && $type ne 'tag' && @bad == 1 && $bad[0]=~/=/ && @contentparams){
$c = pop @bad;
my ($cp) = @contentparams;
my $old=$orig;
$orig="\x7b\x7b$oname";
for my $p (@params) {
$p = "$cp=" . $api->replace_nowiki($p,$nowiki) if $p eq $c;
$orig.='|' . $api->replace_nowiki($p,$nowiki);
}
$orig.="\x7d\x7d";
push @$replacements, {
'orig' => $old,
'repl' => $orig
};
$c=$api->replace_nowiki($c,$nowiki);
}
$c='' if !defined($c);
# We're not interested if it's unnamed. But strip it out if
# it's unnamed and empty, because that's an error.
if(!defined($n)){
if($c eq ''){
push @$replacements, {
'orig' => $orig,
'repl' => ''
};
}
return undef;
}
# If it had unrecognized parameters to the tag, strip them
if(@bad && $type eq 'tag'){
my $old=$orig;
$orig="\x7b\x7b$oname";
$orig.="|$c";
$orig.="|name=$n" if defined($n);
$orig.="|group=$g" if $g ne $defaultgroup;
$orig.="\x7d\x7d";
push @$replacements, {
'orig' => $old,
'repl' => $orig
};
}
# Integer names cause errors, so replace them if found.
if($n=~/^\d+$/){
my $x="renamed_from_".$n."_on_".strftime('%Y%m%d%H%M%S', gmtime);
next if index($text, $x)>=0;
my $old=$orig;
$orig="\x7b\x7b$oname";
if($type eq 'tag'){
$orig.="|$c";
$orig.="|name=$x";
$orig.="|group=$g" if $g ne $defaultgroup;
} else {
foreach my $p (@params){
$p =~ s/^(\s*name\s*=\s*).*?(\s*)$/$1$x$2/;
$orig.='|' . $api->replace_nowiki($p,$nowiki);
}
}
$orig.="\x7d\x7d";
$n=$x;
push @$replacements, {
'orig' => $old,
'repl' => $orig
};
}
# Save detected reference
$refs{$g}={} unless exists($refs{$g});
if(!exists($refs{$g}{$n})){
$refs{$g}{$n}={
orig => [],
type => '',
content => undef,
listdefined => $listdefined
};
}
if($c=~/^\s*$/){
# Apparently, some people really do this. Don't use empty refs.
$c='';
}
push @{$refs{$g}{$n}{'orig'}}, $orig;
if($refs{$g}{$n}{'type'} eq '' && $c ne ''){
$refs{$g}{$n}{'type'}=$type;
$refs{$g}{$n}{'repl'}=$orig;
$refs{$g}{$n}{'content'}=$c;
}
return undef;
});
return %refs;
}
# process_templates callback to strip templates and store them in the fourth
# parameter hash
sub _strip_templates {
my ($name, $params, $wikitext, $data) = @_;
return undef if $name=~/^#tag:\s*(ref|references)$/is;
return undef if exists($alltpl{$name});
# Template in skip list?
return undef if exists($no_move_refs_out{$name});
return undef if exists($no_move_refs_out{"Template:$name"});
return 1;
}
# Regexes used below
my $months=qr/(?:January|February|March|April|May|June|July|August|September|October|November|December)/i;
my $sp=qr/(?:(?:\s| )+)/;
my $dt=qr/(?:(?:\d{1,2}$sp$months|\[\[\d{1,2}[ _]$months\]\]|$months$sp\d{1,2}|\[\[$months[ _]\d{1,2}\]\])$sp?,?$sp?(?:\d{1,4}(?:${sp}BC)?|\[\[\d{1,4}(?:[ _]BC)?\]\])|-?\d{4}-\d{2}-\d{2}|\[\[-?\d{4}-\d{2}-\d{2}\]\]|\[\[-?\d{4}\]\]-\[\[\d{2}-\d{2}\]\])/i;
# Check all the pages in the specified query for needed refs
sub _check_linked_pages {
my ($api,$self,$pages,$type,$pageid,$title,$refs,$needed,$found,$replacements,$log)=@_;
my %found_in_links=();
my %dup_in_links=();
# Resolve any redirects in the list
my %r=$api->resolve_redirects(@$pages);
if(exists($r{''})){
$api->warn("Failed to resolve redirects in $type links for $title: ".$r{''}{'error'}."\n");
return -1;
}
delete $r{$title};
# Get revids for the top revision in all pages, and also get a list of
# recently-edited pages in the list
my %revisions=();
my @pages=();
my $iter=$api->iterator(
titles => bunchlist(500, keys %r),
prop => 'revisions',
rvprop => 'ids|timestamp',
);
while(my $r=$iter->next){
if(!$r->{'_ok_'}){
$api->warn("Failed to retrieve revids for $type links for $title: ".$r->{'error'}."\n");
return -1;
}
next unless exists($r->{'revisions'}[0]{'revid'});
$revisions{$r->{'revisions'}[0]{'revid'}}=1;
push @pages, $r->{'title'} if ISO2timestamp($r->{'revisions'}[0]{'timestamp'})>time-86400;
}
return -1 if $api->halting;
# Now get the revids for the past 24 hours for all the recently-edited pages
for my $p (@pages){
my %rq=(
titles => $p,
prop => 'revisions',
rvprop => 'ids|timestamp',
rvlimit => '100',
);
do {
my $res=$api->query(%rq);
if($res->{'code'} ne 'success'){
$api->warn("Failed to retrieve older revids for $type links for $p (for $title): ".$res->{'error'}."\n");
return -1;
}
if(exists($res->{'query-continue'})){
$rq{'rvcontinue'}=$res->{'query-continue'}{'revisions'}{'rvcontinue'};
} else {
delete($rq{'rvcontinue'});
}
foreach my $r (@{(values %{$res->{'query'}{'pages'}})[0]{'revisions'}}){
$revisions{$r->{'revid'}}=1;
if(ISO2timestamp($r->{'timestamp'})<time-86400){
delete($rq{'rvcontinue'});
last;
}
}
} while(exists($rq{'rvcontinue'}));
}
# Found any revids?
return 0 unless %revisions;
$iter=$api->iterator(
revids => bunchlist(50, keys %revisions),
prop => 'revisions',
rvprop => 'content|timestamp',
rvslots => 'main',
);
while(my $r=$iter->next){
return -1 if $api->halting;
if(!$r->{'_ok_'}){
$api->warn("Failed to retrieve $type revisions for $title: ".$r->{'error'}."\n");
return -1;
}
foreach my $rev (@{$r->{'revisions'}//[]}){
next unless exists($rev->{'slots'}{'main'}{'*'});
my $ts=ISO2timestamp($rev->{'timestamp'});
# Get refs from this linked page, and see if any of
# them are the ones we need.
my %rrefs=$self->_get_refs($api, $rev->{'slots'}{'main'}{'*'});
foreach (keys %$needed){
my ($g,$n)=split />/, $_, 2;
next if !exists($rrefs{$g}{$n});
next if $rrefs{$g}{$n}{'type'} eq '';
next if _is_generic_ref_name($n,$g);
$found_in_links{$_}={} if !exists($found_in_links{$_});
# Did we find a duplicate of a ref already in our target article?
my ($dup,$dupref)=_check_dups($g,$rrefs{$g}{$n},$refs->{$g});
$dup_in_links{$_}=[0,$dup,$dupref,$r->{'title'}] if defined($dup);
# Keep only the most recent version from each article
my $have_newer=0;
while(my ($k,$v)=each %{$found_in_links{$_}}){
if($v->[4]==$r->{'pageid'}){
if($ts>$v->[5]){
delete $found_in_links{$_}{$k} if $ts>$v->[5];
} else {
$have_newer=1;
}
}
}
next if $have_newer;
my $content=$rrefs{$g}{$n}{'content'};
# To help minimize false dups, strip whitespace, manipulate
# dashes, and remove accessdate parameters from the key, and
# sort named template params.
my $k=$content;
$k=~s/Retrieved (?:on )?$dt/Retrieved xxx/ig;
$k=$api->process_templates($k, sub {
my $name=shift;
my @params=@{shift()};
return undef unless @params;
my %p=();
my $i=1;
foreach (@params){
s/\s+//g;
next if(/^access(date|monthday|daymonth|year)=/);
if(/^([^=]+)=/){
$p{$1}=$_;
} else {
$p{$i}="$i=$_";
$i++;
}
}
return "{{$name|".join("|",sort values %p)."}}";
});
$k=~s/[\x{2013}\x{2014}]|&([mn]dash|#0*821[12]|#x0*201[34]);/-/g;
$k=~s/\x{2212}|&(minus|#0*8722|#x0*2212);/-/g;
$k=~s/\s+//g;
$found_in_links{$_}{$k}=[0,$rrefs{$g}{$n}{'repl'},$r->{'title'},$content,$r->{'pageid'},$ts];
}
}
}
my @talkpost=();
$api->store->{"p$pageid"}={} unless exists($api->store->{"p$pageid"});
my $posted=$api->store->{"p$pageid"};
foreach (keys %found_in_links){
my ($g,$n)=split />/, $_, 2;
my @repl=keys %{$found_in_links{$_}};
my ($score,$repl,$from,$content);
my $dup=undef;
if(exists($dup_in_links{$_})){
($score,$dup,$repl,$from)=@{$dup_in_links{$_}};
} elsif(@repl>1){
# Crap, we have multiple versions of the named ref.
next if exists($posted->{$_});
$posted->{$_}=1;
my $x="<b>Reference named \"$n\"";
$x.=" in group \"$g\"" if $g ne '';
$x.=":</b><ul>\n";
foreach my $k (@repl){
($score,$repl,$from,$content)=@{$found_in_links{$_}{$k}};
$x.="<li>From [[$from]]: $content</li>\n";
}
$x.="</ul>\n";
push @talkpost, $x;
next;
} else {
my $k=$repl[0];
($score,$repl,$from)=@{$found_in_links{$_}{$k}};
}
if(defined($dup)){
foreach my $need (@{$needed->{$_}}) {
push @$replacements, {
'orig' => $need,
'repl' => $repl,
};
}
$found->{$_}="\"$n\" → \"$dup\" from [[$from]]";
push @$log, "** Renamed \"$n\" → \"$dup\" from [[:$from]]";
} else {
push @$replacements, {
'orig' => $needed->{$_}[0],
'repl' => $repl,
};
$found->{$_}="\"$n\" from [[$from]]";
push @$log, "** Rescued \"$n\" from [[:$from]]";
}
delete $needed->{$_};
}
if(@talkpost){
my $ttok=$api->edittoken('Talk:'.$title);
if($ttok->{'code'} eq 'shutoff'){
$api->warn("Task disabled: ".$ttok->{'content'}."\n");
return 300;
}
if($ttok->{'code'} ne 'success'){
$api->warn("Failed to get edit token for Talk:$title: ".$ttok->{'error'}."\n");
return -2;
}
my $txt="I check pages listed in ";
$txt.="[[:Category:Pages with incorrect ref formatting]] to ";
$txt.="try to fix reference errors. One of the things I ";
$txt.="do is look for content for ";
$txt.="[[User:".$api->user."/docs/OrphanReferenceFixer|orphaned references]] ";
$txt.="in wikilinked articles. I have found content for ";
$txt.="some of [[:$title]]'s orphans, the problem is that ";
$txt.="I found more than one version. I can't determine ";
$txt.="which (if any) is correct for ''this'' article, so ";
$txt.="I am asking for a sentient editor to look it over ";
$txt.="and copy the correct ref content into this article.\n\n";
$txt.=join("\n", @talkpost);
$txt.="\nI apologize if any of the above are effectively ";
$txt.="identical; I am just a simple computer program, so ";
$txt.="I can't determine whether minor differences are ";
$txt.="significant or not. <small>Feel free to remove this comment after fixing the refs.</small> \x7e\x7e\x7e\x7e";
my $r=$api->edit($ttok, $txt, "Orphaned references in [[:$title]]", 0, 0, section => 'new');
if($r->{'code'} ne 'success'){
$api->warn("Write failed on Talk:$title: ".$r->{'error'}."\n");
return -2;
}
$self->_log($api, "* Posted on [[Talk:$title]] to request assistance");
$api->store->{"p$pageid"}=$posted;
}
return 0;
}
sub _is_generic_ref_name {
my ($n, $g) = @_;
return (
# Skip autogenerated named refs, they're unlikely to be useful
# matches.
$g eq '' && $n=~/^autogenerated\d+$/ ||
# Skip these very generic names, too
$g eq '' && $n=~/^e\d+$/i ||
# Thanks, VE
$n=~/^:\d+$/
);
}
sub _log {
my $self=shift;
my $api=shift;
$api->store->{'log'}.=shift()."\n";
}
sub _notify_reverter {
my $self=shift;
my $api=shift;
my $user=shift;
my $page=shift;
my $revid=shift;
next if exists($api->store->{"revert$revid"});
my $template='User:AnomieBOT/OrphanReferenceFixer revert help';
my $chk=$api->query(
titles => $template,
prop => 'revisions',
rvprop => 'user',
rvlimit => 1,
);
if($chk->{'code'} ne 'success'){
$api->warn("Could not check $template: ".$chk->{'error'}."\n");
return -2;
}
my $edituser=(values %{$chk->{'query'}{'pages'}})[0]{'revisions'}[0]{'user'};
unless(grep $_ eq $edituser, ('Anomie')){
$api->log("An unauthorized user has edited $template!");
$api->warn("An unauthorized user has edited $template!\n");
$api->whine("An unauthorized user has edited [[:$template]]", "An unauthorized user has edited [[:$template]], so I am refusing to use it until an authorized user confirms it has not been vandalized by making any edit to it. No offense to [[User:$edituser|$edituser]], but I don't want to go substing vandalism on innocent people's talk pages.");
return 300;
}
my $title="Help on reversion";
my $summary="Provide information on correctly fixing reference errors (instead of reverting)";
my $msg="{{subst:$template|page=$page|revid=$revid|subst=subst:}}";
my $res=$api->whine($title, $msg, Summary => $summary, Pagename => "User talk:$user", OptOut => 'AnomieBOT-OrphanReferenceFixer', NoSmallPrint => 1, NoSig => 1);
if($res->{'code'} eq 'shutoff'){
$api->warn("Task disabled: ".$res->{'content'}."\n");
return 300;
}
if($res->{'code'} eq 'botexcluded'){
$self->_log($api, "* Tried to give a revert notice to [[User talk:$user|$user]] about [[Special:Diff/$revid|$revid]] on [[:$page]], but I was excluded: <nowiki>".$res->{'error'}."</nowiki>");
$api->log("Bot excluded from User talk:$user: ".$res->{'error'});
$api->store->{"revert$revid"}=1;
return 0;
}
if($res->{'code'} ne 'success'){
$api->warn("Failed to get edit token for User talk:$user: ".$res->{'error'}."\n");
return -2;
}
$self->_log($api, "* Issued a revert notice to [[User talk:$user|$user]] about [[Special:Diff/$revid|$revid]] on [[:$page]]");
$api->log("Issued a revert notice to User talk:$user about $revid on $page");
$api->store->{"revert$revid"}=1;
return 0;
}
# This function can be used to run the bot over arbitrary page content.
# Something like:
# perl -we 'use tasks::OrphanReferenceFixer; tasks::OrphanReferenceFixer::unit_test($revid[,$filename]);'
sub unit_test {
my $revid=shift;
my $filename=shift;
$|=1;
binmode STDOUT, ':utf8';
binmode STDERR, ':utf8';
my $self=tasks::OrphanReferenceFixer->new();
my $api=AnomieBOT::API->new('conf.ini', 1);
$api->{'noedit'}='/tmp/';
$api->login();
$api->DEBUG(-1);
$api->task('OrphanReferenceFixer', 0, 0.1, qw/d::Talk d::Timestamp d::Templates d::Redirects d::IWNS/);
return undef unless $api->load_IWNS_maps();
my $r=$self->init($api);
die "init failed\n" if defined($r);
my $res=$api->query(revids=>$revid,prop=>'info|revisions',rvprop=>$filename?'':'content',rvslots=>'main');
if($res->{'code'} ne 'success'){
die "Could not load revision $revid: ".$res->{'error'}."\n";
}
die "Invalid revid\n" unless(exists($res->{'query'}{'pages'}) && %{$res->{'query'}{'pages'}});
$res=(values(%{$res->{'query'}{'pages'}}))[0];
my $pageid=$res->{'pageid'};
my $lastrevid=$revid;
my $title=$res->{'title'};
my $intxt;
if($filename){
open X, '<:utf8', $filename or die "Could not open $filename: $!\n";
{ local $/=undef; $intxt=<X>; }
close X;
} else {
$intxt=$res->{'revisions'}[0]{'slots'}{'main'}{'*'};
}
my $checked={revid=>$lastrevid,continue=>'<beginning>',did_summary_links=>0,did_page_links=>0,prev_ts=>-1,prev_info=>[0,'','',-1],unfound=>[],ignored=>[]};
$api->store->{$pageid}=$checked;
$api->store->{'log'}='';
my $ret=$self->process_page($api,$pageid,$lastrevid,$title,$intxt,$checked,time()+86400);
print STDERR $api->store->{'log'}."\n";
die "Returned undef\n" unless defined($ret);
die "Returned $ret\n" unless ref($ret);
my $outtxt=$ret->{'outtxt'};
delete $ret->{'outtxt'};
print STDERR scalar Data::Dumper->Dump([$ret],['ret'])."\n";
print $outtxt;
}
1;