User:AnomieBOT/source/tasks/OrphanReferenceFixer.pm

BFRA approved 2008-09-04
Wikipedia:Bots/Requests for approval/AnomieBOT
First supplemental BFRA approved 2008-09-11
Wikipedia:Bots/Requests for approval/AnomieBOT 3
Second supplemental BRFA approved 2008-09-20
Wikipedia:Bots/Requests for approval/AnomieBOT 6
Third supplemental BRFA approval requested
Wikipedia:Bots/Requests for approval/AnomieBOT 27
package tasks::OrphanReferenceFixer;

=pod

=begin metadata

Bot:     AnomieBOT
Task:    OrphanReferenceFixer
BRFA:    Wikipedia:Bots/Requests for approval/AnomieBOT
Status:  Approved 2008-09-04
+BRFA:   Wikipedia:Bots/Requests for approval/AnomieBOT 3
+Status: Approved 2008-09-11
+BRFA:   Wikipedia:Bots/Requests for approval/AnomieBOT 6
+Status: Approved 2008-09-20
+BRFA:   Wikipedia:Bots/Requests for approval/AnomieBOT 27
+Status: Approved 2009-03-23
Created: 2008-08-20

Applies the following corrections to pages in [[:Category:Pages with incorrect
ref formatting]] and/or [[:Category:Pages with broken reference names]]. This
is often enough to get them removed from the category.
<div style="font-size:90%">
* <nowiki><ref&nbsp;name=foo&nbsp;bar> → <ref&nbsp;name="foo&nbsp;bar"></nowiki>
* <nowiki><ref&nbsp;name="foo> → <ref&nbsp;name="foo"></nowiki>
* <nowiki><ref&nbsp;name=bar"> → <ref&nbsp;name="bar"></nowiki>
* <nowiki><ref&nbsp;name&nbsp;"foo"> → <ref&nbsp;name="foo"></nowiki>
* <nowiki><ref&nbsp;name-"foo"> → <ref&nbsp;name="foo"></nowiki>
* <nowiki><ref&nbsp;name+"foo"> → <ref&nbsp;name="foo"></nowiki>
* <nowiki><ref&nbsp;"foo"> → <ref&nbsp;name="foo"></nowiki>
* <nowiki><ref&nbsp;name="foo"&nbsp;name="bar"> → <ref&nbsp;name="bar"></nowiki>
* <nowiki><ref&nbsp;…></ref> → <ref&nbsp;…/></nowiki>
* Remove <nowiki><ref&nbsp;…/></nowiki> without <code>name</code>
* Strip parameters other than <code>name</code> and <code>group</code> from <nowiki><ref> and <references></nowiki>
* Rename refs with numeric names
* Copy content for orphaned named refs from past page revisions
* Copy content for orphaned named refs from linked articles
* Move content for named refs out of infoboxen and other templates
</div>
Actions are periodically logged to [[User:AnomieBOT/OrphanReferenceFixer log]].

=end metadata

=cut

use utf8;
use strict;

use AnomieBOT::API;
use AnomieBOT::Task qw/:time bunchlist/;
use Storable qw/thaw/;
use HTML::Entities ();
use Data::Dumper;
use vars qw/@ISA/;
@ISA=qw/AnomieBOT::Task/;

my %inuse=(
    'Category:Pages actively undergoing a major edit'=>1,
);
my $inuse_delay=7200;

my $min_delay=600;
my $arbitrary_untrusted_threshold=1000;
my $arbitrary_trusted_threshold=2000;
my $untrusted_delay=7200;

my %skiptags=(
    'possible libel or vandalism' => 1,
);

my $logpage='User:AnomieBOT/OrphanReferenceFixer log';
my $logfrequency=21600;
my $loglength=28;

my $initialized=0;
my %no_move_refs_out=(
    'Template:Graphic novel list' => 1,
    'Template:Infobox nrhp' => 1,
    'Template:Infobox Lighthouse' => 1,
    'Template:Episode list/sublist' => 1,
);
my $knowngroups = qr/^(?:note|upper-alpha|upper-roman|lower-alpha|lower-greek|lower-roman)$/;
my %reftpl=(
    'Template:Refn' => { group => '', groupre => qr/[^\x22\x27]*?/, content => [ 'refn', '1' ] },
    'Template:Efn' => { group => 'lower-alpha', groupre => $knowngroups, content => [ 'reference', '1', 'content', 'text' ] },
    'Template:Efn-ua' => { group => 'upper-alpha', groupre => qr/(?!)/, content => [ 'reference', '1', 'content', 'text' ] },
    'Template:Efn-lr' => { group => 'lower-roman', groupre => qr/(?!)/, content => [ 'reference', '1', 'content', 'text' ] },
    'Template:Efn-ur' => { group => 'upper-roman', groupre => qr/(?!)/, content => [ 'reference', '1', 'content', 'text' ] },
    'Template:Efn-lg' => { group => 'lower-greek', groupre => qr/(?!)/, content => [ 'reference', '1', 'content', 'text' ] },
    'Template:NoteTag' => { group => 'note', groupre => qr/(?!)/, content => [ 'note', '1', 'content', 'text' ] },
);
my %reflist=(
    'Template:Reflist' => { group => '', groupre => qr/[^\x22\x27]*?/, refs => [ 'refs' ] },
    'Template:Notelist' => { group => 'lower-alpha', groupre => $knowngroups, refs => [ 'refs', 'notes' ] },
    'Template:Notelist-la' => { group => 'lower-alpha', groupre => qr/(?!)/, refs => [ 'refs', 'notes' ] },
    'Template:Notelist-ua' => { group => 'upper-alpha', groupre => qr/(?!)/, refs => [ 'refs', 'notes' ] },
    'Template:Notelist-lr' => { group => 'lower-roman', groupre => qr/(?!)/, refs => [ 'refs', 'notes' ] },
    'Template:Notelist-ur' => { group => 'upper-roman', groupre => qr/(?!)/, refs => [ 'refs', 'notes' ] },
    'Template:Notelist-lg' => { group => 'lower-greek', groupre => qr/(?!)/, refs => [ 'refs', 'notes' ] },
    'Template:NoteFoot' => { group => 'note', groupre => qr/(?!)/, refs => [ 'refs', 'notes' ] },
);
my %alltpl;

my $attrRe;
{{
        my $s = "\x09\x0a\x0c\x0d\x20";
        $attrRe = qr/([$s]*([^$s\/>][^$s\/>=]*)(?:[$s]*=[$s]*(?|(")([^"]*)("|$)|(')([^']*)('|$)|()([^$s>]*)()))?)/
}}

sub new {
    my $class=shift;
    my $self=$class->SUPER::new();

    # "Skip" list is used to keep one long page from monopolizing the bot's
    # time. It checks a page for a max of 10 minutes, and then skips it on
    # subsequent runs until it has processed all other pages in the category.
    $self->{'skip'}={};

    # Used to determine when to scan the datastore for removing obsolete
    # entries.
    $self->{'lastcleanup'}=0;

    bless $self, $class;
    return $self;
}


=pod

=for info
BFRA approved 2008-09-04<br />[[Wikipedia:Bots/Requests for approval/AnomieBOT]]

=for info
First supplemental BFRA approved 2008-09-11<br />[[Wikipedia:Bots/Requests for approval/AnomieBOT 3]]

=for info
Second supplemental BRFA approved 2008-09-20<br />[[Wikipedia:Bots/Requests for approval/AnomieBOT 6]]

=for info
Third supplemental BRFA approval requested<br />[[Wikipedia:Bots/Requests for approval/AnomieBOT 27]]

=cut

sub approved {
    return 4;
}

sub init {
    my ($self,$api)=@_;
    if(!$initialized){
        my %r=$api->redirects_to_resolved(keys %no_move_refs_out);
        if(exists($r{''})){
            $api->warn("Failed to get non-removal redirects: ".$r{''}{'error'}."\n");
            return 60;
        }
        %no_move_refs_out=%r;
        %r=$api->redirects_to_resolved(keys %reflist);
        if(exists($r{''})){
            $api->warn("Failed to get reflist redirects: ".$r{''}{'error'}."\n");
            return 60;
        }
        while ( my ($k, $v) = each %r ) {
            $reflist{$k} = $reflist{$v};
            $no_move_refs_out{$k} = 1;
        }
        %r=$api->redirects_to_resolved(keys %reftpl);
        if(exists($r{''})){
            $api->warn("Failed to get reftpl redirects: ".$r{''}{'error'}."\n");
            return 60;
        }
        while ( my ($k, $v) = each %r ) {
            $reftpl{$k} = $reftpl{$v};
        }
        %alltpl=(
            ( map { s/^Template://; $_ => 'references', "Template:$_" => 'references' } keys %reflist ),
            ( map { s/^Template://; $_ => 'ref', "Template:$_" => 'ref' } keys %reftpl ),
        );
        $initialized=1;
    }
    return undef;
}

sub run {
    my ($self, $api)=@_;

    $api->task('OrphanReferenceFixer', 0, 10, qw/d::Talk d::Timestamp d::Templates d::Redirects d::IWNS/);

    return 300 unless $api->load_IWNS_maps();

    if(!exists($api->store->{'did_upgrade'}) || $api->store->{'did_upgrade'}<1){
        while(my ($k,$v)=each %{$api->store}){
            if($k=~/^\d+$/){
                foreach (@{$v->{"unfound"}}){
                    my ($g,$n)=@{thaw($_)};
                    $_="$g>$n";
                }
                $api->store->{$k}=$v;
            } elsif($k=~/^p\d+$/){
                my %x=();
                while(my ($blob,$x) = each %$v){
                    my ($g,$n)=@{thaw($blob)};
                    $x{"$g>$n"}=$x;
                }
                $api->store->{$k}=\%x;
            }
        }

        $api->store->{'did_upgrade'} = 1;
    }
    if($api->store->{'did_upgrade'}<2){
        # Changed the code to handle cases that were previously broken, so recheck all pages.
        while(my ($k,$v)=each %{$api->store}){
            delete $api->store->{$k} if $k=~/^\d+$/;
        }
        $api->store->{'did_upgrade'} = 2;
    }
    if($api->store->{'did_upgrade'}<4){
        # Added ignoring of old refs. Add the prop for that.
        while(my ($k,$v)=each %{$api->store}){
            next unless $k=~/^\d+$/;
            if(!defined($v->{'ignored'})){
                $v->{'ignored'} = [];
                $api->store->{$k}=$v;
            }
        }
        $api->store->{'did_upgrade'} = 4;
    }

    if($self->{'lastcleanup'}+86400<time()){
        # Cleanup obsolete entries in the data store
        my $exp=time()-86400*30;
        while(my ($k,$v)=each %{$api->store}){
            next unless $k=~/^\d+$/;
            delete $api->store->{$k} if $v->{'touched'}<$exp;
        }
    }

    my $r=$self->init($api);
    return $r if defined($r);

    $api->store->{'log'}='' unless exists($api->store->{'log'});
    $api->store->{'lastlog'}=0 unless exists($api->store->{'lastlog'});
    my $log=$api->store->{'log'};
    my $lastlog=$api->store->{'lastlog'};
    if($log ne '' && $lastlog!=-1 && $lastlog+$logfrequency<time()){
        my $tok=$api->edittoken($logpage);
        if($tok->{'code'} eq 'shutoff'){
            $api->warn("Task disabled: ".$tok->{'content'}."\n");
            return 300;
        }
        if($tok->{'code'} ne 'success'){
            $api->warn("Failed to get edit token for $logpage: ".$tok->{'error'}."\n");
        } else {
            my @txt;
            my $nowiki={};
            my $txt;
            if(exists($tok->{'revisions'}[0]{'slots'}{'main'}{'*'})){
                ($txt,$nowiki)=$api->strip_nowiki($tok->{'revisions'}[0]{'slots'}{'main'}{'*'});
                @txt=split /(?=(?:^|\n)==[^=])/, $txt;
            } else {
                @txt=(
                    "<!-- Please do not edit the lead section. -->\n".
                    "<!-- You may edit any of the below sections as you wish. -->\n".
                    "This is a log of AnomieBOT's actions over the past few days.\n\n"
                );
            }
            my $h="\n== AnomieBOT Log";
            if($lastlog!=0){
                $h.=' for '.strftime('%F %T Z', gmtime($lastlog));
            }
            $h.=' to '.strftime('%F %T Z', gmtime($api->ISO2timestamp($tok->{'curtimestamp'})))."==\n";
            splice @txt, 1, 0, $h.$log;
            @txt=@txt[0..$loglength] if @txt>$loglength;
            $txt=join('',@txt);
            $txt=$api->replace_nowiki(join('',@txt), $nowiki);
            my $r=$api->edit($tok, $txt, "Log recent actions", 0, 0);
            if($r->{'code'} eq 'httperror'){
                # Could well be that MediaWiki saved the edit, but timed out
                # when trying to respond. So wait a short time and then check
                # the timestamp on the most recent edit by the bot.
                sleep(10);
                my $r2=$api->query(
                    titles  => $logpage,
                    prop    => 'revisions',
                    rvuser  => $api->user,
                    rvprop  => 'timestamp',
                    rvlimit => 1  # Only need the last rev
                );
                if($r2->{'code'} eq 'success'){
                    $r2=[values(%{$r2->{'query'}{'pages'}})];
                    if(exists($r2->[0]{'lastrevid'})){
                        $r=$r2 if $r2->[0]{'lastrevid'} != $tok->{'lastrevid'};
                    }
                }
            }
            if($r->{'code'} ne 'success'){
                $api->warn("Write failed on $logpage: ".$r->{'error'}."\n");
            } else {
                $log='';
                $lastlog=$api->ISO2timestamp($tok->{'curtimestamp'});
                $api->store->{'log'}=$log;
                $api->store->{'lastlog'}=$lastlog;
            }
        }
    }

    # Spend a max of 5 minutes on this task before restarting
    my $endtime=time()+300;

    while(1){
        # Get an iterator for the list of pages to check
        my $iter=$api->iterator(
            generator    => 'categorymembers',
            gcmtitle     => [
                'Category:Pages with broken reference names',
                'Category:Pages with incorrect ref formatting',
            ],
            gcmnamespace => '0',
            gcmtype      => 'page',
            gcmlimit     => 'max',
            gcmsort      => 'timestamp',
            gcmdir       => 'desc',
            prop         => 'info|categories',
            cllimit      => 'max',
            clcategories => join('|', keys %inuse)
        );
        PAGE: while($_=$iter->next){
            return 0 if $api->halting;

            if(!$_->{'_ok_'}){
                $api->warn("Failed to retrieve category list: ".$_->{'error'}."\n");
                return 60;
            }

            my $title=$_->{'title'};

            # WTF?
            if(exists($_->{'missing'})){
                $api->warn("$title is missing? WTF?\n");
                next;
            }

            # Don't try fixing any page touched too recently, to give the real
            # editor a chance to fix it.
            my $lastmod=$api->ISO2timestamp($_->{'touched'});
            if(time()-$lastmod<$min_delay){
                $api->log("$title touched too recently, leave it for later");
                next;
            }

            # Any page marked with {{inuse}} should be left for longer.
            if(time()-$lastmod<$inuse_delay &&
               grep { exists($inuse{$_->{'title'}}) } @{$_->{'categories'}}){
                $api->log("$title marked {{inuse}} and last touched less than $inuse_delay seconds ago, leave it for later");
                next;
            }

            # In the skip list?
            if(exists($self->{'skip'}{$_->{'pageid'}}) && $self->{'skip'}{$_->{'pageid'}} eq $_->{'lastrevid'}){
                $api->log("Skipping $title for now to let other pages get a chance");
                next;
            }

            # Did we check this revision already?
            my $checked;
            if(!exists($api->store->{$_->{'pageid'}})){
                # No, never saw it before
                $checked={
                    revid=>$_->{'lastrevid'},
                    continue=>'<beginning>',
                    touched=>0,
                    did_summary_links=>0,
                    did_page_links=>0,
                    prev_ts=>$lastmod,
                    prev_info=>[0,'','',-1],
                    unfound=>[],
                    ignored=>[],
                    afwarn => 0,
                };
                $api->store->{$_->{'pageid'}} = $checked;
            } else {
                $checked=$api->store->{$_->{'pageid'}};
                if($checked->{'revid'} ne $_->{'lastrevid'}){
                    # Saw an old revision, rescan this new one
                    $checked->{'revid'}=$_->{'lastrevid'};
                    $checked->{'continue'}='<beginning>';
                    $checked->{'prev_ts'}=$lastmod;
                    $checked->{'prev_info'}=[0,'','',-1];
                    $checked->{'did_summary_links'}=0;
                    $checked->{'did_page_links'}=0;
                    $checked->{'afwarn'}=0;
                    delete $checked->{'skipuntil'};
                } elsif($checked->{'continue'} ne '' || !$checked->{'did_summary_links'} || !$checked->{'did_page_links'}){
                    # In the middle of checking this revision
                } else {
                    # Yes, we (supposedly) completed this one
                    $checked->{'touched'}=time();
                    $api->store->{$_->{'pageid'}}=$checked;
                    next;
                }
            }

            # Was this flagged for additional delay (e.g. because of a
            # spamblacklist hit)?
            if ( time() < ($checked->{'skipuntil'} // 0) ) {
                $api->log("$title being skipped until " . strftime('%Y-%m-%d %H:%M:%S (UTC)', gmtime($checked->{'skipuntil'})));
                next;
            }

            # To try to avoid "fixing" vandalism, we choose some arbitrary
            # groups and edit count limits to trust and wait longer if the page
            # hasn't been edited by someone "trusted" since someone "untrusted"
            # edited.
            my $res=$api->query([],
                titles  => $title,
                prop    => 'revisions',
                rvprop  => 'user',
                rvlimit => 'max',
                rvend   => $api->timestamp2ISO(time()-$untrusted_delay)
            );
            if($res->{'code'} ne 'success'){
                $api->warn("Failed to retrieve revisions for $title: ".$res->{'error'}."\n");
                return 60;
            }
            my @users=grep { defined($_) } map $_->{'user'}, @{(values %{$res->{'query'}{'pages'}})[0]{'revisions'}};
            my %u; @u{@users}=();
            $res=$api->query([],
                list    => 'users',
                usprop  => 'editcount|groups',
                ususers => join("|", keys %u)
            );
            if($res->{'code'} ne 'success'){
                $api->warn("Failed to retrieve edit counts for editors of $title: ".$res->{'error'}."\n");
                return 60;
            }
            %u=map { my $n=$_->{'name'}; "$n#g" => ($_->{'groups'} // []), "$n#e" => ($_->{'editcount'} // 0) } @{$res->{'query'}{'users'}};
            my $ok=1;
            foreach my $u (@users) {
                next if grep(/^(?:bot)$/, @{$u{"$u#g"}}); # Skip bots
                last if grep(/^(?:sysop|reviewer)$/, @{$u{"$u#g"}}); # Trust these
                last if $u{"$u#e"}>$arbitrary_trusted_threshold; # Trust these too
                next if $u{"$u#e"}>$arbitrary_untrusted_threshold; # Neutral on these
                $ok=0; # Don't trust anyone else
                $api->log("$title touched too recently by untrusted user $u");
                last;
            }
            next unless $ok;

            # Get edit token
            my $tok=$api->edittoken($title);
            if($tok->{'code'} eq 'shutoff'){
                $api->warn("Task disabled: ".$tok->{'content'}."\n");
                return 300;
            }
            if($tok->{'code'} ne 'success'){
                $api->warn("Failed to get edit token for $title: ".$tok->{'error'}."\n");
                next;
            }
            next if exists($tok->{'missing'});
            if($tok->{'lastrevid'} ne $checked->{'revid'}){
                # Someone edited in between loading the cat and getting the
                # token. We'll catch the new revision next time around.
                $api->log("$title was edited since cat list was loaded, abort");
                next;
            }

            # Check if any tags on the topmost revision are in the skip list
            my @tags=@{$tok->{'revisions'}[0]{'tags'} // []};
            for my $tag (@tags) {
                if(exists($skiptags{$tag})){
                    $api->log("Skipping revision ".$tok->{'revisions'}[0]{'revid'}." of $title because of tag '$tag'\n");
                    next;
                }
            }

            # Ok, check the page
            $api->log("Checking references in $title");

            # Get page text
            my $intxt=$tok->{'revisions'}[0]{'slots'}{'main'}{'*'};

            # Process page
            my $ret=$self->process_page($api,$_->{'pageid'},$_->{'lastrevid'},$title,$intxt,$checked,$endtime);
            next PAGE unless defined $ret;
            return $ret unless ref($ret);

            # Need to edit?
            my $skiplogct = 0;
            if($ret->{'outtxt'} ne $intxt){
                my $post_summary='';
                my $chkrevert=$api->query(
                    titles  => $title,
                    prop    => 'revisions',
                    rvprop  => 'user|ids|content',
                    rvslots => 'main',
                    rvlimit => 3,
                );
                if($chkrevert->{'code'} eq 'shutoff'){
                    $api->warn("Task disabled: ".$chkrevert->{'content'}."\n");
                    return 300;
                }
                if($chkrevert->{'code'} ne 'success'){
                    $api->warn("Failed to get revisions for $title: ".$chkrevert->{'error'}."\n");
                    next PAGE;
                }
                next unless exists($chkrevert->{'query'}{'pages'}{$_->{'pageid'}}); # Deleted at just the wrong time
                my @revs=@{$chkrevert->{'query'}{'pages'}{$_->{'pageid'}}{'revisions'}};
                if(@revs==3 && $revs[1]{'user'} eq $api->user && $ret->{'outtxt'} eq $revs[1]{'slots'}{'main'}{'*'} && $revs[0]{'slots'}{'main'}{'*'} eq $revs[2]{'slots'}{'main'}{'*'}){
                    if($revs[0]{'user'} eq $api->user){
                        # Something is really screwed up, the bot wants to revert itself.
                        $api->whine("Bot confusion at [[:$title]]", "When trying to fix orphaned refs in [[:$title]], it seems that I want to revert myself. That's definitely not right, a human will need to fix the situation.");
                        next PAGE;
                    }

                    my $r=$self->_notify_reverter($api,$revs[0]{'user'},$title,$revs[0]{'revid'});
                    if($r==-1){
                        # Failed, continue next time
                    } elsif($r==-2){
                        # Major fail
                        next PAGE;
                    } elsif($r>0){
                        # Really major fail
                        return $r;
                    } else {
                        # Success!
                    }
                    $post_summary='. [[User:AnomieBOT/OrphanReferenceFixer revert help|Read this before reverting]].';
                }

                my @summary=();
                push @summary, 'fixing reference errors' if $ret->{'anyfix'};
                push @summary, 'moving refs out of templates' if $ret->{'moved'};
                push @summary, 'rescuing orphaned refs ('.join('; ', @{$ret->{'found'}}).')' if @{$ret->{'found'}};
                if(!@summary){
                    $api->warn("No summary for $title even though changes were made, WTF?\n");
                    next;
                }

                $summary[-1]='and '.$summary[-1] if @summary>1;
                my $summary=ucfirst(join((@summary>2)?', ':' ', @summary)).$post_summary;
                $api->log("$summary in $title");
                if(length($summary)>500){
                    @summary=();
                    push @summary, 'fixing reference errors' if $ret->{'anyfix'};
                    push @summary, 'moving refs out of templates' if $ret->{'moved'};
                    push @summary, 'rescuing orphaned refs' if @{$ret->{'found'}};
                    $summary[-1]='and '.$summary[-1] if @summary>1;
                    $summary=ucfirst(join((@summary>2)?', ':' ', @summary)).$post_summary;
                }
                my $r=$api->edit($tok, $ret->{'outtxt'}, $summary, 0, 0);
                if($r->{'code'} eq 'spamblacklist'){
                    my @bl=@{$r->{'spamblacklist'}{'matches'}};
                    $bl[@bl-1] = 'and ' . @bl[@bl-1] if @bl > 1;
                    my $bl = join( @bl > 2 ? ', ' : ' ', @bl );
                    $api->log("Write failed on $title: Blacklisted link $bl");
                    $api->warn("Write failed on $title: Blacklisted link $bl\n");
                    $api->whine("Blacklisted orphaned reference in [[:$title]]", "When trying to fix orphaned refs in [[:$title]], MediaWiki's [[MediaWiki:Spam-blacklist|spam blacklist]] complained about <nowiki>$bl</nowiki>. This ''probably'' means someone didn't properly clean up after themselves when blacklisting the link and removing existing uses, but a human needs to double-check it. The attempted changes were:\n* [[:$title]] revision [[Special:PermaLink/".$tok->{'lastrevid'}."|".$tok->{'lastrevid'}."]]:\n".join("\n",@{$ret->{'log'}})."\nYou might also use {{tlus|User:Anomie/uw-orphans|1{{=}}rm diff|2{{=}}fix diff|subst=y}} to let the remover know, if their edit summary indicates they were specifically removing the blacklisted ref. ");
                    # Don't check again for a while
                    $checked=$api->store->{$_->{'pageid'}};
                    $checked->{'skipuntil'} = time() + 7200;
                    $api->store->{$_->{'pageid'}} = $checked;
                    next;
                }
                if($r->{'code'} eq 'abusefilter-disallowed' || $r->{'code'} eq 'abusefilter-warning'){
                    my $code = $r->{'code'};
                    my $info = $r->{'abusefilter'}{'description'};
                    $api->log("Write failed on $title: $code hit for $info");
                    $api->warn("Write failed on $title: $code hit for $info\n");
                    if($r->{'code'} ne 'abusefilter-warning' || ++$checked->{'afwarn'} > 2){
                        # T184191: AbuseFilter warnings are currently bypassed the next time the bot tries to edit.
                        # So let's not bother whining unless it fails multiple times in a row.
                        $api->whine("AbuseFilter hit in [[:$title]]", "When trying to fix orphaned refs in [[:$title]], MediaWiki's [[WP:Edit filter|edit filter]] complained about an AbuseFilter hit for <nowiki>$info</nowiki> with code [[MediaWiki:$code|$code]]. This ''probably'' means some anti-spam measure is using AbuseFilter rather than SpamBlacklist and someone didn't properly clean up after themselves, but a human needs to double-check it. The attempted changes were:\n* [[:$title]] revision [[Special:PermaLink/".$tok->{'lastrevid'}."|".$tok->{'lastrevid'}."]]:\n".join("\n",@{$ret->{'log'}})."\nYou might also use {{tlus|User:Anomie/uw-orphans|1{{=}}rm diff|2{{=}}fix diff|subst=y}} to let the remover know, if their edit summary indicates they were specifically removing the blacklisted ref. ");
                    }
                    # Don't check again for a while
                    $checked=$api->store->{$_->{'pageid'}};
                    $checked->{'skipuntil'} = time() + 7200;
                    $api->store->{$_->{'pageid'}} = $checked;
                    next;
                }
                if($r->{'code'} ne 'success'){
                    $api->warn("Write failed on $title: ".$r->{'error'}."\n");
                    next;
                }
                my $i=$r->{'edit'}{'newrevid'};
                unshift @{$ret->{'log'}}, "* Edited [[Special:Diff/$i|$title]]:";
            } else {
                $api->log("Nothing I can fix in $title");
                unshift @{$ret->{'log'}}, "* Processed [[Special:PermaLink/".$_->{'lastrevid'}."|$title]] (no edit):";
                $skiplogct++;
            }

            # If we're not continuing next time, any refs that are still needed
            # are not in the article history at all. Record them so we don't
            # bother searching the whole history again next time someone edits
            # the page.
            if($checked->{'continue'} eq '' && $checked->{'did_summary_links'} && $checked->{'did_page_links'}){
                push @{$ret->{'unfound'}}, @{$ret->{'needed'}};
                $checked->{'unfound'}=$ret->{'unfound'};
                if(@{$ret->{'unfound'}} || @{$checked->{'ignored'}}){
                    my $log = "** Scan complete.";
                    $log .= " The following references could not be found: <nowiki>".join(', ', map { my $x=$_; $x=~s/^>//; $x } @{$ret->{'unfound'}})."</nowiki>" if @{$ret->{'unfound'}};
                    $log .= " The following references were found but ignored: <nowiki>".join(', ', map { my $x=$_; $x=~s/^>//; $x } @{$checked->{'ignored'}})."</nowiki>" if @{$checked->{'ignored'}};
                    push @{$ret->{'log'}}, $log;
                } else {
                    push @{$ret->{'log'}}, "** <small>Scan complete.</small>";
                }
                $api->log("Completed scanning $title revision ".$_->{'lastrevid'});
            } else {
                # If we are continuing, add the page to the "skip" list to let
                # other pages have a chance to be scanned.
                $api->log("$title will be continued later");
                $self->{'skip'}{$_->{'pageid'}}=$_->{'lastrevid'};
                push @{$ret->{'log'}}, "** <small>Scan not complete, will continue later.</small>";
                $skiplogct++;
            }

            $self->_log($api, join("\n", @{$ret->{'log'}})) if @{$ret->{'log'}} > $skiplogct;

            # Ok, we successfully processed the page. Save the persistant data
            # now.
            $checked->{'title'}=$title; # for manual db editing
            $checked->{'touched'}=time();
            $api->store->{$_->{'pageid'}}=$checked;

            # If we've been at it long enough, let another task have a go.
            return 0 if time()>=$endtime;
        }
        last unless %{$self->{'skip'}};
        %{$self->{'skip'}}=();
    }

    # No more pages to check, try again in 10 minutes or so.
    return 600;
}

# Do the actual processing of the input. Returns a hashref full of values.
sub process_page {
    my ($self,$api,$pageid,$lastrevid,$title,$intxt,$checked,$endtime)=@_;

    # Obvious vandalism?
    if(index($intxt,'Cite your sources: <ref></ref>')>=0 ||
       $intxt=~m(<blockquote></blockquote>\s+<ref></ref>\s+\{\{Reflist\}\}\s+<references/>)){
        $self->_log($api,"* [[:$title]]: Probable vandalism, ignoring revision [[Special:Diff/$lastrevid|$lastrevid]]</span>");
        $api->log("Probable vandalism, ignoring revision $lastrevid of $title");
        $checked->{'title'}=$title;
        $checked->{'touched'}=time();
        $checked->{'continue'}='';
        $checked->{'did_summary_links'}=1;
        $checked->{'did_page_links'}=1;
        $api->store->{$pageid}=$checked;
        return undef;
    }

    my ($outtxt,$nowiki)=$api->strip_nowiki($intxt);
    my $b0rken='';
    my $anyfix=$outtxt;
    my @log=();

    # First, fix obvious errors.
    my $i=1;
    $i=($_>=$i?$_+1:$i) foreach ($outtxt=~/autogenerated(\d+)/g);
    $outtxt=~s{<ref\s+name\s*(/|(?<!/))>}{'<ref name="autogenerated'.($i++).'"'.$1.'>'}oige;

    # Citation bot bugs
    $outtxt=~s{<ref(\s+[^>\n]*)name\s*=\s*" "([^ >\n]+)"}{<ref$1name="$2"}oig;
    $outtxt=~s{<ref(\s+[^>\n]*)name\s*=\s*"[\x{2018}\x{2019}\x{201c}\x{201d}]([^"\n]*)[\x{2018}\x{2019}\x{201c}\x{201d}]"}{<ref$1name="$2"}oig;

    # Other issues
    $outtxt=~s{<ref(\s+[^>\n]*)name\s*=\s*(name\s*=)}{<ref$1$2}oig;
    $outtxt=~s{<ref\s+name\s*=\s*([^\s\x22\x{201c}\x{201d}\x27<>=]+(?-i:[\s\x27\x{00c0}-\x{02af}]+[^\s\x22\x27<>=]+)+)\s*(/|(?<!/))>}{<ref name="$1"$2>}oig;
    $outtxt=~s{<ref\s+name\s*=\s*[\x22\x27]([^\x22\x27<>=\n]*?)\s*(/|(?<!/))>}{<ref name="$1"$2>}oig;
    $outtxt=~s{<ref\s+name\s*=\s*([^\x22\x27<>=\n]+)[\x22\x27]\s*(/|(?<!/))>}{<ref name="$1"$2>}oig;
    $outtxt=~s{<ref\s+name\s*=\s*\x27\x27([^\x22\x27<>=\n]*)\x27\x27\s*(/|(?<!/))>}{<ref name="$1"$2>}oig;
    $outtxt=~s{<ref\s+name\s*=\s*\x22([^\x22\x27<>=\n]*)\x27\s*(/|(?<!/))>}{<ref name="$1"$2>}oig;
    $outtxt=~s{<ref\s+name\s*=\s*\x27([^\x22\x27<>=\n]*)\x22\s*(/|(?<!/))>}{<ref name="$1"$2>}oig;
    $outtxt=~s{<ref\s+(?:name\s*(?:[+-]\s*)?)?(\x22[^\x22<>=\n]*\x22|\x27[^\x27<>=\n]*\x27)\s*(/|(?<!/))>}{<ref name=$1$2>}oig;
    $outtxt=~s{<ref((?:\s+[^>\n]*)?)(?<!/)></ref\s*>}{<ref$1/>}oig;
    $outtxt=~s!<ref\s*/>!!oig;
    $outtxt=~s{(<ref\s++)([^>\n]+?)(/>|(?<!/)>)}{ $1._uniq_params($2).$3 }oige;
    $outtxt=~s{(<references\s++)([^>\n]+?)(/>|(?<!/)>)}{ $1._uniq_params($2,'group').$3 }oige;

    my $tmptxt=$api->process_templates($outtxt, sub { return ''; }, undef);
    my $re = join( '|', map { my ($f,$r)=split(//, $_, 2); "(?i:\Q$f\E)\Q$r\E" } keys %reflist );
    if($tmptxt=~/\{\{\s*(?i:Template\s*:\s*)?($re)\s*\|/){
        $b0rken="page contains an unclosed {{$1, which probably means there\'s really an LDR with an unclosed cite template";
        $self->_log($api,"* [[:$title]]: Revision [[Special:Diff/$lastrevid|$lastrevid]] is too b0rken to fix (<nowiki>$b0rken</nowiki>), skipping");
        $api->log("Revision $lastrevid of $title is too b0rken to fix ($b0rken), skipping");
        $checked->{'title'}=$title;
        $checked->{'touched'}=time();
        $checked->{'continue'}='';
        $checked->{'did_summary_links'}=1;
        $checked->{'did_page_links'}=1;
        $api->store->{$pageid}=$checked;
        return undef;
    }

    $re = $api->get_token_regex();
    if ( $outtxt=~/($re)$/s ) {
        my $last = $api->replace_stripped( $1, $nowiki );
        if ( $last=~/^<!--/s && $last!~/-->$/s && $last=~/<ref\s/i ) {
            $b0rken='page ends in an unclosed comment that contains a ref tag, which probably means LDRs are broken';
            $self->_log($api,"* [[:$title]]: Revision [[Special:Diff/$lastrevid|$lastrevid]] is too b0rken to fix (<nowiki>$b0rken</nowiki>), skipping");
            $api->log("Revision $lastrevid of $title is too b0rken to fix ($b0rken), skipping");
            $checked->{'title'}=$title;
            $checked->{'touched'}=time();
            $checked->{'continue'}='';
            $checked->{'did_summary_links'}=1;
            $checked->{'did_page_links'}=1;
            $api->store->{$pageid}=$checked;
            return undef;
        }
    }

    # Find references currently in the article, and build list of
    # replacements to be applied.
    my @replacements=();
    my %refs=$self->_get_refs($api, $outtxt, \@replacements, \$b0rken);
    if($b0rken ne ''){
        $self->_log($api,"* [[:$title]]: Revision [[Special:Diff/$lastrevid|$lastrevid]] is too b0rken to fix (<nowiki>$b0rken</nowiki>), skipping");
        $api->log("Revision $lastrevid of $title is too b0rken to fix ($b0rken), skipping");
        $checked->{'title'}=$title;
        $checked->{'touched'}=time();
        $checked->{'continue'}='';
        $checked->{'did_summary_links'}=1;
        $checked->{'did_page_links'}=1;
        $api->store->{$pageid}=$checked;
        return undef;
    }

    # People like to make these silly little templates to avoid typing "<ref>"
    # themselves. To avoid whining, detect that situation.
    my $exptxt = $api->process_templates($outtxt, sub {
        my ($name, $params, $wikitext, $data, $oname) = @_;
        # Replace #tag:ref and #tag:references because expandtemplates does weird things to them.
        # Just prefixing with "subst:" is currently enough of a guard to keep it from expanding it.
        # Same for our recognized templates, for similar reasons.
        if($name=~/^#tag:\s*(ref|references)$/is || exists($alltpl{$name})){
            return "{{subst:$oname|" . join('|', @$params) . "}}";
        }
        return undef;
    } );
    my $res=$api->query(
        action => 'expandtemplates',
        title  => $title,
        text   => $api->replace_stripped( $exptxt, $nowiki ),
        prop   => 'wikitext',
    );
    if($res->{'code'} ne 'success'){
        $api->warn("Failed to expand templates for $title: ".$res->{'error'}."\n");
        return undef;
    }
    $exptxt = $api->process_templates($res->{'expandtemplates'}->{'wikitext'}, sub {
        my ($name, $params, $wikitext, $data) = @_;
        # Undo the replacements above.
        if($name=~m!^subst:(.+)$!is){
            my $name2=$1;
            if($name2=~/^#tag:\s*(ref|references)$/is || exists($alltpl{$name2})){
                return "{{$name2|" . join('|', @$params) . "}}";
            }
        }
        return undef;
    } );
    my $tmplB0rken = '';
    my %tmplrefs=$self->_get_refs($api, $exptxt, undef, \$tmplB0rken);
    if($tmplB0rken ne ''){
        $self->_log($api,"* [[:$title]]: Revision [[Special:Diff/$lastrevid|$lastrevid]] is transcluding something too b0rken to fix (<nowiki>$tmplB0rken</nowiki>), skipping");
        $api->log("Revision $lastrevid of $title is transcluding something too b0rken to fix ($tmplB0rken), skipping");
        $checked->{'title'}=$title;
        $checked->{'touched'}=time();
        $checked->{'continue'}='';
        $checked->{'did_summary_links'}=1;
        $checked->{'did_page_links'}=1;
        $api->store->{$pageid}=$checked;
        return undef;
    }

    # Any orphaned refs?
    my @unfound=@{$checked->{'unfound'}};
    my @ignored=@{$checked->{'ignored'}};
    my @intemplates=();
    my %needed=();
    while(my ($g,$refs)=each(%refs)){
        while(my ($n,$v)=each(%$refs)){
            my $x="$g>$n";
            if(exists($refs{$g}{$n}{'broken'})){
                # Broken ref (contains "<ref"), just completely ignore
                # it.
            } elsif($v->{'type'} eq ''){
                if(($tmplrefs{$g}{$n}{'type'} // '') ne ''){
                    # The template seems to be defined inside one of those silly templates.
                    # Don't add it to %needed.
                    push @intemplates, $x unless grep { $_ eq $x } @intemplates;
                } else {
                    # Orphan found, mark as needed unless known to be
                    # unfindable/unusable.
                    $needed{$x}=$v->{'orig'} unless grep { $_ eq $x } ( @unfound, @ignored );
                }
            } else {
                # Check if someone added a previously unfound or ignored ref
                @unfound=grep { $_ ne $x } @unfound;
                @ignored=grep { $_ ne $x } @ignored;
            }
        }
    }

    $anyfix=(@replacements || $anyfix ne $outtxt);
    push @log, "** Ignoring missing refs that appear to be in silly templates: <nowiki>".join(', ', map { my $x=$_; $x=~s/^>//; $x } @intemplates)."</nowiki>" if @intemplates;
    push @log, "** Fixed broken references" if $anyfix;

    # Setup for checking for unfound refs in page history
    my %rq=(
        pageids   => $pageid,
        prop      => 'revisions',
        rvprop    => 'ids|timestamp|user|comment|content',
        rvslots   => 'main',
        # Using 1 instead of max because we're downloading the content
        # of each revision
        rvlimit   => 1
    );
    my %found=();
    my $needed=scalar keys %needed;
    while($needed>0 && $checked->{'continue'} ne ''){
        # We found some orphaned refs. Now we have to start going back
        # through the history to try to find the original text...
        $rq{'rvstartid'}=$checked->{'revid'};
        $rq{'rvcontinue'}=$checked->{'continue'} unless $checked->{'continue'} eq '<beginning>';
        my $prevts=$checked->{'continue'} eq '<beginning>' ? time() : $checked->{'prev_ts'};
        my $rres=$api->query(%rq);
        if($rres->{'code'} eq 'rvbadcontinue'){
            # Bad saved continue value?
            delete $rq{'rvcontinue'};
            $checked->{'continue'} = '<beginning>';
            redo;
        }
        if($rres->{'code'} ne 'success'){
            $api->warn("Failed to retrieve revision for $title: ".$rres->{'error'}."\n");
            last;
        }
        if(exists($rres->{'query-continue'})){
            $checked->{'continue'}=$rres->{'query-continue'}{'revisions'}{'rvcontinue'};
        } else {
            $checked->{'continue'}='';
        }
        my $r=$rres->{'query'}{'pages'}{$pageid}{'revisions'}[0];
        if($r->{'revid'} ne $checked->{'revid'}){
            # Get refs from this past revision, and see if any of them
            # are the ones we need.
            next unless exists($r->{'slots'}{'main'}{'*'}); # RevDel
            my %rrefs=$self->_get_refs($api, $r->{'slots'}{'main'}{'*'});
            foreach (keys %needed){
                my ($g,$n)=split />/, $_, 2;
                next if !exists($rrefs{$g}{$n});
                next if $rrefs{$g}{$n}{'type'} eq '';

                # Only rescue generic names from recent-ish revisions
                if(_is_generic_ref_name($n,$g) && $prevts < time() - 525600 * 60){
                    delete $needed{$_};
                    $needed--;
                    my $k = $_;
                    unless ( grep { $_ eq $k } @{$checked->{'ignored'}} ) {
                        push @log, "** Ignored \"$n\" from rev [[Special:PermaLink/".$r->{'revid'}."|".$r->{'revid'}."]], name is generic and revision is old";
                        push @{$checked->{'ignored'}}, $k;
                    }
                    next;
                }

                my ($dup,$dupref)=_check_dups($g,$rrefs{$g}{$n},$refs{$g});
                my $log;
                if(defined($dup)){
                    foreach my $need (@{$needed{$_}}) {
                        push @replacements, {
                            'orig' => $need,
                            'repl' => $dupref
                        };
                    }
                    $found{$_}="\"$n\" → \"$dup\" from rev ".$r->{'revid'};
                    $log="** Renamed \"$n\" → \"$dup\" from rev [[Special:PermaLink/".$r->{'revid'}."|".$r->{'revid'}."]]";
                } else {
                    push @replacements, {
                        'orig' => $needed{$_}[0],
                        'repl' => $rrefs{$g}{$n}{'repl'}
                    };
                    $found{$_}="\"$n\" from rev ".$r->{'revid'};
                    $log="** Rescued \"$n\" from rev [[Special:PermaLink/".$r->{'revid'}."|".$r->{'revid'}."]]";
                }
                if(exists($checked->{'prev_info'})){
                    my @i=@{$checked->{'prev_info'}};
                    $log.="<br /><small>Removed";
                    $log.=" in revision [[Special:Diff/$i[0]|$i[0]]]" if $i[0]>0;
                    $log.=" by [[User:$i[1]|]] ([[User talk:$i[1]|talk]] • [[Special:Contributions/$i[1]|contribs]] • [[Special:Log/$i[1]|logs]])" if $i[1] ne '';
                    if(defined($i[2]) && $i[2] ne ''){
                        $i[2]=~s!</nowiki>!&lt;/nowiki&gt;!g;
                        $log.=" with comment \"<nowiki>$i[2]</nowiki>\"";
                    } else {
                        $log.=" with no comment";
                    }
                    if($i[3]!=-1){
                        my $l=length($r->{'slots'}{'main'}{'*'});
                        my $d=$i[3]-$l;
                        if($d>0){
                            my $p=int($d/$l*100+.5);
                            $log.=" (added $d/$l bytes, $p%)";
                        } elsif($d<0){
                            $d=-$d;
                            my $p=int($d/$l*100+.5);
                            $log.=" (removed $d/$l bytes, $p%)";
                        }
                    }
                    $log.="</small>";
                }
                push @log, $log;
                delete $needed{$_};
                $needed--;
            }
        }

        # Update the previous revision time to this revision's time
        $checked->{'prev_ts'}=$api->ISO2timestamp($r->{'timestamp'});
        $checked->{'prev_info'}=[$r->{'revid'},$r->{'user'},$r->{'comment'},length($r->{'slots'}{'main'}{'*'})];

        # If we've been at it long enough, exit the loop to give
        # another page a chance.
        last if time()>=$endtime;
        last if $api->halting;
    }
    # If we found all orphans, no need to continue next time.
    if($needed==0){
        $checked->{'continue'}='';
        $checked->{'did_summary_links'}=1;
        $checked->{'did_page_links'}=1;
    }

    if($checked->{'continue'} eq '' && !$checked->{'did_summary_links'} && time()<$endtime-60 && !$api->halting){
        # Setup for checking for unfound refs in pages linked from edit
        # summaries. We do this all at once because it's easier.
        $api->log("Checking for content in pages linked from $title edit summaries (this may take a while)");
        my %rq=(
            pageids      => $pageid,
            prop         => 'revisions',
            rvprop       => 'comment',
            rvslots      => 'main',
            rvlimit      => 'max',
        );
        my %links=();
        my $re='(?:'.$api->interwiki_re().'|'.$api->namespace_re(qw/! 0/).')';
        do {
            my $rres=$api->query(%rq);
            if($rres->{'code'} ne 'success'){
                $api->warn("Failed to retrieve edit summaries for $title: ".$rres->{'error'}."\n");
                return -1;
            }
            if(exists($rres->{'query-continue'})){
                $rq{'rvcontinue'}=$rres->{'query-continue'}{'revisions'}{'rvcontinue'};
            } else {
                delete($rq{'rvcontinue'});
            }
            foreach my $r (@{(values %{$rres->{'query'}{'pages'}})[0]{'revisions'}}){
                next unless ref($r) eq 'HASH';
                next unless exists($r->{'comment'});
                foreach my $l ($r->{'comment'}=~/\[\[(.*?)(?:\|.*?)?\]\]/g){
                    next if $l=~/^\s*(?::\s*)?$re\s*:/i;
                    $links{$l}=1;
                }
            }
        } while(exists($rq{'rvcontinue'}));
        my $r=%links?_check_linked_pages($api,$self,[keys %links],'summary',$pageid,$title,\%refs,\%needed,\%found,\@replacements,\@log):0;
        if($r==-1){
            # Failed, continue next time
        } elsif($r==-2){
            # Major fail
            return undef;
        } elsif($r>0){
            # Really major fail
            return $r;
        } else {
            # Success!
            $checked->{'did_summary_links'}=1;
        }
    }
    if(scalar(keys %needed)==0){
        $checked->{'did_page_links'}=1;
    }

    ## 2023-07-21: Disable linked pages check pending figuring out a way for it to be less FP-prone. People too often use generic ref names like the name of the newspaper/website.
    $checked->{'did_page_links'} = 1;

    if($checked->{'continue'} eq '' && $checked->{'did_summary_links'} && !$checked->{'did_page_links'} && time()<$endtime-60 && !$api->halting){{
        # Setup for checking for unfound refs in linked pages. We do
        # this all at once because we want to take into account that
        # different articles could have the same named ref with
        # different content.
        $api->log("Checking for content in pages linked from or linking to $title (this may take a while)");
        my %links=();
        my $i=$api->iterator(
            pageids      => $pageid,
            generator    => 'links',
            gplnamespace => 0,
            gpllimit     => 'max',
        );
        my $fail=0;
        while(my $p=$i->next){
            if(!$p->{'_ok_'}){
                $api->warn("Failed to retrieve links for $title: ".$p->{'error'}."\n");
                $fail=1;
                last;
            }
            $links{$p->{'title'}}=1;
        }
        last if $fail; # break from the "if"
        my $res=$api->query(
            generator    => 'backlinks',
            gbltitle     => $title,
            gblnamespace => 0,
            gblredirect  => 1,
            gbllimit     => 1000,
        );
        if($res->{'code'} ne 'success'){
            $api->warn("Failed to retrieve backlinks for $title: ".$res->{'error'}."\n");
            $fail=1;
            last;
        } elsif(exists($res->{'query-continue'})){
            $api->log("Skipping check for content in pages linking to $title, there are too many");
        } else {
            for my $p (values %{$res->{'query'}{'pages'}}){
                $links{$p->{'title'}}=1;
            }
        }
        my $r=%links?_check_linked_pages($api,$self,[keys %links],'page',$pageid,$title,\%refs,\%needed,\%found,\@replacements,\@log):0;
        if($r==-1){
            # Failed, continue next time
        } elsif($r==-2){
            # Major fail
            return undef;
        } elsif($r>0){
            # Really major fail
            return $r;
        } else {
            # Success!
            $checked->{'did_page_links'}=1;
        }
    }}

    # Process the list of replacements now.
    foreach (@replacements){
        my $i=index($outtxt, $_->{'orig'});
        substr($outtxt, $i, length($_->{'orig'}))=$_->{'repl'} if $i>=0;
    }

    # Refs inside of templates have a habit of causing problems if the
    # parameter they are inside of is not rendered. So if we find a
    # named ref where the body is inside a template and a reference is
    # outside, move the body to the outside instance. We do this by
    # stripping out all templates, looking for and replacing any
    # "orphans" in what is left, and then replacing all the templates.
    #
    # But don't do it if the page contains transclusion control tags, as that
    # probably means someone is doing something stupid with transcluding one
    # article into another.
    my $moved=0;
    unless($outtxt=~/<(?:includeonly|noinclude|onlyinclude)>/){
        my $outtmpl={};
        $outtxt=$api->strip_templates($outtxt, \&_strip_templates, undef, $outtmpl);
        %refs=$self->_get_refs($api, $outtxt);
        my %needed2=();
        my $fail=0;
        while(my ($g,$refs)=each(%refs)){
            while(my ($n,$v)=each(%$refs)){
                my $x="$g>$n";
                # Broken ref (contains "<ref"), completely ignore it.
                next if(exists($refs{$g}{$n}{'broken'}));

                # Body version has content, ignore it.
                next if($v->{'type'} ne '');

                # Orphan found, mark as needed unless known to be
                # unfindable.
                $needed2{$x}=$v->{'orig'}[0];
            }
        }
        $needed=scalar values %needed2;
        while($needed>0 && (my ($k,$v)=each %$outtmpl)){
            # Skip the template if it contains a reflist, too weird to mess with.
            next if $v=~/<references/;
            my $found = 0;
            $api->strip_templates($v, sub {
                my ($name, $params, $wikitext, $data) = @_;
                $found = 1 if $name=~/^#tag:\s*references$/is || ($alltpl{$name} // '') eq 'references';
                return undef;
            } );
            next if $found;

            my %rrefs=$self->_get_refs($api, $v);
            foreach (keys %needed2){
                my ($g,$n)=split />/, $_, 2;
                next if !exists($rrefs{$g}{$n});
                next if $rrefs{$g}{$n}{'type'} eq '';

                # Instructed to ignore it
                next if($api->replace_nowiki($rrefs{$g}{$n}{'content'}, $nowiki)=~/<!--\s*AnomieBOT:\s*Don\x27t move\s*-->/i);

                my $orig=$needed2{$_};
                my $repl=$rrefs{$g}{$n}{'repl'};
                my $i=index($outtxt, $orig);
                my $j=index($v, $repl);
                next unless($i>=0 && $j>=0);

                # Found a candidate to move! But first, verify it will actually
                # have an effect.
                my $txt1="$v\n\n$orig\n<references".($g?" group=\"$g\"":"")."/>";
                my $res=$api->query(
                    action => 'parse',
                    title => $title,
                    text => $api->replace_nowiki($txt1, $nowiki),
                    prop => 'text',
                    disablelimitreport => 1
                );
                if($res->{'code'} ne 'success'){
                    $api->warn("Failed to test template for $title: ".$res->{'error'}."\n");
                    $fail=1;
                    last;
                }
                ($txt1=$res->{'parse'}{'text'}{'*'})=~s/^\s*|\s*$//g;
                my $txt2="$v\n\n$repl\n<references".($g?" group=\"$g\"":"")."/>";
                substr($txt2, $j, length($repl))=$orig;
                $res=$api->query(
                    action => 'parse',
                    title => $title,
                    text => $api->replace_nowiki($txt2, $nowiki),
                    prop => 'text',
                    disablelimitreport => 1
                );
                if($res->{'code'} ne 'success'){
                    $api->warn("Failed to test template for $title: ".$res->{'error'}."\n");
                    $fail=1;
                    last;
                }
                ($txt2=$res->{'parse'}{'text'}{'*'})=~s/^\s*|\s*$//g;
                if($txt1 ne $txt2){
                    # Something changed in the output, so it's probably a worthwhile move.
                    substr($outtxt, $i, length($orig))=$repl;
                    substr($v, $j, length($repl))=$orig;
                    delete $needed2{$_};
                    $needed--;
                    $moved++ unless exists($found{$_});
                }
            }
            last if $fail;
            $outtmpl->{$k}=$v;
        }
        $outtxt=$api->replace_stripped($outtxt, $outtmpl);
        push @log, "** Moved refs out of templates" if $moved;
    }

    # Done processing, put back the <nowiki>s now
    $outtxt=$api->replace_nowiki($outtxt, $nowiki);

    return {
        outtxt  => $outtxt,
        log     => \@log,
        anyfix  => $anyfix,
        moved   => $moved,
        found   => [values %found],
        unfound => \@unfound,
        needed  => [keys %needed],
    };
}

# Return just the last of duplicate params, and optionally strip all but those
# specifically allowed
sub _uniq_params {
    my $in=shift;
    my @keep=@_;
    my %k=();
    my %out=();

    my @p=($in=~/(([^\f\t\r\n \x00"'>\/=\p{Control}]+)(?:[\f\t\r\n ]*=[\f\t\r\n ]*(?:"[^<\x22]*"|'[^<\x27]*'|[^ >]+))?)/sgu);
    for(my $i=0; $i<@p; $i+=2){
        $p[$i+1]=lc($p[$i+1]);
        next if(@keep && !grep($_ eq $p[$i+1], @keep));
        $k{$p[$i+1]}=$i;
        $out{$p[$i+1]}=$p[$i];
    }
    my @kk=();
    while(my ($k,$v)=each %k){ $kk[$v]=$k; }
    my $out='';
    foreach (@kk){ $out.=' ' . $out{$_} if defined($_); }
    $out .= $1 if $in =~ /(\s+)$/;
    $out=~s/^\s+//;
    return $out;
}

# Check if the found ref is identical to a reference in the current version of
# the article. If so, use the current version.
sub _check_dups {
    my $g=shift;
    my $ref1=shift;
    my $refs=shift;

    my $c1=$ref1->{'content'}; $c1=~s/\s+//g;
    $c1=~s/[\x{2013}\x{2014}]|&([mn]dash|#0*821[12]|#x0*201[34]);/-/g;
    $c1=~s/\x{2212}|&(minus|#0*8722|#x0*2212);/-/g;
    while(my ($n2,$v2)=each(%$refs)){
        my $c2=$v2->{'content'}//''; $c2=~s/\s+//g;
        $c2=~s/[\x{2013}\x{2014}]|&([mn]dash|#0*821[12]|#x0*201[34]);/-/g;
        $c2=~s/\x{2212}|&(minus|#0*8722|#x0*2212);/-/g;
        next unless $c1 eq $c2;

        my $ref='<ref';
        if($g ne ''){
            my $gx=$g;
            my $q='"';
            if($gx=~/"/){
                if($gx=~/'/){
                    $gx=~s/"/&quot;/g;
                } else {
                    $q="'";
                }
            }
            $gx=~s/</&lt;/g; $gx=~s/>/&gt;/g;
            $ref.=" group=$q$gx$q";
        }
        my $nx=$n2;
        my $q='"';
        if($nx=~/"/){
            if($nx=~/'/){
                $nx=~s/"/&quot;/g;
            } else {
                $q="'";
            }
        }
        $nx=~s/</&lt;/g; $nx=~s/>/&gt;/g;
        $ref.=" name=$q$nx$q";
        $ref.=" />";
        return ($n2,$ref);
    }
    return (undef,undef);
}

# Subroutine to get all the references in some wikitext.
sub _get_refs {
    my $self=shift;
    my $api=shift;
    my $text=shift;
    my $replacements=shift; $replacements=[] unless defined($replacements);
    my $dummy='';
    my $b0rken=shift; $b0rken=\$dummy unless defined($b0rken);
    my %refs=();

    # The new "list-defined references" have to be handled specially, which
    # means we have to manage to pull them out of the wikitext. Fun.
    # First, do the XML-style tags.
    my $nowiki;
    ($text,$nowiki)=$api->strip_nowiki($text);
    my @matches=($text=~m!(<references((?:\s+[^>]*[^/>])?)(?:/>|>(.*?)(</references>|$)))!oigs);
    for(my $i=0; $i<@matches; $i+=4){
        $text=~s/\Q$matches[$i]\E//g;

        # Last ref in the page broken?
        if(defined($matches[$i+3]) && $matches[$i+3] eq ''){
            $$b0rken='Last <references> in page is unclosed'; next;
        }

        # Comments don't work right inside ref names
        if($matches[$i+1]=~/\x02/){ $$b0rken='References parameters contain strip marker'; next; }

        # Don't delete tons of content if some vandal breaks a ref tag
        if($matches[$i+1]=~/<references(?:[\s>]|$)/){ $$b0rken='References parameters contain <references>'; next; }
        if($matches[$i+1]=~/\n==/){ $$b0rken='References parameters contain =='; next; }

        # I can't believe someone actually used “” quotes in a ref tag, but
        # they did. So test for it.
        my $x=$matches[$i+1];
        $x=~s/(\s+group\s*=\s*)[\x{2018}\x{2019}]([^\x{2018}\x{2019}\x22<]*)\x{2019}/$1"$2"/g;
        $x=~s/(\s+group\s*=\s*)[\x22\x{201c}\x{201d}]([^\x{201c}\x{201d}\x22<]*)\x{201d}/$1"$2"/g;
        if($x ne $matches[$i+1]){
            my $old=$matches[$i];
            $matches[$i+1]=$x;
            $matches[$i]='<references'.$matches[$i+1].(defined($matches[$i+2])?'>'.$matches[$i+2].'</references>':'/>');
            push @$replacements, {
                'orig' => $api->replace_nowiki($old,$nowiki),
                'repl' => $api->replace_nowiki($matches[$i],$nowiki)
            };
        }

        $matches[$i+1]=~s/\s+$//g;

        # Group?
        my ($gg,$g);
        if($matches[$i+1]=~/(\s+group\s*=\s*"([^\x22<]*)")/oi ||
           $matches[$i+1]=~/(\s+group\s*=\s*'([^\x27<]*)')/oi ||
           $matches[$i+1]=~/(\s+group\s*=\s*([^\x09\x0a\x0c\x0d\x20]+))/oi){
            $gg=$1; $g=$2;
        } else {
            $gg=''; $g='';
        }

        # Ok, parse the list-defined refs
        if(defined($matches[$i+2])){
            %refs=_get_refs2($self,$api,$api->replace_stripped($matches[$i+2],$nowiki),$replacements,$b0rken,$g,'references',%refs);
        }
    }
    $text=$api->replace_nowiki($text,$nowiki);

    # Next, do reflist and #tag:references
    $api->process_templates($text, sub {
        my $name=shift;
        my @params=@{shift()};
        my $orig=shift;

        my ($c, $g, $type);

        my $prop = $reflist{$name} // $reflist{"Template:$name"} // undef;
        if($prop){
            ($type=$name)=~s/^Template://;
            my $groupre=$prop->{'groupre'};
            my @refparams=@{$prop->{'refs'}};
            $g=$prop->{'group'};
            $c='';
            foreach my $p ($api->process_paramlist(@params)){
                if(grep { $p->{'name'} eq $_ } @refparams) {
                    $c=$p->{'value'};
                } elsif($p->{'name'} eq 'group' && $p->{'value'}=~/^\s*([\x22\x27]?)($groupre)\1\s*$/oi){
                    $g=$2;
                }
            }
        } elsif($name=~/^#tag:\s*references$/is){
            $type=$name;
            $c=shift(@params) // '';
            my $bad=0;
            foreach (@params){
                if(/^\s*group\s*=\s*([\x22\x27]?)([^\x22\x27]*?)\1\s*$/oi){
                    $g=$2;
                } else {
                    $bad=1;
                }
            }

            # If it had unrecognized parameters to the tag, strip them
            if($bad){
                my $old=$orig;
                $orig="\x7b\x7b#tag:references|$c";
                $orig.="|group=$g" if $g ne '';
                $orig.="\x7d\x7d";
                push @$replacements, {
                    'orig' => $old,
                    'repl' => $orig
                };
            }
        } else {
            return undef;
        }

        # Ok, parse the list-defined refs
        %refs=_get_refs2($self,$api,$c,$replacements,$b0rken,$g,$type,%refs);

        return '';
    });

    # And finally, parse the page text.
    return _get_refs2($self,$api,$text,$replacements,$b0rken,'','',%refs);
}

sub _get_refs2 {
    my $self=shift;
    my $api=shift;
    my ($text,$nowiki)=$api->strip_nowiki(shift);
    my $replacements=shift;
    my $b0rken=shift;
    my $defaultgroup=shift;
    my $listdefined=shift;
    my %refs=@_;

    # Fix whitespace in default group
    $defaultgroup =~ s/[\t\r\n ]+/ /g;
    $defaultgroup =~ s/^\s+|\s+$//g;

    # Find all ref tags
    my @matches=($text=~m!(<ref((?:\s+[^>]*[^/>])?)(?:/>|>(.*?)(</ref\s*>|$)))!oigs);
    for(my $i=0; $i<@matches; $i+=4){
        # Last ref in the page broken?
        if(defined($matches[$i+3]) && $matches[$i+3] eq ''){
            $$b0rken='Last <ref> in page is unclosed'; next;
        }

        # Comments don't work right inside ref names
        if($matches[$i+1]=~/\x02/){ $$b0rken='Ref parameters contain strip marker'; next; }

        # Don't delete tons of content if some vandal breaks a ref tag
        if($matches[$i+1]=~/<ref(?:[\s>]|$)/){ $$b0rken='Ref parameters contain <ref>'; next; }
        if($matches[$i+1]=~/\n==/){ $$b0rken='Ref parameters contain =='; next; }

        # I can't believe someone actually used “” quotes in a ref tag, but
        # they did. So test for it.
        my $x=$matches[$i+1];
        $x=~s/(\s+(?:name|group)\s*=\s*)[\x{2018}\x{2019}]([^\x{2018}\x{2019}\x22<]*)\x{2019}/$1"$2"/g;
        $x=~s/(\s+(?:name|group)\s*=\s*)[\x22\x{201c}\x{201d}]([^\x{201c}\x{201d}\x22<]*)\x{201d}/$1"$2"/g;
        if($x ne $matches[$i+1]){
            my $old=$matches[$i];
            $matches[$i+1]=$x;
            $matches[$i]='<ref'.$matches[$i+1].(defined($matches[$i+2])?'>'.$matches[$i+2].'</ref>':'/>');
            push @$replacements, {
                'orig' => $api->replace_nowiki($old,$nowiki),
                'repl' => $api->replace_nowiki($matches[$i],$nowiki)
            };
        }

        $matches[$i+1]=~s/\s+$//g;

        # Fix obviously incorrect ref bodies.
        if(defined($matches[$i+2]) &&
           ($matches[$i+2]=~/^\s*$/ ||
            $matches[$i+2] eq 'Insert footnote text here')){
            my $old=$matches[$i];
            $matches[$i+2]=undef;
            $matches[$i]='<ref'.$matches[$i+1].'/>';
            push @$replacements, {
                'orig' => $api->replace_nowiki($old,$nowiki),
                'repl' => $api->replace_nowiki($matches[$i],$nowiki)
            };
        }

        # Extract params
        my ($gg, $g) = ('', $defaultgroup);
        my ($nn, $n) = ('', '');
        my $params = '';
        my @m = $matches[$i+1] =~ /$attrRe/g;
        for(my $j=0; $j<@m; $j+=5){
            my $a = lc( $m[$j+1] );
            if ( $a eq 'group' || $a eq 'name' || $a eq 'dir' || $a eq 'follow' ) {
                $m[$j] .= $m[$j+2] if( ($m[$j+2]//'') ne ($m[$j+4]//'') );
                $params .= $m[$j];
                ($gg, $g) = ($m[$j], $m[$j+3]//'') if $a eq 'group';
                ($nn, $n) = ($m[$j], $m[$j+3]//'') if $a eq 'name';
            }
        }

        # If it's unnamed and empty, remove it completely.
        if($nn eq '' && !defined($matches[$i+2])){
            push @$replacements, {
                'orig' => $api->replace_nowiki($matches[$i],$nowiki),
                'repl' => ''
            };
            next;
        }

        # Unknown parameters cause errors, so replace them if found.
        if($matches[$i+1] ne $params) {
            my $old=$matches[$i];
            $matches[$i+1]=$params;
            $matches[$i]='<ref'.$matches[$i+1].(defined($matches[$i+2])?'>'.$matches[$i+2].'</ref>':'/>');
            push @$replacements, {
                'orig' => $api->replace_nowiki($old,$nowiki),
                'repl' => $api->replace_nowiki($matches[$i],$nowiki)
            };
        }

        # We're not interested if it's unnamed.
        if($nn eq ''){
            $$b0rken='Ref contains <ref>' if(defined($matches[$i+2]) && $matches[$i+2]=~/<ref(?:[\s>]|$)/);
            $$b0rken='Ref contains ==' if(defined($matches[$i+2]) && $matches[$i+2]=~/\n==/);
            next;
        }

        # Fix whitespace in parameters
        my $g2 = $g; $g=~s/[\t\r\n ]+/ /g; $g=~s/^\s+|\s+$//g;
        if ( $g ne $g2 ) {
            my $old=$matches[$i];
            $matches[$i+1]=~s/group\s*=\s*([\x22\x27]?)\Q$g2\E\1/group=$1$g$1/i;
            $matches[$i]='<ref'.$matches[$i+1].(defined($matches[$i+2])?'>'.$matches[$i+2].'</ref>':'/>');
            push @$replacements, {
                'orig' => $api->replace_nowiki($old,$nowiki),
                'repl' => $api->replace_nowiki($matches[$i],$nowiki)
            };
        }
        my $n2 = $n; $n=~s/[\t\r\n ]+/ /g; $n=~s/^\s+|\s+$//g;
        if ( $n ne $n2 ) {
            my $old=$matches[$i];
            $matches[$i+1]=~s/name\s*=\s*([\x22\x27]?)\Q$n2\E\1/name=$1$n$1/i;
            $matches[$i]='<ref'.$matches[$i+1].(defined($matches[$i+2])?'>'.$matches[$i+2].'</ref>':'/>');
            push @$replacements, {
                'orig' => $api->replace_nowiki($old,$nowiki),
                'repl' => $api->replace_nowiki($matches[$i],$nowiki)
            };
        }

        # Integer names cause errors, so replace them if found.
        if($n=~/^\d+$/){
            my $x="renamed_from_".$n."_on_".strftime('%Y%m%d%H%M%S', gmtime);
            next if index($text, $x)>=0;
            my $old=$matches[$i];
            $matches[$i+1]=~s/name\s*=\s*([\x22\x27]?)$n\1/name=$1$x$1/i;
            $matches[$i]='<ref'.$matches[$i+1].(defined($matches[$i+2])?'>'.$matches[$i+2].'</ref>':'/>');
            push @$replacements, {
                'orig' => $api->replace_nowiki($old,$nowiki),
                'repl' => $api->replace_nowiki($matches[$i],$nowiki)
            };
            $n=$x;
        }

        # Decode HTML entities, as MediaWiki does for <ref> tags (but not {{#tag:ref}})
        $g = HTML::Entities::decode( $g );
        $n = HTML::Entities::decode( $n );

        # Save detected reference
        $refs{$g}={} unless exists($refs{$g});
        if(!exists($refs{$g}{$n})){
            $refs{$g}{$n}={
                orig => [],
                type => '',
                content => undef,
                listdefined => $listdefined
            };
        }
        push @{$refs{$g}{$n}{'orig'}}, $api->replace_nowiki($matches[$i+0],$nowiki);
        if(defined($matches[$i+2]) && $matches[$i+2]=~/<ref(?:[\s>]|$)/){
            # Reference contains "<ref", so probably someone forgot a </ref>
            # somewhere (and then that's probably how it got "orphaned"). To be
            # safe, don't use it.
            $matches[$i+2]=undef;
            $refs{$g}{$n}{'broken'}=1;
            $$b0rken='Ref contains <ref>';
        }
        if(defined($matches[$i+2]) && $matches[$i+2]=~/\n==/){
            # Reference contains "==", so probably someone forgot a </ref>
            # somewhere (and then that's probably how it got "orphaned"). To be
            # safe, don't use it.
            $matches[$i+2]=undef;
            $refs{$g}{$n}{'broken'}=1;
            $$b0rken='Ref contains ==';
        }
        if($refs{$g}{$n}{'type'} eq '' && defined($matches[$i+2])){
            $refs{$g}{$n}{'type'}='ref';
            $refs{$g}{$n}{'repl'}=$api->replace_nowiki($matches[$i+0],$nowiki);
            $refs{$g}{$n}{'content'}=$api->replace_nowiki($matches[$i+2],$nowiki);
        }
    }

    # Darn. Now we have to parse through the page and find all the #tag:refs
    # and {{refn}}/{{efn}} too.
    $api->process_templates($text, sub {
        my $name=shift;
        my @params=@{shift()};
        my $orig=$api->replace_nowiki(shift,$nowiki);
        shift;
        my $oname=shift;

        my ($type, $groupre, @contentparams);
        my $g=$defaultgroup;
        my $c;
        if($name=~/^#tag:\s*ref$/is){
            $oname='#tag:ref';
            $type='tag';
            $groupre=qr/[^\x22\x27]*/;
            @contentparams=();
            $c=$api->replace_nowiki(shift(@params),$nowiki);
        } elsif(exists($reftpl{$name}) || exists($reftpl{"Template:$name"})){
            $type='tpl';
            my $props=$reftpl{$name} // $reftpl{"Template:$name"};
            $g=$props->{'group'} if $props->{'group'} ne '';
            $groupre=$props->{'groupre'};
            @contentparams=@{$props->{'content'}};
        } else {
            return undef;
        }

        my $n=undef;
        my @bad=();
        foreach my $p ($api->process_paramlist(@params)){
            # Whitespace and quotes will be stripped from name and group by #tag, and all the templates use #tag at some level too.
            my $v = $p->{'value'};
            $v=~s/^\s*([\x22\x27]?)(.*?)\1\s*$/$2/;
            if($p->{'name'} eq 'group'){
                $g=$api->replace_nowiki($v,$nowiki) if $v =~ /$groupre/;
            } elsif($p->{'name'} eq 'name') {
                $n=$api->replace_nowiki($v,$nowiki);
            } elsif(grep { $p->{'name'} eq $_ } @contentparams) {
                $c=$api->replace_nowiki($p->{'value'},$nowiki);
            } else {
                push @bad, $p->{'text'};
            }
        }

        # If it's a template, no content, and one "bad" param that contains an `=`, let's guess it's
        # a case where they should have used an explicit param name and didn't.
        if(!defined($c) && $type ne 'tag' && @bad == 1 && $bad[0]=~/=/ && @contentparams){
            $c = pop @bad;
            my ($cp) = @contentparams;
            my $old=$orig;
            $orig="\x7b\x7b$oname";
            for my $p (@params) {
                $p = "$cp=" . $api->replace_nowiki($p,$nowiki) if $p eq $c;
                $orig.='|' . $api->replace_nowiki($p,$nowiki);
            }
            $orig.="\x7d\x7d";
            push @$replacements, {
                'orig' => $old,
                'repl' => $orig
            };
            $c=$api->replace_nowiki($c,$nowiki);
        }
        $c='' if !defined($c);

        # We're not interested if it's unnamed. But strip it out if
        # it's unnamed and empty, because that's an error.
        if(!defined($n)){
            if($c eq ''){
                push @$replacements, {
                    'orig' => $orig,
                    'repl' => ''
                };
            }
            return undef;
        }

        # If it had unrecognized parameters to the tag, strip them
        if(@bad && $type eq 'tag'){
            my $old=$orig;
            $orig="\x7b\x7b$oname";
            $orig.="|$c";
            $orig.="|name=$n" if defined($n);
            $orig.="|group=$g" if $g ne $defaultgroup;
            $orig.="\x7d\x7d";
            push @$replacements, {
                'orig' => $old,
                'repl' => $orig
            };
        }

        # Integer names cause errors, so replace them if found.
        if($n=~/^\d+$/){
            my $x="renamed_from_".$n."_on_".strftime('%Y%m%d%H%M%S', gmtime);
            next if index($text, $x)>=0;
            my $old=$orig;
            $orig="\x7b\x7b$oname";
            if($type eq 'tag'){
                $orig.="|$c";
                $orig.="|name=$x";
                $orig.="|group=$g" if $g ne $defaultgroup;
            } else {
                foreach my $p (@params){
                    $p =~ s/^(\s*name\s*=\s*).*?(\s*)$/$1$x$2/;
                    $orig.='|' . $api->replace_nowiki($p,$nowiki);
                }
            }
            $orig.="\x7d\x7d";
            $n=$x;
            push @$replacements, {
                'orig' => $old,
                'repl' => $orig
            };
        }

        # Save detected reference
        $refs{$g}={} unless exists($refs{$g});
        if(!exists($refs{$g}{$n})){
            $refs{$g}{$n}={
                orig => [],
                type => '',
                content => undef,
                listdefined => $listdefined
            };
        }
        if($c=~/^\s*$/){
            # Apparently, some people really do this. Don't use empty refs.
            $c='';
        }
        push @{$refs{$g}{$n}{'orig'}}, $orig;
        if($refs{$g}{$n}{'type'} eq '' && $c ne ''){
            $refs{$g}{$n}{'type'}=$type;
            $refs{$g}{$n}{'repl'}=$orig;
            $refs{$g}{$n}{'content'}=$c;
        }

        return undef;
    });

    return %refs;
}

# process_templates callback to strip templates and store them in the fourth
# parameter hash
sub _strip_templates {
    my ($name, $params, $wikitext, $data) = @_;
    return undef if $name=~/^#tag:\s*(ref|references)$/is;
    return undef if exists($alltpl{$name});

    # Template in skip list?
    return undef if exists($no_move_refs_out{$name});
    return undef if exists($no_move_refs_out{"Template:$name"});

    return 1;
}

# Regexes used below
my $months=qr/(?:January|February|March|April|May|June|July|August|September|October|November|December)/i;
my $sp=qr/(?:(?:\s|&nbsp;)+)/;
my $dt=qr/(?:(?:\d{1,2}$sp$months|\[\[\d{1,2}[ _]$months\]\]|$months$sp\d{1,2}|\[\[$months[ _]\d{1,2}\]\])$sp?,?$sp?(?:\d{1,4}(?:${sp}BC)?|\[\[\d{1,4}(?:[ _]BC)?\]\])|-?\d{4}-\d{2}-\d{2}|\[\[-?\d{4}-\d{2}-\d{2}\]\]|\[\[-?\d{4}\]\]-\[\[\d{2}-\d{2}\]\])/i;

# Check all the pages in the specified query for needed refs
sub _check_linked_pages {
    my ($api,$self,$pages,$type,$pageid,$title,$refs,$needed,$found,$replacements,$log)=@_;
    my %found_in_links=();
    my %dup_in_links=();

    # Resolve any redirects in the list
    my %r=$api->resolve_redirects(@$pages);
    if(exists($r{''})){
        $api->warn("Failed to resolve redirects in $type links for $title: ".$r{''}{'error'}."\n");
        return -1;
    }
    delete $r{$title};

    # Get revids for the top revision in all pages, and also get a list of
    # recently-edited pages in the list
    my %revisions=();
    my @pages=();
    my $iter=$api->iterator(
        titles => bunchlist(500, keys %r),
        prop   => 'revisions',
        rvprop => 'ids|timestamp',
    );
    while(my $r=$iter->next){
        if(!$r->{'_ok_'}){
            $api->warn("Failed to retrieve revids for $type links for $title: ".$r->{'error'}."\n");
            return -1;
        }
        next unless exists($r->{'revisions'}[0]{'revid'});
        $revisions{$r->{'revisions'}[0]{'revid'}}=1;
        push @pages, $r->{'title'} if ISO2timestamp($r->{'revisions'}[0]{'timestamp'})>time-86400;
    }

    return -1 if $api->halting;

    # Now get the revids for the past 24 hours for all the recently-edited pages
    for my $p (@pages){
        my %rq=(
            titles  => $p,
            prop    => 'revisions',
            rvprop  => 'ids|timestamp',
            rvlimit => '100',
        );
        do {
            my $res=$api->query(%rq);
            if($res->{'code'} ne 'success'){
                $api->warn("Failed to retrieve older revids for $type links for $p (for $title): ".$res->{'error'}."\n");
                return -1;
            }
            if(exists($res->{'query-continue'})){
                $rq{'rvcontinue'}=$res->{'query-continue'}{'revisions'}{'rvcontinue'};
            } else {
                delete($rq{'rvcontinue'});
            }
            foreach my $r (@{(values %{$res->{'query'}{'pages'}})[0]{'revisions'}}){
                $revisions{$r->{'revid'}}=1;
                if(ISO2timestamp($r->{'timestamp'})<time-86400){
                    delete($rq{'rvcontinue'});
                    last;
                }
            }
        } while(exists($rq{'rvcontinue'}));
    }

    # Found any revids?
    return 0 unless %revisions;

    $iter=$api->iterator(
        revids => bunchlist(50, keys %revisions),
        prop   => 'revisions',
        rvprop => 'content|timestamp',
        rvslots => 'main',
    );
    while(my $r=$iter->next){
        return -1 if $api->halting;
        if(!$r->{'_ok_'}){
            $api->warn("Failed to retrieve $type revisions for $title: ".$r->{'error'}."\n");
            return -1;
        }
        foreach my $rev (@{$r->{'revisions'}//[]}){
            next unless exists($rev->{'slots'}{'main'}{'*'});

            my $ts=ISO2timestamp($rev->{'timestamp'});

            # Get refs from this linked page, and see if any of
            # them are the ones we need.
            my %rrefs=$self->_get_refs($api, $rev->{'slots'}{'main'}{'*'});

            foreach (keys %$needed){
                my ($g,$n)=split />/, $_, 2;
                next if !exists($rrefs{$g}{$n});
                next if $rrefs{$g}{$n}{'type'} eq '';

                next if _is_generic_ref_name($n,$g);

                $found_in_links{$_}={} if !exists($found_in_links{$_});

                # Did we find a duplicate of a ref already in our target article?
                my ($dup,$dupref)=_check_dups($g,$rrefs{$g}{$n},$refs->{$g});
                $dup_in_links{$_}=[0,$dup,$dupref,$r->{'title'}] if defined($dup);

                # Keep only the most recent version from each article
                my $have_newer=0;
                while(my ($k,$v)=each %{$found_in_links{$_}}){
                    if($v->[4]==$r->{'pageid'}){
                        if($ts>$v->[5]){
                            delete $found_in_links{$_}{$k} if $ts>$v->[5];
                        } else {
                            $have_newer=1;
                        }
                    }
                }
                next if $have_newer;

                my $content=$rrefs{$g}{$n}{'content'};

                # To help minimize false dups, strip whitespace, manipulate
                # dashes, and remove accessdate parameters from the key, and
                # sort named template params.
                my $k=$content;
                $k=~s/Retrieved (?:on )?$dt/Retrieved xxx/ig;
                $k=$api->process_templates($k, sub {
                    my $name=shift;
                    my @params=@{shift()};
                    return undef unless @params;
                    my %p=();
                    my $i=1;
                    foreach (@params){
                        s/\s+//g;
                        next if(/^access(date|monthday|daymonth|year)=/);
                        if(/^([^=]+)=/){
                            $p{$1}=$_;
                        } else {
                            $p{$i}="$i=$_";
                            $i++;
                        }
                    }
                    return "{{$name|".join("|",sort values %p)."}}";
                });
                $k=~s/[\x{2013}\x{2014}]|&([mn]dash|#0*821[12]|#x0*201[34]);/-/g;
                $k=~s/\x{2212}|&(minus|#0*8722|#x0*2212);/-/g;
                $k=~s/\s+//g;

                $found_in_links{$_}{$k}=[0,$rrefs{$g}{$n}{'repl'},$r->{'title'},$content,$r->{'pageid'},$ts];
            }
        }
    }
    my @talkpost=();
    $api->store->{"p$pageid"}={} unless exists($api->store->{"p$pageid"});
    my $posted=$api->store->{"p$pageid"};
    foreach (keys %found_in_links){
        my ($g,$n)=split />/, $_, 2;
        my @repl=keys %{$found_in_links{$_}};
        my ($score,$repl,$from,$content);
        my $dup=undef;
        if(exists($dup_in_links{$_})){
            ($score,$dup,$repl,$from)=@{$dup_in_links{$_}};
        } elsif(@repl>1){
            # Crap, we have multiple versions of the named ref.
            next if exists($posted->{$_});
            $posted->{$_}=1;
            my $x="<b>Reference named \"$n\"";
            $x.=" in group \"$g\"" if $g ne '';
            $x.=":</b><ul>\n";
            foreach my $k (@repl){
                ($score,$repl,$from,$content)=@{$found_in_links{$_}{$k}};
                $x.="<li>From [[$from]]: $content</li>\n";
            }
            $x.="</ul>\n";
            push @talkpost, $x;
            next;
        } else {
            my $k=$repl[0];
            ($score,$repl,$from)=@{$found_in_links{$_}{$k}};
        }
        if(defined($dup)){
            foreach my $need (@{$needed->{$_}}) {
                push @$replacements, {
                    'orig' => $need,
                    'repl' => $repl,
                };
            }
            $found->{$_}="\"$n\" → \"$dup\" from [[$from]]";
            push @$log, "** Renamed \"$n\" → \"$dup\" from [[:$from]]";
        } else {
            push @$replacements, {
                'orig' => $needed->{$_}[0],
                'repl' => $repl,
            };
            $found->{$_}="\"$n\" from [[$from]]";
            push @$log, "** Rescued \"$n\" from [[:$from]]";
        }
        delete $needed->{$_};
    }

    if(@talkpost){
        my $ttok=$api->edittoken('Talk:'.$title);
        if($ttok->{'code'} eq 'shutoff'){
            $api->warn("Task disabled: ".$ttok->{'content'}."\n");
            return 300;
        }
        if($ttok->{'code'} ne 'success'){
            $api->warn("Failed to get edit token for Talk:$title: ".$ttok->{'error'}."\n");
            return -2;
        }
        my $txt="I check pages listed in ";
        $txt.="[[:Category:Pages with incorrect ref formatting]] to ";
        $txt.="try to fix reference errors. One of the things I ";
        $txt.="do is look for content for ";
        $txt.="[[User:".$api->user."/docs/OrphanReferenceFixer|orphaned references]] ";
        $txt.="in wikilinked articles. I have found content for ";
        $txt.="some of [[:$title]]'s orphans, the problem is that ";
        $txt.="I found more than one version. I can't determine ";
        $txt.="which (if any) is correct for ''this'' article, so ";
        $txt.="I am asking for a sentient editor to look it over ";
        $txt.="and copy the correct ref content into this article.\n\n";
        $txt.=join("\n", @talkpost);
        $txt.="\nI apologize if any of the above are effectively ";
        $txt.="identical; I am just a simple computer program, so ";
        $txt.="I can't determine whether minor differences are ";
        $txt.="significant or not. <small>Feel free to remove this comment after fixing the refs.</small> \x7e\x7e\x7e\x7e";
        my $r=$api->edit($ttok, $txt, "Orphaned references in [[:$title]]", 0, 0, section => 'new');
        if($r->{'code'} ne 'success'){
            $api->warn("Write failed on Talk:$title: ".$r->{'error'}."\n");
            return -2;
        }
        $self->_log($api, "* Posted on [[Talk:$title]] to request assistance");
        $api->store->{"p$pageid"}=$posted;
    }
    return 0;
}

sub _is_generic_ref_name {
    my ($n, $g) = @_;

    return (
        # Skip autogenerated named refs, they're unlikely to be useful
        # matches.
        $g eq '' && $n=~/^autogenerated\d+$/ ||

        # Skip these very generic names, too
        $g eq '' && $n=~/^e\d+$/i ||

        # Thanks, VE
        $n=~/^:\d+$/
    );
}

sub _log {
    my $self=shift;
    my $api=shift;
    $api->store->{'log'}.=shift()."\n";
}

sub _notify_reverter {
    my $self=shift;
    my $api=shift;
    my $user=shift;
    my $page=shift;
    my $revid=shift;

    next if exists($api->store->{"revert$revid"});

    my $template='User:AnomieBOT/OrphanReferenceFixer revert help';
    my $chk=$api->query(
        titles  => $template,
        prop    => 'revisions',
        rvprop  => 'user',
        rvlimit => 1,
    );
    if($chk->{'code'} ne 'success'){
        $api->warn("Could not check $template: ".$chk->{'error'}."\n");
        return -2;
    }
    my $edituser=(values %{$chk->{'query'}{'pages'}})[0]{'revisions'}[0]{'user'};
    unless(grep $_ eq $edituser, ('Anomie')){
        $api->log("An unauthorized user has edited $template!");
        $api->warn("An unauthorized user has edited $template!\n");
        $api->whine("An unauthorized user has edited [[:$template]]", "An unauthorized user has edited [[:$template]], so I am refusing to use it until an authorized user confirms it has not been vandalized by making any edit to it. No offense to [[User:$edituser|$edituser]], but I don't want to go substing vandalism on innocent people's talk pages.");
        return 300;
    }

    my $title="Help on reversion";
    my $summary="Provide information on correctly fixing reference errors (instead of reverting)";
    my $msg="{{subst:$template|page=$page|revid=$revid|subst=subst:}}";
    my $res=$api->whine($title, $msg, Summary => $summary, Pagename => "User talk:$user", OptOut => 'AnomieBOT-OrphanReferenceFixer', NoSmallPrint => 1, NoSig => 1);
    if($res->{'code'} eq 'shutoff'){
        $api->warn("Task disabled: ".$res->{'content'}."\n");
        return 300;
    }
    if($res->{'code'} eq 'botexcluded'){
        $self->_log($api, "* Tried to give a revert notice to [[User talk:$user|$user]] about [[Special:Diff/$revid|$revid]] on [[:$page]], but I was excluded: <nowiki>".$res->{'error'}."</nowiki>");
        $api->log("Bot excluded from User talk:$user: ".$res->{'error'});
        $api->store->{"revert$revid"}=1;
        return 0;
    }
    if($res->{'code'} ne 'success'){
        $api->warn("Failed to get edit token for User talk:$user: ".$res->{'error'}."\n");
        return -2;
    }
    $self->_log($api, "* Issued a revert notice to [[User talk:$user|$user]] about [[Special:Diff/$revid|$revid]] on [[:$page]]");
    $api->log("Issued a revert notice to User talk:$user about $revid on $page");
    $api->store->{"revert$revid"}=1;
    return 0;
}


# This function can be used to run the bot over arbitrary page content.
# Something like:
#  perl -we 'use tasks::OrphanReferenceFixer; tasks::OrphanReferenceFixer::unit_test($revid[,$filename]);'
sub unit_test {
    my $revid=shift;
    my $filename=shift;

    $|=1;
    binmode STDOUT, ':utf8';
    binmode STDERR, ':utf8';

    my $self=tasks::OrphanReferenceFixer->new();
    my $api=AnomieBOT::API->new('conf.ini', 1);
    $api->{'noedit'}='/tmp/';
    $api->login();
    $api->DEBUG(-1);
    $api->task('OrphanReferenceFixer', 0, 0.1, qw/d::Talk d::Timestamp d::Templates d::Redirects d::IWNS/);

    return undef unless $api->load_IWNS_maps();

    my $r=$self->init($api);
    die "init failed\n" if defined($r);

    my $res=$api->query(revids=>$revid,prop=>'info|revisions',rvprop=>$filename?'':'content',rvslots=>'main');
    if($res->{'code'} ne 'success'){
        die "Could not load revision $revid: ".$res->{'error'}."\n";
    }
    die "Invalid revid\n" unless(exists($res->{'query'}{'pages'}) && %{$res->{'query'}{'pages'}});
    $res=(values(%{$res->{'query'}{'pages'}}))[0];
    my $pageid=$res->{'pageid'};
    my $lastrevid=$revid;
    my $title=$res->{'title'};
    my $intxt;
    if($filename){
        open X, '<:utf8', $filename or die "Could not open $filename: $!\n";
        { local $/=undef; $intxt=<X>; }
        close X;
    } else {
        $intxt=$res->{'revisions'}[0]{'slots'}{'main'}{'*'};
    }

    my $checked={revid=>$lastrevid,continue=>'<beginning>',did_summary_links=>0,did_page_links=>0,prev_ts=>-1,prev_info=>[0,'','',-1],unfound=>[],ignored=>[]};
    $api->store->{$pageid}=$checked;
    $api->store->{'log'}='';
    my $ret=$self->process_page($api,$pageid,$lastrevid,$title,$intxt,$checked,time()+86400);
    print STDERR $api->store->{'log'}."\n";
    die "Returned undef\n" unless defined($ret);
    die "Returned $ret\n" unless ref($ret);
    my $outtxt=$ret->{'outtxt'};
    delete $ret->{'outtxt'};
    print STDERR scalar Data::Dumper->Dump([$ret],['ret'])."\n";
    print $outtxt;
}

1;