blob: 357bf4101ca72048b5a5b1e55c30bb5aa6b9852f [file] [log] [blame]
#!/usr/local/bin/perl
#
# W3C Link Checker
# by Hugo Haas <hugo@w3.org>
# (c) 1999-2011 World Wide Web Consortium
# based on Renaud Bruyeron's checklink.pl
#
# This program is licensed under the W3C(r) Software License:
# http://www.w3.org/Consortium/Legal/copyright-software
#
# The documentation is at:
# http://validator.w3.org/docs/checklink.html
#
# See the Mercurial interface at:
# http://dvcs.w3.org/hg/link-checker/
#
# An online version is available at:
# http://validator.w3.org/checklink
#
# Comments and suggestions should be sent to the www-validator mailing list:
# www-validator@w3.org (with 'checklink' in the subject)
# http://lists.w3.org/Archives/Public/www-validator/ (archives)
#
# Small modifications in March 2020 by Karl Berry <karl@freefriends.org>
# (contributed under the same license, or public domain if you prefer).
# I started from https://metacpan.org/release/W3C-LinkChecker, version 4.81.
# - (&simple_request) ignore "Argument isn't numeric" warnings.
# - (%Opts, &check_uri) new option --exclude-url-file; see --help message.
# - (&parse_arguments) allow multiple -X options.
# - (&check_uri) missing argument to hprintf.
# - (&hprintf) avoid useless warnings when undef is returned.
# The ideas are (1) to avoid rechecking every url during development,
# and (2) to make the exclude list easier to maintain,
# and (3) to eliminate useless warnings from the code,
#
# For GNU Automake, this program is used by the checklinkx target
# in doc/local.mk to check the (html output of) automake manual.
# (Run make html to create automake.html.)
use warnings;
use strict;
use 5.008;
# Get rid of potentially unsafe and unneeded environment variables.
delete(@ENV{qw(IFS CDPATH ENV BASH_ENV)});
$ENV{PATH} = undef;
# ...but we want PERL5?LIB honored even in taint mode, see perlsec, perl5lib,
# http://www.mail-archive.com/cpan-testers-discuss%40perl.org/msg01064.html
use Config qw(%Config);
use lib map { /(.*)/ }
defined($ENV{PERL5LIB}) ? split(/$Config{path_sep}/, $ENV{PERL5LIB}) :
defined($ENV{PERLLIB}) ? split(/$Config{path_sep}/, $ENV{PERLLIB}) :
();
# -----------------------------------------------------------------------------
package W3C::UserAgent;
use LWP::RobotUA 1.19 qw();
use LWP::UserAgent qw();
use Net::HTTP::Methods 5.833 qw(); # >= 5.833 for 4kB cookies (#6678)
# if 0, ignore robots exclusion (useful for testing)
use constant USE_ROBOT_UA => 1;
if (USE_ROBOT_UA) {
@W3C::UserAgent::ISA = qw(LWP::RobotUA);
}
else {
@W3C::UserAgent::ISA = qw(LWP::UserAgent);
}
sub new
{
my $proto = shift;
my $class = ref($proto) || $proto;
my ($name, $from, $rules) = @_;
# For security/privacy reasons, if $from was not given, do not send it.
# Cheat by defining something for the constructor, and resetting it later.
my $from_ok = $from;
$from ||= 'www-validator@w3.org';
my $self;
if (USE_ROBOT_UA) {
$self = $class->SUPER::new($name, $from, $rules);
}
else {
my %cnf;
@cnf{qw(agent from)} = ($name, $from);
$self = LWP::UserAgent->new(%cnf);
$self = bless $self, $class;
}
$self->from(undef) unless $from_ok;
$self->env_proxy();
$self->allow_private_ips(1);
$self->protocols_forbidden([qw(mailto javascript)]);
return $self;
}
sub allow_private_ips
{
my $self = shift;
if (@_) {
$self->{Checklink_allow_private_ips} = shift;
if (!$self->{Checklink_allow_private_ips}) {
# Pull in dependencies
require Net::IP;
require Socket;
require Net::hostent;
}
}
return $self->{Checklink_allow_private_ips};
}
sub redirect_progress_callback
{
my $self = shift;
$self->{Checklink_redirect_callback} = shift if @_;
return $self->{Checklink_redirect_callback};
}
sub simple_request
{
my $self = shift;
my $response = $self->ip_disallowed($_[0]->uri());
# RFC 2616, section 15.1.3
$_[0]->remove_header("Referer")
if ($_[0]->referer() &&
(!$_[0]->uri()->secure() && URI->new($_[0]->referer())->secure()));
$response ||= do {
local $SIG{__WARN__} =
sub { # Suppress RobotRules warnings, rt.cpan.org #18902
# Suppress "Argument isn't numeric" warnings, see below.
warn($_[0])
if ($_[0]
&& $_[0] !~ /^RobotRules/
&& $_[0] !~ /^Argument .* isn't numeric.*Response\.pm/
);
};
# @@@ Why not just $self->SUPER::simple_request? [--unknown]
# --- Indeed. Further, why use simple_request in the first place?
# It is not part of the UserAgent UI. I believe this can result
# in warnings like:
# Argument "0, 0, 0, 0" isn't numeric in numeric gt (>) at
# /usr/local/lib/perl5/site_perl/5.30.2/HTTP/Response.pm line 261.
# when checking, e.g.,
# https://metacpan.org/pod/distribution/Test-Harness/bin/prove
# For testing, here is a three-line html file to check that url:
# <html><head><title>X</title></head><body>
# <p><a href="https://metacpan.org/pod/release/MSCHWERN/Test-Simple-0.98_05/lib/Test/More.pm">prove</a></p>
# </body></html>
# I have been unable to reproduce the warning with a test program
# checking that url using $ua->request(), or other UserAgent
# functions, even after carefully reproducing all the headers
# that checklink sends in the request. --karl@freefriends.org.
$self->W3C::UserAgent::SUPER::simple_request(@_);
};
if (!defined($self->{FirstResponse})) {
$self->{FirstResponse} = $response->code();
$self->{FirstMessage} = $response->message() || '(no message)';
}
return $response;
}
sub redirect_ok
{
my ($self, $request, $response) = @_;
if (my $callback = $self->redirect_progress_callback()) {
# @@@ TODO: when an LWP internal robots.txt request gets redirected,
# this will a bit confusingly fire for it too. Would need a robust
# way to determine whether the request is such a LWP "internal
# robots.txt" one.
&$callback($request->method(), $request->uri());
}
return 0 unless $self->SUPER::redirect_ok($request, $response);
if (my $res = $self->ip_disallowed($request->uri())) {
$response->previous($response->clone());
$response->request($request);
$response->code($res->code());
$response->message($res->message());
return 0;
}
return 1;
}
#
# Checks whether we're allowed to retrieve the document based on its IP
# address. Takes an URI object and returns a HTTP::Response containing the
# appropriate status and error message if the IP was disallowed, 0
# otherwise. URIs without hostname or IP address are always allowed,
# including schemes where those make no sense (eg. data:, often javascript:).
#
sub ip_disallowed
{
my ($self, $uri) = @_;
return 0 if $self->allow_private_ips(); # Short-circuit
my $hostname = undef;
eval { $hostname = $uri->host() }; # Not all URIs implement host()...
return 0 unless $hostname;
my $addr = my $iptype = my $resp = undef;
if (my $host = Net::hostent::gethostbyname($hostname)) {
$addr = Socket::inet_ntoa($host->addr()) if $host->addr();
if ($addr && (my $ip = Net::IP->new($addr))) {
$iptype = $ip->iptype();
}
}
if ($iptype && $iptype ne 'PUBLIC') {
$resp = HTTP::Response->new(403,
'Checking non-public IP address disallowed by link checker configuration'
);
$resp->header('Client-Warning', 'Internal response');
}
return $resp;
}
# -----------------------------------------------------------------------------
package W3C::LinkChecker;
use vars qw($AGENT $PACKAGE $PROGRAM $VERSION $REVISION
$DocType $Head $Accept $ContentTypes %Cfg $CssUrl);
use CSS::DOM 0.09 qw(); # >= 0.09 for many bugfixes
use CSS::DOM::Constants qw(:rule);
use CSS::DOM::Style qw();
use CSS::DOM::Util qw();
use Encode qw();
use HTML::Entities qw();
use HTML::Parser 3.40 qw(); # >= 3.40 for utf8_mode()
use HTTP::Headers::Util qw();
use HTTP::Message 5.827 qw(); # >= 5.827 for content_charset()
use HTTP::Request 5.814 qw(); # >= 5.814 for accept_decodable()
use HTTP::Response 1.50 qw(); # >= 1.50 for decoded_content()
use Time::HiRes qw();
use URI 1.53 qw(); # >= 1.53 for secure()
use URI::Escape qw();
use URI::Heuristic qw();
# @@@ Needs also W3C::UserAgent but can't use() it here.
use constant RC_ROBOTS_TXT => -1;
use constant RC_DNS_ERROR => -2;
use constant RC_IP_DISALLOWED => -3;
use constant RC_PROTOCOL_DISALLOWED => -4;
use constant LINE_UNKNOWN => -1;
use constant MP2 =>
(exists($ENV{MOD_PERL_API_VERSION}) && $ENV{MOD_PERL_API_VERSION} >= 2);
# Tag=>attribute mapping of things we treat as links.
# Note: meta/@http-equiv gets special treatment, see start() for details.
use constant LINK_ATTRS => {
a => ['href'],
# base/@href intentionally not checked
# http://www.w3.org/mid/200802091439.27764.ville.skytta%40iki.fi
area => ['href'],
audio => ['src'],
blockquote => ['cite'],
body => ['background'],
command => ['icon'],
# button/@formaction not checked (side effects)
del => ['cite'],
# @pluginspage, @pluginurl, @href: pre-HTML5 proprietary
embed => ['href', 'pluginspage', 'pluginurl', 'src'],
# form/@action not checked (side effects)
frame => ['longdesc', 'src'],
html => ['manifest'],
iframe => ['longdesc', 'src'],
img => ['longdesc', 'src'],
# input/@action, input/@formaction not checked (side effects)
input => ['src'],
ins => ['cite'],
link => ['href'],
object => ['data'],
q => ['cite'],
script => ['src'],
source => ['src'],
track => ['src'],
video => ['src', 'poster'],
};
# Tag=>[separator, attributes] mapping of things we treat as lists of links.
use constant LINK_LIST_ATTRS => {
a => [qr/\s+/, ['ping']],
applet => [qr/[\s,]+/, ['archive']],
area => [qr/\s+/, ['ping']],
head => [qr/\s+/, ['profile']],
object => [qr/\s+/, ['archive']],
};
# TBD/TODO:
# - applet/@code?
# - bgsound/@src?
# - object/@classid?
# - isindex/@action?
# - layer/@background,@src?
# - ilayer/@background?
# - table,tr,td,th/@background?
# - xmp/@href?
@W3C::LinkChecker::ISA = qw(HTML::Parser);
BEGIN {
# Version info
$PACKAGE = 'W3C Link Checker (+GNU Automake)';
$PROGRAM = 'W3C-checklink-am';
$VERSION = '4.81-am';
$REVISION = sprintf('version %s (c) 1999-2011 W3C', $VERSION);
$AGENT = sprintf(
'%s/%s %s',
$PROGRAM, $VERSION,
( W3C::UserAgent::USE_ROBOT_UA ? LWP::RobotUA->_agent() :
LWP::UserAgent->_agent()
)
);
# Pull in mod_perl modules if applicable.
eval {
local $SIG{__DIE__} = undef;
require Apache2::RequestUtil;
} if MP2();
my @content_types = qw(
text/html
application/xhtml+xml;q=0.9
application/vnd.wap.xhtml+xml;q=0.6
);
$Accept = join(', ', @content_types, '*/*;q=0.5');
push(@content_types, 'text/css', 'text/html-sandboxed');
my $re = join('|', map { s/;.*//; quotemeta } @content_types);
$ContentTypes = qr{\b(?:$re)\b}io;
# Regexp for matching URL values in CSS.
$CssUrl = qr/(?:\s|^)url\(\s*(['"]?)(.*?)\1\s*\)(?=\s|$)/;
#
# Read configuration. If the W3C_CHECKLINK_CFG environment variable has
# been set or the default contains a non-empty file, read it. Otherwise,
# skip silently.
#
my $defaultconfig = '/etc/w3c/checklink.conf';
if ($ENV{W3C_CHECKLINK_CFG} || -s $defaultconfig) {
require Config::General;
Config::General->require_version(2.06); # Need 2.06 for -SplitPolicy
my $conffile = $ENV{W3C_CHECKLINK_CFG} || $defaultconfig;
eval {
my %config_opts = (
-ConfigFile => $conffile,
-SplitPolicy => 'equalsign',
-AllowMultiOptions => 'no',
);
%Cfg = Config::General->new(%config_opts)->getall();
};
if ($@) {
die <<"EOF";
Failed to read configuration from '$conffile':
$@
EOF
}
}
$Cfg{Markup_Validator_URI} ||= 'http://validator.w3.org/check?uri=%s';
$Cfg{CSS_Validator_URI} ||=
'http://jigsaw.w3.org/css-validator/validator?uri=%s';
$Cfg{Doc_URI} ||= 'http://validator.w3.org/docs/checklink.html';
# Untaint config params that are used as the format argument to (s)printf(),
# Perl 5.10 does not want to see that in taint mode.
($Cfg{Markup_Validator_URI}) = ($Cfg{Markup_Validator_URI} =~ /^(.*)$/);
($Cfg{CSS_Validator_URI}) = ($Cfg{CSS_Validator_URI} =~ /^(.*)$/);
$DocType =
'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">';
my $css_url = URI->new_abs('linkchecker.css', $Cfg{Doc_URI});
my $js_url = URI->new_abs('linkchecker.js', $Cfg{Doc_URI});
$Head =
sprintf(<<'EOF', HTML::Entities::encode($AGENT), $css_url, $js_url);
<meta http-equiv="Content-Script-Type" content="text/javascript" />
<meta name="generator" content="%s" />
<link rel="stylesheet" type="text/css" href="%s" />
<script type="text/javascript" src="%s"></script>
EOF
# Trusted environment variables that need laundering in taint mode.
for (qw(NNTPSERVER NEWSHOST)) {
($ENV{$_}) = ($ENV{$_} =~ /^(.*)$/) if $ENV{$_};
}
# Use passive FTP by default, see Net::FTP(3).
$ENV{FTP_PASSIVE} = 1 unless exists($ENV{FTP_PASSIVE});
}
# Autoflush
$| = 1;
# Different options specified by the user
my $cmdline = !($ENV{GATEWAY_INTERFACE} && $ENV{GATEWAY_INTERFACE} =~ /^CGI/);
my %Opts = (
Command_Line => $cmdline,
Quiet => 0,
Summary_Only => 0,
Verbose => 0,
Progress => 0,
HTML => 0,
Timeout => 30,
Redirects => 1,
Dir_Redirects => 1,
Accept_Language => $cmdline ? undef : $ENV{HTTP_ACCEPT_LANGUAGE},
Cookies => undef,
No_Referer => 0,
Hide_Same_Realm => 0,
Depth => 0, # < 0 means unlimited recursion.
Sleep_Time => 1,
Connection_Cache_Size => 2,
Max_Documents => 150, # For the online version.
User => undef,
Password => undef,
Base_Locations => [],
Exclude => undef,
Exclude_Docs => undef,
Exclude_Url_File => undef,
Suppress_Redirect => [],
Suppress_Redirect_Prefix => [],
Suppress_Redirect_Regexp => [],
Suppress_Temp_Redirects => 1,
Suppress_Broken => [],
Suppress_Fragment => [],
Masquerade => 0,
Masquerade_From => '',
Masquerade_To => '',
Trusted => $Cfg{Trusted},
Allow_Private_IPs => defined($Cfg{Allow_Private_IPs}) ?
$Cfg{Allow_Private_IPs} :
$cmdline,
);
undef $cmdline;
# Global variables
# What URI's did we process? (used for recursive mode)
my %processed;
# Result of the HTTP query
my %results;
# List of redirects
my %redirects;
# Count of the number of documents checked
my $doc_count = 0;
# Time stamp
my $timestamp = &get_timestamp();
# Per-document header; undefined if already printed. See print_doc_header().
my $doc_header;
&parse_arguments() if $Opts{Command_Line};
my $ua = W3C::UserAgent->new($AGENT); # @@@ TODO: admin address
$ua->conn_cache({total_capacity => $Opts{Connection_Cache_Size}});
if ($ua->can('delay')) {
$ua->delay($Opts{Sleep_Time} / 60);
}
$ua->timeout($Opts{Timeout});
# Set up cookie stash if requested
if (defined($Opts{Cookies})) {
require HTTP::Cookies;
my $cookie_file = $Opts{Cookies};
if ($cookie_file eq 'tmp') {
$cookie_file = undef;
}
elsif ($cookie_file =~ /^(.*)$/) {
$cookie_file = $1; # untaint
}
$ua->cookie_jar(HTTP::Cookies->new(file => $cookie_file, autosave => 1));
}
eval { $ua->allow_private_ips($Opts{Allow_Private_IPs}); };
if ($@) {
die <<"EOF";
Allow_Private_IPs is false; this feature requires the Net::IP, Socket, and
Net::hostent modules:
$@
EOF
}
# Add configured forbidden protocols
if ($Cfg{Forbidden_Protocols}) {
my $forbidden = $ua->protocols_forbidden();
push(@$forbidden, split(/[,\s]+/, lc($Cfg{Forbidden_Protocols})));
$ua->protocols_forbidden($forbidden);
}
if ($Opts{Command_Line}) {
require Text::Wrap;
Text::Wrap->import('wrap');
require URI::file;
&usage(1) unless scalar(@ARGV);
$Opts{_Self_URI} = 'http://validator.w3.org/checklink'; # For HTML output
&ask_password() if ($Opts{User} && !$Opts{Password});
if (!$Opts{Summary_Only}) {
printf("%s %s\n", $PACKAGE, $REVISION) unless $Opts{HTML};
}
else {
$Opts{Verbose} = 0;
$Opts{Progress} = 0;
}
# Populate data for print_form()
my %params = (
summary => $Opts{Summary_Only},
hide_redirects => !$Opts{Redirects},
hide_type => $Opts{Dir_Redirects} ? 'dir' : 'all',
no_accept_language => !(
defined($Opts{Accept_Language}) && $Opts{Accept_Language} eq 'auto'
),
no_referer => $Opts{No_Referer},
recursive => ($Opts{Depth} != 0),
depth => $Opts{Depth},
);
my $check_num = 1;
my @bases = @{$Opts{Base_Locations}};
for my $uri (@ARGV) {
# Reset base locations so that previous URI's given on the command line
# won't affect the recursion scope for this URI (see check_uri())
@{$Opts{Base_Locations}} = @bases;
# Transform the parameter into a URI
$uri = &urize($uri);
$params{uri} = $uri;
&check_uri(\%params, $uri, $check_num, $Opts{Depth}, undef, undef, 1);
$check_num++;
}
undef $check_num;
if ($Opts{HTML}) {
&html_footer();
}
elsif ($doc_count > 0 && !$Opts{Summary_Only}) {
printf("\n%s\n", &global_stats());
}
}
else {
require CGI;
require CGI::Carp;
CGI::Carp->import(qw(fatalsToBrowser));
require CGI::Cookie;
# file: URIs are not allowed in CGI mode
my $forbidden = $ua->protocols_forbidden();
push(@$forbidden, 'file');
$ua->protocols_forbidden($forbidden);
my $query = CGI->new();
for my $param ($query->param()) {
my @values = map { Encode::decode_utf8($_) } $query->param($param);
$query->param($param, @values);
}
# Set a few parameters in CGI mode
$Opts{Verbose} = 0;
$Opts{Progress} = 0;
$Opts{HTML} = 1;
$Opts{_Self_URI} = $query->url(-relative => 1);
# Backwards compatibility
my $uri = undef;
if ($uri = $query->param('url')) {
$query->param('uri', $uri) unless $query->param('uri');
$query->delete('url');
}
$uri = $query->param('uri');
if (!$uri) {
&html_header('', undef); # Set cookie only from results page.
my %cookies = CGI::Cookie->fetch();
&print_form(scalar($query->Vars()), $cookies{$PROGRAM}, 1);
&html_footer();
exit;
}
# Backwards compatibility
if ($query->param('hide_dir_redirects')) {
$query->param('hide_redirects', 'on');
$query->param('hide_type', 'dir');
$query->delete('hide_dir_redirects');
}
$Opts{Summary_Only} = 1 if $query->param('summary');
if ($query->param('hide_redirects')) {
$Opts{Dir_Redirects} = 0;
if (my $type = $query->param('hide_type')) {
$Opts{Redirects} = 0 if ($type ne 'dir');
}
else {
$Opts{Redirects} = 0;
}
}
$Opts{Accept_Language} = undef if $query->param('no_accept_language');
$Opts{No_Referer} = $query->param('no_referer');
$Opts{Depth} = -1 if ($query->param('recursive') && $Opts{Depth} == 0);
if (my $depth = $query->param('depth')) {
# @@@ Ignore invalid depth silently for now.
$Opts{Depth} = $1 if ($depth =~ /(-?\d+)/);
}
# Save, clear or leave cookie as is.
my $cookie = undef;
if (my $action = $query->param('cookie')) {
if ($action eq 'clear') {
# Clear the cookie.
$cookie = CGI::Cookie->new(-name => $PROGRAM);
$cookie->value({clear => 1});
$cookie->expires('-1M');
}
elsif ($action eq 'set') {
# Set the options.
$cookie = CGI::Cookie->new(-name => $PROGRAM);
my %options = $query->Vars();
delete($options{$_})
for qw(url uri check cookie); # Non-persistent.
$cookie->value(\%options);
}
}
if (!$cookie) {
my %cookies = CGI::Cookie->fetch();
$cookie = $cookies{$PROGRAM};
}
# Always refresh cookie expiration time.
$cookie->expires('+1M') if ($cookie && !$cookie->expires());
# All Apache configurations don't set HTTP_AUTHORIZATION for CGI scripts.
# If we're under mod_perl, there is a way around it...
eval {
local $SIG{__DIE__} = undef;
my $auth =
Apache2::RequestUtil->request()->headers_in()->{Authorization};
$ENV{HTTP_AUTHORIZATION} = $auth if $auth;
} if (MP2() && !$ENV{HTTP_AUTHORIZATION});
$uri =~ s/^\s+//g;
if ($uri =~ /:/) {
$uri = URI->new($uri);
}
else {
if ($uri =~ m|^//|) {
$uri = URI->new("http:$uri");
}
else {
local $ENV{URL_GUESS_PATTERN} = '';
my $guess = URI::Heuristic::uf_uri($uri);
if ($guess->scheme() && $ua->is_protocol_supported($guess)) {
$uri = $guess;
}
else {
$uri = URI->new("http://$uri");
}
}
}
$uri = $uri->canonical();
$query->param("uri", $uri);
&check_uri(scalar($query->Vars()), $uri, 1, $Opts{Depth}, $cookie);
undef $query; # Not needed any more.
&html_footer();
}
###############################################################################
################################
# Command line and usage stuff #
################################
sub parse_arguments ()
{
require Encode::Locale;
Encode::Locale::decode_argv();
require Getopt::Long;
Getopt::Long->require_version(2.17);
Getopt::Long->import('GetOptions');
Getopt::Long::Configure('bundling', 'no_ignore_case');
my $masq = '';
my @locs = ();
GetOptions(
'help|h|?' => sub { usage(0) },
'q|quiet' => sub {
$Opts{Quiet} = 1;
$Opts{Summary_Only} = 1;
},
's|summary' => \$Opts{Summary_Only},
'b|broken' => sub {
$Opts{Redirects} = 0;
$Opts{Dir_Redirects} = 0;
},
'e|dir-redirects' => sub { $Opts{Dir_Redirects} = 0; },
'v|verbose' => \$Opts{Verbose},
'i|indicator' => \$Opts{Progress},
'H|html' => \$Opts{HTML},
'r|recursive' => sub {
$Opts{Depth} = -1
if $Opts{Depth} == 0;
},
'l|location=s' => \@locs,
'X|exclude=s@' => \@{$Opts{Exclude}},
'exclude-docs=s@' => \@{$Opts{Exclude_Docs}},
'exclude-url-file=s' => \$Opts{Exclude_Url_File},
'suppress-redirect=s@' => \@{$Opts{Suppress_Redirect}},
'suppress-redirect-prefix=s@' => \@{$Opts{Suppress_Redirect_Prefix}},
'suppress-temp-redirects' => \$Opts{Suppress_Temp_Redirects},
'suppress-broken=s@' => \@{$Opts{Suppress_Broken}},
'suppress-fragment=s@' => \@{$Opts{Suppress_Fragment}},
'u|user=s' => \$Opts{User},
'p|password=s' => \$Opts{Password},
't|timeout=i' => \$Opts{Timeout},
'C|connection-cache=i' => \$Opts{Connection_Cache_Size},
'S|sleep=i' => \$Opts{Sleep_Time},
'L|languages=s' => \$Opts{Accept_Language},
'c|cookies=s' => \$Opts{Cookies},
'R|no-referer' => \$Opts{No_Referer},
'D|depth=i' => sub {
$Opts{Depth} = $_[1]
unless $_[1] == 0;
},
'd|domain=s' => \$Opts{Trusted},
'masquerade=s' => \$masq,
'hide-same-realm' => \$Opts{Hide_Same_Realm},
'V|version' => \&version,
) ||
usage(1);
if ($masq) {
$Opts{Masquerade} = 1;
my @masq = split(/\s+/, $masq);
if (scalar(@masq) != 2 ||
!defined($masq[0]) ||
$masq[0] !~ /\S/ ||
!defined($masq[1]) ||
$masq[1] !~ /\S/)
{
usage(1,
"Error: --masquerade takes two whitespace separated URIs.");
}
else {
require URI::file;
$Opts{Masquerade_From} = $masq[0];
my $u = URI->new($masq[1]);
$Opts{Masquerade_To} =
$u->scheme() ? $u : URI::file->new_abs($masq[1]);
}
}
if ($Opts{Accept_Language} && $Opts{Accept_Language} eq 'auto') {
$Opts{Accept_Language} = &guess_language();
}
if (($Opts{Sleep_Time} || 0) < 1) {
warn(
"*** Warning: minimum allowed sleep time is 1 second, resetting.\n"
);
$Opts{Sleep_Time} = 1;
}
push(@{$Opts{Base_Locations}}, map { URI->new($_)->canonical() } @locs);
$Opts{Depth} = -1 if ($Opts{Depth} == 0 && @locs);
for my $i (0 .. $#{$Opts{Exclude_Docs}}) {
eval { $Opts{Exclude_Docs}->[$i] = qr/$Opts{Exclude_Docs}->[$i]/; };
&usage(1, "Error in exclude-docs regexp: $@") if $@;
}
if (defined($Opts{Trusted})) {
eval { $Opts{Trusted} = qr/$Opts{Trusted}/io; };
&usage(1, "Error in trusted domains regexp: $@") if $@;
}
# Sanity-check error-suppression arguments
for my $i (0 .. $#{$Opts{Suppress_Redirect}}) {
${$Opts{Suppress_Redirect}}[$i] =~ s/ /->/;
my $sr_arg = ${$Opts{Suppress_Redirect}}[$i];
if ($sr_arg !~ /.->./) {
&usage(1,
"Bad suppress-redirect argument, should contain \"->\": $sr_arg"
);
}
}
for my $i (0 .. $#{$Opts{Suppress_Redirect_Prefix}}) {
my $srp_arg = ${$Opts{Suppress_Redirect_Prefix}}[$i];
$srp_arg =~ s/ /->/;
if ($srp_arg !~ /^(.*)->(.*)$/) {
&usage(1,
"Bad suppress-redirect-prefix argument, should contain \"->\": $srp_arg"
);
}
# Turn prefixes into a regexp.
${$Opts{Suppress_Redirect_Prefix}}[$i] = qr/^\Q$1\E(.*)->\Q$2\E\1$/ism;
}
for my $i (0 .. $#{$Opts{Suppress_Broken}}) {
${$Opts{Suppress_Broken}}[$i] =~ s/ /:/;
my $sb_arg = ${$Opts{Suppress_Broken}}[$i];
if ($sb_arg !~ /^(-1|[0-9]+):./) {
&usage(1,
"Bad suppress-broken argument, should be prefixed by a numeric response code: $sb_arg"
);
}
}
for my $sf_arg (@{$Opts{Suppress_Fragment}}) {
if ($sf_arg !~ /.#./) {
&usage(1,
"Bad suppress-fragment argument, should contain \"#\": $sf_arg"
);
}
}
if ($#{$Opts{Exclude}} > 0) {
# convert $Opts{Exclude} array into regexp by parenthesizing
# each and inserting alternations between.
my $exclude_rx = join("|", map { "($_)" } @{$Opts{Exclude}});
#
# For the sake of the rest of the program, pretend the option
# was that string all along.
$Opts{Exclude} = $exclude_rx;
}
if ($Opts{Exclude_Url_File}) {
# The idea is that if the specified file exists, we read it and
# treat it as a list of excludes. If the file doesn't exist, we
# write it with all the urls that were successful. That way, we
# can avoid re-checking them on every run, and it can be removed
# externally (from cron) to get re-updated.
#
# We distinguish the cases here, and either add to
# $Opts{Exclude} if reading, or setting Exclude_File_Write in
# %Opts if writing (even though it is not really an option,
# but it's the most convenient place).
if (-s $Opts{Exclude_Url_File}) {
open (my $xf, "$Opts{Exclude_Url_File}")
|| &usage(1, "Could not open $Opts{Exclude_Url_File}"
. " for reading: $!");
my @xf = ();
while (<$xf>) {
chomp;
# the file is urls, not regexps, so quotemeta.
push (@xf, "(" . quotemeta($_) . ")");
}
my $xf_rx = join ("|", @xf);
if ($Opts{Exclude}) {
$Opts{Exclude} .= "|$xf_rx";
} else {
$Opts{Exclude} = $xf_rx;
}
} else {
open ($Opts{Exclude_File_Write}, ">$Opts{Exclude_Url_File}")
|| &usage(1,
"Could not open $Opts{Exclude_Url_File} for writing: $!");
# we write on a successful retrieve, and don't bother closing.
}
}
# Precompile/error-check final list of regular expressions
if (defined($Opts{Exclude})) {
eval { $Opts{Exclude} = qr/$Opts{Exclude}/o; };
&usage(1, "Error in exclude regexp $Opts{Exclude}: $@") if $@;
}
return;
}
sub version ()
{
print "$PACKAGE $REVISION\n";
exit 0;
}
sub usage ()
{
my ($exitval, $msg) = @_;
$exitval = 0 unless defined($exitval);
$msg ||= '';
$msg =~ s/[\r\n]*$/\n\n/ if $msg;
die($msg) unless $Opts{Command_Line};
my $trust = defined($Cfg{Trusted}) ? $Cfg{Trusted} : 'same host only';
select(STDERR) if $exitval;
print "$msg$PACKAGE $REVISION
Usage: checklink <options> <uris>
Options:
-s, --summary Result summary only.
-b, --broken Show only the broken links, not the redirects.
-e, --directory Hide directory redirects, for example
http://www.w3.org/TR -> http://www.w3.org/TR/
-r, --recursive Check the documents linked from the first one.
-D, --depth N Check the documents linked from the first one to
depth N (implies --recursive).
-l, --location URI Scope of the documents checked in recursive mode
(implies --recursive). Can be specified multiple
times. If not specified, the default eg. for
http://www.w3.org/TR/html4/Overview.html
would be http://www.w3.org/TR/html4/
-X, --exclude REGEXP Do not check links whose full, canonical URIs
match REGEXP; also limits recursion the same way
as --exclude-docs with the same regexp would.
This option may be specified multiple times.
--exclude-docs REGEXP In recursive mode, do not check links in documents
whose full, canonical URIs match REGEXP. This
option may be specified multiple times.
--exclude-url-file FILE If FILE exists, treat each line as a string
specifying another exclude; quotemeta is called
to make them regexps. If FILE does not exist,
open it for writing and write each checked url
which gets a 200 response to it.
--suppress-redirect URI->URI Do not report a redirect from the first to the
second URI. This option may be specified multiple
times.
--suppress-redirect-prefix URI->URI Do not report a redirect from a child of
the first URI to the same child of the second URI.
This option may be specified multiple times.
--suppress-temp-redirects Suppress warnings about temporary redirects.
--suppress-broken CODE:URI Do not report a broken link with the given CODE.
CODE is HTTP response, or -1 for robots exclusion.
This option may be specified multiple times.
--suppress-fragment URI Do not report the given broken fragment URI.
A fragment URI contains \"#\". This option may be
specified multiple times.
-L, --languages LANGS Accept-Language header to send. The special value
'auto' causes autodetection from the environment.
-c, --cookies FILE Use cookies, load/save them in FILE. The special
value 'tmp' causes non-persistent use of cookies.
-R, --no-referer Do not send the Referer HTTP header.
-q, --quiet No output if no errors are found (implies -s).
-v, --verbose Verbose mode.
-i, --indicator Show percentage of lines processed while parsing.
-u, --user USERNAME Specify a username for authentication.
-p, --password PASSWORD Specify a password.
--hide-same-realm Hide 401's that are in the same realm as the
document checked.
-S, --sleep SECS Sleep SECS seconds between requests to each server
(default and minimum: 1 second).
-t, --timeout SECS Timeout for requests in seconds (default: 30).
-d, --domain DOMAIN Regular expression describing the domain to which
authentication information will be sent
(default: $trust).
--masquerade \"BASE1 BASE2\" Masquerade base URI BASE1 as BASE2. See the
manual page for more information.
-H, --html HTML output.
-?, -h, --help Show this message and exit.
-V, --version Output version information and exit.
See \"perldoc LWP\" for information about proxy server support,
\"perldoc Net::FTP\" for information about various environment variables
affecting FTP connections and \"perldoc Net::NNTP\" for setting a default
NNTP server for news: URIs.
The W3C_CHECKLINK_CFG environment variable can be used to set the
configuration file to use. See details in the full manual page, it can
be displayed with: perldoc checklink
More documentation at: $Cfg{Doc_URI}
Please send bug reports and comments to the www-validator mailing list:
www-validator\@w3.org (with 'checklink' in the subject)
Archives are at: http://lists.w3.org/Archives/Public/www-validator/
";
exit $exitval;
}
sub ask_password ()
{
eval {
local $SIG{__DIE__} = undef;
require Term::ReadKey;
Term::ReadKey->require_version(2.00);
Term::ReadKey->import(qw(ReadMode));
};
if ($@) {
warn('Warning: Term::ReadKey 2.00 or newer not available, ' .
"password input disabled.\n");
return;
}
printf(STDERR 'Enter the password for user %s: ', $Opts{User});
ReadMode('noecho', *STDIN);
chomp($Opts{Password} = <STDIN>);
ReadMode('restore', *STDIN);
print(STDERR "ok.\n");
return;
}
###############################################################################
###########################################################################
# Guess an Accept-Language header based on the $LANG environment variable #
###########################################################################
sub guess_language ()
{
my $lang = $ENV{LANG} or return;
$lang =~ s/[\.@].*$//; # en_US.UTF-8, fi_FI@euro...
return 'en' if ($lang eq 'C' || $lang eq 'POSIX');
my $res = undef;
eval {
require Locale::Language;
if (my $tmp = Locale::Language::language2code($lang)) {
$lang = $tmp;
}
if (my ($l, $c) = (lc($lang) =~ /^([a-z]+)(?:[-_]([a-z]+))?/)) {
if (Locale::Language::code2language($l)) {
$res = $l;
if ($c) {
require Locale::Country;
$res .= "-$c" if Locale::Country::code2country($c);
}
}
}
};
return $res;
}
############################
# Transform foo into a URI #
############################
sub urize ($)
{
my $arg = shift;
my $uarg = URI::Escape::uri_unescape($arg);
my $uri;
if (-d $uarg) {
# look for an "index" file in dir, return it if found
require File::Spec;
for my $index (map { File::Spec->catfile($uarg, $_) }
qw(index.html index.xhtml index.htm index.xhtm))
{
if (-e $index) {
$uri = URI::file->new_abs($index);
last;
}
}
# return dir itself if an index file was not found
$uri ||= URI::file->new_abs($uarg);
}
elsif ($uarg =~ /^[.\/\\]/ || -e $uarg) {
$uri = URI::file->new_abs($uarg);
}
else {
my $newuri = URI->new($arg);
if ($newuri->scheme()) {
$uri = $newuri;
}
else {
local $ENV{URL_GUESS_PATTERN} = '';
$uri = URI::Heuristic::uf_uri($arg);
$uri = URI::file->new_abs($uri) unless $uri->scheme();
}
}
return $uri->canonical();
}
########################################
# Check for broken links in a resource #
########################################
sub check_uri (\%\$$$$;\$$)
{
my ($params, $uri, $check_num, $depth, $cookie, $referer, $is_start) = @_;
$is_start ||= ($check_num == 1);
my $start = $Opts{Summary_Only} ? 0 : &get_timestamp();
# Get and parse the document
my $response = &get_document(
'GET', $uri, $doc_count, \%redirects, $referer,
$cookie, $params, $check_num, $is_start
);
# Can we check the resource? If not, we exit here...
return if defined($response->{Stop});
if ($Opts{HTML}) {
&html_header($uri, $cookie) if ($check_num == 1);
&print_form($params, $cookie, $check_num) if $is_start;
}
if ($is_start) { # Starting point of a new check, eg. from the command line
# Use the first URI as the recursion base unless specified otherwise.
push(@{$Opts{Base_Locations}}, $response->{absolute_uri}->canonical())
unless @{$Opts{Base_Locations}};
}
else {
# Before fetching the document, we don't know if we'll be within the
# recursion scope or not (think redirects).
if (!&in_recursion_scope($response->{absolute_uri})) {
hprintf("Not in recursion scope: %s\n", $response->{absolute_uri})
if ($Opts{Verbose});
$response->content("");
return;
}
}
# Define the document header, and perhaps print it.
# (It might still be defined if the previous document had no errors;
# just redefine it in that case.)
if ($check_num != 1) {
if ($Opts{HTML}) {
$doc_header = "\n<hr />\n";
}
else {
$doc_header = "\n" . ('-' x 40) . "\n";
}
}
if ($Opts{HTML}) {
$doc_header .=
("<h2>\nProcessing\t" . &show_url($response->{absolute_uri}) .
"\n</h2>\n\n");
}
else {
$doc_header .= "\nProcessing\t$response->{absolute_uri}\n\n";
}
if (!$Opts{Quiet}) {
print_doc_header();
}
# We are checking a new document
$doc_count++;
my $result_anchor = 'results' . $doc_count;
if ($check_num == 1 && !$Opts{HTML} && !$Opts{Summary_Only}) {
my $s = $Opts{Sleep_Time} == 1 ? '' : 's';
my $acclang = $Opts{Accept_Language} || '(not sent)';
my $send_referer = $Opts{No_Referer} ? 'not sent' : 'sending';
my $cookies = 'not used';
if (defined($Opts{Cookies})) {
$cookies = 'used, ';
if ($Opts{Cookies} eq 'tmp') {
$cookies .= 'non-persistent';
}
else {
$cookies .= "file $Opts{Cookies}";
}
}
printf(
<<'EOF', $Accept, $acclang, $send_referer, $cookies, $Opts{Sleep_Time}, $s);
Settings used:
- Accept: %s
- Accept-Language: %s
- Referer: %s
- Cookies: %s
- Sleeping %d second%s between requests to each server
EOF
printf("- Excluding links matching %s\n", $Opts{Exclude})
if defined($Opts{Exclude});
printf("- Excluding links in documents whose URIs match %s\n",
join(', ', @{$Opts{Exclude_Docs}}))
if @{$Opts{Exclude_Docs}};
}
if ($Opts{HTML}) {
if (!$Opts{Summary_Only}) {
my $accept = &encode($Accept);
my $acclang = &encode($Opts{Accept_Language} || '(not sent)');
my $send_referer = $Opts{No_Referer} ? 'not sent' : 'sending';
my $s = $Opts{Sleep_Time} == 1 ? '' : 's';
printf(
<<'EOF', $accept, $acclang, $send_referer, $Opts{Sleep_Time}, $s);
<div class="settings">
Settings used:
<ul>
<li><tt><a href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.1">Accept</a></tt>: %s</li>
<li><tt><a href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4">Accept-Language</a></tt>: %s</li>
<li><tt><a href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.36">Referer</a></tt>: %s</li>
<li>Sleeping %d second%s between requests to each server</li>
</ul>
</div>
EOF
printf("<p>Go to <a href=\"#%s\">the results</a>.</p>\n",
$result_anchor);
my $esc_uri = URI::Escape::uri_escape($response->{absolute_uri},
"^A-Za-z0-9.");
print "<p>For reliable link checking results, check ";
if (!$response->{IsCss}) {
printf("<a href=\"%s\">HTML validity</a> and ",
&encode(sprintf($Cfg{Markup_Validator_URI}, $esc_uri)));
}
printf(
"<a href=\"%s\">CSS validity</a> first.</p>
<p>Back to the <a accesskey=\"1\" href=\"%s\">link checker</a>.</p>\n",
&encode(sprintf($Cfg{CSS_Validator_URI}, $esc_uri)),
&encode($Opts{_Self_URI})
);
printf(<<'EOF', $result_anchor);
<div class="progress" id="progress%s">
<h3>Status: <span></span></h3>
<div class="progressbar"><div></div></div>
<pre>
EOF
}
}
if ($Opts{Summary_Only} && !$Opts{Quiet}) {
print '<p>' if $Opts{HTML};
print 'This may take some time';
print "... (<a href=\"$Cfg{Doc_URI}#wait\">why?</a>)</p>"
if $Opts{HTML};
print " if the document has many links to check.\n" unless $Opts{HTML};
}
# Record that we have processed this resource
$processed{$response->{absolute_uri}} = 1;
# Parse the document
my $p =
&parse_document($uri, $response->base(), $response, 1, ($depth != 0));
my $base = URI->new($p->{base});
# Check anchors
###############
print "Checking anchors...\n" unless $Opts{Summary_Only};
my %errors;
while (my ($anchor, $lines) = each(%{$p->{Anchors}})) {
if (!length($anchor)) {
# Empty IDREF's are not allowed
$errors{$anchor} = 1;
}
else {
my $times = 0;
$times += $_ for values(%$lines);
# They should appear only once
$errors{$anchor} = 1 if ($times > 1);
}
}
print " done.\n" unless $Opts{Summary_Only};
# Check links
#############
&hprintf("Recording all the links found: %d\n",
scalar(keys %{$p->{Links}}))
if ($Opts{Verbose});
my %links;
my %hostlinks;
# Record all the links found
while (my ($link, $lines) = each(%{$p->{Links}})) {
my $link_uri = URI->new($link);
my $abs_link_uri = URI->new_abs($link_uri, $base);
if ($Opts{Masquerade}) {
if ($abs_link_uri =~ m|^\Q$Opts{Masquerade_From}\E|) {
print_doc_header();
printf("processing %s in base %s\n",
$abs_link_uri, $Opts{Masquerade_To});
my $nlink = $abs_link_uri;
$nlink =~ s|^\Q$Opts{Masquerade_From}\E|$Opts{Masquerade_To}|;
$abs_link_uri = URI->new($nlink);
}
}
my $canon_uri = URI->new($abs_link_uri->canonical());
my $fragment = $canon_uri->fragment(undef);
if (!defined($Opts{Exclude}) || $canon_uri !~ $Opts{Exclude}) {
if (!exists($links{$canon_uri})) {
my $hostport;
$hostport = $canon_uri->host_port()
if $canon_uri->can('host_port');
$hostport = '' unless defined $hostport;
push(@{$hostlinks{$hostport}}, $canon_uri);
}
for my $line_num (keys(%$lines)) {
if (!defined($fragment) || !length($fragment)) {
# Document without fragment
$links{$canon_uri}{location}{$line_num} = 1;
}
else {
# Resource with a fragment
$links{$canon_uri}{fragments}{$fragment}{$line_num} = 1;
}
}
} else {
hprintf("excluded via options: %s\n", $canon_uri)
if ($Opts{Verbose});
}
}
my @order = &distribute_links(\%hostlinks);
undef %hostlinks;
# Build the list of broken URI's
my $nlinks = scalar(@order);
&hprintf("Checking %d links to build list of broken URI's\n", $nlinks)
if ($Opts{Verbose});
my %broken;
my $link_num = 0;
for my $u (@order) {
my $ulinks = $links{$u};
if ($Opts{Summary_Only}) {
# Hack: avoid browser/server timeouts in summary only CGI mode, bug 896
print ' ' if ($Opts{HTML} && !$Opts{Command_Line});
}
else {
&hprintf("\nChecking link %s\n", $u);
my $progress = ($link_num / $nlinks) * 100;
printf(
'<script type="text/javascript">show_progress("%s", "Checking link %s", "%.1f%%");</script>',
$result_anchor, &encode($u), $progress)
if (!$Opts{Command_Line} &&
$Opts{HTML} &&
!$Opts{Summary_Only});
}
$link_num++;
# Check that a link is valid
&check_validity($uri, $u, ($depth != 0 && &in_recursion_scope($u)),
\%links, \%redirects);
&hprintf("\tReturn code: %s\n", $results{$u}{location}{code})
if ($Opts{Verbose});
if ($Opts{Exclude_File_Write} && $results{$u}{location}{code} == 200) {
my $fh = $Opts{Exclude_File_Write};
print $fh ("$u\n");
}
if ($results{$u}{location}{success}) {
# Even though it was not broken, we might want to display it
# on the results page (e.g. because it required authentication)
$broken{$u}{location} = 1
if ($results{$u}{location}{display} >= 400);
# List the broken fragments
while (my ($fragment, $lines) = each(%{$ulinks->{fragments}})) {
my $fragment_ok = $results{$u}{fragments}{$fragment};
if ($Opts{Verbose}) {
my @line_nums = sort { $a <=> $b } keys(%$lines);
&hprintf(
"\t\t%s %s - Line%s: %s\n",
$fragment,
$fragment_ok ? 'OK' : 'Not found',
(scalar(@line_nums) > 1) ? 's' : '',
join(', ', @line_nums)
);
}
# A broken fragment?
$broken{$u}{fragments}{$fragment} += 2 unless $fragment_ok;
}
}
elsif (!($Opts{Quiet} && &informational($results{$u}{location}{code})))
{
# Couldn't find the document
$broken{$u}{location} = 1;
# All the fragments associated are hence broken
for my $fragment (keys %{$ulinks->{fragments}}) {
$broken{$u}{fragments}{$fragment}++;
}
}
}
&hprintf(
"\nProcessed in %s seconds.\n",
&time_diff($start, &get_timestamp())
) unless $Opts{Summary_Only};
printf(
'<script type="text/javascript">show_progress("%s", "Done. Document processed in %s seconds.", "100%%");</script>',
$result_anchor, &time_diff($start, &get_timestamp()))
if ($Opts{HTML} && !$Opts{Summary_Only});
# Display results
if ($Opts{HTML} && !$Opts{Summary_Only}) {
print("</pre>\n</div>\n");
printf("<h2><a name=\"%s\">Results</a></h2>\n", $result_anchor);
}
print "\n" unless $Opts{Quiet};
&links_summary(\%links, \%results, \%broken, \%redirects);
&anchors_summary($p->{Anchors}, \%errors);
# Do we want to process other documents?
if ($depth != 0) {
for my $u (map { URI->new($_) } keys %links) {
next unless $results{$u}{location}{success}; # Broken link?
next unless &in_recursion_scope($u);
# Do we understand its content type?
next unless ($results{$u}{location}{type} =~ $ContentTypes);
# Have we already processed this URI?
next if &already_processed($u, $uri);
# Do the job
print "\n" unless $Opts{Quiet};
if ($Opts{HTML}) {
if (!$Opts{Command_Line}) {
if ($doc_count == $Opts{Max_Documents}) {
print(
"<hr />\n<p><strong>Maximum number of documents ($Opts{Max_Documents}) reached!</strong></p>\n"
);
}
if ($doc_count >= $Opts{Max_Documents}) {
$doc_count++;
print("<p>Not checking <strong>$u</strong></p>\n");
$processed{$u} = 1;
next;
}
}
}
# This is an inherently recursive algorithm, so Perl's warning is not
# helpful. You may wish to comment this out when debugging, though.
no warnings 'recursion';
if ($depth < 0) {
&check_uri($params, $u, 0, -1, $cookie, $uri);
}
else {
&check_uri($params, $u, 0, $depth - 1, $cookie, $uri);
}
}
}
return;
}
###############################################################
# Distribute links based on host:port to avoid RobotUA delays #
###############################################################
sub distribute_links(\%)
{
my $hostlinks = shift;
# Hosts ordered by weight (number of links), descending
my @order =
sort { scalar(@{$hostlinks->{$b}}) <=> scalar(@{$hostlinks->{$a}}) }
keys %$hostlinks;
# All link list flattened into one, in host weight order
my @all;
push(@all, @{$hostlinks->{$_}}) for @order;
return @all if (scalar(@order) < 2);
# Indexes and chunk size for "zipping" the end result list
my $num = scalar(@{$hostlinks->{$order[0]}});
my @indexes = map { $_ * $num } (0 .. $num - 1);
# Distribute them
my @result;
while (my @chunk = splice(@all, 0, $num)) {
@result[@indexes] = @chunk;
@indexes = map { $_ + 1 } @indexes;
}
# Weed out undefs
@result = grep(defined, @result);
return @result;
}
##########################################
# Decode Content-Encodings in a response #
##########################################
sub decode_content ($)
{
my $response = shift;
my $error = undef;
my $docref = $response->decoded_content(ref => 1);
if (defined($docref)) {
utf8::encode($$docref);
$response->content_ref($docref);
# Remove Content-Encoding so it won't be decoded again later.
$response->remove_header('Content-Encoding');
}
else {
my $ce = $response->header('Content-Encoding');
$ce = defined($ce) ? "'$ce'" : 'undefined';
my $ct = $response->header('Content-Type');
$ct = defined($ct) ? "'$ct'" : 'undefined';
my $request_uri = $response->request->url;
my $cs = $response->content_charset();
$cs = defined($cs) ? "'$cs'" : 'unknown';
$error =
"Error decoding document at <$request_uri>, Content-Type $ct, " .
"Content-Encoding $ce, content charset $cs: '$@'";
}
return $error;
}
#######################################
# Get and parse a resource to process #
#######################################
sub get_document ($\$$;\%\$$$$$)
{
my ($method, $uri, $in_recursion, $redirects, $referer,
$cookie, $params, $check_num, $is_start
) = @_;
# $method contains the HTTP method the use (GET or HEAD)
# $uri object contains the identifier of the resource
# $in_recursion is > 0 if we are in recursion mode (i.e. it is at least
# the second resource checked)
# $redirects is a pointer to the hash containing the map of the redirects
# $referer is the URI object of the referring document
# $cookie, $params, $check_num, and $is_start are for printing HTTP headers
# and the form if $in_recursion == 0 and not authenticating
# Get the resource
my $response;
if (defined($results{$uri}{response}) &&
!($method eq 'GET' && $results{$uri}{method} eq 'HEAD'))
{
$response = $results{$uri}{response};
}
else {
$response = &get_uri($method, $uri, $referer);
&record_results($uri, $method, $response, $referer);
&record_redirects($redirects, $response);
}
if (!$response->is_success()) {
if (!$in_recursion) {
# Is it too late to request authentication?
if ($response->code() == 401) {
&authentication($response, $cookie, $params, $check_num,
$is_start);
}
else {
if ($Opts{HTML}) {
&html_header($uri, $cookie) if ($check_num == 1);
&print_form($params, $cookie, $check_num) if $is_start;
print "<p>", &status_icon($response->code());
}
&hprintf("\nError: %d %s\n",
$response->code(), $response->message() || '(no message)');
print "</p>\n" if $Opts{HTML};
}
}
$response->{Stop} = 1;
$response->content("");
return ($response);
}
# What is the URI of the resource that we are processing by the way?
my $base_uri = $response->base();
my $request_uri = URI->new($response->request->url);
$response->{absolute_uri} = $request_uri->abs($base_uri);
# Can we parse the document?
my $failed_reason;
my $ct = $response->header('Content-Type');
if (!$ct || $ct !~ $ContentTypes) {
$failed_reason = "Content-Type for <$request_uri> is " .
(defined($ct) ? "'$ct'" : 'undefined');
}
else {
$failed_reason = decode_content($response);
}
if ($failed_reason) {
# No, there is a problem...
if (!$in_recursion) {
if ($Opts{HTML}) {
&html_header($uri, $cookie) if ($check_num == 1);
&print_form($params, $cookie, $check_num) if $is_start;
print "<p>", &status_icon(406);
}
&hprintf("Can't check links: %s.\n", $failed_reason);
print "</p>\n" if $Opts{HTML};
}
$response->{Stop} = 1;
$response->content("");
}
# Ok, return the information
return ($response);
}
#########################################################
# Check whether a URI is within the scope of recursion. #
#########################################################
sub in_recursion_scope (\$)
{
my ($uri) = @_;
return 0 unless $uri;
my $candidate = $uri->canonical();
return 0 if (defined($Opts{Exclude}) && $candidate =~ $Opts{Exclude});
for my $excluded_doc (@{$Opts{Exclude_Docs}}) {
return 0 if ($candidate =~ $excluded_doc);
}
for my $base (@{$Opts{Base_Locations}}) {
my $rel = $candidate->rel($base);
next if ($candidate eq $rel); # Relative path not possible?
next if ($rel =~ m|^(\.\.)?/|); # Relative path upwards?
return 1;
}
return 0; # We always have at least one base location, but none matched.
}
#################################
# Check for content type match. #
#################################
sub is_content_type ($$)
{
my ($candidate, $type) = @_;
return 0 unless ($candidate && $type);
my @v = HTTP::Headers::Util::split_header_words($candidate);
return scalar(@v) ? $type eq lc($v[0]->[0]) : 0;
}
##################################################
# Check whether a URI has already been processed #
##################################################
sub already_processed (\$\$)
{
my ($uri, $referer) = @_;
# Don't be verbose for that part...
my $summary_value = $Opts{Summary_Only};
$Opts{Summary_Only} = 1;
# Do a GET: if it fails, we stop, if not, the results are cached
my $response = &get_document('GET', $uri, 1, undef, $referer);
# ... but just for that part
$Opts{Summary_Only} = $summary_value;
# Can we process the resource?
return -1 if defined($response->{Stop});
# Have we already processed it?
return 1 if defined($processed{$response->{absolute_uri}->as_string()});
# It's not processed yet and it is processable: return 0
return 0;
}
############################
# Get the content of a URI #
############################
sub get_uri ($\$;\$$\%$$$$)
{
# Here we have a lot of extra parameters in order not to lose information
# if the function is called several times (401's)
my ($method, $uri, $referer, $start, $redirects,
$code, $realm, $message, $auth
) = @_;
# $method contains the method used
# $uri object contains the target of the request
# $referer is the URI object of the referring document
# $start is a timestamp (not defined the first time the function is called)
# $redirects is a map of redirects
# $code is the first HTTP return code
# $realm is the realm of the request
# $message is the HTTP message received
# $auth equals 1 if we want to send out authentication information
# For timing purposes
$start = &get_timestamp() unless defined($start);
# Prepare the query
# Do we want printouts of progress?
my $verbose_progress =
!($Opts{Summary_Only} || (!$doc_count && $Opts{HTML}));
&hprintf("%s %s ", $method, $uri) if $verbose_progress;
my $request = HTTP::Request->new($method, $uri);
$request->header('Accept-Language' => $Opts{Accept_Language})
if $Opts{Accept_Language};
$request->header('Accept', $Accept);
$request->accept_decodable();
# Are we providing authentication info?
if ($auth && $request->url()->host() =~ $Opts{Trusted}) {
if (defined($ENV{HTTP_AUTHORIZATION})) {
$request->header(Authorization => $ENV{HTTP_AUTHORIZATION});
}
elsif (defined($Opts{User}) && defined($Opts{Password})) {
$request->authorization_basic($Opts{User}, $Opts{Password});
}
}
# Tell the user agent if we want progress reports for redirects or not.
$ua->redirect_progress_callback(sub { &hprintf("\n-> %s %s ", @_); })
if $verbose_progress;
# Set referer
$request->referer($referer) if (!$Opts{No_Referer} && $referer);
# Telling caches in the middle we want a fresh copy (Bug 4998)
$request->header(Cache_Control => "max-age=0");
# Do the query
my $response = $ua->request($request);
# Get the results
# Record the very first response
if (!defined($code)) {
($code, $message) = delete(@$ua{qw(FirstResponse FirstMessage)});
}
# Authentication requested?
if ($response->code() == 401 &&
!defined($auth) &&
(defined($ENV{HTTP_AUTHORIZATION}) ||
(defined($Opts{User}) && defined($Opts{Password})))
)
{
# Set host as trusted domain unless we already have one.
if (!$Opts{Trusted}) {
my $re = sprintf('^%s$', quotemeta($response->base()->host()));
$Opts{Trusted} = qr/$re/io;
}
# Deal with authentication and avoid loops
if (!defined($realm) &&
$response->www_authenticate() =~ /Basic realm=\"([^\"]+)\"/)
{
$realm = $1;
}
print "\n" if $verbose_progress;
return &get_uri($method, $response->request()->url(),
$referer, $start, $redirects, $code, $realm, $message, 1);
}
# @@@ subtract robot delay from the "fetched in" time?
&hprintf(" fetched in %s seconds\n", &time_diff($start, &get_timestamp()))
if $verbose_progress;
$response->{IsCss} =
is_content_type($response->content_type(), "text/css");
$response->{Realm} = $realm if defined($realm);
return $response;
}
#########################################
# Record the results of an HTTP request #
#########################################
sub record_results (\$$$$)
{
my ($uri, $method, $response, $referer) = @_;
$results{$uri}{referer} = $referer;
$results{$uri}{response} = $response;
$results{$uri}{method} = $method;
$results{$uri}{location}{code} = $response->code();
$results{$uri}{location}{code} = RC_ROBOTS_TXT()
if ($results{$uri}{location}{code} == 403 &&
$response->message() =~ /Forbidden by robots\.txt/);
$results{$uri}{location}{code} = RC_IP_DISALLOWED()
if ($results{$uri}{location}{code} == 403 &&
$response->message() =~ /non-public IP/);
$results{$uri}{location}{code} = RC_DNS_ERROR()
if ($results{$uri}{location}{code} == 500 &&
$response->message() =~ /Bad hostname '[^\']*'/);
$results{$uri}{location}{code} = RC_PROTOCOL_DISALLOWED()
if ($results{$uri}{location}{code} == 500 &&
$response->message() =~ /Access to '[^\']*' URIs has been disabled/);
$results{$uri}{location}{type} = $response->header('Content-type');
$results{$uri}{location}{display} = $results{$uri}{location}{code};
# Rewind, check for the original code and message.
for (my $tmp = $response->previous(); $tmp; $tmp = $tmp->previous()) {
$results{$uri}{location}{orig} = $tmp->code();
$results{$uri}{location}{orig_message} = $tmp->message() ||
'(no message)';
}
$results{$uri}{location}{success} = $response->is_success();
# If a suppressed broken link, fill the data structure like a typical success.
# print STDERR "success? " . $results{$uri}{location}{success} . ": $uri\n";
if (!$results{$uri}{location}{success}) {
my $code = $results{$uri}{location}{code};
my $match = grep { $_ eq "$code:$uri" } @{$Opts{Suppress_Broken}};
if ($match) {
$results{$uri}{location}{success} = 1;
$results{$uri}{location}{code} = 100;
$results{$uri}{location}{display} = 100;
}
}
# Stores the authentication information
if (defined($response->{Realm})) {
$results{$uri}{location}{realm} = $response->{Realm};
$results{$uri}{location}{display} = 401 unless $Opts{Hide_Same_Realm};
}
# What type of broken link is it? (stored in {record} - the {display}
# information is just for visual use only)
if ($results{$uri}{location}{display} == 401 &&
$results{$uri}{location}{code} == 404)
{
$results{$uri}{location}{record} = 404;
}
else {
$results{$uri}{location}{record} = $results{$uri}{location}{display};
}
# Did it fail?
$results{$uri}{location}{message} = $response->message() || '(no message)';
if (!$results{$uri}{location}{success}) {
&hprintf(
"Error: %d %s\n",
$results{$uri}{location}{code},
$results{$uri}{location}{message}
) if ($Opts{Verbose});
}
return;
}
####################
# Parse a document #
####################
sub parse_document (\$\$$$$)
{
my ($uri, $base_uri, $response, $links, $rec_needs_links) = @_;
print("parse_document($uri, $base_uri, ..., $links, $rec_needs_links)\n")
if $Opts{Verbose};
my $p;
if (defined($results{$uri}{parsing})) {
# We have already done the job. Woohoo!
$p->{base} = $results{$uri}{parsing}{base};
$p->{Anchors} = $results{$uri}{parsing}{Anchors};
$p->{Links} = $results{$uri}{parsing}{Links};
return $p;
}
$p = W3C::LinkChecker->new();
$p->{base} = $base_uri;
my $stype = $response->header("Content-Style-Type");
$p->{style_is_css} = !$stype || is_content_type($stype, "text/css");
my $start;
if (!$Opts{Summary_Only}) {
$start = &get_timestamp();
print("Parsing...\n");
}
# Content-Encoding etc already decoded in get_document().
my $docref = $response->content_ref();
# Count lines beforehand if needed (for progress indicator, or CSS while
# we don't get any line context out of the parser). In case of HTML, the
# actual final number of lines processed shown is populated by our
# end_document handler.
$p->{Total} = ($$docref =~ tr/\n//)
if ($response->{IsCss} || $Opts{Progress});
# We only look for anchors if we are not interested in the links
# obviously, or if we are running a recursive checking because we
# might need this information later
$p->{only_anchors} = !($links || $rec_needs_links);
if ($response->{IsCss}) {
# Parse as CSS
$p->parse_css($$docref, LINE_UNKNOWN());
}
else {
# Parse as HTML
# Transform <?xml:stylesheet ...?> into <xml:stylesheet ...> for parsing
# Processing instructions are not parsed by process, but in this case
# it should be. It's expensive, it's horrible, but it's the easiest way
# for right now.
$$docref =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/
unless $p->{only_anchors};
$p->xml_mode(1) if ($response->content_type() =~ /\+xml$/);
$p->parse($$docref)->eof();
}
$response->content("");
if (!$Opts{Summary_Only}) {
my $stop = &get_timestamp();
print "\r" if $Opts{Progress};
&hprintf(" done (%d lines in %s seconds).\n",
$p->{Total}, &time_diff($start, $stop));
}
# Save the results before exiting
$results{$uri}{parsing}{base} = $p->{base};
$results{$uri}{parsing}{Anchors} = $p->{Anchors};
$results{$uri}{parsing}{Links} = $p->{Links};
return $p;
}
####################################
# Constructor for W3C::LinkChecker #
####################################
sub new
{
my $p = HTML::Parser::new(@_, api_version => 3);
$p->utf8_mode(1);
# Set up handlers
$p->handler(start => 'start', 'self, tagname, attr, line');
$p->handler(end => 'end', 'self, tagname, line');
$p->handler(text => 'text', 'self, dtext, line');
$p->handler(
declaration => sub {
my $self = shift;
$self->declaration(substr($_[0], 2, -1));
},
'self, text, line'
);
$p->handler(end_document => 'end_document', 'self, line');
if ($Opts{Progress}) {
$p->handler(default => 'parse_progress', 'self, line');
$p->{last_percentage} = 0;
}
# Check <a [..] name="...">?
$p->{check_name} = 1;
# Check <[..] id="..">?
$p->{check_id} = 1;
# Don't interpret comment loosely
$p->strict_comment(1);
return $p;
}
#################################################
# Record or return the doctype of the document #
#################################################
sub doctype
{
my ($self, $dc) = @_;
return $self->{doctype} unless $dc;
$_ = $self->{doctype} = $dc;
# What to look for depending on the doctype
# Check for <a name="...">?
$self->{check_name} = 0
if m%^-//(W3C|WAPFORUM)//DTD XHTML (Basic|Mobile) %;
# Check for <* id="...">?
$self->{check_id} = 0
if (m%^-//IETF//DTD HTML [23]\.0//% || m%^-//W3C//DTD HTML 3\.2//%);
# Enable XML mode (XHTML, XHTML Mobile, XHTML-Print, XHTML+RDFa, ...)
$self->xml_mode(1) if (m%^-//(W3C|WAPFORUM)//DTD XHTML[ \-\+]%);
return;
}
###################################
# Print parse progress indication #
###################################
sub parse_progress
{
my ($self, $line) = @_;
return unless defined($line) && $line > 0 && $self->{Total} > 0;
my $percentage = int($line / $self->{Total} * 100);
if ($percentage != $self->{last_percentage}) {
printf("\r%4d%%", $percentage);
$self->{last_percentage} = $percentage;
}
return;
}
#############################
# Extraction of the anchors #
#############################
sub get_anchor
{
my ($self, $tag, $attr) = @_;
my $anchor = $self->{check_id} ? $attr->{id} : undef;
if ($self->{check_name} && ($tag eq 'a')) {
# @@@@ In XHTML, <a name="foo" id="foo"> is mandatory
# Force an error if it's not the case (or if id's and name's values
# are different)
# If id is defined, name if defined must have the same value
$anchor ||= $attr->{name};
}
return $anchor;
}
#############################
# W3C::LinkChecker handlers #
#############################
sub add_link
{
my ($self, $uri, $base, $line) = @_;
if (defined($uri)) {
# Remove repeated slashes after the . or .. in relative links, to avoid
# duplicated checking or infinite recursion.
$uri =~ s|^(\.\.?/)/+|$1|o;
$uri = Encode::decode_utf8($uri);
$uri = URI->new_abs($uri, $base) if defined($base);
$self->{Links}{$uri}{defined($line) ? $line : LINE_UNKNOWN()}++;
}
return;
}
sub start
{
my ($self, $tag, $attr, $line) = @_;
$line = LINE_UNKNOWN() unless defined($line);
# Anchors
my $anchor = $self->get_anchor($tag, $attr);
$self->{Anchors}{$anchor}{$line}++ if defined($anchor);
# Links
if (!$self->{only_anchors}) {
my $tag_local_base = undef;
# Special case: base/@href
# @@@TODO: The reason for handling <base href> ourselves is that LWP's
# head parsing magic fails at least for responses that have
# Content-Encodings: https://rt.cpan.org/Ticket/Display.html?id=54361
if ($tag eq 'base') {
# Ignore <base> with missing/empty href.
$self->{base} = $attr->{href}
if (defined($attr->{href}) && length($attr->{href}));
}
# Special case: meta[@http-equiv=Refresh]/@content
elsif ($tag eq 'meta') {
if ($attr->{'http-equiv'} &&
lc($attr->{'http-equiv'}) eq 'refresh')
{
my $content = $attr->{content};
if ($content && $content =~ /.*?;\s*(?:url=)?(.+)/i) {
$self->add_link($1, undef, $line);
}
}
}
# Special case: tags that have "local base"
elsif ($tag eq 'applet' || $tag eq 'object') {
if (my $codebase = $attr->{codebase}) {
# Applet codebases are directories, append trailing slash
# if it's not there so that new_abs does the right thing.
$codebase .= "/" if ($tag eq 'applet' && $codebase !~ m|/$|);
# TODO: HTML 4 spec says applet/@codebase may only point to
# subdirs of the directory containing the current document.
# Should we do something about that?
$tag_local_base = URI->new_abs($codebase, $self->{base});
}
}
# Link attributes:
if (my $link_attrs = LINK_ATTRS()->{$tag}) {
for my $la (@$link_attrs) {
$self->add_link($attr->{$la}, $tag_local_base, $line);
}
}
# List of links attributes:
if (my $link_attrs = LINK_LIST_ATTRS()->{$tag}) {
my ($sep, $attrs) = @$link_attrs;
for my $la (@$attrs) {
if (defined(my $value = $attr->{$la})) {
for my $link (split($sep, $value)) {
$self->add_link($link, $tag_local_base, $line);
}
}
}
}
# Inline CSS:
delete $self->{csstext};
if ($tag eq 'style') {
$self->{csstext} = ''
if ((!$attr->{type} && $self->{style_is_css}) ||
is_content_type($attr->{type}, "text/css"));
}
elsif ($self->{style_is_css} && (my $style = $attr->{style})) {
$style = CSS::DOM::Style::parse($style);
$self->parse_style($style, $line);
}
}
$self->parse_progress($line) if $Opts{Progress};
return;
}
sub end
{
my ($self, $tagname, $line) = @_;
$self->parse_css($self->{csstext}, $line) if ($tagname eq 'style');
delete $self->{csstext};
$self->parse_progress($line) if $Opts{Progress};
return;
}
sub parse_css
{
my ($self, $css, $line) = @_;
return unless $css;
my $sheet = CSS::DOM::parse($css);
for my $rule (@{$sheet->cssRules()}) {
if ($rule->type() == IMPORT_RULE()) {
$self->add_link($rule->href(), $self->{base}, $line);
}
elsif ($rule->type == STYLE_RULE()) {
$self->parse_style($rule->style(), $line);
}
}
return;
}
sub parse_style
{
my ($self, $style, $line) = @_;
return unless $style;
for (my $i = 0, my $len = $style->length(); $i < $len; $i++) {
my $prop = $style->item($i);
my $val = $style->getPropertyValue($prop);
while ($val =~ /$CssUrl/go) {
my $url = CSS::DOM::Util::unescape($2);
$self->add_link($url, $self->{base}, $line);
}
}
return;
}
sub declaration
{
my ($self, $text, $line) = @_;
# Extract the doctype
my @declaration = split(/\s+/, $text, 4);
if ($#declaration >= 3 &&
$declaration[0] eq 'DOCTYPE' &&
lc($declaration[1]) eq 'html')
{
# Parse the doctype declaration
if ($text =~
m/^DOCTYPE\s+html\s+(?:PUBLIC\s+"([^"]+)"|SYSTEM)(\s+"([^"]+)")?\s*$/i
)
{
# Store the doctype
$self->doctype($1) if $1;
# If there is a link to the DTD, record it
$self->add_link($3, undef, $line)
if (!$self->{only_anchors} && $3);
}
}
$self->text($text) unless $self->{only_anchors};
return;
}
sub text
{
my ($self, $text, $line) = @_;
$self->{csstext} .= $text if defined($self->{csstext});
$self->parse_progress($line) if $Opts{Progress};
return;
}
sub end_document
{
my ($self, $line) = @_;
$self->{Total} = $line;
delete $self->{csstext};
return;
}
################################
# Check the validity of a link #
################################
sub check_validity (\$\$$\%\%)
{
my ($referer, $uri, $want_links, $links, $redirects) = @_;
# $referer is the URI object of the document checked
# $uri is the URI object of the target that we are verifying
# $want_links is true if we're interested in links in the target doc
# $links is a hash of the links in the documents checked
# $redirects is a map of the redirects encountered
# Get the document with the appropriate method: GET if there are
# fragments to check or links are wanted, HEAD is enough otherwise.
my $fragments = $links->{$uri}{fragments} || {};
my $method = ($want_links || %$fragments) ? 'GET' : 'HEAD';
my $response;
my $being_processed = 0;
if (!defined($results{$uri}) ||
($method eq 'GET' && $results{$uri}{method} eq 'HEAD'))
{
$being_processed = 1;
$response = &get_uri($method, $uri, $referer);
# Get the information back from get_uri()
&record_results($uri, $method, $response, $referer);
# Record the redirects
&record_redirects($redirects, $response);
}
elsif (!($Opts{Summary_Only} || (!$doc_count && $Opts{HTML}))) {
my $ref = $results{$uri}{referer};
&hprintf("Already checked%s\n", $ref ? ", referrer $ref" : ".");
}
# We got the response of the HTTP request. Stop here if it was a HEAD.
return if ($method eq 'HEAD');
# There are fragments. Parse the document.
my $p;
if ($being_processed) {
# Can we really parse the document?
if (!defined($results{$uri}{location}{type}) ||
$results{$uri}{location}{type} !~ $ContentTypes)
{
&hprintf("Can't check content: Content-Type for '%s' is '%s'.\n",
$uri, $results{$uri}{location}{type})
if ($Opts{Verbose});
$response->content("");
return;
}
# Do it then
if (my $error = decode_content($response)) {
&hprintf("%s\n.", $error);
}
# @@@TODO: this isn't the best thing to do if a decode error occurred
$p =
&parse_document($uri, $response->base(), $response, 0,
$want_links);
}
else {
# We already had the information
$p->{Anchors} = $results{$uri}{parsing}{Anchors};
}
# Check that the fragments exist
for my $fragment (keys %$fragments) {
if (defined($p->{Anchors}{$fragment}) ||
&escape_match($fragment, $p->{Anchors}) ||
grep { $_ eq "$uri#$fragment" } @{$Opts{Suppress_Fragment}})
{
$results{$uri}{fragments}{$fragment} = 1;
}
else {
$results{$uri}{fragments}{$fragment} = 0;
}
}
return;
}
sub escape_match ($\%)
{
my ($a, $hash) = (URI::Escape::uri_unescape($_[0]), $_[1]);
for my $b (keys %$hash) {
return 1 if ($a eq URI::Escape::uri_unescape($b));
}
return 0;
}
##########################
# Ask for authentication #
##########################
sub authentication ($;$$$$)
{
my ($response, $cookie, $params, $check_num, $is_start) = @_;
my $realm = '';
if ($response->www_authenticate() =~ /Basic realm=\"([^\"]+)\"/) {
$realm = $1;
}
if ($Opts{Command_Line}) {
printf STDERR <<'EOF', $response->request()->url(), $realm;
Authentication is required for %s.
The realm is "%s".
Use the -u and -p options to specify a username and password and the -d option
to specify trusted domains.
EOF
}
else {
printf(
"Status: 401 Authorization Required\nWWW-Authenticate: %s\n%sConnection: close\nContent-Language: en\nContent-Type: text/html; charset=utf-8\n\n",
$response->www_authenticate(),
$cookie ? "Set-Cookie: $cookie\n" : "",
);
printf(
"%s
<html lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">
<head>
<title>W3C Link Checker: 401 Authorization Required</title>
%s</head>
<body>", $DocType, $Head
);
&banner(': 401 Authorization Required');
&print_form($params, $cookie, $check_num) if $is_start;
printf(
'<p>
%s
You need "%s" access to <a href="%s">%s</a> to perform link checking.<br />
',
&status_icon(401),
&encode($realm), (&encode($response->request()->url())) x 2
);
my $host = $response->request()->url()->host();
if ($Opts{Trusted} && $host !~ $Opts{Trusted}) {
printf <<'EOF', &encode($Opts{Trusted}), &encode($host);
This service has been configured to send authentication only to hostnames
matching the regular expression <code>%s</code>, but the hostname
<code>%s</code> does not match it.
EOF
}
print "</p>\n";
}
return;
}
##################
# Get statistics #
##################
sub get_timestamp ()
{
return pack('LL', Time::HiRes::gettimeofday());
}
sub time_diff ($$)
{
my @start = unpack('LL', $_[0]);
my @stop = unpack('LL', $_[1]);
for ($start[1], $stop[1]) {
$_ /= 1_000_000;
}
return (sprintf("%.2f", ($stop[0] + $stop[1]) - ($start[0] + $start[1])));
}
########################
# Handle the redirects #
########################
# Record the redirects in a hash
sub record_redirects (\%$)
{
my ($redirects, $response) = @_;
for (my $prev = $response->previous(); $prev; $prev = $prev->previous()) {
# Check for redirect match.
my $from = $prev->request()->url();
my $to = $response->request()->url(); # same on every loop iteration
my $from_to = $from . '->' . $to;
my $match = grep { $_ eq $from_to } @{$Opts{Suppress_Redirect}};
# print STDERR "Result $match of redirect checking $from_to\n";
if ($match) { next; }
$match = grep { $from_to =~ /$_/ } @{$Opts{Suppress_Redirect_Prefix}};
# print STDERR "Result $match of regexp checking $from_to\n";
if ($match) { next; }
my $c = $prev->code();
if ($Opts{Suppress_Temp_Redirects} && ($c == 307 || $c == 302)) {
next;
}
$redirects->{$prev->request()->url()} = $response->request()->url();
}
return;
}
# Determine if a request is redirected
sub is_redirected ($%)
{
my ($uri, %redirects) = @_;
return (defined($redirects{$uri}));
}
# Get a list of redirects for a URI
sub get_redirects ($%)
{
my ($uri, %redirects) = @_;
my @history = ($uri);
my %seen = ($uri => 1); # for tracking redirect loops
my $loop = 0;
while ($redirects{$uri}) {
$uri = $redirects{$uri};
push(@history, $uri);
if ($seen{$uri}) {
$loop = 1;
last;
}
else {
$seen{$uri}++;
}
}
return ($loop, @history);
}
####################################################
# Tool for sorting the unique elements of an array #
####################################################
sub sort_unique (@)
{
my %saw;
@saw{@_} = ();
return (sort { $a <=> $b } keys %saw);
}
#####################
# Print the results #
#####################
sub line_number ($)
{
my $line = shift;
return $line if ($line >= 0);
return "(N/A)";
}
sub http_rc ($)
{
my $rc = shift;
return $rc if ($rc >= 0);
return "(N/A)";
}
# returns true if the given code is informational
sub informational ($)
{
my $rc = shift;
return $rc == RC_ROBOTS_TXT() ||
$rc == RC_IP_DISALLOWED() ||
$rc == RC_PROTOCOL_DISALLOWED();
}
sub anchors_summary (\%\%)
{
my ($anchors, $errors) = @_;
# Number of anchors found.
my $n = scalar(keys(%$anchors));
if (!$Opts{Quiet}) {
if ($Opts{HTML}) {
print("<h3>Anchors</h3>\n<p>");
}
else {
print("Anchors\n\n");
}
&hprintf("Found %d anchor%s.\n", $n, ($n == 1) ? '' : 's');
print("</p>\n") if $Opts{HTML};
}
# List of the duplicates, if any.
my @errors = keys %{$errors};
if (!scalar(@errors)) {
print("<p>Valid anchors!</p>\n")
if (!$Opts{Quiet} && $Opts{HTML} && $n);
return;
}
undef $n;
print_doc_header();
print('<p>') if $Opts{HTML};
print('List of duplicate and empty anchors');
print <<'EOF' if $Opts{HTML};
</p>
<table class="report" border="1" summary="List of duplicate and empty anchors.">
<thead>
<tr>
<th>Anchor</th>
<th>Lines</th>
</tr>
</thead>
<tbody>
EOF
print("\n");
for my $anchor (@errors) {
my $format;
my @unique = &sort_unique(
map { line_number($_) }
keys %{$anchors->{$anchor}}
);
if ($Opts{HTML}) {
$format = "<tr><td class=\"broken\">%s</td><td>%s</td></tr>\n";
}
else {
my $s = (scalar(@unique) > 1) ? 's' : '';
$format = "\t%s\tLine$s: %s\n";
}
printf($format,
&encode(length($anchor) ? $anchor : 'Empty anchor'),
join(', ', @unique));
}
print("</tbody>\n</table>\n") if $Opts{HTML};
return;
}
sub show_link_report (\%\%\%\%\@;$\%)
{
my ($links, $results, $broken, $redirects, $urls, $codes, $todo) = @_;
print("\n<dl class=\"report\">") if $Opts{HTML};
print("\n") if (!$Opts{Quiet});
# Process each URL
my ($c, $previous_c);
for my $u (@$urls) {
my @fragments = keys %{$broken->{$u}{fragments}};
# Did we get a redirect?
my $redirected = &is_redirected($u, %$redirects);
# List of lines
my @total_lines;
push(@total_lines, keys(%{$links->{$u}{location}}));
for my $f (@fragments) {
push(@total_lines, keys(%{$links->{$u}{fragments}{$f}}))
unless ($f eq $u && defined($links->{$u}{$u}{LINE_UNKNOWN()}));
}
my ($redirect_loop, @redirects_urls) = get_redirects($u, %$redirects);
my $currloc = $results->{$u}{location};
# Error type
$c = &code_shown($u, $results);
# What to do
my $whattodo;
my $redirect_too;
if ($todo) {
if ($u =~ m/^javascript:/) {
if ($Opts{HTML}) {
$whattodo =
'You must change this link: people using a browser without JavaScript support
will <em>not</em> be able to follow this link. See the
<a href="http://www.w3.org/TR/WAI-WEBCONTENT/#tech-scripts">Web Content
Accessibility Guidelines on the use of scripting on the Web</a> and the
<a href="http://www.w3.org/TR/WCAG10-HTML-TECHS/#directly-accessible-scripts">techniques
on how to solve this</a>.';
}
else {
$whattodo =
'Change this link: people using a browser without JavaScript support will not be able to follow this link.';
}
}
elsif ($c == RC_ROBOTS_TXT()) {
$whattodo =
'The link was not checked due to robots exclusion ' .
'rules. Check the link manually.';
}
elsif ($redirect_loop) {
$whattodo =
'Retrieving the URI results in a redirect loop, that should be '
. 'fixed. Examine the redirect sequence to see where the loop '
. 'occurs.';
}
else {
$whattodo = $todo->{$c};
}
}
elsif (defined($redirects{$u})) {
# Redirects
if (($u . '/') eq $redirects{$u}) {
$whattodo =
'The link is missing a trailing slash, and caused a redirect. Adding the trailing slash would speed up browsing.';
}
elsif ($c == 307 || $c == 302) {
$whattodo =
'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.';
}
elsif ($c == 301) {
$whattodo =
'This is a permanent redirect. The link should be updated.';
}
}
my @unique = &sort_unique(map { line_number($_) } @total_lines);
my $lines_list = join(', ', @unique);
my $s = (scalar(@unique) > 1) ? 's' : '';
undef @unique;
my @http_codes = ($currloc->{code});
unshift(@http_codes, $currloc->{orig}) if $currloc->{orig};
@http_codes = map { http_rc($_) } @http_codes;
if ($Opts{HTML}) {
# Style stuff
my $idref = '';
if ($codes && (!defined($previous_c) || ($c != $previous_c))) {
$idref = ' id="d' . $doc_count . 'code_' . $c . '"';
$previous_c = $c;
}
# Main info
for (@redirects_urls) {
$_ = &show_url($_);
}
# HTTP message
my $http_message;
if ($currloc->{message}) {
$http_message = &encode($currloc->{message});
if ($c == 404 || $c == 500) {
$http_message =
'<span class="broken">' . $http_message . '</span>';
}
}
my $redirmsg =
$redirect_loop ? ' <em>redirect loop detected</em>' : '';
printf("
<dt%s>%s <span class='msg_loc'>Line%s: %s</span> %s</dt>
<dd class='responsecode'><strong>Status</strong>: %s %s %s</dd>
<dd class='message_explanation'><p>%s %s</p></dd>\n",
# Anchor for return codes
$idref,
# Color
&status_icon($c),
$s,
# List of lines
$lines_list,
# List of redirects
$redirected ?
join(' redirected to ', @redirects_urls) . $redirmsg :
&show_url($u),
# Realm
defined($currloc->{realm}) ?
sprintf('Realm: %s<br />', &encode($currloc->{realm})) :
'',
# HTTP original message
# defined($currloc->{orig_message})
# ? &encode($currloc->{orig_message}).
# ' <span title="redirected to">-&gt;</span> '
# : '',
# Response code chain
join(
' <span class="redirected_to" title="redirected to">-&gt;</span> ',
map { &encode($_) } @http_codes),
# HTTP final message
$http_message,
# What to do
$whattodo,
# Redirect too?
$redirect_too ?
sprintf(' <span %s>%s</span>',
&bgcolor(301), $redirect_too) :
'',
);
if ($#fragments >= 0) {
printf("<dd>Broken fragments: <ul>\n");
}
}
else {
my $redirmsg = $redirect_loop ? ' redirect loop detected' : '';
printf(
"\n%s\t%s\n Code: %s %s\n%s\n",
# List of redirects
$redirected ? join("\n-> ", @redirects_urls) . $redirmsg : $u,
# List of lines
$lines_list ? sprintf("\n%6s: %s", "Line$s", $lines_list) : '',
# Response code chain
join(' -> ', @http_codes),
# HTTP message
$currloc->{message} || '',
# What to do
wrap(' To do: ', ' ', $whattodo)
);
if ($#fragments >= 0) {
if ($currloc->{code} == 200) {
print("The following fragments need to be fixed:\n");
}
else {
print("Fragments:\n");
}
}
}
# Fragments
for my $f (@fragments) {
my @unique_lines =
&sort_unique(keys %{$links->{$u}{fragments}{$f}});
my $plural = (scalar(@unique_lines) > 1) ? 's' : '';
my $unique_lines = join(', ', @unique_lines);
if ($Opts{HTML}) {
printf("<li>%s<em>#%s</em> (line%s %s)</li>\n",
&encode($u), &encode($f), $plural, $unique_lines);
}
else {
printf("\t%-30s\tLine%s: %s\n", $f, $plural, $unique_lines);
}
}
print("</ul></dd>\n") if ($Opts{HTML} && scalar(@fragments));
}
# End of the table
print("</dl>\n") if $Opts{HTML};
return;
}
sub code_shown ($$)
{
my ($u, $results) = @_;
if ($results->{$u}{location}{record} == 200) {
return $results->{$u}{location}{orig} ||
$results->{$u}{location}{record};
}
else {
return $results->{$u}{location}{record};
}
}
sub links_summary (\%\%\%\%)
{
# Advice to fix the problems
my %todo = (
200 =>
'Some of the links to this resource point to broken URI fragments (such as index.html#fragment).',
300 =>
'This often happens when a typo in the link gets corrected automatically by the server. For the sake of performance, the link should be fixed.',
301 =>
'This is a permanent redirect. The link should be updated to point to the more recent URI.',
302 =>
'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.',
303 =>
'This rare status code points to a "See Other" resource. There is generally nothing to be done.',
307 =>
'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.',
400 =>
'This is usually the sign of a malformed URL that cannot be parsed by the server. Check the syntax of the link.',
401 =>
"The link is not public and the actual resource is only available behind authentication. If not already done, you could specify it.",
403 =>
'The link is forbidden! This needs fixing. Usual suspects: a missing index.html or Overview.html, or a missing ACL.',
404 =>
'The link is broken. Double-check that you have not made any typo, or mistake in copy-pasting. If the link points to a resource that no longer exists, you may want to remove or fix the link.',
405 =>
'The server does not allow HTTP HEAD requests, which prevents the Link Checker to check the link automatically. Check the link manually.',
406 =>
"The server isn't capable of responding according to the Accept* headers sent. This is likely to be a server-side issue with negotiation.",
407 => 'The link is a proxy, but requires Authentication.',
408 => 'The request timed out.',
410 => 'The resource is gone. You should remove this link.',
415 => 'The media type is not supported.',
500 => 'This is a server side problem. Check the URI.',
501 =>
'Could not check this link: method not implemented or scheme not supported.',
503 =>
'The server cannot service the request, for some unknown reason.',
# Non-HTTP codes:
RC_ROBOTS_TXT() => sprintf(
'The link was not checked due to %srobots exclusion rules%s. Check the link manually, and see also the link checker %sdocumentation on robots exclusion%s.',
$Opts{HTML} ? (
'<a href="http://www.robotstxt.org/robotstxt.html">', '</a>',
"<a href=\"$Cfg{Doc_URI}#bot\">", '</a>'
) : ('') x 4
),
RC_DNS_ERROR() =>
'The hostname could not be resolved. Check the link for typos.',
RC_IP_DISALLOWED() =>
sprintf(
'The link resolved to a %snon-public IP address%s, and this link checker instance has been configured to not access such addresses. This may be a real error or just a quirk of the name resolver configuration on the server where the link checker runs. Check the link manually, in particular its hostname/IP address.',
$Opts{HTML} ?
('<a href="http://www.ietf.org/rfc/rfc1918.txt">', '</a>') :
('') x 2),
RC_PROTOCOL_DISALLOWED() =>
'Accessing links with this URI scheme has been disabled in link checker.',
);
my %priority = (
410 => 1,
404 => 2,
403 => 5,
200 => 10,
300 => 15,
401 => 20
);
my ($links, $results, $broken, $redirects) = @_;
# List of the broken links
my @urls = keys %{$broken};
my @dir_redirect_urls = ();
if ($Opts{Redirects}) {
# Add the redirected URI's to the report
for my $l (keys %$redirects) {
next
unless (defined($results->{$l}) &&
defined($links->{$l}) &&
!defined($broken->{$l}));
# Check whether we have a "directory redirect"
# e.g. http://www.w3.org/TR -> http://www.w3.org/TR/
my ($redirect_loop, @redirects) = get_redirects($l, %$redirects);
if ($#redirects == 1) {
push(@dir_redirect_urls, $l);
next;
}
push(@urls, $l);
}
}
# Broken links and redirects
if ($#urls < 0) {
if (!$Opts{Quiet}) {
print_doc_header();
if ($Opts{HTML}) {
print "<h3>Links</h3>\n<p>Valid links!</p>\n";
}
else {
print "\nValid links.\n";
}
}
}
else {
print_doc_header();
print('<h3>') if $Opts{HTML};
print("\nList of broken links and other issues");
#print(' and redirects') if $Opts{Redirects};
# Sort the URI's by HTTP Code
my %code_summary;
my @idx;
for my $u (@urls) {
if (defined($results->{$u}{location}{record})) {
my $c = &code_shown($u, $results);
$code_summary{$c}++;
push(@idx, $c);
}
}
my @sorted = @urls[
sort {
defined($priority{$idx[$a]}) ?
defined($priority{$idx[$b]}) ?
$priority{$idx[$a]} <=> $priority{$idx[$b]} :
-1 :
defined($priority{$idx[$b]}) ? 1 :
$idx[$a] <=> $idx[$b]
} 0 .. $#idx
];
@urls = @sorted;
undef(@sorted);
undef(@idx);
if ($Opts{HTML}) {
# Print a summary
print <<'EOF';
</h3>
<p><em>There are issues with the URLs listed below. The table summarizes the
issues and suggested actions by HTTP response status code.</em></p>
<table class="report" border="1" summary="List of issues and suggested actions.">
<thead>
<tr>
<th>Code</th>
<th>Occurrences</th>
<th>What to do</th>
</tr>
</thead>
<tbody>
EOF
for my $code (sort(keys(%code_summary))) {
printf('<tr%s>', &bgcolor($code));
printf('<td><a href="#d%scode_%s">%s</a></td>',
$doc_count, $code, http_rc($code));
printf('<td>%s</td>', $code_summary{$code});
printf('<td>%s</td>', $todo{$code});
print "</tr>\n";
}
print "</tbody>\n</table>\n";
}
else {
print(':');
}
&show_link_report($links, $results, $broken, $redirects, \@urls, 1,
\%todo);
}
# Show directory redirects
if ($Opts{Dir_Redirects} && ($#dir_redirect_urls > -1)) {
print_doc_header();
print('<h3>') if $Opts{HTML};
print("\nList of redirects");
print(
"</h3>\n<p>The links below are not broken, but the document does not use the exact URL, and the links were redirected. It may be a good idea to link to the final location, for the sake of speed.</p>"
) if $Opts{HTML};
&show_link_report($links, $results, $broken, $redirects,
\@dir_redirect_urls);
}
return;
}
###############################################################################
################
# Global stats #
################
sub global_stats ()
{
my $stop = &get_timestamp();
my $n_docs =
($doc_count <= $Opts{Max_Documents}) ? $doc_count :
$Opts{Max_Documents};
return sprintf(
'Checked %d document%s in %s seconds.',
$n_docs,
($n_docs == 1) ? '' : 's',
&time_diff($timestamp, $stop)
);
}
##################
# HTML interface #
##################
sub html_header ($$)
{
my ($uri, $cookie) = @_;
my $title = defined($uri) ? $uri : '';
$title = ': ' . $title if ($title =~ /\S/);
my $headers = '';
if (!$Opts{Command_Line}) {
$headers .= "Cache-Control: no-cache\nPragma: no-cache\n" if $uri;
$headers .= "Content-Type: text/html; charset=utf-8\n";
$headers .= "Set-Cookie: $cookie\n" if $cookie;
# mod_perl 1.99_05 doesn't seem to like it if the "\n\n" isn't in the same
# print() statement as the last header
$headers .= "Content-Language: en\n\n";
}
my $onload = $uri ? '' :
' onload="if(document.getElementById){document.getElementById(\'uri_1\').focus()}"';
print $headers, $DocType, "
<html lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">
<head>
<title>W3C Link Checker", &encode($title), "</title>
", $Head, "</head>
<body", $onload, '>';
&banner($title);
return;
}
sub banner ($)
{
my $tagline = "Check links and anchors in Web pages or full Web sites";
printf(
<<'EOF', URI->new_abs("../images/no_w3c.png", $Cfg{Doc_URI}), $tagline);
<div id="banner"><h1 id="title"><a href="http://www.w3.org/" title="W3C"><img alt="W3C" id="logo" src="%s" width="110" height="61" /></a>
<a href="checklink"><span>Link Checker</span></a></h1>
<p id="tagline">%s</p></div>
<div id="main">
EOF
return;
}
sub status_icon($)
{
my ($code) = @_;
my $icon_type;
my $r = HTTP::Response->new($code);
if ($r->is_success()) {
$icon_type = 'error'
; # if is success but reported, it's because of broken frags => error
}
elsif (&informational($code)) {
$icon_type = 'info';
}
elsif ($code == 300) {
$icon_type = 'info';
}
elsif ($code == 401) {
$icon_type = 'error';
}
elsif ($r->is_redirect()) {
$icon_type = 'warning';
}
elsif ($r->is_error()) {
$icon_type = 'error';
}
else {
$icon_type = 'error';
}
return sprintf('<span class="err_type"><img src="%s" alt="%s" /></span>',
URI->new_abs("../images/info_icons/$icon_type.png", $Cfg{Doc_URI}),
$icon_type);
}
sub bgcolor ($)
{
my ($code) = @_;
my $class;
my $r = HTTP::Response->new($code);
if ($r->is_success()) {
return '';
}
elsif ($code == RC_ROBOTS_TXT() || $code == RC_IP_DISALLOWED()) {
$class = 'dubious';
}
elsif ($code == 300) {
$class = 'multiple';
}
elsif ($code == 401) {
$class = 'unauthorized';
}
elsif ($r->is_redirect()) {
$class = 'redirect';
}
elsif ($r->is_error()) {
$class = 'broken';
}
else {
$class = 'broken';
}
return (' class="' . $class . '"');
}
sub show_url ($)
{
my ($url) = @_;
return sprintf('<a href="%s">%s</a>', (&encode($url)) x 2);
}
sub html_footer ()
{
printf("<p>%s</p>\n", &global_stats())
if ($doc_count > 0 && !$Opts{Quiet});
if (!$doc_count) {
print <<'EOF';
<div class="intro">
<p>
This Link Checker looks for issues in links, anchors and referenced objects
in a Web page, CSS style sheet, or recursively on a whole Web site. For
best results, it is recommended to first ensure that the documents checked
use Valid <a href="http://validator.w3.org/">(X)HTML Markup</a> and
<a href="http://jigsaw.w3.org/css-validator/">CSS</a>. The Link Checker is
part of the W3C's <a href="http://www.w3.org/QA/Tools/">validators and
Quality Web tools</a>.
</p>
</div>
EOF
}
printf(<<'EOF', $Cfg{Doc_URI}, $Cfg{Doc_URI}, $PACKAGE, $REVISION);
</div><!-- main -->
<ul class="navbar" id="menu">
<li><a href="%s" accesskey="3" title="Documentation for this Link Checker Service">Docs</a></li>
<li><a href="http://search.cpan.org/dist/W3C-LinkChecker/" accesskey="2" title="Download the source / Install this service">Download</a></li>
<li><a href="%s#csb" title="feedback: comments, suggestions and bugs" accesskey="4">Feedback</a></li>
<li><a href="http://validator.w3.org/" title="Validate your markup with the W3C Markup Validation Service">Validator</a></li>
</ul>
<div>
<address>
%s<br /> %s
</address>
</div>
</body>
</html>
EOF
return;
}
sub print_form (\%$$)
{
my ($params, $cookie, $check_num) = @_;
# Split params on \0, see CGI's docs on Vars()
while (my ($key, $value) = each(%$params)) {
if ($value) {
my @vals = split(/\0/, $value, 2);
$params->{$key} = $vals[0];
}
}
# Override undefined values from the cookie, if we got one.
my $valid_cookie = 0;
if ($cookie) {
my %cookie_values = $cookie->value();
if (!$cookie_values{clear})
{ # XXX no easy way to check if cookie expired?
$valid_cookie = 1;
while (my ($key, $value) = each(%cookie_values)) {
$params->{$key} = $value unless defined($params->{$key});
}
}
}
my $chk = ' checked="checked"';
$params->{hide_type} = 'all' unless $params->{hide_type};
my $requested_uri = &encode($params->{uri} || '');
my $sum = $params->{summary} ? $chk : '';
my $red = $params->{hide_redirects} ? $chk : '';
my $all = ($params->{hide_type} ne 'dir') ? $chk : '';
my $dir = $all ? '' : $chk;
my $acc = $params->{no_accept_language} ? $chk : '';
my $ref = $params->{no_referer} ? $chk : '';
my $rec = $params->{recursive} ? $chk : '';
my $dep = &encode($params->{depth} || '');
my $cookie_options = '';
if ($valid_cookie) {
$cookie_options = "
<label for=\"cookie1_$check_num\"><input type=\"radio\" id=\"cookie1_$check_num\" name=\"cookie\" value=\"nochanges\" checked=\"checked\" /> Don't modify saved options</label>
<label for=\"cookie2_$check_num\"><input type=\"radio\" id=\"cookie2_$check_num\" name=\"cookie\" value=\"set\" /> Save these options</label>
<label for=\"cookie3_$check_num\"><input type=\"radio\" id=\"cookie3_$check_num\" name=\"cookie\" value=\"clear\" /> Clear saved options</label>";
}
else {
$cookie_options = "
<label for=\"cookie_$check_num\"><input type=\"checkbox\" id=\"cookie_$check_num\" name=\"cookie\" value=\"set\" /> Save options in a <a href=\"http://www.w3.org/Protocols/rfc2109/rfc2109\">cookie</a></label>";
}
print "<form action=\"", $Opts{_Self_URI},
"\" method=\"get\" onsubmit=\"return uriOk($check_num)\" accept-charset=\"UTF-8\">
<p><label for=\"uri_$check_num\">Enter the address (<a href=\"http://www.w3.org/Addressing/\">URL</a>)
of a document that you would like to check:</label></p>
<p><input type=\"text\" size=\"50\" id=\"uri_$check_num\" name=\"uri\" value=\"",
$requested_uri, "\" /></p>
<fieldset id=\"extra_opt_uri_$check_num\" class=\"moreoptions\">
<legend class=\"toggletext\">More Options</legend>
<div class=\"options\">
<p>
<label for=\"summary_$check_num\"><input type=\"checkbox\" id=\"summary_$check_num\" name=\"summary\" value=\"on\"",
$sum, " /> Summary only</label>
<br />
<label for=\"hide_redirects_$check_num\"><input type=\"checkbox\" id=\"hide_redirects_$check_num\" name=\"hide_redirects\" value=\"on\"",
$red,
" /> Hide <a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html#sec10.3\">redirects</a>:</label>
<label for=\"hide_type_all_$check_num\"><input type=\"radio\" id=\"hide_type_all_$check_num\" name=\"hide_type\" value=\"all\"",
$all, " /> all</label>
<label for=\"hide_type_dir_$check_num\"><input type=\"radio\" id=\"hide_type_dir_$check_num\" name=\"hide_type\" value=\"dir\"",
$dir, " /> for directories only</label>
<br />
<label for=\"no_accept_language_$check_num\"><input type=\"checkbox\" id=\"no_accept_language_$check_num\" name=\"no_accept_language\" value=\"on\"",
$acc,
" /> Don't send the <tt><a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4\">Accept-Language</a></tt> header</label>
<br />
<label for=\"no_referer_$check_num\"><input type=\"checkbox\" id=\"no_referer_$check_num\" name=\"no_referer\" value=\"on\"",
$ref,
" /> Don't send the <tt><a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.36\">Referer</a></tt> header</label>
<br />
<label title=\"Check linked documents recursively (maximum: ",
$Opts{Max_Documents},
" documents)\" for=\"recursive_$check_num\"><input type=\"checkbox\" id=\"recursive_$check_num\" name=\"recursive\" value=\"on\"",
$rec, " /> Check linked documents recursively</label>,
<label title=\"Depth of the recursion (-1 is the default and means unlimited)\" for=\"depth_$check_num\">recursion depth: <input type=\"text\" size=\"3\" maxlength=\"3\" id=\"depth_$check_num\" name=\"depth\" value=\"",
$dep, "\" /></label>
<br /><br />", $cookie_options, "
</p>
</div>
</fieldset>
<p class=\"submit_button\"><input type=\"submit\" name=\"check\" value=\"Check\" /></p>
</form>
<div class=\"intro\" id=\"don_program\"></div>
<script type=\"text/javascript\" src=\"http://www.w3.org/QA/Tools/don_prog.js\"></script>
";
return;
}
sub encode (@)
{
return $Opts{HTML} ? HTML::Entities::encode(@_) : @_;
}
sub hprintf (@)
{
print_doc_header();
if (!$Opts{HTML}) {
# can have undef values here; avoid useless warning. E.g.,
# Error: -1 Forbidden by robots.txt
# Use of uninitialized value $_[2] in printf at /usr/local/bin/checklink line 3245.
# and
# Error: 404 File `/u/karl/gnu/src/akarl/doc/dejagnu.html' does not exist
# Use of uninitialized value $_[2] in printf at /usr/local/bin/checklink line 3245.
my @args = ();
for my $a (@_) {
push (@args, defined $a ? $a : ""),
}
printf(@args);
}
else {
print HTML::Entities::encode(sprintf($_[0], @_[1 .. @_ - 1]));
}
return;
}
# Print the document header, if it hasn't been printed already.
# This is invoked before most other output operations, in order
# to enable quiet processing that doesn't clutter the output with
# "Processing..." messages when nothing else will be reported.
sub print_doc_header ()
{
if (defined($doc_header)) {
print $doc_header;
undef($doc_header);
}
}
# Local Variables:
# mode: perl
# indent-tabs-mode: nil
# cperl-indent-level: 4
# cperl-continued-statement-offset: 4
# cperl-brace-offset: -4
# perl-indent-level: 4
# End:
# ex: ts=4 sw=4 et