#!/usr/bin/perl
#
# by Thomas A. Alspaugh
#
# Licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 2.5 License
# http://creativecommons.org/licenses/by-nc-sa/2.5/
#
use strict;
use warnings;
sub help {
my $cmdname = $0;
$cmdname =~ s:.*/::;
print STDERR "$cmdname -[/hv] file ...\n";
print STDERR "Check HTML files.\n";
print STDERR " -help Show this help.\n";
print STDERR " -/ Require end tags or empty-element tags.\n";
print STDERR " -H List the absolute destinations linked to.\n";
print STDERR " -h List the relative destinations linked to.\n";
print STDERR " -# List the anchors defined.\n";
print STDERR " -q Quiet for correct files\n";
print STDERR " (otherwise each file name is echoed.)\n";
print STDERR " -v Verbose output.\n";
print STDERR "What is checked:\n";
print STDERR " elements and character entities are valid (HTML4.01)\n";
print STDERR " start/end elements balance\n";
print STDERR " no links to undefined same-file anchors\n";
print STDERR " no links to nonexistent local files or anchors in them\n";
print STDERR " no links to nonexistent local directory index files\n";
print STDERR " or anchors in them\n";
print STDERR " no img elements with nonexistent local sources\n";
print STDERR " no open comments\n";
print STDERR " no closing tags with attributes\n";
print STDERR " no -- in comments\n";
}
# Bad character `Ę' (looks like accented E)
if (! scalar @ARGV) {
&help;
exit 0;
}
my $HOUT = *STDOUT;
my $anchor = "";
my $arg = "";
my $leaveCrumbs = "y";
my $elt = "";
my $endTagsRequired = "";
my $fname = "";
my $fragment = "";
my $fDirSlash = "";
my $href = "";
my %hrefs = ();
my %ids = ();
my $lineno = 0;
my $linenoComment = 0;
my $pathname = "";
my $showAnchors = "";
my $showLocalRefs = "";
my $showNonLocalRefs = "";
my $tag = "";
my $tagName = "";
my $tagRest = "";
my $v = "";
my %charEnts = ();
my %htmlTags = (); # "/" signifies the tag must be closed.
&setup;
foreach $arg (@ARGV) {
if ($arg =~ /^-+help/) {
&help;
exit 0;
}
elsif ($arg =~ /^-+q$/) {
$leaveCrumbs = "";
}
elsif ($arg =~ /^-+v$/) {
$v = "y";
}
elsif ($arg =~ /^-+\/$/) {
$endTagsRequired = "y";
}
elsif ($arg =~ /^-+#$/) {
$showAnchors = "y";
}
elsif ($arg =~ /^-+h$/) {
$showLocalRefs = "y";
}
elsif ($arg =~ /^-+H$/) {
$showNonLocalRefs = "y";
}
elsif ($arg =~ /^-+.*/) {
print STDERR "Unexpected option \"$arg\".\n";
exit -1;
}
else {
$fname = $arg;
if ($fname =~ /\//) {
$fDirSlash = $fname;
$fDirSlash =~ s:/[^/]+$:/:;
}
else {
$fDirSlash = "";
}
#print STDERR "$fname:\n";
if (open HIN, "<$fname") {
if ($leaveCrumbs) { print STDERR "($fname)\n"; }
&htmlFile;
close HIN;
}
else {
print STDERR "Can't open input file \"$arg\".\n";
}
}
}
sub htmlFile {
%ids = ();
$lineno = 0;
my $linenoTag = 0;
#print STDERR "($fname:$lineno):\n";
my @stack = ();
my $stackDepth = 0;
my $top;
my $topTagName;
my $topWhere;
%hrefs = ();
while (&readLineElideComments) {
chomp;
#print STDERR "($fname:$lineno):\n";
if (//) {
# The entire DOCTYPE was on this line
}
else {
# Find the end of the DOCTYPE
while (&readLineElideComments) {
chomp;
if (/) {
print $HOUT "($fname:$lineno) '<' while looking for > to close /) {
$linenoTag = 0;
last;
}
}
if ($linenoTag) {
print $HOUT "($fname:$lineno) EOF while looking for > to close /) {
# Stray >.
if ($tagName eq "style") {
#print $HOUT "($fname:$lineno) '>' in style.\n";
}
else {
print $HOUT "($fname:$lineno) Stray '>'.\n";
}
}
$tag = $_;
if (/>/) {
# The entire tag was on this line
$tag =~ s/(<[^>]*>)(.*)/$1/;
$_ =~ s/(<[^>]*>)(.*)/$2/;
#\/?[_A-Za-z][-_.A-Za-z0-9]*
}
else {
# End of tag wasn't on this line.
$linenoTag = $lineno;
$_ = "";
# Find the end of the tag
while (&readLineElideComments) {
chomp;
$tag = "$tag $_"; # Perhaps should put newline rather than blank.
if (/>/) {
$linenoTag = 0;
last;
}
}
if ($linenoTag) {
print $HOUT "($fname:$lineno) EOF while looking for > to close < from line $linenoTag.\n";
exit;
}
$_ = $tag;
$tag =~ s/(<[^>]*>)(.*)/$1/;
$_ =~ s/(<[^>]*>)(.*)/$2/;
}
$elt = $tag;
$elt =~ s/='[^'>]*'/=./g;
$elt =~ s/="[^">]*"/=../g;
#print STDERR "for quotes ($fname:$lineno) $elt\n";
if ($elt =~ /'/) {
print $HOUT "($fname:$lineno) Runaway single quote.\n";
}
if ($elt =~ /"/) {
print $HOUT "($fname:$lineno) Runaway double quote.\n";
}
$tagName = lc $tag;
$tagRest = $tag;
$tagName =~ s/<\/?([_A-Za-z][-_.A-Za-z0-9]*)([ \n\/>](.|\n)*)/$1/;
$tagRest =~ s/<\/?([_A-Za-z][-_.A-Za-z0-9]*)([ \n\/>](.|\n)*)/$2/;
$tagRest =~ s/([^>]*)\/?>.*/$1/;
if ($v) { print $HOUT "($fname:$lineno) Found tag `$tag'.\n"; }
if ($tagRest =~ /) {
print $HOUT "($fname:$lineno) Found '<' in tag while looking for closing '>'.\n";
}
# What kind of tag is it?
if (!exists $htmlTags{$tagName}) {
# It's not an HTML tag.
print $HOUT "($fname:$lineno) Not HTML: `$tagName' `$tag'.\n";
}
elsif ($tag =~ /^<\//) {
# It's a closing tag.
if ($tagRest =~ /=/) {
print $HOUT "($fname:$lineno) Closing tag can't have attributes: `$tag'.\n";
}
# Figure out which tag it closes.
while ("true") {
if (0 < scalar @stack) {
$top = pop @stack;
$topTagName = $top;
$topTagName =~ s/^([^ ]+) (.*)/$1/;
$top =~ s/^([^ ]+) (.*)/$2/;
$topWhere = $top;
$topWhere =~ s/(\([^:]*:[0-9]*\)).*/$1/;
if ($tagName eq $topTagName) {
$stackDepth = 1 + scalar @stack;
if ($v) { print $HOUT "($fname:$lineno) {$stackDepth} Matched <$topTagName> from $topWhere.\n"; }
last;
}
elsif ($endTagsRequired || "/" eq $htmlTags{$topTagName}) {
if (0 >scalar @stack) {
my $top1 = pop @stack;
my $top1TagName = $top1;
$top1TagName =~ s/^([^ ]+) (.*)/$1/;
$top1 =~ s/^([^ ]+) (.*)/$2/;
my $top1Where = $top;
$top1Where =~ s/(\([^:]*:[0-9]*\)).*/$1/;
if ($tagName eq $top1TagName) {
$stackDepth = 1 + scalar @stack;
if ($v) { print $HOUT "($fname:$lineno) {$stackDepth} Matched <$top1TagName> from $top1Where.\n"; }
print $HOUT "($fname:$lineno) Unmatched start tag <$topTagName> from $topWhere.\n";
}
else {
print $HOUT "($fname:$lineno) Unmatched end tag $tag (tried 2 from stack).\n";
$stackDepth = 1 + scalar @stack;
print $HOUT "($fname:$lineno) {$stackDepth} Unmatched start tag <$topTagName> from $topWhere.\n";
}
last;
} else {
print $HOUT "($fname:$lineno) Unmatched end tag $tag.\n";
$stackDepth = 1 + scalar @stack;
print $HOUT "($fname:$lineno) {$stackDepth} Unmatched start tag <$topTagName> from $topWhere.\n";
last;
}
}
else {
$stackDepth = 1 + scalar @stack;
if ($v) { print $HOUT "$topWhere {$stackDepth} No closing tag expected for <$topTagName>.\n"; }
}
}
else {
print $HOUT "($fname:$lineno) Unmatched end tag $tag (empty stack).\n";
last;
}
}
}
elsif ($tag =~ /\/>/) {
# It's an empty-element tag.
if ($v) { print $HOUT "($fname:$lineno) Found empty-element <$tagName/>.\n"; }
&checkLinks;
}
else {
# It's a start tag.
push @stack, "$tagName ($fname:$lineno) $tag";
if ($v) { print $HOUT "($fname:$lineno) {" . scalar @stack . "} Pushed <$tagName> $tag.\n"; }
&checkLinks;
}
}
if (/>/) {
# Stray >.
if ($tagName eq "style") {
#print $HOUT "($fname:$lineno) '>' in style.\n";
}
else {
print $HOUT "($fname:$lineno) Stray '>'.\n";
}
}
}
while (scalar @stack) {
$stackDepth = scalar @stack;
$top = pop @stack;
$topTagName = $top;
$topTagName =~ s/^([^ ]+) (.*)/$1/;
$top =~ s/^([^ ]+) (.*)/$2/;
$topWhere = $top;
$topWhere =~ s/(\([^:]*:[0-9]*\)).*/$1/;
print $HOUT "($fname:$lineno) {$stackDepth} Unmatched start tag <$topTagName> from $topWhere.\n";
}
my @hrefList = reverse sort keys %hrefs;
while (@hrefList) {
$href = pop @hrefList;
if ($href =~ /^#/) {
# Local id -- can check :)
my $id = $href;
$id =~ s/^#//;
if (exists $ids{$id}) {
if ($v) { print $HOUT "$hrefs{$href} Link to `$id' defined at $ids{$id}.\n"; }
}
else {
print $HOUT "$hrefs{$href} Link to undefined id `$id'.\n";
}
}
else {
die "Unexpected href '$href' -- not local.\n";
}
}
}
# Checks the link attributes of a tag (in $tag)
sub checkLinks {
# IDs
if ($tag =~ /.* id="([^">]+)".*/ ||
$tag =~ /.* id='([^'>]+)'.*/ ||
$tag =~ /^]*name="([^">]+)".*/ ||
$tag =~ /^]*name='([^'>]+)'.*/) {
my $id = $tag;
if ($tag =~ /.* id="([^">]+)"/) {
$id =~ s/.* id="([^">]+)".*/$1/;
}
elsif ($tag =~ /.* id='([^'>]+)'/) {
$id =~ s/.* id='([^'>]+)'.*/$1/;
}
elsif ($tag =~ /^]*name="([^">]+)".*/) {
$id =~ s/^]*name="([^">]+)".*/$1/;
}
elsif ($tag =~ /^]*name='([^'>]+)'.*/) {
$id =~ s/^]*name='([^'>]+)'.*/$1/;
}
else {
print STDERR "Something seriously wrong: $tag\n";
exit
}
if ($showAnchors) { print STDOUT "$id defined ($fname:$lineno)\n"; }
if ($v) { print $HOUT "($fname:$lineno) id `$id' defined.\n"; }
if (exists $ids{$id}) {
print $HOUT "($fname:$lineno) id `$id' already defined $ids{$id}.\n";
}
$ids{$id} = "($fname:$lineno)";
if ($id =~ /^[A-Za-z][-_.:A-Za-z0-9]*$/) {}
else {
print $HOUT "($fname:$lineno) Invalid name not matching [A-Za-z][-_.:A-Za-z0-9]*: \"$id\".\n";
}
}
# hrefs
if ($tag =~ /<(a|link) [^>]*href="([^">]+)"/ ||
$tag =~ /<(a|link) [^>]*href='([^'>]+)'/) {
$href = $tag;
if ($href =~ /<(a|link) [^>]*href="([^">]+)".*/) {
$href =~ s/<(a|link) [^>]*href="([^">]+)".*/$2/;
}
elsif ($href =~ /<(a|link) [^>]*href='([^'>]+)'.*/) {
$href =~ s/<(a|link) [^>]*href='([^'>]+)'.*/$2/;
}
if ($href =~ /^#/) {
$hrefs{$href} = "($fname:$lineno)";
if ($v) { print $HOUT "($fname:$lineno) Saving '$href' to check later.\n"; }
}
elsif ($href =~ /^[^:]+:\/\//) {
# URL -- can't check it. :(
if ($showNonLocalRefs) {
print $HOUT "$href linked to from ($fname:$lineno)\n";
}
}
else {
# Local file or directory :)
if ($showLocalRefs) {
print $HOUT "$href linked to from ($fname:$lineno)\n";
}
$pathname = $href;
$pathname =~ s/%20/ /g;
$pathname =~ s/#.*//;
if ($href =~ /#/) {
$fragment = $href;
$fragment =~ s/.*#//;
}
else {
$fragment = "";
}
if ($pathname =~ /^file:/) {
$pathname =~ s/^file://;
$pathname =~ s/^\/+/\//;
#print STDERR "# file:pathname $pathname\n";
}
if ($pathname =~ /^[^\/]/ && $fDirSlash) {
#print STDERR "# [^/]pathname $pathname\n";
$pathname = "$fDirSlash$pathname";
}
if ($pathname =~ /\/$/) {
my $dirname = $pathname;
$pathname = $pathname . "index.html";
if (-f "$pathname") {
if ($v) { print $HOUT "($fname:$lineno) Link to existing index file `$pathname'.\n"; }
if ($fragment) {
&findAnchor;
}
}
else {
if (-d "$dirname") {
if ($v) { print $HOUT "($fname:$lineno) Link to existing directory `$dirname'.\n"; }
if ($fragment) {
&findAnchor;
}
}
else {
print $HOUT "($fname:$lineno) Link to nonexistent index file `$pathname'.\n";
}
}
}
else {
if (-f $pathname) {
if ($v) { print $HOUT "($fname:$lineno) Link to existing file `$pathname'.\n"; }
if ($fragment) {
&findAnchor;
}
}
else {
print $HOUT "($fname:$lineno) Link to nonexistent file `$pathname'.\n";
}
}
}
}
# srcs
if ($tag =~ /
]*src="([^"]+)"/ ||
$tag =~ /
]*src='([^']+)'/) {
my $src = $tag;
if ($src =~ /
]*src="([^"]+)".*/) {
$src =~ s/
]*src="([^"]+)".*/$1/;
}
elsif ($src =~ /
]*src='([^']+)'.*/) {
$src =~ s/
]*src='([^']+)'.*/$1/;
}
if ($src =~ /^http:\/\//) {
# URL
if ($v) { print $HOUT "($fname:$lineno)
sources external image file `$src'.\n"; }
}
else {
if (-f "$fDirSlash$src") {
if ($v) { print $HOUT "($fname:$lineno)
sources existing image file `$src'.\n"; }
}
else {
print $HOUT "($fname:$lineno)
sources nonexistent image file `$src'.\n";
}
}
}
}
# Reads an HTML file searching for an anchor definition
# Doesn't account for comments.
sub findAnchor {
my $saved = $_;
open HANCHOR, "<$pathname";
while () {
chomp;
#print STDERR "### $_\n";
if (/ (name|id)='$fragment'/ ||
/ (name|id)="$fragment"/) {
if ($v) { print $HOUT "($fname:$lineno) Link to existing anchor '$pathname#$fragment'.\n"; }
close HANCHOR;
$_ = $saved;
return;
}
}
print $HOUT "($fname:$lineno) Link to nonexistent anchor '$fragment' in existing file '$pathname'.\n";
close HANCHOR;
$_ = $saved;
}
# Reads a line into $_ and returns it, eliding any comments
# (including those spanning two or more lines),
# keeps $lineno correct, doesn't chomp the newlines.
sub readLineElideComments {
$_ = ;
++$lineno;
if ($_) {
if ($linenoComment) {
# Looking for --> before any /) {
$comment =~ s/^(.*?)-->(.*)/$1/;
$_ =~ s/^(.*?)-->(.*)/$2/;
$linenoComment = 0;
}
else {
s/.*//;
}
while ($comment =~ /
while (//) {
my $comment = $_;
chomp $comment;
my $pre = $comment;
my $suf = $comment;
$pre =~ s/^(.*?)(.*)/$1/;
$comment =~ s/^(.*?)(.*)/$2/;
$suf =~ s/^(.*?)(.*)/$3/;
if ($comment =~ /^-/) { print $HOUT "($fname:$lineno) Bad comment: ''.\n"; }
while ($comment =~ //) {
print $HOUT "($fname:$lineno) Unmatched '-->'.\n";
s/-->//;
}
if (/