#!/usr/bin/perl # # by Thomas A. Alspaugh # # Licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 2.5 License # http://creativecommons.org/licenses/by-nc-sa/2.5/ # use strict; use warnings; sub help { my $cmdname = $0; $cmdname =~ s:.*/::; print STDERR "$cmdname -[/hv] file ...\n"; print STDERR "Check HTML files.\n"; print STDERR " -help Show this help.\n"; print STDERR " -/ Require end tags or empty-element tags.\n"; print STDERR " -H List the absolute destinations linked to.\n"; print STDERR " -h List the relative destinations linked to.\n"; print STDERR " -# List the anchors defined.\n"; print STDERR " -q Quiet for correct files\n"; print STDERR " (otherwise each file name is echoed.)\n"; print STDERR " -v Verbose output.\n"; print STDERR " -x Check XHTML.\n"; print STDERR "What is checked:\n"; print STDERR " elements and character entities are valid (HTML4.01)\n"; print STDERR " start/end elements balance\n"; print STDERR " no links to undefined same-file anchors\n"; print STDERR " no links to nonexistent local files or anchors in them\n"; print STDERR " no links to nonexistent local directory index files\n"; print STDERR " or anchors in them\n"; print STDERR " no img elements with nonexistent local sources\n"; print STDERR " no open comments\n"; print STDERR " no closing tags with attributes\n"; print STDERR " no -- in comments\n"; } # Bad character `Ę' (looks like accented E) if (! scalar @ARGV) { &help; exit 0; } my $HOUT = *STDOUT; my $anchor = ""; my $arg = ""; my $leaveCrumbs = "y"; my $elt = ""; my $endTagsRequired = ""; my $fname = ""; my $fragment = ""; my $fDirSlash = ""; my $href = ""; my %hrefs = (); my %ids = (); my $lineno = 0; my $linenoComment = 0; my $pathname = ""; my $showAnchors = ""; my $showLocalRefs = ""; my $showNonLocalRefs = ""; my $tag = ""; my $tagName = ""; my $tagRest = ""; my $v = ""; my %charEnts = (); my %htmlTags = (); # "/" signifies the tag must be closed. &setup; foreach $arg (@ARGV) { if ($arg =~ /^-+help/) { &help; exit 0; } elsif ($arg =~ /^-+q$/) { $leaveCrumbs = ""; } elsif ($arg =~ /^-+v$/) { $v = "y"; } elsif ($arg =~ /^-+\/$/ || $arg =~ /^-+x$/) { $endTagsRequired = "y"; } elsif ($arg =~ /^-+#$/) { $showAnchors = "y"; } elsif ($arg =~ /^-+h$/) { $showLocalRefs = "y"; } elsif ($arg =~ /^-+H$/) { $showNonLocalRefs = "y"; } elsif ($arg =~ /^-+.*/) { print STDERR "Unexpected option \"$arg\".\n"; exit -1; } else { $fname = $arg; if ($fname =~ /\//) { $fDirSlash = $fname; $fDirSlash =~ s:/[^/]+$:/:; } else { $fDirSlash = ""; } #print STDERR "$fname:\n"; if (open HIN, "<$fname") { if ($leaveCrumbs) { print STDERR "($fname)\n"; } &htmlFile; close HIN; } else { print STDERR "Can't open input file \"$arg\".\n"; } } } sub htmlFile { %ids = (); $lineno = 0; my $linenoTag = 0; #print STDERR "($fname:$lineno):\n"; my @stack = (); my $stackDepth = 0; my $top; my $topTagName; my $topWhere; %hrefs = (); while (&readLineElideComments) { chomp; #print STDERR "($fname:$lineno):\n"; if (//) { # The entire DOCTYPE was on this line } else { # Find the end of the DOCTYPE while (&readLineElideComments) { chomp; if (/ to close /) { $linenoTag = 0; last; } } if ($linenoTag) { print $HOUT "($fname:$lineno) EOF while looking for > to close /) { # Stray >. if ($tagName eq "style") { #print $HOUT "($fname:$lineno) '>' in style.\n"; } else { print $HOUT "($fname:$lineno) Stray '>'.\n"; } } $tag = $_; if (/>/) { # The entire tag was on this line $tag =~ s/(<[^>]*>)(.*)/$1/; $_ =~ s/(<[^>]*>)(.*)/$2/; #\/?[_A-Za-z][-_.A-Za-z0-9]* } else { # End of tag wasn't on this line. $linenoTag = $lineno; $_ = ""; # Find the end of the tag while (&readLineElideComments) { chomp; $tag = "$tag $_"; # Perhaps should put newline rather than blank. if (/>/) { $linenoTag = 0; last; } } if ($linenoTag) { print $HOUT "($fname:$lineno) EOF while looking for > to close < from line $linenoTag.\n"; exit; } $_ = $tag; $tag =~ s/(<[^>]*>)(.*)/$1/; $_ =~ s/(<[^>]*>)(.*)/$2/; } $elt = $tag; $elt =~ s/='[^'>]*'/=./g; $elt =~ s/="[^">]*"/=../g; #print STDERR "for quotes ($fname:$lineno) $elt\n"; if ($elt =~ /'/) { print $HOUT "($fname:$lineno) Runaway single quote.\n"; } if ($elt =~ /"/) { print $HOUT "($fname:$lineno) Runaway double quote.\n"; } $tagName = lc $tag; $tagRest = $tag; $tagName =~ s/<\/?([_A-Za-z][-_.A-Za-z0-9]*)([ \n\/>](.|\n)*)/$1/; $tagRest =~ s/<\/?([_A-Za-z][-_.A-Za-z0-9]*)([ \n\/>](.|\n)*)/$2/; $tagRest =~ s/([^>]*)\/?>.*/$1/; if ($v) { print $HOUT "($fname:$lineno) Found tag `$tag'.\n"; } if ($tagRest =~ /'.\n"; } # What kind of tag is it? if (!exists $htmlTags{$tagName}) { # It's not an HTML tag. print $HOUT "($fname:$lineno) Not HTML: `$tagName' `$tag'.\n"; } elsif ($tag =~ /^<\//) { # It's a closing tag. if ($tagRest =~ /=/) { print $HOUT "($fname:$lineno) Closing tag can't have attributes: `$tag'.\n"; } # Figure out which tag it closes. while ("true") { if (0 < scalar @stack) { $top = pop @stack; $topTagName = $top; $topTagName =~ s/^([^ ]+) (.*)/$1/; $top =~ s/^([^ ]+) (.*)/$2/; $topWhere = $top; $topWhere =~ s/(\([^:]*:[0-9]*\)).*/$1/; if ($tagName eq $topTagName) { $stackDepth = 1 + scalar @stack; if ($v) { print $HOUT "($fname:$lineno) {$stackDepth} Matched <$topTagName> from $topWhere.\n"; } last; } elsif ($endTagsRequired || "/" eq $htmlTags{$topTagName}) { if (0 >scalar @stack) { my $top1 = pop @stack; my $top1TagName = $top1; $top1TagName =~ s/^([^ ]+) (.*)/$1/; $top1 =~ s/^([^ ]+) (.*)/$2/; my $top1Where = $top; $top1Where =~ s/(\([^:]*:[0-9]*\)).*/$1/; if ($tagName eq $top1TagName) { $stackDepth = 1 + scalar @stack; if ($v) { print $HOUT "($fname:$lineno) {$stackDepth} Matched <$top1TagName> from $top1Where.\n"; } print $HOUT "($fname:$lineno) Unmatched start tag <$topTagName> from $topWhere.\n"; } else { print $HOUT "($fname:$lineno) Unmatched end tag $tag (tried 2 from stack).\n"; $stackDepth = 1 + scalar @stack; print $HOUT "($fname:$lineno) {$stackDepth} Unmatched start tag <$topTagName> from $topWhere.\n"; } last; } else { print $HOUT "($fname:$lineno) Unmatched end tag $tag.\n"; $stackDepth = 1 + scalar @stack; print $HOUT "($fname:$lineno) {$stackDepth} Unmatched start tag <$topTagName> from $topWhere.\n"; last; } } else { $stackDepth = 1 + scalar @stack; if ($v) { print $HOUT "$topWhere {$stackDepth} No closing tag expected for <$topTagName>.\n"; } } } else { print $HOUT "($fname:$lineno) Unmatched end tag $tag (empty stack).\n"; last; } } } elsif ($tag =~ /\/>/) { # It's an empty-element tag. if ($v) { print $HOUT "($fname:$lineno) Found empty-element <$tagName/>.\n"; } &checkLinks; } else { # It's a start tag. push @stack, "$tagName ($fname:$lineno) $tag"; if ($v) { print $HOUT "($fname:$lineno) {" . scalar @stack . "} Pushed <$tagName> $tag.\n"; } &checkLinks; } } if (/>/) { # Stray >. if ($tagName eq "style") { #print $HOUT "($fname:$lineno) '>' in style.\n"; } else { print $HOUT "($fname:$lineno) Stray '>'.\n"; } } } while (scalar @stack) { $stackDepth = scalar @stack; $top = pop @stack; $topTagName = $top; $topTagName =~ s/^([^ ]+) (.*)/$1/; $top =~ s/^([^ ]+) (.*)/$2/; $topWhere = $top; $topWhere =~ s/(\([^:]*:[0-9]*\)).*/$1/; print $HOUT "($fname:$lineno) {$stackDepth} Unmatched start tag <$topTagName> from $topWhere.\n"; } my @hrefList = reverse sort keys %hrefs; while (@hrefList) { $href = pop @hrefList; if ($href =~ /^#/) { # Local id -- can check :) my $id = $href; $id =~ s/^#//; if (exists $ids{$id}) { if ($v) { print $HOUT "$hrefs{$href} Link to `$id' defined at $ids{$id}.\n"; } } elsif ($id eq "") { # do nothing -- the empty ID means top-of-file } else { print $HOUT "$hrefs{$href} Link to undefined id `$id'.\n"; } } else { die "Unexpected href '$href' -- not local.\n"; } } } # Checks the link attributes of a tag (in $tag) sub checkLinks { # IDs if ($tag =~ /.* id="([^">]+)".*/ || $tag =~ /.* id='([^'>]+)'.*/ || $tag =~ /^]*name="([^">]+)".*/ || $tag =~ /^]*name='([^'>]+)'.*/) { my $id = $tag; if ($tag =~ /.* id="([^">]+)"/) { $id =~ s/.* id="([^">]+)".*/$1/; } elsif ($tag =~ /.* id='([^'>]+)'/) { $id =~ s/.* id='([^'>]+)'.*/$1/; } elsif ($tag =~ /^]*name="([^">]+)".*/) { $id =~ s/^]*name="([^">]+)".*/$1/; } elsif ($tag =~ /^]*name='([^'>]+)'.*/) { $id =~ s/^]*name='([^'>]+)'.*/$1/; } else { print STDERR "Something seriously wrong: $tag\n"; exit } if ($showAnchors) { print STDOUT "$id defined ($fname:$lineno)\n"; } if ($v) { print $HOUT "($fname:$lineno) id `$id' defined.\n"; } if (exists $ids{$id}) { print $HOUT "($fname:$lineno) id `$id' already defined $ids{$id}.\n"; } $ids{$id} = "($fname:$lineno)"; if ($id =~ /^[A-Za-z][-_.:A-Za-z0-9]*$/) {} else { print $HOUT "($fname:$lineno) Invalid name not matching [A-Za-z][-_.:A-Za-z0-9]*: \"$id\".\n"; } } # hrefs if ($tag =~ /<(a|link) [^>]*href="([^">]+)"/ || $tag =~ /<(a|link) [^>]*href='([^'>]+)'/) { $href = $tag; if ($href =~ /<(a|link) [^>]*href="([^">]+)".*/) { $href =~ s/<(a|link) [^>]*href="([^">]+)".*/$2/; } elsif ($href =~ /<(a|link) [^>]*href='([^'>]+)'.*/) { $href =~ s/<(a|link) [^>]*href='([^'>]+)'.*/$2/; } if ($href =~ /^#/) { $hrefs{$href} = "($fname:$lineno)"; if ($v) { print $HOUT "($fname:$lineno) Saving '$href' to check later.\n"; } } elsif ($href =~ /^[^:]+:\/\//) { # URL -- can't check it. :( if ($showNonLocalRefs) { print $HOUT "$href linked to from ($fname:$lineno)\n"; } } else { # Local file or directory :) if ($showLocalRefs) { print $HOUT "$href linked to from ($fname:$lineno)\n"; } $pathname = $href; $pathname =~ s/%20/ /g; $pathname =~ s/#.*//; if ($href =~ /#/) { $fragment = $href; $fragment =~ s/.*#//; } else { $fragment = ""; } if ($pathname =~ /^file:/) { $pathname =~ s/^file://; $pathname =~ s/^\/+/\//; #print STDERR "# file:pathname $pathname\n"; } if ($pathname =~ /^[^\/]/ && $fDirSlash) { #print STDERR "# [^/]pathname $pathname\n"; $pathname = "$fDirSlash$pathname"; } if ($pathname =~ /\/$/) { my $dirname = $pathname; $pathname = $pathname . "index.html"; if (-f "$pathname") { if ($v) { print $HOUT "($fname:$lineno) Link to existing index file `$pathname'.\n"; } if ($fragment) { &findAnchor; } } else { if (-d "$dirname") { if ($v) { print $HOUT "($fname:$lineno) Link to existing directory `$dirname'.\n"; } if ($fragment) { &findAnchor; } } else { print $HOUT "($fname:$lineno) Link to nonexistent index file `$pathname'.\n"; } } } else { if (-f $pathname) { if ($v) { print $HOUT "($fname:$lineno) Link to existing file `$pathname'.\n"; } if ($fragment) { &findAnchor; } } else { print $HOUT "($fname:$lineno) Link to nonexistent file `$pathname'.\n"; } } } } # srcs if ($tag =~ /]*src="([^"]+)"/ || $tag =~ /]*src='([^']+)'/) { my $src = $tag; if ($src =~ /]*src="([^"]+)".*/) { $src =~ s/]*src="([^"]+)".*/$1/; } elsif ($src =~ /]*src='([^']+)'.*/) { $src =~ s/]*src='([^']+)'.*/$1/; } if ($src =~ /^http:\/\//) { # URL if ($v) { print $HOUT "($fname:$lineno) sources external image file `$src'.\n"; } } else { if (-f "$fDirSlash$src") { if ($v) { print $HOUT "($fname:$lineno) sources existing image file `$src'.\n"; } } else { print $HOUT "($fname:$lineno) sources nonexistent image file `$src'.\n"; } } } } # Reads an HTML file searching for an anchor definition # Doesn't account for comments. sub findAnchor { my $saved = $_; open HANCHOR, "<$pathname"; while () { chomp; #print STDERR "### $_\n"; if (/ (name|id)='$fragment'/ || / (name|id)="$fragment"/) { if ($v) { print $HOUT "($fname:$lineno) Link to existing anchor '$pathname#$fragment'.\n"; } close HANCHOR; $_ = $saved; return; } } print $HOUT "($fname:$lineno) Link to nonexistent anchor '$fragment' in existing file '$pathname'.\n"; close HANCHOR; $_ = $saved; } # Reads a line into $_ and returns it, eliding any comments # (including those spanning two or more lines), # keeps $lineno correct, doesn't chomp the newlines. sub readLineElideComments { $_ = ; ++$lineno; if ($_) { if ($linenoComment) { # Looking for --> before any /) { $comment =~ s/^(.*?)-->(.*)/$1/; $_ =~ s/^(.*?)-->(.*)/$2/; $linenoComment = 0; } else { s/.*//; } while ($comment =~ / while (//) { my $comment = $_; chomp $comment; my $pre = $comment; my $suf = $comment; $pre =~ s/^(.*?)(.*)/$1/; $comment =~ s/^(.*?)(.*)/$2/; $suf =~ s/^(.*?)(.*)/$3/; if ($comment =~ /^-/) { print $HOUT "($fname:$lineno) Bad comment: ''.\n"; } while ($comment =~ //) { print $HOUT "($fname:$lineno) Unmatched '-->'.\n"; s/-->//; } if (/