#!/usr/bin/perl use strict; use warnings; sub help { print "Replaces non-ASCII accented and other characters with HTML entities.\n"; print "The non-ASCII characters replaced are, generally:\n"; print " aeiou with acute, circumflex, grave, ring above, tilde, or diaeresis;\n"; print " slashed O and o;\n"; print " ae and AE ligatures;\n"; print " c and C with cedilla;\n"; print " ess;\n"; print " n and N with tilde;\n"; print " single and double quotes.\n"; print "Certain non-ASCII characters are not handled:\n"; print " ETH, THORN, Y acute, division, eth, thorn, times, y acute.\n"; print "Input is taken from files named on the command line or from STDIN.\n"; print "Options:\n"; print " -h, -help Show this help.\n"; print " -e, -entities List the HTML entities in the input.\n"; print " -n, -non-ASCII List the non-ASCII characters (by their entity names)\n"; print " in the input [default].\n"; print " -r, -replace Print the input with non-ASCII characters replaced by\n"; print " entities.\n"; print " -w, -where Show where each non-ASCII character or entity appeared.\n"; } #print "\$\#ARGV: $#ARGV\n"; my $act = "n"; # e(ntities), n(on-ASCII), r(eplace) my $showLocation = ""; my %found = (); my $entity = ""; my $fileCount = 0; my $arg = ""; my $fname = ""; my $changedLines = 0; my $v = "y"; if (0 == scalar @ARGV) { &help; exit 0; } foreach $arg (@ARGV) { if ($arg =~ /^-/) { if ($arg =~ /^-+h(elp)?$/) { &help; exit 0; } elsif ($arg =~ /^-+e(ntities)?$/) { $act = "e"; } elsif ($arg =~ /^-+n(on-ASCII)?$/) { $act = "n"; } elsif ($arg =~ /^-+r(eplace)?$/) { $act = "r"; } elsif ($arg =~ /^-+w(here)?$/) { $showLocation = "y"; } else { print "Unexpected option \"$arg\".\n"; exit -1; } } else { $fname = $arg; if (open HIN, $fname) { &perFile(\*HIN, $fname); close HIN; $fileCount = 1 + $fileCount; } else { print "Unable to open input file \"$arg\".\n"; exit -1; } } } if (0 == $fileCount) { &perFile(\*STDIN, ""); } # if (0 < $changedLines) { # print STDERR "Changed lines: $changedLines.\n"; # } sub perFile { if ($act eq "e") { %found = (); &showEntities(@_); &showFound; } elsif ($act eq "n") { %found = (); &showAccents(@_); &showFound; } elsif ($act eq "r") { &replaceAccents(@_); } else { die "Unexpected action \"$act\""; } } sub replaceAccents { my $IN = $_[0]; my $fname = $_[1]; my $lineNumber = 0; if ($v) { print STDERR "Replacing accents in \"$fname\":\n"; } while (<$IN>) { ++$lineNumber; my $line0 = $_; # Alphabetized by HTML character entity name. s//Æ/g; # latin capital letter AE s//Á/g; # latin capital letter A with acute s//Â/g; # latin capital letter A with circumflex s//À/g; # latin capital letter A with grave s//Å/g; # latin capital letter A with ring above s//Ã/g; # latin capital letter A with tilde s//Ä/g; # latin capital letter A with diaeresis s//Ç/g; # latin capital letter C with cedilla #s/?/Ð/g; # latin capital letter ETH, U+00D0 ISOlat1 s//É/g; # latin capital letter E with acute s//Ê/g; # latin capital letter E with circumflex s//È/g; # latin capital letter E with grave s//Ë/g; # latin capital letter E with diaeresis s//Í/g; # latin capital letter I with acute s//Î/g; # latin capital letter I with circumflex s//Ì/g; # latin capital letter I with grave s//Ï/g; # latin capital letter I with diaeresis s//Ñ/g; # latin capital letter N with tilde s//Ó/g; # latin capital letter O with acute s//Ô/g; # latin capital letter O with circumflex s//Ò/g; # latin capital letter O with grave s//Ø/g; # latin capital letter O with stroke s//Õ/g; # latin capital letter O with tilde s//Ö/g; # latin capital letter O with diaeresis #s/?/Þ/g; # latin capital letter THORN s//Ú/g; # latin capital letter U with acute s//Û/g; # latin capital letter U with circumflex s//Ù/g; # latin capital letter U with grave s//Ü/g; # latin capital letter U with diaeresis #s/?/Ý/g; # latin capital letter Y with acute s//á/g; # latin small letter a with acute s//â/g; # latin small letter a with circumflex s//æ/g; # latin small letter ae, U+00E6 ISOlat1 s//à/g; # latin small letter a with grave s//å/g; # latin small letter a with ring above s//ã/g; # latin small letter a with tilde s//ä/g; # latin small letter a with diaeresis s//ç/g; # latin small letter c with cedilla #s//÷/g; # division sign, U+00F7 ISOnum s//é/g; # latin small letter e with acute s//ê/g; # latin small letter e with circumflex s//è/g; # latin small letter e with grave #s/?/ð/g; # latin small letter eth, U+00F0 ISOlat1 s//ë/g; # latin small letter e with diaeresis s//í/g; # latin small letter i with acute s//î/g; # latin small letter i with circumflex s//ì/g; # latin small letter i with grave s//ï/g; # latin small letter i with diaeresis s//ñ/g; # latin small letter n with tilde s//ó/g; # latin small letter o with acute s//ô/g; # latin small letter o with circumflex s//ò/g; # latin small letter o with grave s//ø/g; # latin small letter o with stroke, s//õ/g; # latin small letter o with tilde s//ö/g; # latin small letter o with diaeresis s//ß/g; # latin small letter sharp s = ess-zed #s/?/þ/g; # latin small letter thorn #s/?/×/g; # multiplication sign, U+00D7 ISOnum s//ú/g; # latin small letter u with acute s//û/g; # latin small letter u with circumflex s//ù/g; # latin small letter u with grave s//ü/g; # latin small letter u with diaeresis #s/?/ý/g; # latin small letter y with acute s//ÿ/g; # latin small letter y with diaeresis s//“/g; # `` s//”/g; # '' s//‘/g; # ` s//’/g; # ' s//•/g; # bullet s//—/g; # m dash s// /g; # Firefox shows it as Eacute. if ($_ ne $line0) { ++$changedLines; if ($showLocation) { print STDERR "($fname:$lineNumber)\n"; } } print $_; } } sub deaccent { s//AE/g; # latin capital letter AE s//A/g; # latin capital letter A with acute s//A/g; # latin capital letter A with circumflex s//A/g; # latin capital letter A with grave s//A/g; # latin capital letter A with ring above s//A/g; # latin capital letter A with tilde s//A/g; # latin capital letter A with diaeresis s//C/g; # latin capital letter C with cedilla s//E/g; # latin capital letter E with acute s//E/g; # latin capital letter E with circumflex s//E/g; # latin capital letter E with grave s//E/g; # latin capital letter E with diaeresis s//I/g; # latin capital letter I with acute s//I/g; # latin capital letter I with circumflex s//I/g; # latin capital letter I with grave s//I/g; # latin capital letter I with diaeresis s//N/g; # latin capital letter N with tilde s//O/g; # latin capital letter O with acute s//O/g; # latin capital letter O with circumflex s//O/g; # latin capital letter O with grave s//O/g; # latin capital letter O with stroke s//O/g; # latin capital letter O with tilde s//O/g; # latin capital letter O with diaeresis s//U/g; # latin capital letter U with acute s//U/g; # latin capital letter U with circumflex s//U/g; # latin capital letter U with grave s//U/g; # latin capital letter U with diaeresis s//a/g; # latin small letter a with acute s//a/g; # latin small letter a with circumflex s//a/g; # latin small letter ae, U+00E6 ISOlat1 s//a/g; # latin small letter a with grave s//a/g; # latin small letter a with ring above s//a/g; # latin small letter a with tilde s//a/g; # latin small letter a with diaeresis s//c/g; # latin small letter c with cedilla s//e/g; # latin small letter e with acute s//e/g; # latin small letter e with circumflex s//e/g; # latin small letter e with grave s//e/g; # latin small letter e with diaeresis s//i/g; # latin small letter i with acute s//i/g; # latin small letter i with circumflex s//i/g; # latin small letter i with grave s//i/g; # latin small letter i with diaeresis s//n/g; # latin small letter n with tilde s//o/g; # latin small letter o with acute s//o/g; # latin small letter o with circumflex s//o/g; # latin small letter o with grave s//o/g; # latin small letter o with stroke, s//o/g; # latin small letter o with tilde s//o/g; # latin small letter o with diaeresis s//s/g; # latin small letter sharp s = ess-zed s//u/g; # latin small letter u with acute s//u/g; # latin small letter u with circumflex s//u/g; # latin small letter u with grave s//u/g; # latin small letter u with diaeresis s//y/g; # latin small letter y with diaeresis } sub showAccents { my $IN = $_[0]; my $fname = $_[1]; if ($v) { print STDERR "Showing accents in \"$fname\":\n"; } my $lineno = 0; while (<$IN>) { ++$lineno; my $line0 = $_; # Alphabetized by HTML character entity name. if (//) { show($fname, $lineno, "AElig"); } if (//) { show($fname, $lineno, "Aacute"); } if (//) { show($fname, $lineno, "Acirc"); } if (//) { show($fname, $lineno, "Agrave"); } if (//) { show($fname, $lineno, "Aring"); } if (//) { show($fname, $lineno, "Atilde"); } if (//) { show($fname, $lineno, "Auml"); } if (//) { show($fname, $lineno, "Ccedil"); } #if (/?/) { show($fname, $lineno, "ETH"); } if (//) { show($fname, $lineno, "Eacute"); } if (//) { show($fname, $lineno, "Ecirc"); } if (//) { show($fname, $lineno, "Egrave"); } if (//) { show($fname, $lineno, "Euml"); } if (//) { show($fname, $lineno, "Iacute"); } if (//) { show($fname, $lineno, "Icirc"); } if (//) { show($fname, $lineno, "Igrave"); } if (//) { show($fname, $lineno, "Iuml"); } if (//) { show($fname, $lineno, "Ntilde"); } if (//) { show($fname, $lineno, "Oacute"); } if (//) { show($fname, $lineno, "Ocirc"); } if (//) { show($fname, $lineno, "Ograve"); } if (//) { show($fname, $lineno, "Oslash"); } if (//) { show($fname, $lineno, "Otilde"); } if (//) { show($fname, $lineno, "Ouml"); } #if (/?/) { show($fname, $lineno, "THORN"); } if (//) { show($fname, $lineno, "Uacute"); } if (//) { show($fname, $lineno, "Ucirc"); } if (//) { show($fname, $lineno, "Ugrave"); } if (//) { show($fname, $lineno, "Uuml"); } #if (/?/) { show($fname, $lineno, "Yacute"); } if (//) { show($fname, $lineno, "aacute"); } if (//) { show($fname, $lineno, "acirc"); } if (//) { show($fname, $lineno, "aelig"); } if (//) { show($fname, $lineno, "agrave"); } if (//) { show($fname, $lineno, "aring"); } if (//) { show($fname, $lineno, "atilde"); } if (//) { show($fname, $lineno, "auml"); } if (//) { show($fname, $lineno, "ccedil"); } #if (//) { show($fname, $lineno, "divide"); } if (//) { show($fname, $lineno, "eacute"); } if (//) { show($fname, $lineno, "ecirc"); } if (//) { show($fname, $lineno, "egrave"); } #if (/?/) { show($fname, $lineno, "eth"); } if (//) { show($fname, $lineno, "euml"); } if (//) { show($fname, $lineno, "iacute"); } if (//) { show($fname, $lineno, "icirc"); } if (//) { show($fname, $lineno, "igrave"); } if (//) { show($fname, $lineno, "iuml"); } if (//) { show($fname, $lineno, "ntilde"); } if (//) { show($fname, $lineno, "oacute"); } if (//) { show($fname, $lineno, "ocirc"); } if (//) { show($fname, $lineno, "ograve"); } if (//) { show($fname, $lineno, "oslash"); } if (//) { show($fname, $lineno, "otilde"); } if (//) { show($fname, $lineno, "ouml"); } if (//) { show($fname, $lineno, "szlig"); } #if (/?/) { show($fname, $lineno, "thorn"); } #if (/?/) { show($fname, $lineno, "times"); } if (//) { show($fname, $lineno, "uacute"); } if (//) { show($fname, $lineno, "ucirc"); } if (//) { show($fname, $lineno, "ugrave"); } if (//) { show($fname, $lineno, "uuml"); } #if (/?/) { show($fname, $lineno, "yacute"); } if (//) { show($fname, $lineno, "yuml"); } if (//) { show($fname, $lineno, "ldquo"); } if (//) { show($fname, $lineno, "rdquo"); } if (//) { show($fname, $lineno, "lsquo"); } if (//) { show($fname, $lineno, "rsquo"); } if (//) { show($fname, $lineno, "bull"); } if (//) { show($fname, $lineno, "mdash"); } } } sub showEntities { my $IN = $_[0]; my $fname = $_[1]; if ($v) { print STDERR "Showing entities in \"$fname\":\n"; } my $lineno = 0; $entity = ""; while (<$IN>) { ++$lineno; chomp; while (/&[A-Za-z#0-9]+;/) { #print "line: $_"; s/(.*&[A-Za-z#0-9]+;).*$/$1/; #print "without trailing characters: $_"; $entity = $_; #print "Has entity: \"$entity\".\n"; $entity =~ s/.*(&[A-Za-z#0-9]+;)$/$1/; #print "Entity: \"$entity\".\n"; &show($fname, $lineno, $entity); s/(.*)&[A-Za-z#0-9]+;$/$1/; #print "without last entity: $_"; } } } sub show { my $fname = $_[0]; my $lineno = $_[1]; $entity = $_[2]; if ($showLocation) { print "($fname:$lineno) '$entity'\n"; } else { $found{$entity} = "($fname:$lineno)"; } } sub showFound { if ($showLocation) { # do nothing } else { my $count = scalar keys %found; if ($count) { foreach $entity (sort keys %found) { print "$entity\n"; } } print STDERR "$count found\n"; } }