#!/usr/bin/env perl # html-toc # Michael Ernst # Thanks to C. Scott Ananian # Time-stamp: <2020-12-24 11:03:08 mernst> # Generate (to standard out) table of contents for an HTML document. # The table of contents is based on the header tags (

,

, etc.). # See comments in html-update-toc for documentation. # (Typically, you will run html-update-toc rather than this program.) use strict; use English; $WARNING = 1; my $level = 0; my $num_named_1 = 0; # number of named items in

...

my $num_unnamed_1 = 0; # number of unnamed items in

...

my $output = ""; my $errors = ""; my $eol; # end-of-line character in input file my $debug = 0; # $debug = 1; sub enter_level { # called with cursor at start of line. $output .= ((" " x ($level * 4)) . "

$level--; $output .= "

"); # returns with an open

} sub found_header ( $$$$ ) { my ($thishlevel, $anchorname, $secname1, $secname2) = @_; # Remove leading and trailing whitespace in section title. $secname1 =~ s/^[ \t]+//; $secname1 =~ s/[ \t]+$//; $secname2 =~ s/^[ \t]+//; $secname2 =~ s/[ \t]+$//; my $sectitle; if ($secname1 eq "") { $sectitle = $secname2; } elsif ($secname2 eq "") { $sectitle = $secname1; } else { # $errors .= "Merging two section titles (\"$secname1\" and \"$secname2\") found on one line: $_"; # next; $sectitle = "$secname1 $secname2"; } # if $level >= $thishlevel, we have an open

while ($level > $thishlevel) { exit_level(); } # close this

, unless we're going deeper $output .= "

" if ($level == $thishlevel); $output .= "$eol" unless $level == 0; while ($level < $thishlevel) { enter_level(); $output .= (" " x ($level * 4 - 2)) . "

$eol" if $level < $thishlevel; } $output .= ((" " x ($level * 4 - 2)) . "

$sectitle"); if ($level == 1) { $num_named_1++; } } # Main loop while (<>) { if (!defined($eol)) { if (/\r\n$/) { $eol = "\r\n"; } elsif (/\n$/) { $eol = "\n"; } elsif (/\r$/) { $eol = "\r"; } else { die "Bad line ending: $_"; } } if (/^|/) { # Check because sometimes, a HTML comment is not appropriate. next; } elsif ( /]*)?> (?:)? # permit a comment # close the a start tag (.*?) # the text of the anchor <\/a> # the a end tag (.*) # sometimes the anchor text is found here instead <\/h([1-9])> /ix ) { if ($debug) { print STDERR "match 3: <$1><$2><$3><$4><$5>\n"; } if ($1 ne $5) { $errors .= "Non-matching heading tags $1 and $5: $_"; } my $thishlevel = $1; my $anchorname = $2; my $secname1 = $3; my $secname2 = $4; found_header($thishlevel, $anchorname, $secname1, $secname2); } elsif ( / # close the h start tag (.*?) # the text of the anchor <\/h([1-9])> /ix ) { if ($debug) { print STDERR "match 3: <$1><$2><$3><$4>\n"; } if ($1 ne $4) { $errors .= "Non-matching heading tags $1 and $4: $_"; } my $thishlevel = $1; my $anchorname = $2; my $secname1 = $3; my $secname2 = ""; found_header($thishlevel, $anchorname, $secname1, $secname2); } elsif (/(.*)<\/h([1-9])>/) { if ($1 == 1) { $num_unnamed_1++; } $errors .= "$ARGV:$.: No tag: $_"; } elsif (//) { $errors .= "$ARGV:$.: No tag: $_"; } } continue { # Reset line numbers; example from "eof" text in "man perlfunc". close ARGV if eof; # Not eof()! } while ($level > 0) { exit_level(); } $output .= "$eol"; if ($num_named_1 + $num_unnamed_1 < 2) { # There is only one top-level (

) item. # Assume

is the page title, and the h2, h3, h4 etc are the ones # we should be making a toc for. $output =~ s/^
${eol}//; $output =~ s/<\/ul>${eol}$//; # remove top-level item. $output =~ s/^
.${eol}//; $output =~ s/<\/li>${eol}$/${eol}/; if ($num_unnamed_1 == 1) { # Remove error from output $errors =~ s/^.: No tag:
[^${eol}]*${eol}//; } } print STDERR $errors; print $output; exit(0);