#!/usr/bin/env perl
# html-toc
# Michael Ernst
# Thanks to C. Scott Ananian
# Time-stamp: <2020-12-24 11:03:08 mernst>
# Generate (to standard out) table of contents for an HTML document.
# The table of contents is based on the header tags (, , etc.).
# See comments in html-update-toc for documentation.
# (Typically, you will run html-update-toc rather than this program.)
use strict;
use English;
$WARNING = 1;
my $level = 0;
my $num_named_1 = 0; # number of named items in ...
my $num_unnamed_1 = 0; # number of unnamed items in ...
my $output = "";
my $errors = "";
my $eol; # end-of-line character in input file
my $debug = 0;
# $debug = 1;
sub enter_level
{
# called with cursor at start of line.
$output .= ((" " x ($level * 4)) . "$eol");
$level++;
# returns with cursor at start of line.
}
sub exit_level
{
# always called with an open -
$level--;
$output .= "
$eol";
$output .= ((" " x ($level * 4)) . "
");
# returns with an open
}
sub found_header ( $$$$ )
{
my ($thishlevel, $anchorname, $secname1, $secname2) = @_;
# Remove leading and trailing whitespace in section title.
$secname1 =~ s/^[ \t]+//;
$secname1 =~ s/[ \t]+$//;
$secname2 =~ s/^[ \t]+//;
$secname2 =~ s/[ \t]+$//;
my $sectitle;
if ($secname1 eq "")
{
$sectitle = $secname2;
}
elsif ($secname2 eq "")
{
$sectitle = $secname1;
}
else
{
# $errors .= "Merging two section titles (\"$secname1\" and \"$secname2\") found on one line: $_";
# next;
$sectitle = "$secname1 $secname2";
}
# if $level >= $thishlevel, we have an open
while ($level > $thishlevel)
{
exit_level();
}
# close this , unless we're going deeper
$output .= "" if ($level == $thishlevel);
$output .= "$eol" unless $level == 0;
while ($level < $thishlevel)
{
enter_level();
$output .= (" " x ($level * 4 - 2)) . "$eol"
if $level < $thishlevel;
}
$output .=
((" " x ($level * 4 - 2)) . "$sectitle");
if ($level == 1) { $num_named_1++; }
}
# Main loop
while (<>)
{
if (!defined($eol))
{
if (/\r\n$/)
{
$eol = "\r\n";
}
elsif (/\n$/)
{
$eol = "\n";
}
elsif (/\r$/)
{
$eol = "\r";
}
else
{
die "Bad line ending: $_";
}
}
if (/^|/)
{
# Check because sometimes, a HTML comment is not appropriate.
next;
}
elsif (
/]*)?>
(?:)? # permit a comment
# close the a start tag
(.*?) # the text of the anchor
<\/a> # the a end tag
(.*) # sometimes the anchor text is found here instead
<\/h([1-9])>
/ix
)
{
if ($debug) { print STDERR "match 3: <$1><$2><$3><$4><$5>\n"; }
if ($1 ne $5)
{
$errors .= "Non-matching heading tags $1 and $5: $_";
}
my $thishlevel = $1;
my $anchorname = $2;
my $secname1 = $3;
my $secname2 = $4;
found_header($thishlevel, $anchorname, $secname1, $secname2);
}
elsif (
/ # close the h start tag
(.*?) # the text of the anchor
<\/h([1-9])>
/ix
)
{
if ($debug) { print STDERR "match 3: <$1><$2><$3><$4>\n"; }
if ($1 ne $4)
{
$errors .= "Non-matching heading tags $1 and $4: $_";
}
my $thishlevel = $1;
my $anchorname = $2;
my $secname1 = $3;
my $secname2 = "";
found_header($thishlevel, $anchorname, $secname1, $secname2);
}
elsif (/(.*)<\/h([1-9])>/)
{
if ($1 == 1)
{
$num_unnamed_1++;
}
$errors .= "$ARGV:$.: No tag: $_";
}
elsif (//)
{
$errors .= "$ARGV:$.: No tag: $_";
}
}
continue
{
# Reset line numbers; example from "eof" text in "man perlfunc".
close ARGV if eof; # Not eof()!
}
while ($level > 0)
{
exit_level();
}
$output .= "$eol";
if ($num_named_1 + $num_unnamed_1 < 2)
{
# There is only one top-level () item.
# Assume is the page title, and the h2, h3, h4 etc are the ones
# we should be making a toc for.
$output =~ s/^