#!/usr/bin/env perl
# html-toc
# Michael Ernst
# Thanks to C. Scott Ananian
# Time-stamp: <2018-02-06 20:49:41 mernst>
### NOTE: This script is deprecated. It has moved to
### https://github.com/plume-lib/html-tools
# Generate (to standard out) table of contents for an HTML document.
# The table of contents is based on the header tags (, , etc.).
# See html-update-toc for documentation.
# (Typically, you will run html-update-toc rather than this program.)
use strict;
use English;
$WARNING = 1;
my $level = 0;
my $num_named_1 = 0; # number of named items in ...
my $num_unnamed_1 = 0; # number of unnamed items in ...
my $output = "";
my $errors = "";
my $eol; # end-of-line character in input file
my $debug = 0;
# $debug = 1;
sub enter_level {
# called with cursor at start of line.
$output .= ((" " x ($level * 4)) . "$eol");
$level++;
# returns with cursor at start of line.
}
sub exit_level {
# always called with an open -
$level--;
$output .= "
$eol";
$output .= ((" " x ($level * 4)) . "
");
# returns with an open
}
sub found_header ( $$$$ ) {
my ($thishlevel, $anchorname, $secname1, $secname2) = @_;
# Remove leading and trailing whitespace in section title.
$secname1 =~ s/^[ \t]+//;
$secname1 =~ s/[ \t]+$//;
$secname2 =~ s/^[ \t]+//;
$secname2 =~ s/[ \t]+$//;
my $sectitle;
if ($secname1 eq "") {
$sectitle = $secname2;
} elsif ($secname2 eq "") {
$sectitle = $secname1;
} else {
# $errors .= "Merging two section titles (\"$secname1\" and \"$secname2\") found on one line: $_";
# next;
$sectitle = "$secname1 $secname2";
}
# if $level >= $thishlevel, we have an open
while ($level > $thishlevel) {
exit_level();
}
# close this , unless we're going deeper
$output .= "" if ($level == $thishlevel);
$output .= "$eol" unless $level == 0;
while ($level < $thishlevel) {
enter_level();
$output .= (" " x ($level * 4 - 2)) . "$eol" if $level < $thishlevel;
}
$output .= ((" " x ($level * 4 - 2)) . "$sectitle");
if ($level == 1) { $num_named_1++; }
}
# Main loop
while (<>) {
if (! defined($eol)) {
if (/\r\n$/) {
$eol = "\r\n";
} elsif (/\n$/) {
$eol = "\n";
} elsif (/\r$/) {
$eol = "\r";
} else {
die "Bad line ending: $_";
}
}
if (/^|/) {
# Check because sometimes, a HTML comment is not appropriate.
next;
} elsif (/]*)?>
(?:)? # permit a comment
# close the a start tag
(.*?) # the text of the anchor
<\/a> # the a end tag
(.*) # sometimes the anchor text is found here instead
<\/h([1-9])>
/ix) {
if ($debug) { print STDERR "match 3: <$1><$2><$3><$4><$5>\n"; }
if ($1 ne $5) {
$errors .= "Non-matching heading tags $1 and $5: $_";
}
my $thishlevel = $1;
my $anchorname = $2;
my $secname1 = $3;
my $secname2 = $4;
found_header($thishlevel, $anchorname, $secname1, $secname2);
} elsif (/ # close the h start tag
(.*?) # the text of the anchor
<\/h([1-9])>
/ix) {
if ($debug) { print STDERR "match 3: <$1><$2><$3><$4>\n"; }
if ($1 ne $4) {
$errors .= "Non-matching heading tags $1 and $4: $_";
}
my $thishlevel = $1;
my $anchorname = $2;
my $secname1 = $3;
my $secname2 = "";
found_header($thishlevel, $anchorname, $secname1, $secname2);
} elsif (/(.*)<\/h([1-9])>/) {
if ($1 == 1) {
$num_unnamed_1++;
}
$errors .= "$ARGV:$.: No tag: $_";
} elsif (//) {
$errors .= "$ARGV:$.: No tag: $_";
}
} continue {
# Reset line numbers; example from "eof" text in "man perlfunc".
close ARGV if eof; # Not eof()!
}
while ($level > 0) {
exit_level();
}
$output .= "$eol";
if ($num_named_1 + $num_unnamed_1 < 2) {
# There is only one top-level () item.
# Assume is the page title, and the h2, h3, h4 etc are the ones
# we should be making a toc for.
$output =~ s/^