#!/usr/bin/env perl
# html-update-toc
# Update (in place) table of contents for an HTML document.
# Michael Ernst
# Time-stamp: <2020-12-24 11:03:07 mernst>
# This script edits a HTML file in place to modify an existing table of
# contents. The contents start with
#
# and end with
#
# There may be extra text in the comments; for example,
# Contents:
#
#
# To get started, just add those three lines near the top of the file.
# The table of contents contains links to header lines of these forms:
# Subheading
#
# Header lines without "id=..." or "" anchor tags are ignored
# (and a warning is generated). The header opening and closing tags (e.g.,
# and
) should appear on the same line.
# Header lines are omitted if they contain
#
# For instance, this would not appear:
# The title of the document
# This script does not fully parse the HTML document; instead, it
# pattern-matches against specific lines in the document. That means that
# it will generate table-of-contents lines for commented-out sections of a
# document. To ensure that a header line doesn't appear in the table of
# contents, start a comment on that line, such as:
# (?:\r?\n)?|(?:Table of )?Contents:[ \t]*(?:|)(?:\r?\n)?";
# Second alternative used to be "\r?\n$" to avoid grabbing too much DOS text.
my $contents_end =
"|?p>([ \t]*\r?\n|\r?\n)";
my $contents_start_quoted = $contents_start;
$contents_start_quoted =~ s/\n/\\n/g;
$contents_start_quoted =~ s/\r/\\r/g;
my $contents_end_quoted = $contents_end;
$contents_end_quoted =~ s/\n/\\n/g;
$contents_end_quoted =~ s/\r/\\r/g;
FILELOOP: for my $file (@ARGV)
{
if ($debug) { print STDERR "Examining $file\n"; }
if (($file eq "-q") || ($file eq "--quiet") || ($file eq "-quiet"))
{
$quiet = 1;
next;
}
if ($file !~ /\.html?$/)
{
if ($debug)
{
print STDERR "Skipping $file: does not end in .htm or .html\n";
}
next;
}
if (!open(FILE, $file))
{
print STDERR "Skipping $file: can't open\n";
next;
}
# Only check the first 10 paragraphs (see above for caveat).
for (my $i = 0 ; $i < 10 ; $i++)
{
my $line = ;
if (!defined($line))
{
# past end of file
next;
}
if ($debug) { print STDERR "line: $line"; }
# ".*?" means a minimal match
if ($debug)
{
print "matches start? " . ($line =~ /$contents_start/) . "\n";
}
if ($debug)
{
print "matches end? " . ($line =~ /$contents_end/) . "\n";
}
if ($line =~ /($contents_start)(.*?)($contents_end)/is)
{
my $oldcontent = $2;
my $newcontent = `$script_dir/html-toc $file`;
if ($newcontent eq "\n")
{
$newcontent = "";
}
if ($oldcontent eq $newcontent)
{
if (!$quiet)
{
print STDERR "$file has up-to-date table of contents\n";
}
close(FILE);
next FILELOOP;
}
close(FILE);
if (!$quiet)
{
print STDERR "html-update-toc: updating $file\n";
}
# I would like to just call Perl with
# "perl -pi.bak -e '\$/ = \"\\n\\n\"; s/($contents_start_quoted)(.*?)($contents_end_quoted)/\$1$newcontent_quoted\$3/s'"
# but that doesn't work: the shell says "word too long".
rename($file, "$file.bak") or die "Can't make backup of $file";
open(OLDFILE, "$file.bak")
or die "Can't read backup I just made: $file.bak";
open(FILE, ">$file")
or die "Can't write $file; old version in $file.bak";
while (defined($line = ))
{
$line =~
s/($contents_start)(.*?)($contents_end)/$1$newcontent$3/si;
print FILE $line;
}
close(OLDFILE);
close(FILE);
unlink("$file.bak");
# I could now reset the write date, but the file *has* changed,
# so don't bother.
next FILELOOP;
}
}
# It's not clear whether this is a helpful warning or is annoying.
print STDERR "No table of contents found in first 10 paragraphs of $file\n";
}