#!/usr/bin/env perl
# html-update-toc
# Update (in place) table of contents for an HTML document.
# Michael Ernst
# Time-stamp: <2018-02-06 20:49:41 mernst>
### NOTE: This script is deprecated. It has moved to
### https://github.com/plume-lib/html-tools
# This script edits a HTML file in place to modify an existing table of
# contents. The contents start (and end, respectively) with "" and "". There may be extra text in the
# comments; for example,
# Contents:
#
#
# To get started, just add those three lines near the top of the file.
# The table of contents contains links to header lines of these forms:
# Subheading
#
# Header lines without "id=..." or "" anchor tags are ignored
# (and a warning is generated). The header opening and closing tags should
# appear on the same line.
# Header lines are omitted if they contain
#
# For instance, this would not appear:
# The title of the document
# This script does not fully parse the HTML document; instead, it
# pattern-matches against specific lines in the document. That means that
# it will generate table-of-contents lines for commented-out sections of a
# document. To ensure that a header line doesn't appear in the table of
# contents, start a comment on that line, such as:
# (?:\r?\n)?|(?:Table of )?Contents:[ \t]*(?:|)(?:\r?\n)?";
# Second alternative used to be "\r?\n$" to avoid grabbing too much DOS text.
my $contents_end = "|?p>([ \t]*\r?\n|\r?\n)";
my $contents_start_quoted = $contents_start;
$contents_start_quoted =~ s/\n/\\n/g;
$contents_start_quoted =~ s/\r/\\r/g;
my $contents_end_quoted = $contents_end;
$contents_end_quoted =~ s/\n/\\n/g;
$contents_end_quoted =~ s/\r/\\r/g;
FILELOOP: for my $file (@ARGV) {
if ($debug) { print STDERR "Examining $file\n"; }
if (($file eq "-q") || ($file eq "--quiet") || ($file eq "-quiet")) {
$quiet = 1;
next;
}
if ($file !~ /\.html?$/) {
if ($debug) { print STDERR "Skipping $file: does not end in .htm or .html\n"; }
next;
}
if (! open(FILE, $file)) {
print STDERR "Skipping $file: can't open\n";
next;
}
# Only check the first 10 paragraphs (see above for caveat).
for (my $i=0; $i<10; $i++) {
my $line = ;
if (! defined($line)) {
# past end of file
next;
}
if ($debug) { print STDERR "line: $line"; }
# ".*?" means a minimal match
if ($debug) { print "matches start? " . ($line =~ /$contents_start/) . "\n"; }
if ($debug) { print "matches end? " . ($line =~ /$contents_end/) . "\n"; }
if ($line =~ /($contents_start)(.*?)($contents_end)/is) {
my $oldcontent = $2;
my $newcontent = `html-toc $file`;
if ($newcontent eq "\n") {
$newcontent = "";
}
if ($oldcontent eq $newcontent) {
if (! $quiet) {
print STDERR "$file has up-to-date table of contents\n";
}
close(FILE);
next FILELOOP;
}
close(FILE);
if (! $quiet) {
print STDERR "html-update-toc: updating $file\n";
}
# I would like to just call Perl with
# "perl -pi.bak -e '\$/ = \"\\n\\n\"; s/($contents_start_quoted)(.*?)($contents_end_quoted)/\$1$newcontent_quoted\$3/s'"
# but that doesn't work: the shell says "word too long".
rename($file,"$file.bak") or die "Can't make backup of $file";
open(OLDFILE, "$file.bak") or die "Can't read backup I just made: $file.bak";
open(FILE, ">$file") or die "Can't write $file; old version in $file.bak";
while (defined($line = )) {
$line =~ s/($contents_start)(.*?)($contents_end)/$1$newcontent$3/si;
print FILE $line;
}
close(OLDFILE);
close(FILE);
unlink("$file.bak");
# I could now reset the write date, but the file *has* changed,
# so don't bother.
next FILELOOP;
}
}
# It's not clear whether this is a helpful warning or is annoying.
print STDERR "No table of contents found in first 10 paragraphs of $file\n";
}