#!/usr/bin/env perl # html-update-toc # Update (in place) table of contents for an HTML document. # Michael Ernst # Time-stamp: <2020-12-24 11:03:07 mernst> # This script edits a HTML file in place to modify an existing table of # contents. The contents start with # # and end with # # There may be extra text in the comments; for example, #

Contents:

# # # To get started, just add those three lines near the top of the file. # The table of contents contains links to header lines of these forms: #

Subheading

# Header lines without "id=..." or "" anchor tags are ignored # (and a warning is generated). The header opening and closing tags (e.g., #

and

) should appear on the same line. # Header lines are omitted if they contain # # For instance, this would not appear: #

The title of the document

# This script does not fully parse the HTML document; instead, it # pattern-matches against specific lines in the document. That means that # it will generate table-of-contents lines for commented-out sections of a # document. To ensure that a header line doesn't appear in the table of # contents, start a comment on that line, such as: # (?:\r?\n)?|(?:Table of )?Contents:[ \t]*(?:|)(?:\r?\n)?"; # Second alternative used to be "\r?\n$" to avoid grabbing too much DOS text. my $contents_end = "|([ \t]*\r?\n|\r?\n)"; my $contents_start_quoted = $contents_start; $contents_start_quoted =~ s/\n/\\n/g; $contents_start_quoted =~ s/\r/\\r/g; my $contents_end_quoted = $contents_end; $contents_end_quoted =~ s/\n/\\n/g; $contents_end_quoted =~ s/\r/\\r/g; FILELOOP: for my $file (@ARGV) { if ($debug) { print STDERR "Examining $file\n"; } if (($file eq "-q") || ($file eq "--quiet") || ($file eq "-quiet")) { $quiet = 1; next; } if ($file !~ /\.html?$/) { if ($debug) { print STDERR "Skipping $file: does not end in .htm or .html\n"; } next; } if (! open(FILE, $file)) { print STDERR "Skipping $file: can't open\n"; next; } # Only check the first 10 paragraphs (see above for caveat). for (my $i=0; $i<10; $i++) { my $line = ; if (! defined($line)) { # past end of file next; } if ($debug) { print STDERR "line: $line"; } # ".*?" means a minimal match if ($debug) { print "matches start? " . ($line =~ /$contents_start/) . "\n"; } if ($debug) { print "matches end? " . ($line =~ /$contents_end/) . "\n"; } if ($line =~ /($contents_start)(.*?)($contents_end)/is) { my $oldcontent = $2; my $newcontent = `$script_dir/html-toc $file`; if ($newcontent eq "\n") { $newcontent = ""; } if ($oldcontent eq $newcontent) { if (! $quiet) { print STDERR "$file has up-to-date table of contents\n"; } close(FILE); next FILELOOP; } close(FILE); if (! $quiet) { print STDERR "html-update-toc: updating $file\n"; } # I would like to just call Perl with # "perl -pi.bak -e '\$/ = \"\\n\\n\"; s/($contents_start_quoted)(.*?)($contents_end_quoted)/\$1$newcontent_quoted\$3/s'" # but that doesn't work: the shell says "word too long". rename($file,"$file.bak") or die "Can't make backup of $file"; open(OLDFILE, "$file.bak") or die "Can't read backup I just made: $file.bak"; open(FILE, ">$file") or die "Can't write $file; old version in $file.bak"; while (defined($line = )) { $line =~ s/($contents_start)(.*?)($contents_end)/$1$newcontent$3/si; print FILE $line; } close(OLDFILE); close(FILE); unlink("$file.bak"); # I could now reset the write date, but the file *has* changed, # so don't bother. next FILELOOP; } } # It's not clear whether this is a helpful warning or is annoying. print STDERR "No table of contents found in first 10 paragraphs of $file\n"; }