#!/usr/bin/env perl # Reading the table of contents of a conference or journal is a useful way # to keep up-to-date with the latest research. ACM even provides a "TOC # service" that emails the table of contents of conferences and journals # that you specify. A problem is that the tables of contents contain the # title of each article, but not its abstract, so you have to click on # every abstract link. # # This script takes as input a filename or URL for an ACM digital library # proceedings table of contents. It produces, to standard output, a HTML # file that augments the table of contents with abstracts for each paper. # This makes it possible to read all the abstracts on one HTML page, # without clicking on any links. # # You can use it on: # * a URL from the ACM digital library. Example: # acm-dl-abstracts 'http://portal.acm.org/toc.cfm?id=964001' # * email from the ACM TOC Service. Example (after saving the contents, # not the whole email with headers, to to local file 'acm.html'): # acm-dl-abstracts acm.html # (If you do not redirect output to a file, the program will choose a # filename of the form acm-abstracts-XXXX.html.) # Then, browse it or print it. If printing, run # for i in `ls acm-abstracts-*.html`; do htmldoc --webpage -t ps --outfile $i.ps $i; done # because both html2ps and the Filefox print functionality tend to cut off # content. (Maybe the ACM HTML is malformed, or maybe this script should be # enhanced.) # This script takes a long time to run (multiple minutes per table of # contents, because of built-in delay to avoid being locked out by ACM web # server); be patient! use strict; use English; $WARNING = 1; use LWP::Simple; use File::Temp qw/ :mktemp /; # time between get requests, to avoid being locked out by ACM web server my $sleep_seconds = 5; if (scalar(@ARGV) != 1) { die "Expected exactly 1 argument, got " . scalar(@ARGV); } my $toc_url = $ARGV[0]; if (-e $toc_url) { $toc_url = "file:" . $toc_url; } my $toc_html = get($toc_url); if (! defined $toc_html) { die "Couldn't get TOC: $toc_url" } my ($fh, $file) = mkstemps( "acm-abstracts-XXXX", ".html"); # Parens around regexp put them in the list, too. my @toc_sections = split(/(<\/div>)/, $toc_html); for my $toc_section (@toc_sections) { if ($toc_section !~ / *abstract<\/A>/) { print $fh "$toc_section\n"; next; } sleep($sleep_seconds); my $paper_url = $1; # print "PAPER_URL: $paper_url\n"; my $full_paper_url = $paper_url; # print "FULL_PAPER_URL: $full_paper_url\n"; if ($full_paper_url !~ /^http:\/\/portal[0-9]*.acm.org(:80)?\//) { $full_paper_url = " http://portal.acm.org/$full_paper_url"; } # print "FULL_PAPER_URL: $full_paper_url\n"; my $paper_html = get($full_paper_url); if (! defined $paper_html) { die "Couldn't get paper: $full_paper_url"; } # Parens around regexp put them in the list, too. my @paper_sections = split(/(<\/div>)/, $paper_html); my $appended = 0; for my $paper_section (@paper_sections) { # print $fh "\n"; if ($paper_section =~ /ABSTRACT<\/A>/) { $appended = 1; # print "ABSTRACT: $paper_section END ABSTRACT\n"; $toc_section .= $paper_section . ""; last; } } if (! $appended) { print $fh "DID NOT FIND ABSTRACT!\n"; } print $fh "\n"; print $fh "$toc_section\n"; }