#!/usr/bin/env perl # java-cpp -- C preprocessor specialized for Java # Michael Ernst # Time-stamp: <2014-01-30 14:23:04 mernst> # This acts like the C preprocessor (cpp), but # * it does not remove comments, and # * it cleans up spacing in the processed file. # Its name comes from the fact that it is useful for running on a source # file with cpp macros, to create Java source code. # If last argument is a file, it is used as input. Otherwise, input comes # from standard in. Output goes to standard out. # (Implementation note: I do want substitution to occur in comments. # Therefore, I do not use the -C (leave comments in) flag to cpp, or make # JAVACPP_DOUBLESLASHCOMMENT put the rest of the line in a string, both # of which would also avoid the problem but would prevent substitution.) # I'm not calling this jpp because someone else has probably already taken # that name. # This script started out as a 16-line csh script by Josh Kataoka. use English; use strict; use filetest 'access'; # needed for afs! $WARNING = 1; # "-w" command-line switch my $debug = 0; # $debug = 1; my $cleanup_tmpfiles = 1; # $cleanup_tmpfiles = 0; my $system_temp_dir = -d '/tmp' ? '/tmp' : $ENV{TMP} || $ENV{TEMP} || die "Cannot determine system temporary directory, stopped"; my $tmpfile_in = "$system_temp_dir/java-cpp-$$-in"; my $tmpfile_out = "$system_temp_dir/java-cpp-$$-out"; my $tmpfile_err = "$system_temp_dir/java-cpp-$$-err"; if ($debug) { print STDERR "// tmpfile_in = $tmpfile_in\n"; print STDERR "// tmpfile_out = $tmpfile_out\n"; print STDERR "// tmpfile_err = $tmpfile_err\n"; } my $file_handle_nonce = 'fh00'; { my $filename; if ((scalar(@ARGV) > 0) && (-r $ARGV[$#ARGV])) { $filename = pop @ARGV; # remove last (filename) element } else { if ($ARGV[$#ARGV] !~ /^-/ and $ARGV[$#ARGV] =~ /\.jpp$/) { # Hmm, looks like a filename, even though we can't read it. die "Can't read $ARGV[$#ARGV], stopped"; } if ($debug) { print STDERR "// Last arg not a filename; reading from standard in.\n"; } $filename = "-"; } open(TMPFILE, ">$tmpfile_in") || die "Cannot open $tmpfile_in: $!\n"; escape_comments($filename); close(TMPFILE); my $argv = join(' ', @ARGV); # Intentionally does not capture standard error; it goes straight through. my $cpp_command = "cpp -ffreestanding -traditional $argv $tmpfile_in > $tmpfile_out 2> $tmpfile_err"; if ($debug) { print STDERR "// running command: $cpp_command\n"; } my $system_result = system($cpp_command); if ($system_result != 0) { rewrite_errors($tmpfile_err, $filename); if (! $debug && $cleanup_tmpfiles) { # unlink($tmpfile_in); unlink($tmpfile_out); # unlink($tmpfile_err); } die "java-cpp: cpp $argv $filename failed: $!\n Actual command that failed: $cpp_command"; } unescape_comments($tmpfile_out); if ($cleanup_tmpfiles) { unlink($tmpfile_in); unlink($tmpfile_out); unlink($tmpfile_err); } } exit(); ########################################################################### ### Subroutines ### # Also processes #include directives. # Perhaps I should instead do this; the advantage would be correct # "#line" information. # ## Use of cpp for #include only: # cpp -C -nostdinc -undef sub escape_comments ( $ ) { my ($filename) = @_; # Indirect through this filehandle name in order to make this routine # (escape_comments) re-entrant. my $inhandle = $file_handle_nonce++; # this is a string increment if ($debug) { print STDERR "// escape_comments: opening $filename\n"; } no strict 'refs'; open($inhandle, $filename) || die "escape_comments: cannot open $filename: $!\n"; # my $JAVACPP_WHITESPACE_SEPARATOR = "JAVACPP_WHITESPACE_SEPARATOR"; while (<$inhandle>) { # print STDERR "// top of escape_comments loop: $_"; s/^[ \t]+\#/\#/; if (/^\#include "(.*)"/) { escape_comments($1); next; } s|//|JAVACPP_DOUBLESLASHCOMMENT|g; s|/\*|JAVACPP_SLASHSTARCOMMENT|g; s/\'/JAVACPP_SINGLEQUOTE/g; # cpp 2.96 (Red Hat) compresses all multiple internal whitespace into one # space, which loses space both preceding and within comments. Don't do # this inside cpp directives, however, and don't change leading space. if (! /^[ \t]*\#/) { my($leading_ws, $rest) = /^(\s*)(.*?)\z/s; $rest =~ s/(\s{2,})/separate_spaces($1)/eg; $_ = $leading_ws . $rest; } print TMPFILE; # print STDERR "// bottom of escape_comments loop: $_"; } close($inhandle); } # Insert "JAVACPP_WHITESPACE_SEPARATOR" in between each pair of # characters in the argument (which presumably consists only of # whitespace characters) sub separate_spaces { my ($spaces) = @_; my @spaces = split(//, $spaces); return join "JAVACPP_WHITESPACE_SEPARATOR", @spaces; } sub unescape_comments ( $ ) { my ($filename) = @_; # Read by paragraphs, eliminating empty paragraphs. # This causes records read from file to potentially have many trailing blanks. $INPUT_RECORD_SEPARATOR = ""; open(CPPFILE, $filename) || die "unescape_comments: cannot open $filename: $!\n"; my $post_return_space = ""; my $next_post_return_space = ""; my $post_else_space = ""; my $next_post_else_space = ""; while () { # print STDERR "// top of unescape_comments loop: <<<$_>>>"; s|JAVACPP_DOUBLESLASHCOMMENT|//|g; s|JAVACPP_SLASHSTARCOMMENT|/\*|g; s/JAVACPP_SINGLEQUOTE/\'/g; # Change multiple contiguous spaces into one space, but not at # beginning of line (cpp 2.96 (Red Hat) does this). s/([^ \t\n] ) +/$1/g; s/JAVACPP_WHITESPACE_SEPARATOR//og; # Perform string contatenation required by use of -traditional cpp flag. s/(\w) \#\# (\w)/$1$2/g; # Perform stringization, but not just after "@link" or "@see". s/(?!\@link)(.(?!\@see).... )\#(\w+)/$1"$2"/g; # Undocumented workaround to enable a literal pound sign to be placed # in the output, in case the stringizaton goes poorly. s/JAVACPP_POUNDSIGN_NOSPACE */\#/g; s/JAVACPP_POUNDSIGN/\#/g; # Convert string concatenation ("a" + "b") into single string ("ab"). while (s/(".*)" ?\+ "(.*")/$1$2/g) { } # Remove "# 22" lines. (Turn them into blank lines that are removed later.) # The while loop is because there may be many such lines adjacent. while (s/(^|\n)\# [0-9]+ ".*"[1-4, ]*($|\n)/$1$2/g) { } ## This is no longer necessary, because I use "-traditional" cpp flag. # ## Remove extra horizontal space # ## (Some of these are cosmetic; others are necessary to get identical # ## output under all versions of cpp.) # # print STDERR "pre-horizontal: $_"; # # Remove all trailing space # s/[ \t]+\n/\n/g; # # Remove space before trailing semicolon # s/ ;$/;/gm; # # Remove space after package name # s/^(package .*\.) ([^ ]*;)/$1$2/gm; # # Remove all extra spaces in import list # while (s/^(import [^ \n]*) (.*;)$/$1$2/m) { } # # convert " );" to ");"; requires "=" somewhere earlier in line # s/(=.*[^ \t\n\}]) (\);\n)/$1$2/g; # # convert "(Foo )" to "(Foo)" # s/\((\b[A-Za-z]\w*) \)/($1)/g; # # convert "a .b" to "a.b". # s/(\b[A-Za-z]\w*) \.([A-Za-z]\w*\b)/$1.$2/g; # # convert "a. foo (" to "a.foo(" # # (Note single spaces, lowercase first letter.) # # also: "a. FOO)" becomes "a.FOO)" # s/(\b[A-Za-z]\w*|\))\. ([a-z]\w*) ?\(/$1.$2\(/g; # s/(\b[A-Za-z]\w*)\. (\w+) ?(\)|;|\.[a-z])/$1.$2$3/g; # # convert "new Foo. Bar(...)" to "new Foo.Bar(...)" # s/(\bnew [A-Za-z]\w*)\. ([A-Za-z]\w*)\(/$1.$2\(/g; # # convert " instanceof long [])" to " instanceof long[])" # s/( instanceof \w+) ((\[\])*\))/$1$2/g; # # convert "long []" to "long[]" (for cast, prototype, or declaration). # # also "long[] " to "long[]". # s/((?:\(|^ *|(?:(?:,|public|private|protected)(?: static)? ))\w+(?:\[\])*) ([\[\)])/$1$2/gm; # # convert "new int[2 ]" to "new int[2]"; also "new int [", "new Foo (" # s/(\bnew \w+) (\()/$1$2/g; # s/(\bnew \w+) (\[)/$1$2/g; # s/(\bnew \w+\[\w+) *(\])/$1$2/g; # # convert "public PptSlice1 (" to "public PptSlice1(" # s/(^ *(?:public|private|protected)(?: static)? \w+) (\()/$1$2/gm; # # convert "double [] val1_array =" to "double[] val1_array =" # s/(^ *\w+) (\[\] \w+(;| *=))/$1$2/gm; # # remove space before close paren, if there is an open w/o a space after it # # and this is not a "for" loop. # # old version: s/(\(\S.*) (\)(;|\)| {|$))/$1$2/gm; # s/(^ *(?!(?:\/\/ *)?for\b)[^ ].*\(\S.*) (\)(;|\)| {|$))/$1$2/gm; # # print STDERR "post-horizontal: $_"; # print STDERR "pre-vertical: $_"; ## Remove extra vertical space # compress out duplicate blank lines s/\n\n\n+/\n\n/g; # Also remove blank lines before the start of the file. # This substitution can only succeed for the very first record that is # read, or for paragraphs that are completely eliminated. s/^\n+//; # This does not work: it matches to *every* paragraph. # # compress out blank lines at end (due to the above, this can be simpler) # s/\n\n\z/\n/; # Remove newline after "if" statement # if no open curly brace or semicolon but 2 newlines. if (/^[ \t]*if[ \t]*\([^\n\{;]*\n\n\z/) { # not "chomp": it removes all of the trailing newlines rather than one s/\n\z//; } # print STDERR "post-vertical: $_"; # Remove newline after "return" statement if followed by 2 nelines and # open curly brace. But I have no way of knowing that open curly follows. # Thus, the post_return_space hack. if (/\breturn [^\n]*;\n\n\z/) { s/\n\z//; $next_post_return_space = "\n"; } # Same for closing up space in "} else\n\n{" if (/\} else\n\n\z/) { s/\n\n\z//; $next_post_else_space = "\n\n"; # Is the below for debugging? # $next_post_else_space = "POST_ELSE_SPACE for <<<$_>>>\n"; } # Skip if nothing to print (e.g., this paragraph was just a "# 22" line) if (! /[^\n]/) { next; } if (/^[ \t]*\}/) { $post_return_space = ""; } print $post_return_space; $post_return_space = $next_post_return_space; $next_post_return_space = ""; if (($post_else_space ne "") && /^[ \t]*\{/) { s/^[ \t]*\{/ \{/; $post_else_space = ""; } print $post_else_space; $post_else_space = $next_post_else_space; $next_post_else_space = ""; print; } close(CPPFILE); } sub rewrite_errors ( $$ ) { my ($tmp_filename, $orig_filename) = @_; open(ERRFILE, $tmp_filename) || die "Cannot open $tmp_filename: $!\n"; while () { s/$tmpfile_in/$orig_filename/o; print STDERR; } close(ERRFILE); }