#!/usr/bin/env perl # # This file is part of moses. Its use is licensed under the GNU Lesser General # Public License version 2.1 or, at your option, any later version. use warnings; use strict; my $language = "en"; my $PENN = 0; while (@ARGV) { $_ = shift; /^-b$/ && ($| = 1, next); # not buffered (flush each line) /^-l$/ && ($language = shift, next); /^[^\-]/ && ($language = $_, next); /^-penn$/ && ($PENN = 1, next); } while() { s/\r//g; # remove extra spaces s/\(/ \(/g; s/\)/\) /g; s/ +/ /g; s/\) ([\.\!\:\?\;\,])/\)$1/g; s/\( /\(/g; s/ \)/\)/g; s/(\d) \%/$1\%/g; s/ :/:/g; s/ ;/;/g; # normalize unicode punctuation if ($PENN == 0) { s/\`/\'/g; s/\'\'/ \" /g; } s/„/\"/g; s/“/\"/g; s/”/\"/g; s/–/-/g; s/—/ - /g; s/ +/ /g; s/´/\'/g; s/([a-z])‘([a-z])/$1\'$2/gi; s/([a-z])’([a-z])/$1\'$2/gi; s/‘/\"/g; s/‚/\"/g; s/’/\"/g; s/''/\"/g; s/´´/\"/g; s/…/.../g; # French quotes s/ « / \"/g; s/« /\"/g; s/«/\"/g; s/ » /\" /g; s/ »/\"/g; s/»/\"/g; # handle pseudo-spaces s/ \%/\%/g; s/nº /nº /g; s/ :/:/g; s/ ºC/ ºC/g; s/ cm/ cm/g; s/ \?/\?/g; s/ \!/\!/g; s/ ;/;/g; s/, /, /g; s/ +/ /g; # English "quotation," followed by comma, style if ($language eq "en") { s/\"([,\.]+)/$1\"/g; } # Czech is confused elsif ($language eq "cs" || $language eq "cz") { } # German/Spanish/French "quotation", followed by comma, style else { s/,\"/\",/g; s/(\.+)\"(\s*[^<])/\"$1$2/g; # don't fix period at end of sentence } if ($language eq "de" || $language eq "es" || $language eq "cz" || $language eq "cs" || $language eq "fr") { s/(\d) (\d)/$1,$2/g; } else { s/(\d) (\d)/$1.$2/g; } print $_; }