#!/usr/bin/env bash # --- STANDARD SCRIPT-GLOBAL CONSTANTS kTHIS_NAME=${BASH_SOURCE##*/} kTHIS_HOMEPAGE='https://github.com/mklement0/nws-cli' kTHIS_VERSION='v0.3.4' # NOTE: This assignment is automatically updated by `make version VER=` - DO keep the 'v' prefix. unset CDPATH # To prevent unexpected `cd` behavior. # --- Begin: STANDARD HELPER FUNCTIONS die() { echo "$kTHIS_NAME: ERROR: ${1:-"ABORTING due to unexpected error."}" 1>&2; exit ${2:-1}; } dieSyntax() { echo "$kTHIS_NAME: ARGUMENT ERROR: ${1:-"Invalid argument(s) specified."} Use -h for help." 1>&2; exit 2; } # Set up an EXIT handler that cleans up on success # or reports where a copy of the input file can be found on error, if in-place updating was attempted. tmpFile= file= inFile= trap ' if (( $? )); then # an error occurred if (( inPlaceUpdating )); then if [[ -e $inFile ]]; then echo "Input file $file may have gotten corrupted. A copy of the input file can be found at: $inFile" >&2 fi fi else # success: remove any temp. file [[ -e $tmpFile ]] && rm -- "$tmpFile" fi ' EXIT # SYNOPSIS # openUrl # DESCRIPTION # Opens the specified URL in the system's default browser. openUrl() { local url=$1 platform=$(uname) cmd=() case $platform in 'Darwin') # OSX cmd=( open "$url" ) ;; 'CYGWIN_'*) # Cygwin on Windows; must call cmd.exe with its `start` builtin cmd=( cmd.exe /c start '' "$url " ) # !! Note the required trailing space. ;; 'MINGW32_'*) # MSYS or Git Bash on Windows; they come with a Unix `start` binary cmd=( start '' "$url" ) ;; *) # Otherwise, assume a Freedesktop-compliant OS, which includes many Linux distros, PC-BSD, OpenSolaris, ... cmd=( xdg-open "$url" ) ;; esac "${cmd[@]}" || { echo "Cannot locate or failed to open default browser; please go to '$url' manually." >&2; return 1; } } # Prints the embedded Markdown-formatted man-page source to stdout. printManPageSource() { sed -n -e $'/^: <<\'EOF_MAN_PAGE\'/,/^EOF_MAN_PAGE/ { s///; t\np;}' "$BASH_SOURCE" } # Opens the man page, if installed; otherwise, tries to display the embedded Markdown-formatted man-page source; if all else fails: tries to display the man page online. openManPage() { local pager embeddedText if ! man 1 "$kTHIS_NAME" 2>/dev/null; then # 2nd attempt: if present, display the embedded Markdown-formatted man-page source embeddedText=$(printManPageSource) if [[ -n $embeddedText ]]; then pager='more' command -v less &>/dev/null && pager='less' # see if the non-standard `less` is available, because it's preferable to the POSIX utility `more` printf '%s\n' "$embeddedText" | "$pager" else # 3rd attempt: open the the man page on the utility's website openUrl "${kTHIS_HOMEPAGE}/doc/${kTHIS_NAME}.md" fi fi } # Prints the contents of the synopsis chapter of the embedded Markdown-formatted man-page source for quick reference. printUsage() { local embeddedText # Extract usage information from the SYNOPSIS chapter of the embedded Markdown-formatted man-page source. embeddedText=$(sed -n -e $'/^: <<\'EOF_MAN_PAGE\'/,/^EOF_MAN_PAGE/!d; /^## SYNOPSIS$/,/^#/{ s///; t\np; }' "$BASH_SOURCE") if [[ -n $embeddedText ]]; then # Print extracted synopsis chapter - remove backticks for uncluttered display. printf '%s\n\n' "$embeddedText" | tr -d '`' else # No SYNOPIS chapter found; fall back to displaying the man page. echo "WARNING: usage information not found; opening man page instead." >&2 openManPage fi } # --- End: STANDARD HELPER FUNCTIONS # --- PROCESS STANDARD, OUTPUT-INFO-THEN-EXIT OPTIONS. case $1 in --version) # Output version number and exit, if requested. echo "$kTHIS_NAME $kTHIS_VERSION"$'\nFor license information and more, visit '"$kTHIS_HOMEPAGE"; exit 0 ;; -h|--help) # Print usage information and exit. printUsage; exit ;; --man) # Display the manual page and exit, falling back to printing the embedded man-page source. openManPage; exit ;; --man-source) # private option, used by `make update-man` # Print raw, embedded Markdown-formatted man-page source and exit printManPageSource; exit ;; --home) # Open the home page and exit. openUrl "$kTHIS_HOMEPAGE"; exit ;; esac # --- BEGIN: functions # Helper function for exiting with error message due to runtime error. # die [errMsg [exitCode]] # Default error message states context and indicates that execution is aborted. Default exit code is 1. # Prefix for context is always prepended. # Note: An error message is *always* printed; if you just want to exit with a specific code silently, use `exit n` directly. die() { echo "$kTHIS_NAME: ERROR: ${1:-"ABORTING due to unexpected error."}" 1>&2 exit ${2:-1} # Note: If the argument is non-numeric, the shell prints a warning and uses exit code 255. } # Helper function for exiting with error message due to invalid arguments. # dieSyntax [errMsg] # Default error message is provided, as is prefix and suffix; exit code is always 2. dieSyntax() { echo "$kTHIS_NAME: ARGUMENT ERROR: ${1:-"Invalid argument(s) specified."} Use -h for help." 1>&2 exit 2 } # SYNOPSIS # normalizeWhiteSpace [mode] # DESCRIPTION # Normalizes whitespace in the input provided via stdin, using one of 4 modes: # All modes behave identically with respect to normalizing individual lines: # Any run of any mix of spaces and tabs is replaced with a single space each, except leading and trailing runs, which are discarded. # Thus, specifying a mode applies only to *multi-line* input (all modes behave the same with single-line input): # 0 (default) ... runs of blank (all-whitespace or empty) lines are replaced with 1 empty line each, resulting in paragraph-internal newlines getting preserved, with blank lines at the beginning, between paragraphs, and at the end getting normalized to a single empty line each. # 1 ... runs of blank (all-whitespace or empty) lines are discarded, resulting in a single block of non-blank lines. # 2 ... like mode 0, except that paragraph-internal newlines are replaced with a single space each, resulting in each paragraph becoming a single line, with 1 empty line between paragraphs. # 3 ... normalization includes newlines too, so that any run of any mix of spaces, tabs, and newlines is replaced with a single space each, resulting in a single, long output line. # EXAMPLES # normalizeWhiteSpace <<<$' a b \t \t' # -> 'a b' # normalizeWhiteSpace 0 <<<$'\n \n line\t1\n line 2 \n\nline 3\n\n\n' # -> $'\nline 1\nline 2\n\nline 3' (0 is optional) # normalizeWhiteSpace 1 <<<$'\n \n line\t1\n line 2 \n\nline 3\n\n\n' # -> $'line 1\nline 2\nline 3' # normalizeWhiteSpace 2 <<<$'\n \n line\t1\n line 2 \n\nline 3\n\n\n' # -> $'\nline 1 line 2\n\nline 3' # normalizeWhiteSpace 3 <<<$'\n \n line\t1\n line 2 \n\nline 3\n\n\n' # -> $'line 1 line 2 line 3' normalizeWhiteSpace() { local mode=$1; shift case $mode in 0) # default: line-interior normalization with leading and trailing runs discarded; runs of blank (all-whitespace or empty) lines replaced with 1 empty line each -> multiple paragraphs with 1 empty line in between awk '{$1=$1; print}' "$@" | cat -s # rebuild the line with a space between fields (may result in empty lines), then replace each run of empty lines with a single empty line. ;; 1) # line-interior normalization with leading and trailing runs discarded; runs of blank (all-whitespace or empty) lines discarded -> 1 block of non-empty lines awk '{$1=$1} $1!=""' "$@" # rebuild the line with a space between fields; if there's at least 1 field, print the rebuilt line. ;; 2) # paragraph-interior normalization (across non-empty lines), runs of blank (all-whitespace or empty) lines replaced with 1 empty line each -> each paragraph becomes 1 long line with 1 empty line in between awk '{$1=$1; nonEmpty=($1!=""); printf ((nonEmpty && prevNonEmpty) ? " " : ((NR > 1) ? "\n" : "")); prevNonEmpty=nonEmpty; printf "%s", $0} END { printf "\n" }' "$@" | cat -s # rebuild the line with a space between fields (may result in empty lines), then replace each run of empty lines with a single empty line. ;; 3) # normalization across *all* types of whitespace, including newlines -> 1 long line awk 'r!="" { printf "%s%s", sep, r; r=""; sep=" " } { $1=$1; if ($1!="") r=$0 } END { if (r!="") printf "%s%s", sep, r }' "$@" printf '\n' ;; *) echo "Unknown normalization mode: '$1'." >&2; return 2 ;; esac } # Replaces select non-ASCII Unicode whitespace and punctuation characters # with their closest ASCII equivalents (while leaving other non-ASCII # characters untouched). toAsciiWhitespaceAndPunctuation() { # Specify the Unicode chars. as hex. byte escape sequences representing the # bytes in UTF-8 encoding, # and then use `printf %b` to convert them into actual characters. # The characters are a *selection* from the following Unicode categories: # * U+80 Latin-1 Supplement (double-angle quotation marks as used in Spanish, for instance) # * U+2000 General Punctuation # * U+2E00 Supplemental Punctuation # * [OMITTED FOR NOW: U+2700 Dingbats (for ornamental quotes)] local sedCmds='' # Chars. to map to a space local chars=( '\xc2\xa0' # no-break space '\xe2\x80\x80' # en quad '\xe2\x80\x81' # em quad '\xe2\x80\x82' # en space '\xe2\x80\x83' # em space '\xe2\x80\x84' # three-per-em space '\xe2\x80\x85' # four-per-em space '\xe2\x80\x86' # six-per-em space '\xe2\x80\x87' # figure space '\xe2\x80\x88' # punctuation space '\xe2\x80\x89' # thin space '\xe2\x80\x8a' # hair space '\xe2\x80\xaf' # narrow no-break space '\xe2\x81\x9f' # medium mathematical space ) sedCmds+=$(printf 's/[%b]/ /g;' "${chars[@]}") # Chars. to map to a LF (\n) local chars=( '\xc2\x85' # next line '\xe2\x80\xa8' # line separator ) sedCmds+=$(printf 's/[%b]/\\\n/g;' "${chars[@]}") # Chars. to map to 2 LFs (\n\n) local chars=( '\xe2\x80\xa9' # paragraph separator ) sedCmds+=$(printf 's/[%b]/\\\n\\\n/g;' "${chars[@]}") # Chars. to map to nothing (empty string). local chars=( '\xe2\x80\x8b' # zero-width space '\xe2\x80\x8c' # zero width non-joiner '\xe2\x81\xa0' # word joiner '\xe2\x80\xa7' # hyphenation point ) sedCmds+=$(printf 's/[%b]//g;' "${chars[@]}") # Chars. to map to ' (single quote) local chars=( '\xe2\x80\x98' # left single quotation mark '\xe2\x80\x99' # right single quotation mark '\xe2\x80\x9a' # single low-9 quotation mark '\xe2\x80\x9b' # single high-reversed-9 quotation mark '\xe2\x80\xb3' # prime '\xe2\x80\xb6' # reversed prime '\xe2\x80\xb9' # single left-pointing angle quotation mark '\xe2\x80\xba' # single right-pointing angle quotation mark # '\xe2\x9d\xae' # heavy left-pointing angle quotation mark ornament # '\xe2\x9d\xaf' # heavy right-pointing angle quotation mark ornament # '\xe2\x9d\x9b' # heavy single turned comma quotation mark ornament # '\xe2\x9d\x9c' # heavy single comma quotation mark ornament # '\xe2\x9d\x9f' # heavy low single comma quotation mark ornament ) sedCmds+=$(printf "s/[%b]/'/g;" "${chars[@]}") # Chars. to map to ''' (3 single quotes) local chars=( '\xe2\x80\xb4' # triple prime '\xe2\x80\xb7' # reversed triple prime ) sedCmds+=$(printf "s/[%b]/'''/g;" "${chars[@]}") # Chars. to map to '''' (4 single quotes) local chars=( '\xe2\x81\x97' # quadruple prime ) sedCmds+=$(printf "s/[%b]/''''/g;" "${chars[@]}") # Chars. to map to " (double quote) local chars=( '\xe2\x80\x9c' # left double quotation mark '\xe2\x80\x9d' # right double quotation mark '\xe2\x80\x9e' # double low-9 quotation mark '\xe2\x80\x9f' # double high-reversed-9 quotation mark '\xe2\x80\xb2' # double prime '\xe2\x80\xb5' # double reversed prime '\xc2\xab' # left-pointing double angle quotation mark '\xc2\xbb' # right-pointing double angle quotation mark '\xe3\x80\x9d' # reversed double prime quotation mark '\xe3\x80\x9f' # low double prime quotation mark '\xe3\x80\x9e' # double prime quotation mark '\xef\xbc\x82' # fullwidth quotation mark '\xf3\xa0\x80\xa2' # tag quotation mark # '\xe2\x9d\x9d' # heavy double turned comma quotation mark ornament # '\xe2\x9d\x9e' # heavy double comma quotation mark ornament # '\xe2\x9d\xa0' # heavy low double comma quotation mark ornament # '\xe2\xb9\x82' # double low-reversed-9 quotation mark # '\xf0\x9f\x99\xb8' # sans-serif heavy low double comma quotation mark ornament # '\xf0\x9f\x99\xb6' # sans-serif heavy double turned comma quotation mark ornament # '\xf0\x9f\x99\xb7' # sans-serif heavy double comma quotation mark ornament ) sedCmds+=$(printf 's/[%b]/"/g;' "${chars[@]}") # Chars. to map to - (hyphen-minus) local chars=( '\xe2\x80\x90' # hyphen '\xe2\x80\x91' # non-breaking hyphen '\xe2\x80\x92' # figure dash '\xe2\x80\x93' # en dash '\xe2\x81\x83' # hyphen bullet ) sedCmds+=$(printf 's/[%b]/-/g;' "${chars[@]}") # Chars. to map to -- (2 hyphen-minus) local chars=( '\xe2\x80\x94' # em dash '\xe2\x80\x95' # horizontal bar '\xe2\xb9\x80' # double hyphen '\xe2\xb8\xba' # two-em dash '\xe2\xb8\xbb' # three-em dash ) sedCmds+=$(printf 's/[%b]/--/g;' "${chars[@]}") # Chars. to map to . (period) local chars=( '\xe2\x80\xa4' # one dot leader '\xe2\xb8\xbc' # stenographic full stop ) sedCmds+=$(printf 's/[%b]/./g;' "${chars[@]}") # Chars. to map to .. (2 periods) local chars=( '\xe2\x80\xa5' # two dot leader ) sedCmds+=$(printf 's/[%b]/.,/g;' "${chars[@]}") # Chars. to map to ... (3 periods) local chars=( '\xe2\x80\xa6' # horizontal ellipsis ) sedCmds+=$(printf 's/[%b]/.../g;' "${chars[@]}") # Chars. to map to || (2 pipe symbols) local chars=( '\xe2\x80\xa6' # double vertical line ) sedCmds+=$(printf 's/[%b]/||/g;' "${chars[@]}") # Misc. other substitutions # !! These are true non-ASCII Unicode chars. in the first operands to the # !! s function calls, so this file must not ever be saved in an encoding # !! other than UTF-8. sedCmds+='s/‼/!!/g; s/‽/?!/g; s/[‽⁈]/?!/g; s/⁇/??/g; s/⁉/!?/g;' # Misc. other one-to-one character substitutions # !! These are true non-ASCII Unicode chars. in the first operand to the # !! y function call, so this file must not ever be saved in an encoding # !! other than UTF-8. sedCmds+='y#‸⁁‹›⁄⁓⁎⁕⁏⁚#^^<>/~**;:#;' # Invoke sed with the list of commands, with stdin input. # !! Since the commands rely on processing UTF-8-encoded chars. (multi-byte), # !! we mustn't invoke Sed with LC_ALL=C. # !! The price we pay is that this only works with properly UTF-8-encoded # !! input files. # !! The symptom with invalid files depends on the Sed implementation: # !! * GNU Sed silently *ignores* invalid chars. # !! * BSD/macOS Sed *fails* with "invalid byte sequence" sed "$sedCmds" "$@" } # --- END: functions # --- MAIN BODY nwsMode= handled=0 inPlaceUpdating=0 backupExt= # ----- BEGIN: OPTIONS PARSING: This is MOSTLY generic code, but: # - SET allowOptsAfterOperands AFTER THIS COMMENT TO 1 to ALLOW OPTIONS TO BE MIXED WITH OPERANDS rather than requiring all options to come before the 1st operand, as POSIX mandates. # - The SPECIFIC OPTIONS MUST BE HANDLED IN A CASE ... ESAC STATEMENT BELOW; look for "BEGIN: CUSTOMIZE HERE ... END: CUSTOMIZE HERE" # - Assumes presence of function dieSyntax(); if not present, define as: dieSyntax() { echo "$(basename -- "$BASH_SOURCE"): ARGUMENT ERROR: ${1:-"Invalid argument(s) specified."} Use -h for help." >&2; exit 2; } # - After the end of options parsing, $@ only contains the operands (non-option arguments), if any. allowOptsAfterOperands=1 operands=() i=0 optName= isLong=0 prefix= optArg= haveOptArgAttached=0 haveOptArgAsNextArg=0 acceptOptArg=0 needOptArg=0 optsSpecified=0 while (( $# )); do if [[ $1 =~ ^(-)[a-zA-Z0-9]+.*$ || $1 =~ ^(--)[a-zA-Z0-9]+.*$ ]]; then # an option: either a short option / multiple short options in compressed form or a long option prefix=${BASH_REMATCH[1]}; [[ $prefix == '--' ]] && isLong=1 || isLong=0 for (( i = 1; i < (isLong ? 2 : ${#1}); i++ )); do acceptOptArg=0 needOptArg=0 haveOptArgAttached=0 haveOptArgAsNextArg=0 optArgAttached= optArgOpt= optArgReq= if (( isLong )); then # long option: parse into name and, if present, argument optName=${1:2} [[ $optName =~ ^([^=]+)=(.*)$ ]] && { optName=${BASH_REMATCH[1]}; optArgAttached=${BASH_REMATCH[2]}; haveOptArgAttached=1; } else # short option: *if* it takes an argument, the rest of the string, if any, is by definition the argument. optName=${1:i:1}; optArgAttached=${1:i+1}; (( ${#optArgAttached} >= 1 )) && haveOptArgAttached=1 fi (( haveOptArgAttached )) && optArgOpt=$optArgAttached optArgReq=$optArgAttached || { (( $# > 1 )) && { optArgReq=$2; haveOptArgAsNextArg=1; }; } optsSpecified=1 # ---- BEGIN: CUSTOMIZE HERE handled=0 errMsgSingleModeOnly="Only one mode may be specified." case $optName in m|mode) [[ -z $nwsMode ]] || dieSyntax "$errMsgSingleModeOnly" needOptArg=1 nwsMode=$optArgReq ;; i|in-place) # Note: similar to GNU sed, the only way to specify the optional option-argument is to directly attach it to the option. acceptOptArg=1 handled=1 inPlaceUpdating=1 backupExt=$optArgAttached handled=1 ;; *) # We also accept the mode as direct options, so that --mode fp is identical to --fp [[ -z $nwsMode ]] || dieSyntax "$errMsgSingleModeOnly" nwsMode=$optName ;; esac if [[ $handled -eq 0 && -n $nwsMode ]]; then handled=1 case $nwsMode in # whitespace condensing modes 0|1|2|3) # !! We still accept the numerical modes, but they're deprecated and no longer documented. : ;; mp|multi-para) nwsMode=0 ;; sp|single-para) nwsMode=1 ;; fp|flat-para) nwsMode=2 ;; sl|single-line) nwsMode=3 ;; # whitespace transliteration modes crlf|windows-newlines) # convert CRLF line endings nwsMode=4 ;; lf|unix-newlines) # convert to LF-only line endings nwsMode=5 ;; ascii|ascii-punctuation) # convert non-ASCII Unicode whitespace and punctuation to closest ASCII equivalents nwsMode=6 ;; *) (( needOptArg )) && dieSyntax "Unknown mode specified: '$nwsMode'." handled=0 ;; esac fi if (( ! handled )); then case $optName in *) dieSyntax "Unknown option: ${prefix}${optName}." ;; esac fi # ---- END: CUSTOMIZE HERE (( needOptArg )) && { (( ! haveOptArgAttached && ! haveOptArgAsNextArg )) && dieSyntax "Option ${prefix}${optName} is missing its argument." || (( haveOptArgAsNextArg )) && shift; } (( acceptOptArg || needOptArg )) && break done else # an operand if [[ $1 == '--' ]]; then shift; operands+=( "$@" ); break elif (( allowOptsAfterOperands )); then operands+=( "$1" ) # continue else operands=( "$@" ) break fi fi shift done (( "${#operands[@]}" > 0 )) && set -- "${operands[@]}"; unset allowOptsAfterOperands operands i optName isLong prefix optArgAttached haveOptArgAttached haveOptArgAsNextArg acceptOptArg needOptArg # ----- END: OPTIONS PARSING: "$@" now contains all operands (non-option arguments). # Set default mode [[ -n $nwsMode ]] || nwsMode=0 if (( $# )); then # filename operands given # Make sure all files exists and are both readable and writable. errMsgSuffix= (( inPlaceUpdating )) && errMsgSuffix=$'\nNone of the '$#' input file(s) were modified.' # !! With xargs and a large input set that necessitates multiple `nws` calls, this error message may be misleading. for f; do [[ -L $f && ! -e $f ]] && die "File is a broken symlink: $f""$errMsgSuffix" [[ ! -e $f ]] && die "No such file: $f""$errMsgSuffix" [[ ! -f $f ]] && die "Not a file (wrong type of filesystem object): $f""$errMsgSuffix" [[ ! -r $f ]] && die "File is not readable (permissions problem?): $f""$errMsgSuffix" [[ ! -w $f ]] && die "File is not writable (permissions problem?): $f""$errMsgSuffix" done # Create a temporary file, if in-place updating without a backup was requested. if [[ $inPlaceUpdating -eq 1 && -z $backupExt ]]; then tmpFile=$(mktemp -t 'XXXX') # Works on both OSX and Linux; note: file will have random extension on OSX (e.g., '.../XXXX.bJViLcM3') and none on Linux (e.g., '.../vXDA') # Note: The trap '...' EXIT handler set up above takes care of removing the temp. file. fi else # no operands -> use stdin input # Abort if no filename operands were given, yet in-place updating was requested. (( inPlaceUpdating )) && dieSyntax "Please specfiy at least one file to update in place." fi # Determine how many iterations we need. # We only need multiple ones, if more than 1 filename operand was passed and # in-place updating was requested, because we then need to process files # individually. if (( inPlaceUpdating )); then numIterations=$# files=( "$@" ) else # stdin input or filename operands without in-place updating that can be passed at once numIterations=1 fi # Processing loop for (( i = 0; i < numIterations; ++i )); do # Create the backup file / temp. copy for in-place updating. # Note: With in-place updating, we need a temporary *copy* of the input file # so we can write back to the *original* file using output direction # which ensures that the original inode with all its attributes is # preserved. if (( inPlaceUpdating )); then # Determine the next input file. # Note: Variables $file, $inFile, and $tmpFile are used in the # trap '...' EXIT handler defined at the top. file=${files[i]} # Create the temp. copy of the input file: # Determine the file path... if [[ -n $backupExt ]]; then # create backup copy # We create the requested backup copy of the input file, which also # serves as our input source. inFile="$file$backupExt" else # no backup - use the temp. file created (empty) above. inFile=$tmpFile fi # ... and create the copy. cp -p -- "$file" "$inFile" || die "Failed to create temp. copy of input file: $file" # Now, set $@ to the input file at hand, and # redirect stdout to the original input file. set -- "$inFile" exec > "$file" fi case $nwsMode in # whitespace compression/unification modes 0|1|2|3) normalizeWhiteSpace "$nwsMode" "$@" || die ;; # whitespace transliteration modes 4) # to CRLF line endings - must be idempotent if the input already has the target line endings awk -v ORS='\r\n' '{ sub("\r$", ""); print }' "$@" || die ;; 5) # to LF-only line endings - must be idempotent if the input already has the target line endings awk '{ sub("\r$", ""); print }' "$@" || die ;; 6) # Unicode whitespace+punctuation to ASCII equivalents toAsciiWhitespaceAndPunctuation "$@" || die ;; *) die "DESIGN ERROR: Unanticipated mode ID: $nwsMode" esac done #### # MAN PAGE MARKDOWN SOURCE # - Place a Markdown-formatted version of the man page for this script # inside the here-document below. # The document must be formatted to look good in all 3 viewing scenarios: # - as a man page, after conversion to ROFF with marked-man # - as plain text (raw Markdown source) # - as HTML (rendered Markdown) # Markdown formatting tips: # - GENERAL # To support plain-text rendering in the terminal, limit all lines to 80 chars., # and, for similar rendering as HTML, *end every line with 2 trailing spaces*. # - HEADINGS # - For better plain-text rendering, leave an empty line after a heading # marked-man will remove it from the ROFF version. # - The first heading must be a level-1 heading containing the utility # name and very brief description; append the manual-section number # directly to the CLI name; e.g.: # # foo(1) - does bar # - The 2nd, level-2 heading must be '## SYNOPSIS' and the chapter's body # must render reasonably as plain text, because it is printed to stdout # when `-h`, `--help` is specified: # Use 4-space indentation without markup for both the syntax line and the # block of brief option descriptions; represent option-arguments and operands # in angle brackets; e.g., '' # - All other headings should be level-2 headings in ALL-CAPS. # - TEXT # - Use NO indentation for regular chapter text; if you do, it will # be indented further than list items. # - Use 4-space indentation, as usual, for code blocks. # - Markup character-styling markup translates to ROFF rendering as follows: # `...` and **...** render as bolded (red) text # _..._ and *...* render as word-individually underlined text # - LISTS # - Indent list items by 2 spaces for better plain-text viewing, but note # that the ROFF generated by marked-man still renders them unindented. # - End every list item (bullet point) itself with 2 trailing spaces too so # that it renders on its own line. # - Avoid associating more than 1 paragraph with a list item, if possible, # because it requires the following trick, which hampers plain-text readability: # Use ' ' in lieu of an empty line. #### : <<'EOF_MAN_PAGE' # nws(1) - normalize whitespace ## SYNOPSIS Normalizes whitespace in one of several modes. nws [-m ] [[-i[]] file...] Condensing s: All these modes normalize runs of tabs and spaces to a single space each and trim leading and trailing runs; they only differ with respect to how multi-line input is processed. mp (default) multi-paragraph: folds multiple blank lines into one fp flattened multi-paragraph: normalizes each paragraph to single line sp single-paragraph: removes all blank lines. sl single-line: normalizes to single output line Transliteration s: lf translates line endings to LF-only (\n) crlf translates line endings to CRLF (\r\n) ascii translates Unicode whitespace and punctuation to ASCII Alternatively, specify mode values directly as options; e.g., `--sp` in lieu of `-m sp` Standard options: `--help`, `--man`, `--version`, `--home` ## DESCRIPTION `nws` (*n*ormalize *w*hite*s*pace) performs whitespace normalization, offering several modes in two categories: * whitespace-condensing modes: Trims leading and trailing runs of any mix of tabs and spaces and replaces them with a single space each. The individual modes differ only with respect to how multi-line input is treated. * whitespace-transliteration modes: Line endings can be changed to be Windows- or Unix-specific, and select Unicode whitespace and punctuation can be replaced with their closest ASCII equivalents. Input is provided either from the specified files or via stdin. Output is sent to stdout by default. To update files in-place, use the `-i` option (in which case there will be no stdout output). ## OPTIONS * CONDENSING modes: `-m ` or `--mode ` or `--` where `` is one of: * `mp`, `multi-para` (default) Runs of blank (all-whitespace or empty) lines are replaced with 1 empty line each, resulting in paragraph-internal newlines getting preserved, with blank lines at the beginning, between paragraphs, and at the end getting normalized to a single empty line each. * `fp`, `flat-para` Like mode `mp`, except that paragraph-internal newlines are replaced with a single space each, resulting in each paragraph becoming a single line, with 1 empty line between paragraphs. * `sp`, `single-para` Runs of blank (all-whitespace or empty) lines are discarded, resulting in a single paragraph of non-blank lines. * `sl`, `single-line` Normalization includes newlines too, so that any run of any mix of spaces, tabs, and newlines is replaced with a single space each, resulting in a single, long output line. * TRANSLITERATION modes: `-m ` or `--mode `or `--` where `` is one of: * `lf` Translates Windows-style CRLF (\r\n) line endings to Unix-style LF (\n) line endings. * `crlf` Translates Unix-style LF (\n) line endings to Windows-style CRLF (\r\n) line endings. * `ascii`, `ascii-punctuation` Translates non-ASCII Unicode whitespace and punctuation to the closest ASCII equivalents, while leaving other non-ASCII characters untouched. This is helpful for source-code samples that have been formatted for display with typographic quotes, em dashes, and the like, which usually makes the code indigestible to compilers/interpreters. IMPORTANT: This mode only works with PROPERLY ENCODED UTF-8 FILES. On BSD/macOS systems, an improperly encoded input file will result in a 'sed: RE error: illegal byte sequence' error. * `-i[]`, `--in-place[=]` Updates the specified files *in place*; that is, the results of the normalization are written back to the input file, and no stdout output is produced. If `` is specified (recommended), a backup copy of each input file is made first, simply by appending the suffix to the filename. ## STANDARD OPTIONS All standard options provide information only. * `-h, --help` Prints the contents of the synopsis chapter to stdout for quick reference. * `--man` Displays this manual page, which is a helpful alternative to using `man`, if the manual page isn't installed. * `--version` Prints version information. * `--home` Opens this utility's home page in the system's default web browser. ## LICENSE For license information, bug reports, and more, visit this utility's home page by running `nws --home` ## EXAMPLES The examples use ANSI C-quoted input strings (`$'...'`) for brevity, which are supported in Bash, Ksh, and Zsh. Empty output lines are represented by `~`. ## CONDENSING EXAMPLES: # Single-line input - no mode needed. $ nws <<< $' one \t\t two three ' one two three # Default: multi-paragraph mode (`-m mp` or `--mode multi-para`) $ nws <<<$'\n\n one\n two \n\n\n three\n\n' ~ one two ~ three ~ # Single-paragraph mode; `-m sp` is the short equivalent of # `--mode single-para`. $ nws -m sp <<<$'\n\n one\n two \n\n\n three\n\n' one two three # Flattened-paragraph mode; note use of shorcut option `--fp` for `-m fp`. nws --fp <<<$'\n\n one\n two \n\n\n three\n\n' ~ one two ~ three ~ # Single-line mode $ nws --sl <<<$' one two\n three ' one two three ## TRANSLITERATION EXAMPLES: # Converts a CRLF line-endings file (Windows) to a LF-only file (Unix). # No output is produced, because the file is updated in-place; a backup # of the original file is created with suffix '.bak'. $ nws --mode lf --in-place=.bak from-windows.txt # Converts a LF-only file (Unix) to a CRLF line-endings file (Windows). # No output is produced, because the file is updated in-place; since no # backup suffix is specified, no backup file is created. $ nws --crlf -i from-unix.txt # Converts select Unicode whitespace and punctuation chars. to their # closest ASCII equivalents and sends the output to a different file. $ nws --ascii unicode-punct.txt > ascii-punct.txt EOF_MAN_PAGE