#!/bin/bash export LANG=C.UTF-8 # sed_sentence_chunker.sh # Created: 2017-Jul-20 | Victoria Stuart | "mail"..@t.."VictoriasJourney.com" # Last updated: 2017-Dec-30 # ---------------------------------------------------------------------------- # local: /mnt/Vancouver/Programming/scripts/sed_sentence_chunker/sed_sentence_chunker.sh # GitHub: https://github.com/victoriastuart/biomedical-sentence-splitter # ============================================================================ # USAGE: # ====== # ./sed_sentence_chunker.sh # bash sed_sentence_chunker.sh # This script processes text files in the "input/" directory, and outputs to the # "output/" directory. # ============================================================================ # PYTHON SCRIPT USAGE: # ==================== # To use this "sed_sentence_chunker.sh" bash script in a Python script; run # this script in a directory that contains your text/input files in an "input/" # directory. Note that you must also (manually) create an "output/" directory. # ============================================================================ # APPROACH: # ========= # 1. Preprocessing # 2. Split sentences # 3. Postprocessing # ============================================================================ # TWO VARIATIONS OF THIS SCRIPT: # ============================== # If desired you can edit this script for alternative runtime options, as # summarized here. # ---------------------------------------------------------------------------- # SCRIPT VARIANT 1: specify input, output files on the command line. # ------------------------------------------------------------------ # Usage: # ./sed_sentence_chunker.sh # bash sed_sentence_chunker.sh # Example: # ./sed_sentence_chunker.sh chunk_test_input.txt chunk_test_output.txt # 1. Add these at/near top of script (note: cannot have spaces around " = " sign): # input=$1 # output=$2 # 2. Comment out or delete this code section (after the Technical Notes, below): # FILES=$(find input -type f -iname "*") # # for f in $FILES # do # sed -i -e 's/ffi/ffi/g # s/fi/fi/g # ... snip ... # s/x/x/g' $f # 3. Change "$f" in this line to "$input": # sed 's/pp\.\s/Cho4Ph/g' $f > tmp_file # 4. Near the bottom of the script, add these, # sed 's/Dr,/Dr./g' tmp_file > $output # rm tmp_file # and delete these: # sed -i 's/Dr,/Dr./g' tmp_file # mv tmp_file output/$outname # ---------------------------------------------------------------------------- # SCRIPT VARIANT 2: directly pass input text on the command line. # ------------------------------------------------------------------ # Usage: # . sed_sentence_chunker.sh <<< "quoted input text / sentences" ## << note: dot space command # source sed_sentence_chunker.sh <<< "quoted input text / sentences" ## alternative (script sourcing) # ---------------------------------------- # Examples: # . sed_sentence_chunker.sh <<< "This is sentence 1. This is sentence 1." # or: # S="This is sentence 3. This is sentence 4." # . sed_sentence_chunker.sh <<< $S # 1. Add these at/near top of script (note: cannot have spaces around " = " sign): # input=$1 # outfile="" ## output file # OUTPUT="" ## output variable # 2. Comment out or delete this code section (after the Technical Notes, below): # FILES=$(find input -type f -iname "*") # # for f in $FILES # do # sed -i -e 's/ffi/ffi/g # s/fi/fi/g # ... snip ... # s/x/x/g' $f # 3. Change "$f" in this line to "$input": # sed 's/pp\.\s/Cho4Ph/g' $f > tmp_file # 4. Near the bottom of the script, add these, # sed 's/Dr,/Dr./g' tmp_file > out_file # OUTPUT=$(printf out_file) # export $OUTPUT # rm -f tmp* # and delete these: # sed -i 's/Dr,/Dr./g' tmp_file # mv tmp_file output/$outname # ============================================================================ # TECHNICAL NOTES: # ================ # ---------------------------------------------------------------------------- # SCRIPT NAME ...: # ---------------- # If the script name is too long for convenient use, just rename it; e.g.: ssc # Run this script on my "chunk_test_input.txt" file to get an idea of it's # capability (or to run your own unit tests). # If needed you can use the Linux "pwgen" command to generate alphanumeric # UID: "pwgen 8 2" will generate two (unique) 8-character alphanumeric strings. # Example: $ pwgen 8 2 >> eej8Ae2p | air4Coo2 # ---------------------------------------------------------------------------- # FIRST SED COMMAND IN THIS SCRIPT: # --------------------------------- # After much (!) experimentation, it appears that the first sed command (below), # outputting to the "tmp_file", MUST involve an "-r" argument (that in turn # expects a regex expression). To achieve this, it is best to use the first # command, as shown below. [Otherwise, you end up with blank output.] # ---------------------------------------------------------------------------- # [a-zA-Z] vs. [A-Za-z] : # ----------------------- # [a-zA-Z] **also** matches the ASCII characters between z and A: [ \ ] ^ _ ` # [A-Za-z] will only match the alphabet # https://stackoverflow.com/questions/4923380/difference-between-regex-a-z-and-a-za-z # http://www.asciitable.com/ # https://en.wikipedia.org/wiki/ASCII#/media/File:USASCII_code_chart.png # ---------------------------------------------------------------------------- # REGEX EXPRESSIONS: # ------------------ # I predominantly use two sed expressions -- the second, here, involving regex: # sed -i 's/foo/bar/g' # sed -i -r 's/foo\s?/bar/g' ## \s? : 0 or 1 (?) spaces (\s) # . : any char, including newline (\n)_ # \. : period (literal period) # -i : --in-place # Regex "special" characters, # [\^$.|?*+() # have special meaning / function, and will thus need to be \-escaped. # { and } are literal characters, unless they're part of a valid regular # expression token such as a quantifier, e.g.: {3}. # https://www.regular-expressions.info/refcharacters.html # ---------------------------------------- # HERE IS MY (WORKING) EXPERIENCE RE: SED AND REGEX: # In non-regex sed expressions, those special characters will need to be \-escaped # to indicate that they are a regex special (not a literal) character. # in regex (-r) sed expressions, they will be recognized as regex special # characters, and will not have to be \-escaped. # Exception: as noted, [ is a special character in regex -- denoting (e.g.) the # start of a character class / set (https://www.regular-expressions.info/charclass.html). # HOWEVER, unlike ?{}*^$ etc., in non-regex sed expressions, we need to escape # it, \[ if we want to match a literal "[" in our expressions. [That applies, # also, to regex (-r) sed expressions!]. # To match the start (^) or end ($) of a line. don't ever \-escape the ^ or $. # To match the end of a line (EOL) ending (e.g.) with: ... the end. # # sed 's/the end\.$/the end.\n\Period./g' ## \. : literal period; $ EOL # sed -r 's/the end\.$/the end.\n\Period./g' ## \. : literal period; $ EOL # sed 's/the end.$/the end.\n\Period./g' ## . : any single characer; $ EOL # sed -r 's/the end.$/the end.\n\Period./g' ## . : any single characer; $ EOL # To match a literal $, anywhere in a line / sentence, \-escape the $ ( \$ ): # # sed 's/\$/\n/g' # sed -r 's/\$/\n/g' # Likewise (viz-a-viz: ^ $), there is no need to ever escape * if you intend it # to match 0 or more of the preceding expression: # # sed 's/foo\s*bar/Foo.\n(Bar!)/g' ## matches 0 or more spaces between foo and bar # sed -r 's/foo\s*bar/Foo.\n(Bar!)/g' ## ditto # # sed -r 's/foo\*bar/Foo.\n(Bar!)/g' ## matches foo*bar # sed 's/foo*bar/Foo.\n(Bar!)/g' ## matches foobar (0 or more o) # sed -r 's/foo*bar/Foo.\n(Bar!)/g' ## matches foobar (0 or more o) # sed 's/foob*ar/Foo.\n(Bar!)/g' ## matches foobar (0 or more b) # sed -r 's/foob*ar/Foo.\n(Bar!)/g' ## matches foobar (0 or more b) # sed 's/fooz*bar/Foo.\n(Bar!)/g' ## matches foobar (0 or more z) # sed -r 's/fooz*bar/Foo.\n(Bar!)/g' ## matches foobar (0 or more z) # # compare to: # # sed 's/foo?bar/Foo.\n(Bar!)/g' ## does NOT match foobar; MATCHES foo?bar (literal ?) # sed 's/foo\?bar/Foo.\n(Bar!)/g' ## matches foobar (0 or 1 o); does not match foo?bar # sed -r 's/foo?bar/Foo.\n(Bar!)/g' ## matches foobar (0 or 1 o); does not match foo?bar # ---------------------------------------- # MORE EXAMPLES: # model: sed 's/foo/bar/g' # sed 's/foo\s\?bar/Foo.\nBar!/g' # sed -r 's/foo\s?bar/Foo.\nBar!/g' # ## 0 or 1 (?) spaces (\s) ## matches: foobar | foo bar ## does not match: foo bar | foo bar | ... # sed 's/foo\s\{0,3\}bar/Foo.\nBar!/g' # sed -r 's/foo\s{0,3}bar/Foo.\nBar!/g' # ## {0,3} : 0, 1, 2 or 3 of preceding sequence (here: space, \s) ## matches: foobar | foo bar | foo bar | foo bar ## does not match: foo bar | foo bar | ... # Regarding [ : # sed 's/foo\s\?\[bar]/Foo.\n(Bar!)/g' # sed -r 's/foo\s?\[bar]/Foo.\n(Bar!)/g' # ## \[: match literal [ ## matches: foo[bar] | foo [bar] ## does not match: foo [bar] | foo [bar] | foo [bar] | foo [bar] | ... ## does not match: foobar | foo bar | foo bar | ... # sed 's/foo\s\?[bar]/Foo.\n(Bar!)/g' # sed -r 's/foo\s?[bar]/Foo.\n(Bar!)/g' # ## matches: foobar | foo bar ## replacing foo with Foo. and [bar] with (Bar!)ar ## (with a line break, \n, between them)! ## does not match: foo[bar] | foo [bar | ... # ## Here, even in a non-regex sed expression, [bar] is being processed as a ## character class (like [A-Za-z0-9]), and so will match the b in foobar, but ## not the b in foo[bar]. To match the literal [ in that non-regex sed expression, ## \-escape the [, \[ , as shown further above / here: # sed 's/foo\s\{0,3\}\[bar]/Foo.\n(Bar!)/g' # sed -r 's/foo\s{0,3}\[bar]/Foo.\n(Bar!)/g' # ## matches: foo[bar] | foo [bar] | foo [bar] | foo [bar] ## does not match: foo [bar] | foo [bar] | ... ## does not match: foobar | foo bar | foo bar | ... # ---------------------------------------- # sed -r 's/\.([A-Z])\.$/.\1Shah7a/g' # ## \. : literal period; ([A-Z]) : ASCII capitals in character class (); ## $ : end of line, non-escaped; . : period (do not need to escape in ## replace portion of the sed expression; \1 : replace with captured ## characters (class); Shah7a : an alphanumeric "tag" / substitution / ## UID (that I will replace later with the text it represents: .) # sed -r 's/([[({\s])pp\.\s?([ivx0-9])/\1Cho4Ph\2/g' $f > tmp_file # ## NOTE: that "[" MUST appear FIRST in the "[...]" character expression); ## i.e., [[...]. Also, if used, escape ] (i.e., \]). Lastly, as this is ## a -r regex expression, the ? is not \?-escaped; ... # ---------------------------------------- # SED REGEX SUMMARY: # ================== # 1. No need to \-escape: ^ (start of line) # $ (EOL) # [] (character class / set) ## sed 's/foo[b]ar/foo\nbar/g' # * (0 or more instances of matches for preceding expression) # # in: sed 's///g' # or: sed -r 's///g' # 2. \-escape: ? (0 or 1 of preceding expression) ## \? # * (0 or more of preceding expression) ## \* # { and } in {i,j} expressions ## \{0,3\} # # in: sed 's///g' # not in: sed -r 's///g' # ---------------------------------------- # In the script below, I tried to minimize the use of "lookaheads" () in # my sed ( -r ) expressions, as I found these to increase the runtime. # That is, where possible / practical, I tended to prefer the simpler # sed -i 's///g' expressions. # Expressions of the sort .{1,15}\.s\s* look complicated, but they are pretty # simple! Basically it says: match any character ( . ), appearing 1-15 # times ( {1,15}, that is followed by a period ( \.) and any space ( \s\s* ) ... # Likewise: ^[A-Z].{1,5}\. says match any 1..5 preceding characters that are # not capitals, followed by a period ... # # sed -i -r "s/[.](.[^0-9]{1,15})[.]/Shah7a\1./g" tmp_file # # likewise translates to: match, in place, a period [.] that is followed by # any span of 1-15 characters {1,15}, that are not 0 through 9 [^0-9], # followed by another period [.]. All of that is this bit: .[^0-9]{1,15})[.] # # The second ("replace")_half of that regex expression states: replace replace # THOSE periods (matched as described) with the unique alphanumeric string, # Shah7a, followed by a period. # # sed -i -r "s/.[^.]\{1,15\}.\s\s*/\n\n/g" tmp_file # # Match any character ( . ), appearing 1-15 times ( {1,15} that is NOT a # period ( !. ), but is followed by a period ( \.) and any space ( \s\s* ), # and split ( \n\n ) at that position. # https://www.gnu.org/software/sed/manual/html_node/Regular-Expressions.html # http://www.rexegg.com/regex-quickstart.html # ---------------------------------------------------------------------------- # ABBREVIATIONS -- JOURNAL TITLES; AUTHORS ... # -------------------------------------------- # Journal author name initials and journal title abbreviations are a huge # programmatic, i.e. technical difficulty. While my approach, below, minimizes # the disruptions of those viz-a-viz bone fide sentence chunking, some issues # will inevitably remain. E.g., some very short sentences may not get split # from the others. C'est la vie! # ---------------------------------------------------------------------------- # THESE COMMENTS: # --------------- # I deleted all of these comments from this script, leaving only the commands. # The runtimes (time ./sed_sentence_chunker.sh) were essentially identical. # ---------------------------------------------------------------------------- # OLDER NOTES / REFERENCE ... # --------------------------- # These notes are no longer relevant viz-a-viz this script, but are useful # re: my earlier versions -- and general knowledge (preserved here!). # ---------------------------------------- # ESCAPING SINGLE QUOTES WITHIN SINGLE-QUOTED EXPRESSIONS: # -------------------------------------------------------- # To escape a single quote within a single-quoted sed expression, you need to # terminate / chain the single quotes. E.g., to escape an internal ', terminate # the sed single-quoted expression with another (internal) ', then escape the # internal single quote inside the sed expression: "'", then add back (chain) # another single quote ' to "continue / chain" the sed expression. Similarly, # to escape (e.g.) a bracket [ ] inside the optional match [] pattern within a # sed expression, chain the sed command, quoting the bracket term: ['"]"'] ... # https://stackoverflow.com/questions/18370536/sed-or-operator-in-set-of-regex # https://stackoverflow.com/questions/14813145/boolean-or-in-sed-regex # https://serverfault.com/questions/466118/using-sed-to-remove-both-an-opening-and-closing-square-bracket-around-a-string # ... all members of a character class lose special meaning (with a few # exceptions). And ] loses its meaning if it is placed first. # That observation is important re: the "([])}])" pattern below (that searches # for characters ")", "}" and ")"). You MUST list the "]" closing bracket # (within the "([ ])" character class), with the "]" square bracket listed FIRST: # "([])}])". # The following should capture all permutations of two contiguous sentences, # where the inter-sentence boundary may contain any permutation of terminal # punctuation (".", "!", "?"), parentheses and brackets ("(", "{, "[", ")", "}", # "]", and any combination of quotation marks -- and split those sentences! # sed -i -r 's/([A-Z]\.)\s\s*([A-Z])/\1\n\n\2/g' tmp_file # To "follow" these, focus on the second part (after the \n break): # '"'"' = escaped single quotation, used internally in single-quoted sed expression # Since multiple spaces were converted (above) to single spaces, sentences will # be separated by 0 or 1 spaces. Hence, the ".?" expression, below, will match # 0 or 1 characters, between the two parts of these sed regex expressions # [sentences will be split (\n) at those places]. # Replace -- again -- multiple spaces with single space: # sed -i 's/ */ /g' tmp_file # ---------------------------------------- # MORE REGEX EXAMPLES -- QUOTATION MARKS AND BRACKETS: # ---------------------------------------------------- # bn="ant bat, cat; dog; (eel), [fish]: 'horse - jackal \"kangaroo\" {lemur} / moose | possum \ quail" # echo $bn # ant bat, cat; dog; (eel), [fish]: 'horse - jackal "kangaroo" {lemur} / moose | possum \ quail # echo $bn; echo $bn | sed 's/[][(){} -,;:\x27"\|/]/./g' ## \x27 : single quote # ant bat, cat; dog; (eel), [fish]: 'horse - jackal "kangaroo" {lemur} / moose | possum \ quail # ant.bat..cat..dog...eel....fish....horse...jackal..kangaroo...lemur....moose...possum...quail # echo $bn; echo $bn | sed 's/[][(){} -,;:\x27"\|/]/./g ; s/\.\{1,\}/./g' ## \x27 : single quote # ant bat, cat; dog; (eel), [fish]: 'horse - jackal "kangaroo" {lemur} / moose | possum \ quail # ant.bat.cat.dog.eel.fish.horse.jackal.kangaroo.lemur.moose.possum.quail # NOTES: # * not a regex (-r) sed expression, so need to escape the {} in {1,} --> \{1,\} # * to easily escape a single quote ' in a 'single-quoted string', substitute it with: \x27 # * to include literal [] brackets inside a [] character class, they must appear in this order immediately after the leading (character class) [: # [][...] # echo 'donkey [horse]' | sed 's/[[]//g' # donkey horse] # echo 'donkey [horse]' | sed 's/[]]//g' # donkey [horse # echo 'donkey [horse]' | sed 's/[[]]//g' ## << does not work! [[]] ... # donkey [horse] # echo 'donkey [horse]' | sed 's/[][]//g' ## << ... use THIS! [][] # donkey horse # ---------------------------------------- # UPDATED [2017-11-24]: # --------------------- # With my substitution of ' " ( ) [ ] { } I no longer have to worry about # those when splitting sentences -- this HUGELY simplifies things!! :-D # [E.g., look at the "main processing loops" in my older # "sed_sentence_chunker{1|2|3}.sh" scripts!] # As well, I took the approach that since they will not be especially relevant # for my BioNLP work, tokenized sentences, etc. of deleting all double quotation # marks: ". As well, I delete all single quotes around sentences (keeping # internal single quotes / apostrophes, with the exception that I expand most # common contractions; e.g. it's --> it is ...). This (also) greatly simplifies # the processing, i.e. sentence chunking / splitting! :-D # ============================================================================ # ============================================================================ # PRELIMINARIES: # ============== # https://stackoverflow.com/questions/4638874/how-to-loop-through-a-directory-recursively-to-delete-files-with-certain-extensi # FILES=$(find ./input-z -type f -iname "*") # ... As a number of people have commented, this will fail if there are spaces in filenames. # You can work around this by temporarily setting the IFS (internal field separator) to the newline character. ... IFS=$'\n'; set -f # be sure to include "unset IFS; set +f" when done, near the bottom of the script FILES=$(find ./input -type f -iname "*") ## ALL files, recursively # can also use this, in for loop a few lines below: # for f in $(find ./input-z -type f -iname "*") # echo '------------------------------------------------------------------------------' # echo '$FILES:' ## single-quoted, prints: $FILES: # echo "$FILES" ## double-quoted, prints path/, filename (one per line) # echo '------------------------------------------------------------------------------' for f in $FILES do cp "$f" "tmp_file" ## work on a copy so that input file $f is not modified # ---------------------------------------------------------------------- # Preprocessing step -- replace various annoyances (different types of quotation marks; ligatures; ...): # https://stackoverflow.com/questions/26568952/how-to-replace-multiple-patterns-at-once-with-sed # https://stackoverflow.com/questions/24509214/how-to-escape-single-quote-in-sed # Escape ' within single-quoted sed '...' expressions by substituting those ' with \x27; e.g.: # s/'/'/g --> s/'/\x27/g sed -i -e 's/ffi/ffi/g s/fi/fi/g s/ff/ff/g s/fl/fl/g s/ffl/ffl/g s/…/.../g s/�/μ/g s/␮/μ/g s/௡/®/g s/␣/α/g s/␤/β/g s/␦/δ/g s/5Ј-/5\x27-/g s/-3Ј/-3\x27/g s/þ/+/g s/¼/=/g s/ϭ/=/g s/Ɛ/=/g s/Ͻ//g s/␥/γ/g s/␧/ε/g s/␨/ζ/g s/Ϫ/-/g s/À/-/g s/# OLD:/=/g s/ ‫؍‬ ./=/g s/␹/X/g s/Ն/≥/g s/Ն/≤/g s/Յ/+/g s/Ã/*/g s/Â/x/g s/¥/x/g s///g s/™//g s/®//g s/→/>/g s/–/-/g s/Ϯ/±/g s/؉/+/g s/ϫ/x/g s/ϳ/~/g s/ʽ/\x27/g s/ʻ/\x27/g s/“/"/g s/ˮ/"/g s/”/"/g s/״/"/g s/ʺ/"/g s/′′/"/g s/〃/"/g s/’/\x27/g s/ʼ/\x27/g s/‘/\x27/g s/′/\x27/g s/`/\x27/g s/׳/\x27/g s/ʹ/\x27/g s/ꞌ/\x27/g s/ˊ/\x27/g s/ˋ/\x27/g s/ˌ/\x27/g s/—/-/g s/؊/-/g s/ϩ/+/g s/ϫ/x/g' tmp_file # ============================================================================ # SPECIAL CASES -- COMMON ABBREVIATIONS: # -------------------------------------- # ---------------------------------------- # PAGE NUMBER ABBREVIATIONS: # Approach: substitute a unique alphanumeric string for "pp." (we will restore # it later). Generated via the Linux command: pwgen 6 1 # Page number abbreviation "pp.", followed by a space; unlikely to appear' # at EOL, so we can do a simple substitution: sed -i 's/pp\.\s/Cho4Ph/g' tmp_file # [ in character expression [] must appear first: [[]; -r regex, therefore # [I will process the "p." abbreviation after I strip the document of # extraneous whitespace.] # ============================================================================ # REMOVE URLs # Here is the approach that I used to remove URLs, etc. from my files # not sed -r .... therefore \-escape the ? : # sed -i -e 's/http[s]\?:\/\/\S*//g ; s/www\.\S*//g ; s/ftp:\S*//g ; s/doi:\S*//g' tmp_file sed -i -e 's/http[s]\?:\/\/\S*//g ; s/www\.\S*//g ; s/ftp:\S*//g ; s/[dD][oO][iI]:\s\?\S*//g' tmp_file # However, that expression leaves "blank" lines, that this perl expression removes: perl -i -pe 's/^'`echo "\012"`'${2,}//g' tmp_file ## 012 is the octal form of \n # Posted to / explained at: # https://stackoverflow.com/questions/4283344/sed-to-remove-urls-from-a-file/47821796#47821796 # ... includes an alternative to using "branch labels" to deal with newlines, \n, with sed ... # ============================================================================ # REMOVE (SOME) REFERENCES: perl -i -pe 's/^Reference:.*$//g;s/^Ref:.*$//g;s/^Citation:.*$//g; s/^'`echo "\012"`'${2,}//g' tmp_file # The last bit removes the non-printing newlines (\n) that are left behind. # Test: # # Ongoing work in the Black lab seeks to uncover biomarkers of response and toxicity to new immunotherapeutic agents used in the fight against lung cancer. # Reference: Madeline Krentz Gober, James P. Collard, Katherine Thompson, Esther P. Black.A microRNA signature of response to erlotinib is descriptive of TGFβ behaviour in NSCLC. # Ref: Madeline Krentz Gober, James P. Collard, Katherine Thompson, Esther P. Black.A microRNA signature of response to erlotinib is descriptive of TGFβ behaviour in NSCLC. # Citation: Madeline Krentz Gober, James P. Collard, Katherine Thompson, Esther P. Black.A microRNA signature of response to erlotinib is descriptive of TGFβ behaviour in NSCLC. # Our previous work identified a 13-gene miRNA signature predictive of response to the epidermal growth factor receptor (EGFR) inhibitor, erlotinib, in Non-Small Cell Lung Cancer cell lines. # # perl -pe 's/^Reference:.*$//g;s/^Ref:.*$//g;s/^Citation:.*$//g; s/^'`echo "\012"`'${2,}//g' # ============================================================================ # WHITESPACE, TABS: # Remove leading, trailing whitespace and multiple spaces from sentences: # https://www.cyberciti.biz/tips/delete-leading-spaces-from-front-of-each-word.html sed -i 's/^[ \t]*//; s/[ \t]*$//' tmp_file ## two (chained) sed expressions # Replace multiple spaces with single space: sed -i 's/ */ /g' tmp_file # ============================================================================ # REMAINING PAGE NUMBER ABBREVIATIONS: # The page number abbreviation "p." is more complicated than "pp.". We # needed to process "pp." (above) BEFORE "p.", otherwise substitution # of the "p." in "pp." will incorrectly get substituted with "Cho4Ph". sed -i -r 's/([[({\s])p\.\s?([ivx0-9])/\1Eiph2T\2/g' tmp_file # [ in character class [] must appear first: [[...] # ============================================================================ # BIOCHEMICAL TEXT -- AMINO ACIDS: # Need to do these before processing periods, as (e.g.) the p. ("protein") # in p.Arg62His (an amino acid substitution / variant) will be processed # as an abbreviation, and/or split into a sentence at that period ... sed -i 's/p.Ala/HieN7uuP/g' tmp_file ## Ala Alanine (A) sed -i 's/p.Arg/Nae0RaeZ/g' tmp_file ## Arg Arginine (R) sed -i 's/p.Asn/see7AuK6/g' tmp_file ## Asn Asparagine (N) sed -i 's/p.Asp/chaeJeu1/g' tmp_file ## Asp Aspartic Acid (D) sed -i 's/p.Cys/EiV6Gaix/g' tmp_file ## Cys Cysteine (C) sed -i 's/p.Gln/Ufaiph2b/g' tmp_file ## Gln Glutamine (Q) sed -i 's/p.Glu/Goh8eish/g' tmp_file ## Glu Glutamic Acid (E) sed -i 's/p.Gly/xei1Phei/g' tmp_file ## Gly Glycine (G) sed -i 's/p.His/aak0eVei/g' tmp_file ## His Histidine (H) sed -i 's/p.Ile/vai9aeS3/g' tmp_file ## Ile Isoleucine (I) sed -i 's/p.Leu/ohzah5Ei/g' tmp_file ## Leu Leucine (L) sed -i 's/p.Lys/Oa4Aequo/g' tmp_file ## Lys Lysine (K) sed -i 's/p.Met/TheeWie7/g' tmp_file ## Met Methionine (M) sed -i 's/p.Phe/ohNa9pe0/g' tmp_file ## Phe Phenylalanine (F) sed -i 's/p.Pro/Eetaib7k/g' tmp_file ## Pro Proline (P) sed -i 's/p.Trp/ga3yeeGh/g' tmp_file ## Trp Tryptophan (W) sed -i 's/p.Tyr/DuY2Gub7/g' tmp_file ## Tyr Tyrosine (Y) sed -i 's/p.Ser/oezoo9Ca/g' tmp_file ## Ser Serine (S) sed -i 's/p.Thr/wahRoo7E/g' tmp_file ## Thr Threonine (T) sed -i 's/p.Val/ieKai4oo/g' tmp_file ## Val Valine (V) # ---------------------------------------------------------------------------- # GENOMIC VARIANTS: # ... a letter prefix should be used to indicate the type of reference sequence used. # Accepted prefixes are; # "g." for a genomic reference sequence # "c." for a coding DNA reference sequence # "n." for a non-coding DNA reference sequence # "r." for an RNA reference sequence (transcript) # "p." for a protein reference sequence # ============================================================================ # PERIODS: # To better deal with the many complications associated with periods, # first delete all spaces preceding and proceeding periods. This will # take care of, e.g.: U. S. A. | The end . | V. A. Stuart | # J. Am. Soc. Chem. ... sed -i 's/\s*\././g' tmp_file sed -i 's/\.\s*/./g' tmp_file # ---------------------------------------- # Ellipses (ellipsis: ...) -- convert 3 or more periods (.) to an ellipsis: sed -i 's/\.\{3,\}/.../g' tmp_file # .. then store those ellipses as a UID: sed -i 's/\.\.\./Iet1auki/g' tmp_file # ... and finally convert remaining tandem periods (..) to a single period: sed -i 's/\.\././g' tmp_file # ---------------------------------------- # version (v.) abbreviation (v. + 0 or 1 character + any number): sed -i -r 's/v\.\s?([0-9])/Eegh5eel\1/g' tmp_file # ---------------------------------------- # versus (vs.) abbreviation: sed -i 's/vs\./Air5ah/g' tmp_file # ---------------------------------------- # "E.g.", "e.g.", "I.e." or "i.e.": sed -i 's/[eE]\.g\./Va1Eed/g' tmp_file sed -i 's/[iI]\.e\./Uchee4/g' tmp_file # ---------------------------------------- # "cc.", "CC." or "cf.": # This also captures "Hcc" (hepatocellular carcinoma) at the end of a sentence: Hcc. # sed -i 's/[cC]\.\?[cC]\./Ri9Ohk/g' tmp_file # Here is a workaround: sed -i 's/[^Hh][cC]\.\?[cC]\./Ri9Ohk/g' tmp_file # " cc " or " CC ": sed -i 's/\s[cC][cC]\s/ Ri9Ohk /g' tmp_file sed -i 's/c\.\?f\./Tig8shei/g' tmp_file sed -i 's/\scf\s/ Tig8shei /g' tmp_file # ---------------------------------------- # "et al." abbreviation (will restore, with period, later): sed -i 's/et al\./et al/g' tmp_file # ---------------------------------------- # "Fig.", "fig.", "Figs.", "figs.": # As I don't otherwise process commas, I can simply use them as a facile # substitution for periods (later swapping , for . in post-processing): sed -i -r 's/([fF]ig[s])\./\1,/g' tmp_file # ---------------------------------------- # Personal titles (again, temporarily replace '.' with ','): sed -i 's/Dr\./Dr,/g' tmp_file sed -i 's/Drs\./Drs,/g' tmp_file sed -i 's/Mr\./Mr,/g' tmp_file sed -i 's/Mrs\./Mrs,/g' tmp_file sed -i 's/Ms\./Ms,/g' tmp_file sed -i 's/St\./St,/g' tmp_file # ============================================================================ # OTHER BIOCHEMICAL TEXT: # ---------------------------------------- # SINGLE QUOTATIONS: # Note that some single quotes (i.e. apostrophes), e.g., 5'-, 3'-, ... # are important in biochemistry / chemistry. To be safe, we'll proactively # capture / protect these: sed -i "s/3'/tho6Si2o/g" tmp_file ## e.g.: 3'-end sed -i "s/5'/oochie8P/g" tmp_file ## e.g.: 5'-ATGGCTCGATCTTA... sed -i "s/A's/ohph5AN6/g" tmp_file ## e.g.: (multiple adenines) multiple A's precede sed -i "s/C's/Ji4oopow/g" tmp_file ## e.g.: (multiple adenines) multiple C's precede sed -i "s/G's/Aeyahk4A/g" tmp_file ## e.g.: (multiple adenines) multiple G's precede sed -i "s/T's/oogeel3W/g" tmp_file ## e.g.: (multiple adenines) multiple T's precede # ---------------------------------------- # BIOCHEMICAL, CHEMICAL PRIMES: sed -i "s/1'/hooPhil4/g" tmp_file sed -i "s/2'/He5EiS1Z/g" tmp_file sed -i "s/3'/IeghuP3V/g" tmp_file sed -i "s/4'/Loh4aeri/g" tmp_file sed -i "s/5'/Aht9Vohs/g" tmp_file sed -i "s/6'/ReiR5zee/g" tmp_file sed -i "s/7'/eiTei4ri/g" tmp_file sed -i "s/8'/ay0ePicu/g" tmp_file sed -i "s/9'/seeHush2/g" tmp_file # ============================================================================ # REMAINING SINGLE, DOUBLE QUOTATIONS: # Delete all double quotations: not particularly needed in NLP, e.g. tokenized text: sed -i 's/"//g' tmp_file # ---------------------------------------------------------------------------- # CONTRACTIONS: # Deal with common contractions, before dealing with single quotes / apostrophes. # ---------------------------------------- # First, expand common contractions: sed -i -r "s/([a-z])'d/\1 did/g" tmp_file ## otherwise, 'd* becomes did* sed -i -r "s/([a-z])'m/\1 am/g" tmp_file ## otherwise, 'm* becomes am*; e.g. to 'mess' with >> to amess' with sed -i "s/won't/will not/g" tmp_file ## do this rule before the following rule sed -i "s/n't/ not/g" tmp_file ## isn't | shouldn't | wouldn't | wouldn't | ... sed -i "s/'ll/ will/g" tmp_file sed -i "s/'re/ are/g" tmp_file sed -i "s/'ve/ have/g" tmp_file sed -i "s/here's/here is/g" tmp_file ## here's | Here's | there's | There's | where's | Where's ... sed -i "s/I'd/I would/g" tmp_file sed -i "s/It's/It is/g" tmp_file sed -i "s/\sit's/ it is/g" tmp_file sed -i "s/That's/That is/g" tmp_file sed -i "s/that's/that is/g" tmp_file sed -i "s/What's/What is/g" tmp_file sed -i "s/\swhat's/ what is/g" tmp_file # ---------------------------------------- # Next, substitute remaining contractions with UID (restore in post-processing): sed -i -r "s/([a-zI])'d/\1chaSaib7/g" tmp_file ## e.g.: I'd | how'd | who'd | why'd | ... sed -i -r "s/([a-zI])'ll/\1UivahJ5e/g" tmp_file ## e.g.: I'll sed -i -r "s/([a-zI])'m/\1chahei1O/g" tmp_file ## e.g.: I'm sed -i -r "s/([a-z])'t/\1Zeep7Auy/g" tmp_file sed -i -r "s/([a-z])'nt/\1Zeep7Auy/g" tmp_file ## e.g.: is'nt [grammatical (spelling) error] sed -i -r "s/([a-z])'re/\1Phoh5eil/g" tmp_file ## e.g.: you're | We're responsible ... # ------------------ sed -i "s/'six/eKu6eech/g" tmp_file ## e.g.: escape 'six sed -i "s/'seven/pahl8Avu/g" tmp_file ## e.g.: escape 'seven sed -i -r "s/([a-z])'s/\1zaoGii5p/g" tmp_file ## e.g.: there's | various possessives: Victoria's | women's | ... # ------------------ # UPDATE: the following expression left (when apostrophes restored) artefacts like this: # 'mess'[orig text] >> [processing: this script] >> mess' [output]: # # sed -i -r "s/([a-z])'\s/\1ueKek3oh/g" tmp_file ## e.g.: plural noun possessives ending in "s": girls' dresses | Wilsons' house | ... # # It is not needed, with the inclusion of the "final" rule, below: sed -i "s/'//g" tmp_file # ------------------ sed -i -r "s/([a-z])'t/\1iCuRahb6/g" tmp_file ## e.g.: isn't sed -i -r "s/([a-zI])'ve/\1Roopes5f/g" tmp_file ## e.g.: I've' | (+)'ve # less common / archaic: sed -i "s/ma'am/Quei2Eex/g" tmp_file sed -i "s/ne'er/IeDae7Lu/g" tmp_file ## e.g.: ne'er-do-well sed -i -r "s/o'([a-z])/Xahc3Iel\1/g" tmp_file ## e.g.: o'clock sed -i "s/'twas/uph4aida/g" tmp_file ## e.g. 'twas the night; escapes: 'two | 'twenty ... # Finally, delete all remaining single quotations, apostrophes: sed -i "s/'//g" tmp_file # WITH THE EXPRESSION ABOVE, THIS SHOULD **NOT** BE NEEDED: # Delete single quotations, apostrophes at end of words: # sed -i "s/'\s/ /g" tmp_file ## e.g.: missed' that # sed -i "s/'\././g" tmp_file ## e.g.: missed.' That # sed -i "s/\.'/./g" tmp_file ## e.g.: missed'. That # ============================================================================ # PREPROCESSING MISCELLANY: # ---------------------------------------- # Delete tandem commas, semicolons: sed -i 's/,,/,/g' tmp_file sed -i 's/;;/;/g' tmp_file # ---------------------------------------- # Clean up improperly-terminated sentences (e.g. ?!!?!?!??!): # ------------------ # Tandem question, exclamation marks: for i in {1..8} do sed -i 's/??/?/g' tmp_file ## not regex (-r), so those those are sed -i 's/!!/!/g' tmp_file ## literal ? ! character substitutions done # ------------------ # Remaining [.!?] permutations: sed -i 's/!?/?/g' tmp_file sed -i 's/?!/?/g' tmp_file sed -i 's/?\./?/g' tmp_file sed -i 's/!\./!/g' tmp_file sed -i 's/\.?/?/g' tmp_file sed -i 's/\.!/!/g' tmp_file # ============================================================================ # BRACKETS: # These can be annoying, especially re: processing. They are important in # chemistry / biochemistry, however (e.g. chemical / biochemical names), so # for now just do the usual substitute / replace later approach. # The order of these steps is important: do ( [ {, then ) ] } associated # with periods (to split at those), then do left-over ( ) [ ] { }. # ---------------------------------------- # Simplify [{ as ( ; simplify ]} as ) : sed -i 's/\[/(/g' tmp_file ## \-escape the [ : \[ sed -i 's/]/)/g' tmp_file sed -i 's/{/(/g' tmp_file sed -i 's/}/)/g' tmp_file # ---------------------------------------- # Angle brackets { < | > }: # Deal with these first: (angle brackets used as mathematical inequalities); # include "p" to capture (e.g.) "p < 0.001" or "p > 0.001 : sed -i -r 's/([0-9p])\s?<\s?([0-9])/\1Woxoh4ph\2/g' tmp_file sed -i -r 's/([0-9p])\s?>\s?([0-9])/\1aeja8ohM\2/g' tmp_file # not "sed -r", therefore \-escape "?" (regex 0 or 1 modifier) -- \? : sed -i 's/\s\?<\s\?=\s\?/aev3Shoo/g' tmp_file sed -i 's/\s\?>\s\?=\s\?/iez7ieVi/g' tmp_file # ... then remove all other angle brackets: sed -i 's//)/g' tmp_file # ---------------------------------------- # Delete spaces following leading parentheses; delete spaces preceding lagging parentheses: sed -i 's/(\s\?/(/g' tmp_file sed -i 's/\s\?)/)/g' tmp_file # ---------------------------------------- # Delete empty and multiple parentheses: sed -i 's/(\s\?)//g' tmp_file sed -i 's/(\{2,\}/(/g' tmp_file sed -i 's/)\{2,\}/)/g' tmp_file # ---------------------------------------- # Split parentheses associated with punctuation (.?!) at the ends of sentences: sed -i 's/\.)\s\?/.)\n/g' tmp_file sed -i 's/\.\s\?(/.\n(/g' tmp_file sed -i 's/?)\s\?/?)\n/g' tmp_file sed -i 's/?\s\?(/?\n(/g' tmp_file sed -i 's/!)\s\?/!)\n/g' tmp_file sed -i 's/!\s\?(/!\n(/g' tmp_file # ---------------------------------------- # Split lines on ") (", only if first parenthesized expression is at the end of a sentence: sed -i 's/[.!?]\s\?)\s\?(/.)\n(/g' tmp_file # ---------------------------------------- # Clean up: remove parentheses at start or end of lines: # First, (again) remove all leading and trailing whitespace from sentences, as well as multiple spaces: sed -i 's/^[ \t]*//; s/[ \t]*$//' tmp_file ## two (chained) sed expressions sed -i 's/^(//g' tmp_file sed -i 's/)$//g' tmp_file # ============================================================================ # AUTHOR INITIALS; JOURNAL TITLE ABBREVIATIONS: # ============================================= sed -i -r 's/(\.[A-Z][a-z]{0,13})\./\1Shah7a/g' tmp_file # (Proc.NatlShah7aAcad.SciShah7aUShah7aS.AShah7a104, 9346 sed -i -r 's/(Shah7a[A-Z][a-z]{0,13})\./\1Shah7a/g' tmp_file # (Proc.NatlShah7aAcadShah7aSciShah7aUShah7aSShah7aAShah7a104, 9346 # ---------------------------------------------------------------------------- # Match abbreviations at the start of a line. sed -i -r 's/(^[A-Z][a-z]{0,13})\./\1Shah7a/g' tmp_file # ---------------------------------------------------------------------------- # Capture the first abbreviation inside a parenthesis ( ( ): sed -i -r 's/(\([A-Z][a-z]{0,13})\./\1Shah7a/g' tmp_file ## \-escaped, literal ( inside () character substitution # ---------------------------------------------------------------------------- # Authors' names -- additional processing: # ------------------ # Match hyphenated names abbreviations (e.g. Chen A.-B. Jiang): sed -i -r 's/\.-([A-Z])\./Shah7a-\1Shah7a/g' tmp_file # ------------------ # Clean up { space Cap(range 1:4 Caps) dot Cap | space Cap dash Cap dot } patterns: # William F.JShah7aMcLeod | A.BCD.Smith | ABCD.Smith | Chen A-B.JiangShah7a | ... sed -i -r 's/\s([A-Z]{1,4})\.([A-Z])/ \1Shah7a\2/g' tmp_file # Rule above prevents " HCC. More", etc., from being split (HCC: hepatocellular carcinoma). # I deal with it via custom splits, in "post-processing." sed -i -r 's/\s([A-Z]-[A-Z])\./ \1Shah7a/g' tmp_file # ============================================================================ # SPLIT SENTENCES ONTO SEPARATE LINES: # ------------------------------------ # Here we want to process the remaining periods to split sentences onto # separate lines, with the caveats (i) that we do not want to split decimal # numbers (3.1; ... i.e. [0-9].[0-9]), and (ii) we do not want to (as much # as practically possible) split abbreviations (journal titles; authors; ...). # sed -i -r 's/([\.?!])\s?([A-Z][A-Za-z0-9 ,-]{4,})/\1\n\2/g' tmp_file ## literal space, [ ] inside that [A-Za-z ,-] character class # Expression above failed to split: .TGFβ -- corrected here: sed -i -r 's/([\.?!])\s?([A-Z][A-Za-z0-9αβγδεζηθικλμνξοπρςστυφχψω ,-]{3,})/\1\n\2/g' tmp_file # ============================================================================ # RESTORATIONS: # ============= # ---------------------------------------- # (Re-)delete leading and trailing whitespace from sentences, as well as # multiple spaces (if present / inadvertently reintroduced): sed -i 's/^[ \t]*//;s/[ \t]*$//' tmp_file # Replace multiple spaces with single space: sed -i 's/ */ /g' tmp_file # ---------------------------------------------------------------------------- # Restorations -- amino acids (e.g.: p.Arg in p.Arg62His): sed -i 's/HieN7uuP/p.Ala/g' tmp_file sed -i 's/Nae0RaeZ/p.Arg/g' tmp_file sed -i 's/see7AuK6/p.Asn/g' tmp_file sed -i 's/chaeJeu1/p.Asp/g' tmp_file sed -i 's/EiV6Gaix/p.Cys/g' tmp_file sed -i 's/Ufaiph2b/p.Gln/g' tmp_file sed -i 's/Goh8eish/p.Glu/g' tmp_file sed -i 's/xei1Phei/p.Gly/g' tmp_file sed -i 's/aak0eVei/p.His/g' tmp_file sed -i 's/vai9aeS3/p.Ile/g' tmp_file sed -i 's/ohzah5Ei/p.Leu/g' tmp_file sed -i 's/Oa4Aequo/p.Lys/g' tmp_file sed -i 's/TheeWie7/p.Met/g' tmp_file sed -i 's/ohNa9pe0/p.Phe/g' tmp_file sed -i 's/Eetaib7k/p.Pro/g' tmp_file sed -i 's/ga3yeeGh/p.Trp/g' tmp_file sed -i 's/DuY2Gub7/p.Tyr/g' tmp_file sed -i 's/oezoo9Ca/p.Ser/g' tmp_file sed -i 's/wahRoo7E/p.Thr/g' tmp_file sed -i 's/ieKai4oo/p.Val/g' tmp_file # ---------------------------------------- # Restore single quotations: sed -i "s/tho6Si2o/3'/" tmp_file sed -i "s/oochie8P/5'/g" tmp_file sed -i "s/ohph5AN6/A's/g" tmp_file sed -i "s/Ji4oopow/C's/g" tmp_file sed -i "s/Aeyahk4A/G's/g" tmp_file sed -i "s/oogeel3W/T's/g" tmp_file # Restore angle brackets used as mathematical inequalities: sed -i 's/Woxoh4ph/ < /g' tmp_file sed -i 's/aeja8ohM/ > /g' tmp_file sed -i 's/aev3Shoo/ <= /g' tmp_file sed -i 's/iez7ieVi/ >= /g' tmp_file # ---------------------------------------- # Restore common contractions: sed -i "s/chaSaib7/'d/g" tmp_file sed -i "s/UivahJ5e/'ll/g" tmp_file sed -i "s/chahei1O/'m/g" tmp_file sed -i "s/Zeep7Auy/'t/g" tmp_file sed -i "s/Zeep7Auy/'nt/g" tmp_file sed -i "s/Phoh5eil/'re/g" tmp_file # ------------------ sed -i "s/eKu6eech/six/g" tmp_file sed -i "s/pahl8Avu/seven/g" tmp_file sed -i "s/zaoGii5p/'s/g" tmp_file sed -i "s/ueKek3oh/' /g" tmp_file # ------------------ sed -i "s/iCuRahb6/'t/g" tmp_file sed -i "s/Roopes5f/'ve/g" tmp_file # less common / archaic: sed -i "s/Quei2Eex/ma'am/g" tmp_file sed -i "s/IeDae7Lu/ne'er/g" tmp_file sed -i "s/Xahc3Iel/o'/g" tmp_file sed -i "s/uph4aida/'twas/g" tmp_file # ---------------------------------------- # Restore biochemical, chemical primes: sed -i "s/hooPhil4/1'/g" tmp_file sed -i "s/He5EiS1Z/2'/g" tmp_file sed -i "s/IeghuP3V/3'/g" tmp_file sed -i "s/Loh4aeri/4'/g" tmp_file sed -i "s/Aht9Vohs/5'/g" tmp_file sed -i "s/ReiR5zee/6'/g" tmp_file sed -i "s/eiTei4ri/7'/g" tmp_file sed -i "s/ay0ePicu/8'/g" tmp_file sed -i "s/seeHush2/9'/g" tmp_file # ---------------------------------------- # Restore version (v.): sed -i -r 's/Eegh5eel/v./g' tmp_file # ---------------------------------------- # Restore versus ("vs."): sed -i 's/Air5ah/vs. /g' tmp_file # ---------------------------------------- # Restore "e.g." and "i.e.": sed -i 's/Va1Eed/e.g. /g' tmp_file sed -i 's/Uchee4/i.e. /g' tmp_file # Capitalize restored "e.g.", "i.e.", "c.f." present at the start of a sentence: sed -i 's/^c\.f\./Cf. /g' tmp_file sed -i 's/^e\.g\./E.g. /g' tmp_file sed -i 's/^i\.e\./I.e. /g' tmp_file # ---------------------------------------- # Restore "c.c." and "cf.": sed -i 's/Ri9Ohk/ cc. /g' tmp_file sed -i 's/Tig8shei/cf. /g' tmp_file # ---------------------------------------- # Restore page number abbreviations {pp. | p.}: sed -i 's/Cho4Ph/pp./g' tmp_file sed -i 's/Eiph2T/ p./g' tmp_file # ---------------------------------------- # Restore ellipses (...): sed -i 's/Iet1auki/ ... /g' tmp_file ## add space before ... # ... and split line if following character is a Capital letter: sed -i -r 's/\.\.\.(\s?[A-Z])/...\n\1/g' tmp_file # ... and delete line is it consists solely of an ellipsis ("...") [optionally with spaces]: sed -i -r 's/^\s{0,}\.\.\.\s{0,}$//g' tmp_file # ---------------------------------------- # Restore et al. : sed -i 's/et al/et al. /g' tmp_file # ---------------------------------------- # Restore "Fig.", "fig.", "Figs.", "figs.": sed -i -r 's/([fF]ig[s]),/\1\. /g' tmp_file # ---------------------------------------- # Restore personal titles (replace ',' with '.'): sed -i 's/St,/St. /g' tmp_file sed -i 's/Ms,/Ms. /g' tmp_file sed -i 's/Mrs,/Mrs. /g' tmp_file sed -i 's/Mr,/Mr. /g' tmp_file sed -i 's/Drs,/Drs. /g' tmp_file # ---------------------------------------------------------------------------- # Miscellany: split St. at end of sentence: sed -i -r 's/\sSt.\s?([A-Z])/ St.\n\1/g' tmp_file # ---------------------------------------------------------------------------- # Lastly , restore author initials, journal title abbreviations: sed -i 's/Shah7a/./g' tmp_file # ============================================================================ # POSTPROCESSING: # =============== # ---------------------------------------------------------------------------- # Delete { ---------- | ========== }-type lines: # [I often use these to delimiter sections of text.] sed -i '/^[-=]*$/d' tmp_file # Deletes all of these: # --------------------- # ===================== # --------=====-------- # =====----------===== # ---------------------------------------- # Remove unterminated lines (no terminal ".!?", often due to citations): # sed -i -r 's/^.*[^.!?]$//g' tmp_file # Ack! Expression above ** appears** to delete lines ending in ellipsis (...) # (closer inspection: those ellipses were followed by spaces). # Workaround -- first remove spaces at EOL: sed -i 's/\s\s*$//g' tmp_file # .. THEN remove unterminated lines (excluding also those ending in an ellipsis): sed -i -r 's/^.*[^.{1,3}?!]$//g' tmp_file # Converts: # # DIR: Cellular Signaling - TGFβ Signaling Pathway. # SUBJ: Cell signaling interaction may prevent key step in lung cancer progression. # Date: November 9, 2017 # Source: University of Kentucky # Ongoing work seeks to uncover biomarkers of response used in the fight against lung cancer! # Ongoing work seeks to uncover biomarkers of response used in the fight against lung cancer? # A microRNA signature of response to erlotinib is descriptive of TGFβ behaviour in NSCLC # Madeline Krentz Gober, James P. Collard, Katherine Thompson & Esther P. Black # Scientific Reports 7, Article number: 4202 (2017 # ABSTRACT # Our previous work identified a miRNA signature in Non-Small Cell Lung Cancer cell lines. # # to # # DIR: Cellular Signaling - TGFβ Signaling Pathway. # SUBJ: Cell signaling interaction may prevent key step in lung cancer progression. # Ongoing work seeks to uncover biomarkers of response used in the fight against lung cancer. # Ongoing work seeks to uncover biomarkers of response used in the fight against lung cancer! # Ongoing work seeks to uncover biomarkers of response used in the fight against lung cancer? # Our previous work identified a miRNA signature in Non-Small Cell Lung Cancer cell lines. # ---------------------------------------- # "unsplit" sentences: # As a consequence of processing abbreviations, some existing abbreviations # get captured; e.g. [original text] "... in PD.Overall ...". That PD # (Parkinson's Disease) abbreviation gets preprocessed by his script as ah # abbreviation [PDShah7a], and so it is not present during the sentence # splitting step; the period (hence unsplit sentence) is added when "Shah7a" # is replaced with a period. sed -i -r 's/\s([A-Z]{2,4})\.([A-Z])/ \1.\n\2/g' tmp_file # Also, some citations at the ends of sentences do not get split: sed -i -r 's/([ a-z])([0-9]{1,3})\.([A-Z])/\1\2.\n\3/g' tmp_file # Matches a space or lowercase letter followed by 1-3 numbers followed by a # period followed by a capital letter; e.g. "response17.TGFβ" or "response 17.TGFβ". # ---------------------------------------- # "Run-on" names, e.g. "Esther P.Black" (will get tokenized as "p.black" ...: # sed -i -r 's/([A-Z])\.([A-Z])/\1. \2/g' tmp_file # Problem -- expression above also (e.g.) converts "Sci.U.S.A." to "Sci.U. S.A." # Workaround: sed -i -r 's/([a-z]\s?[A-Z])\.([A-Z])/\1. \2/g' tmp_file # Authors initials with spaces (remove spaces) -- e.g. " A. B. Charles" >> " A.B.Charles" : sed -i -r 's/\s([A-Z])\.\s([A-Z])\./ \1.\2./g' tmp_file # ---------------------------------------- # Dot space: # sed -i 's/\.\s\?/./g' tmp_file # ---------------------------------------- # Miscellaneous unsplit: # As mentioned above, some abbreviations (e.g. hepatocellular carcinoma: Hcc | Hcc) # must be processed via "one-of" rules: sed -i 's/\s\?[Hh][Cc][Cc]\./HCC.\n/g' tmp_file # ------------------ sed -i -r 's/([a-z]{2,})\.\s?([0-9])([A-Z])/\1.\n\2\3/g' tmp_file ## e.g.: lines.2OH-BNPP1 # Oops: this splits "invasion. km23" but also splits "pp. iii"; "vs. that"; ... # sed -i -r 's/([a-z]{2,})\.\s?([a-z]{2,})/\1.\n\2/g' tmp_file ## e.g.: invasion. km23-1 # Facile solution -- extend match length: sed -i -r 's/([a-z]{4,})\.\s?([a-z]{2,})/\1.\n\2/g' tmp_file sed -i -r 's/([A-Za-z0-9])\.\s?([αβγδεζηθικλμνξοπρςστυφχψω])/\1.\n\2/g' tmp_file ## e.g.: CBX7. β3 | manner. β3 sed -i -r 's/([a-z]{2,})\.\s?([0-9])([A-Z])/\1.\n\2\3/g' tmp_file ## e.g.: lines.2OH-BNPP1 # ---------------------------------------- # Delete any remaining empty / blank lines (if they exist): sed -i '/^\s*$/d' tmp_file ## * : 0 or more instances (here, of spaces: \s) perl -i -pe 's/^'`echo "\012"`'${2,}//g' tmp_file ## 012 is the octal form of \n # ---------------------------------------- # Delete any remaining multiple spaces: sed -i 's/\s\s\?/ /g' tmp_file # ---------------------------------------- # Delete space comma: sed -i 's/\s,/,/g' tmp_file # ---------------------------------------- # Delete spaces at beginning of lines: sed -i 's/^\s\s\?//g' tmp_file # ============================================================================ # FINAL SED OPERATION: # ==================== # ---------------------------------------------------------------------------- # Final sed operation; output to file: sed -i 's/Dr,/Dr. /g' tmp_file # ---------------------------------------------------------------------------- # Create output files, into PREEXISTING ./output directory: # http://pubs.opengroup.org/onlinepubs/007908799/xcu/basename.html # https://stackoverflow.com/questions/15803227/getting-permission-denied-on-dirname-and-basename # https://stackoverflow.com/questions/7194192/basename-with-spaces-in-a-bash-script outname=$(basename "$f") mv "tmp_file" "output/$outname" done # https://stackoverflow.com/questions/4638874/how-to-loop-through-a-directory-recursively-to-delete-files-with-certain-extensi # At top of script: IFS=$'\n'; set -f # Unset here: unset IFS; set +f # ---------------------------------------------------------------------------- # SIGNAL END OF SCRIPT EXECUTION: # for i in 1 2 3 4 5 for i in 1 2 3 do { #aplay alarm-frenzy.mp3 ## << aplay cannot play MP3 files; use WAV #aplay beep.wav #aplay ding.wav aplay /mnt/Vancouver/Programming/scripts/PHASER.WAV #aplay /mnt/Vancouver/Programming/scripts/KenbeepLoud.wav sleep 0.25 #echo "Welcome $i times" } &> /dev/null ## re: above - suppresses aplay echo in terminal, per: ## http://stackoverflow.com/questions/18062778/how-to-hide-command-output-in-bash done # ============================================================================ # Q.E.D.! :-D # ============================================================================