#!/bin/bash ################################################## ### Assignment 2 ### Seminar: The ABC of Computational Text Analysis ### University of Lucerne ################################################## ### parsing and removing meta information from a document similar to the following: # Von BRUNO VANONI, BERN.      # 591 words # 26 February 2004 # Tages Anzeiger # TANZ # German # (c) 2004 Tages Anzeiger Homepage Address:      http://www.tages-anzeiger.ch ### Task 1: Parse Information with RegEx ### general hints # As meta information always starts at the beginning of a new line, use ^ to be more specific and avoid false positive matches. # Similarly, you may use the $ symbol to match the end of the line. # Task 1.1: parse number of words # include the comma as there are numbers like 1,529 egrep -o "^[0-9,]+ words$" newspaper_articles.txt > words.txt # Task 1.2: parse dates # assuming dates of the format: X MONTH XXXX or XX MONTH XXXX egrep -o "^[0-9]{1,2} \w+ [0-9]{4}$" newspaper_articles.txt > dates.txt # Task 1.2: parse author names # author names are not provided for all articles # some examples are really hard to match like: # Von DAS GESPRÄCH FÜHRTEN IWAN STÄDLER UND VERENA ARBURG # Von MIT ALEX SCHEIWILLER* SPRACH ERWIN HAAS.  # Practically, it's not worth to cover all of them unless you really need them. Removing is easier than exact extracting. # Moreover, it takes more sophisticated patterns (lookarounds, see below) or even a machine-learning approach. # Both approaches go beyond what you learn in this seminar. # assuming names consist of uppercased letters, dots (e.g. abbreviated names), hyphens (e.g. double names) and spaces (e.g. firstname secondname lastname). # thus, locations are excluded as they are preceded by commas. egrep -o "^[vV]on [A-ZÄÜÖ .-]+" newspaper_articles.txt > authors.txt ### Task 2: Removing Parts of a Document # Simply reuse the patterns from above and replace the matches with an empty sequence (i.e. nothing). Names are removed completely. cat newspaper_articles.txt | sed -E "s/^[0-9,]+ words$//g" | sed -E "s/^[0-9]{1,2} \w+ [0-9]{4}$//g" | sed -E "s/^[vV]on [A-ZÄÜÖ .-].*//g" > clean.txt #################################### # Collective Feedback #################################### # - make patterns more general # - date: `DD* Month DDDD` # - keep it simple # - name of month ~ any word ~ `\w+` # - avoid false positives with positional information # - start of line: `^` # - names are hard to extract # - variation + inconsistency # pro-tip check the count of matches with `wc` + the cleanup with `diff` #################################### # Further Information #################################### # count the number of articles. With this information, you know how many matches you should get in Task 1) egrep "^Document" newspaper_articles.txt | wc -l # the number of documents should equal the number of matches (e.g. words) egrep "^[0-9,]+ words$" newspaper_articles.txt | wc -l # check the differences between the original file and the clean file diff -y --suppress-common-lines newspaper_articles.txt clean.txt # More poweful engines (like the one in Perl/Python) support negative/positive lookahead and lookbehind operators to make the match context-specific without matching it. # See https://www.rexegg.com/regex-lookarounds.html # This may be useful for the name extraction task. You may want to get rid of the "Von" and the period at the end of the line. # Even without using lookarounds, you can simply pipe multiple regex operations to subsequentally remove more information to get the same result. egrep "^[vV]on [A-ZÄÜÖ .-]+" newspaper_articles.txt | sed 's/^Von //' # To remove empty lines, you can use the following command sed -E '/^\s*$/d' newspaper_articles.txt # Pattern to extract URLs (there are also more complicated ones to cover edge-cases) egrep -o "(http://)?www\.[a-z0-9.-]+" newspaper_articles.txt