#!/bin/sh

export SCRIPT=$(readlink -f "$0")
export PYTHONHASHSEED=123
export SCRIPTPATH=$(dirname "$SCRIPT")
export PROJECTPATH=$(dirname "$SCRIPTPATH")
export DATAPATH=/data/wikipedia
export STORAGEPATH=/srv/wikipedia/storage
export MAXNOFHTREADS=14

mkdir -p $DATAPATH
mkdir -p $STORAGEPATH

createData() {
	cd $DATAPATH
	wget http://dumps.wikimedia.your.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
	bunzip2 enwiki-latest-pages-articles.xml.bz2
	
	mkdir -p xml
	mkdir -p nlpxml
	mkdir -p storage
	
	strusWikimediaToXml -n 0 -P 10000 -R ./redirects.txt enwiki-latest-pages-articles.xml
	strusWikimediaToXml -B -n 0 -P 10000 -t "$MAXNOFHTREADS" -L ./redirects.txt enwiki-latest-pages-articles.xml xml

	for ext in err mis wtf org txt; do find xml -name "*.$ext" | xargs rm; done
}

mergePosTagging()
{
	cd $DATAPATH
	strusMergeMarkup -x ".xml" -t "$MAXNOFHTREADS" -k "T,P,C,X,A,V,M,N,E,U,R,W" -o nlpxml/ -F errout/ nlpxml.old/ xml/
}

processPosTagging() {
    DID=$1
    NLPCONV=$SCRIPTPATH/strusnlp.py
    PYTHONHASHSEED=123
    # [1] Call a strus program to scan the Strus Wikipedia XML generated in the previous step from the Wikimedia dump.
    #	the program creates a text dump in $DATAPATH/pos/$DID.txt with all the selected contents as input for the
    #	POS tagging script.
    strusPosTagger -I -x xml -C XML -D '; ' -X '//pagelink@id://pagelink//*()' -Y '##' -e '//pagelink()' -e '//weblink()' -e '//text()' -e '//attr()' -e '//char()' -e '//math()' -e '//code()' -e '//bibref()' -E '//mark' -E '//text' -E '//entity' -E '//attr' -E '//attr~' -E '//quot' -E '//quot~' -E '//pagelink' -E '//weblink' -E '//tablink' -E '//citlink' -E '//reflink' -E '//tabtitle' -E '//head' -E '//cell' -E '//bibref' -E '//time' -E '//char' -E '//code' -E '//math' -p '//heading' -p '//table' -p '//citation' -p '//ref' -p '//list' -p '//cell~' -p '//head~' -p '//heading~' -p '//list~' -p '//br' $DATAPATH/xml/$DID $DATAPATH/pos/$DID.txt
    EC="$?"
    if [ "$EC" != "0" ]; then
        echo "Error creating POS tagger input: $EC" > $DATAPATH/err/$DID.txt
    fi
    # [2] Call the POS tagging script with the text dumps in $DATAPATH/pos/$DID.txt and write the output to $DATAPATH/tag/$DID,txt
    cat $DATAPATH/pos/$DID.txt | $NLPCONV -S -C 100 > $DATAPATH/tag/$DID.txt
    EC="$?"
    if [ "$EC" != "0" ]; then
        echo "Error in POS tagger script: $EC" > $DATAPATH/err/$DID.txt
    fi
    # [3] Merge the output of the POS tagging script with the original XML in $DATAPATH/xml/$DID/
    #	and write a new XML file with the same name into $DATAPATH/nlpxml/$DID/
    strusPosTagger -F $DATAPATH/err/tag_$DID.err -x ".xml" -C XML -e '//pagelink()' -e '//weblink()' -e '//text()' -e '//attr()' -e '//char()' -e '//math()' -e '//code()' -e '//bibref()' -o $DATAPATH/nlpxml/$DID $DATAPATH/xml/$DID $DATAPATH/tag/$DID.txt
    EC="$?"
    if [ "$EC" != "0" ]; then
        echo "Error tagging XML with POS tagger output: $EC" > $DATAPATH/err/$DID.txt
    fi
}

processPosTaggingDumpSlice() {
    WHAT=$1
    SLICE=$2
    START=${3:-0000}
    END=${4:-9999}
    LASTJOB=none
    STOPPED=false
    for aa in 0 1 2 3 4 5 6 ; do
    for bb in 0 1 2 3 4 5 6 7 8 9; do
    for cc in 0 1 2 3 4 5 6 7 8 9; do
    for dd in 0 1 2 3 4 5 6 7 8 9; do
        DID=$aa$bb$cc$dd
        if [ $DID -ge $START ]; then
            if [ $DID -le $END ]; then
                if [ `expr $DID % $SLICE` == $WHAT ]; then
                    if [ -e $DATAPATH/flags/stop_nlp ]
                    then
                        if [ "_$STOPPED" == "_false" ]; then
                            echo "stopped POS tagging ($LASTJOB)."
                            STOPPED=true
                        fi
                        break
                    else
                        echo "processing POS tagging of $DID ..."
                        processPosTagging $DID
                        LASTJOB=$DID
                    fi
                fi
            fi
        fi
    done
    done
    done
    done
}

processHeadingTagMarkup() {
    START=${1:-0000}
    END=${2:-9999}
    for aa in 0 1 2 3 4 5 6 ; do
    for bb in 0 1 2 3 4 5 6 7 8 9; do
    for cc in 0 1 2 3 4 5 6 7 8 9; do
    for dd in 0 1 2 3 4 5 6 7 8 9; do
        DID=$aa$bb$cc$dd
        if [ $DID -ge $START ]; then
            if [ $DID -le $END ]; then
                echo "processing title/heading tag markup of $DID ..."
                strusTagMarkup -t "$MAXNOFHTREADS" -x xml -e '/doc/title' -e '//heading' -d '//br' -P $DID"_1" $DATAPATH/nlpxml/$DID $DATAPATH/nlpxml/$DID
            fi
        fi
    done
    done
    done
    done
}

processCategoryTagMarkup() {
    echo "processing category tag markup ..."
    strusTagMarkup -x xml  --markup map --attribute cid -e '//category' -P "1:lc:convdia" $DATAPATH/nlpxml $DATAPATH/nlpxml
}

processDocumentCheck() {
    START=${1:-0000}
    END=${2:-9999}
    for aa in 0 1 2 3 4 5 6 ; do
    for bb in 0 1 2 3 4 5 6 7 8 9; do
    for cc in 0 1 2 3 4 5 6 7 8 9; do
    for dd in 0 1 2 3 4 5 6 7 8 9; do
        DID=$aa$bb$cc$dd
        if [ $DID -ge $START ]; then
        if [ $DID -le $END ]; then
        if [ -d $DATAPATH/nlpxml ]; then
            echo "checking $DID ..."
            for ff in `ls $DATAPATH/nlpxml/$DID/*.xml`; do xmllint --noout $ff; done > $DATAPATH/err/xmlerr.$DID.xml 2>&1
            [ -s $DATAPATH/err/xmlerr.$DID.xml ] || rm $DATAPATH/err/xmlerr.$DID.xml # ... delete empty files
            [ -e $DATAPATH/err/xmlerr.$DID.xml ] && echo "$DID has errors, see $DATAPATH/err/xmlerr.$DID.xml"
        fi
        fi
    fi
    done
    done
    done
    done
}


dumpVectorInput() {
    DID=$1
    CFG=$PROJECTPATH/config/word2vecInput.ana
    FILTER=$SCRIPTPATH/filtervectok.py
    strusAnalyze --dump "eod='\n. . . . . . . .\n',punct=' , ',eos=' .\n',refid,word" --unique -C XML $CFG $DATAPATH/nlpxml/$DID/ | $FILTER >> $DATAPATH/vec.txt
}

calcWord2vec() {
    word2vec -size 256 -window 8 -sample 1e-5 -negative 16 -threads "$MAXNOFHTREADS" -type-prefix-delim '#' -type-min-count 'H=1,V=100,E=4,N=20,A=100,C=100,X=100,M=100,U=2,R=100,W=100,T=100' -min-count 5 -alpha 0.025 -classes 0 -debug 2 -binary 1 -portable 1 -save-vocab $DATAPATH/vocab.txt -cbow 0 -train $DATAPATH/vec.txt -output $DATAPATH/vec.bin
}

insertVectors() {
    STORAGEID=vec
    if [ -d "$STORAGEPATH/$STORAGEID" ]; then
         strusDestroy -s "path=$STORAGEPATH/$STORAGEID"
    fi
    strusCreateVectorStorage -c 100000 -s "path=$STORAGEPATH/$STORAGEID;vecdim=256;bits=64;variations=32" -P -f $DATAPATH/vec.bin
}

dumpVectorInputAll() {
    rm $DATAPATH/vec.txt
    START=${1:-0000}
    END=${2:-9999}
    for aa in 0 1 2 3 4 5 6 ; do
    for bb in 0 1 2 3 4 5 6 7 8 9; do
    for cc in 0 1 2 3 4 5 6 7 8 9; do
    for dd in 0 1 2 3 4 5 6 7 8 9; do
        DID=$aa$bb$cc$dd
        if [ $DID -ge $START ]; then
            if [ $DID -le $END ]; then
                echo "processing $DID ..."
                dumpVectorInput $DID
            fi
        fi
    done
    done
    done
    done
}

createStorage() {
    STORAGEID=$1
    if [ -d "$STORAGEPATH/$STORAGEID" ]; then
        strusDestroy -s "path=$STORAGEPATH/$STORAGEID"
    fi
    strusCreate -s "path=$STORAGEPATH/$STORAGEID"
    strusAlterMetaData -s "path=$STORAGEPATH/$STORAGEID" "add doclen UINT32"
}

insertDocuments() {
    STORAGEID=$1
    WHAT=$2
    SLICE=$3
    START=${4:-0000}
    END=${5:-9999}
    CFG=$PROJECTPATH/config/doc.ana
    LASTJOB=none
    STOPPED=false

    for aa in 0 1 2 3 4 5 6 ; do
    for bb in 0 1 2 3 4 5 6 7 8 9; do
    PATHLIST=""
    for cc in 0 1 2 3 4 5 6 7 8 9; do
    for dd in 0 1 2 3 4 5 6 7 8 9; do
        DID=$aa$bb$cc$dd
        if [ $DID -ge $START ]; then
            if [ $DID -le $END ]; then
                if [ `expr $DID % $SLICE` == $WHAT ]; then
                    PATHLIST="$PATHLIST $DID"
                fi
            fi
        fi
    done
    done
    if [ -e $DATAPATH/flags/stop_insert ]
    then
        if [ "_$STOPPED" == "_false" ]; then
            echo "stopped inserting documents ($LASTJOB)."
            STOPPED=true
        fi
        break
    else
        cd $DATAPATH/nlpxml
        if [ "_$PATHLIST" != "_" ]; then
            echo "inserting documents of $PATHLIST ..."
            strusInsert -s "path=$STORAGEPATH/$STORAGEID" -x xml -C XML -t 3 -c 5000 $CFG $PATHLIST 
        fi
        LASTJOB=$PATHLIST
        cd -
    fi
    done
    done
}

createData

processPosTaggingDumpSlice 0 3 0000 5762
processPosTaggingDumpSlice 1 3 0000 5762
processPosTaggingDumpSlice 2 3 0000 5762

processHeadingTagMarkup 0000 5762

processDocumentCheck 0000 5762

processCategoryTagMarkup 
dumpVectorInputAll 0000 5762
calcWord2vec
insertVectors

createStorage doc1
insertDocuments doc1 0 4 0000 5762

createStorage doc2
insertDocuments doc2 1 4 0000 5762

createStorage doc3
insertDocuments doc3 2 4 0000 5762

createStorage doc4
insertDocuments doc4 3 4 0000 5762