#!/bin/sh echo "Prepare data directory ..." mkdir data cd data echo "Download and uppack the data dump ..." wget http://ftp.musicbrainz.org/pub/musicbrainz/data/fullexport/LATEST LATEST=`cat LATEST` wget http://ftp.musicbrainz.org/pub/musicbrainz/data/fullexport/$LATEST/mbdump-cdstubs.tar.bz2 bzip2 -d mbdump-cdstubs.tar.bz2 tar -xvf mbdump-cdstubs.tar cat mbdump/release_raw | sed 's/\&/\&/g' | sed 's//\>/g' > dump.txt ndocs=`dump.txt |awk -F"\t" '{if(min=="")min=max=$1; if($1>max) {max=$1}; if($1< min) {min=$1}; } END {print int(max/100)}'` echo "Create the documents ..." mkdir doc idoc=0 while [ $idoc -lt $ndocs ] do echo "" > doc/$idoc.xml echo "" >> doc/$idoc.xml idoc=`expr $idoc + 1` done echo "Fill the documents with the content of the dump ..." cat dump.txt\ | awk -F"\t" '{FNAME=int($1/100); print "" $1 "" $2 "" $3 "" $4 "" $9 "" $10 "" >> "doc/"FNAME".xml" }' idoc=0 while [ $idoc -lt $ndocs ] do echo "" >> doc/$idoc.xml idoc=`expr $idoc + 1` done cd .. echo "Leave the data directory and inspect the result ..." echo "Nof Documents: `ls -l data/doc/*.xml | wc -l`" echo "Nof entries (example document) `cat data/doc/999.xml | grep item | wc -l`"