#this needs to be run in one of the clusterdock containers
#for some reason not always the name resolution is good in these containers, 
#so add the google DNS before running this, just in case
#echo "nameserver 8.8.8.8" >> /etc/resolv.conf
#make a temporary directory to store the downloaded csv files
mkdir /tmp/data
#go there
cd /tmp/data
#run the script that will download and unarchive all the csv files
wget -qO- https://raw.githubusercontent.com/academyofdata/clusterdock/master/getrawdata.sh | bash -s
#if everything went fine we should have some csv files in /tmp/data, put them onto hdfs
#first do a little setup - create a few directories and give everyone in HDFS unrestricted access
HADOOP_USER_NAME=hdfs hdfs dfs -mkdir /data
HADOOP_USER_NAME=hdfs hdfs dfs -chmod a+w /data
HADOOP_USER_NAME=hdfs hdfs dfs -mkdir /metadata
HADOOP_USER_NAME=hdfs hdfs dfs -chmod a+w /metadata
HADOOP_USER_NAME=hdfs hdfs dfs -mkdir /data/movies
HADOOP_USER_NAME=hdfs hdfs dfs -mkdir /data/movies_internal_hive
HADOOP_USER_NAME=hdfs hdfs dfs -mkdir /data/users
HADOOP_USER_NAME=hdfs hdfs dfs -mkdir /data/ratings
HADOOP_USER_NAME=hdfs hdfs dfs -mkdir /data/ratings-all


#now put the files in there
HADOOP_USER_NAME=hdfs hdfs dfs -put /tmp/data/movies.csv /data/movies/movies.csv
HADOOP_USER_NAME=hdfs hdfs dfs -put /tmp/data/movies_updates.csv /tmp/movies_updates.csv
HADOOP_USER_NAME=hdfs hdfs dfs -put /tmp/data/movies.csv /data/movies_internal_hive/movies.csv
HADOOP_USER_NAME=hdfs hdfs dfs -put /tmp/data/users.csv /data/users/users.csv
HADOOP_USER_NAME=hdfs hdfs dfs -put /tmp/data/ratings.csv /data/ratings/ratings.csv
HADOOP_USER_NAME=hdfs hdfs dfs -put /tmp/data/ratings-all.csv /data/ratings-all/ratings-all.csv

HADOOP_USER_NAME=hdfs hdfs dfs -chmod -R a+w /data


wget -qO- https://raw.githubusercontent.com/academyofdata/clusterdock/master/gen-avro-parquet.sh | bash -s