#this needs to be run in one of the clusterdock containers #for some reason not always the name resolution is good in these containers, #so add the google DNS before running this, just in case #echo "nameserver 8.8.8.8" >> /etc/resolv.conf #make a temporary directory to store the downloaded csv files mkdir /tmp/data #go there cd /tmp/data #run the script that will download and unarchive all the csv files wget -qO- https://raw.githubusercontent.com/academyofdata/clusterdock/master/getrawdata.sh | bash -s #if everything went fine we should have some csv files in /tmp/data, put them onto hdfs #first do a little setup - create a few directories and give everyone in HDFS unrestricted access HADOOP_USER_NAME=hdfs hdfs dfs -mkdir /data HADOOP_USER_NAME=hdfs hdfs dfs -chmod a+w /data HADOOP_USER_NAME=hdfs hdfs dfs -mkdir /metadata HADOOP_USER_NAME=hdfs hdfs dfs -chmod a+w /metadata HADOOP_USER_NAME=hdfs hdfs dfs -mkdir /data/movies HADOOP_USER_NAME=hdfs hdfs dfs -mkdir /data/movies_internal_hive HADOOP_USER_NAME=hdfs hdfs dfs -mkdir /data/users HADOOP_USER_NAME=hdfs hdfs dfs -mkdir /data/ratings HADOOP_USER_NAME=hdfs hdfs dfs -mkdir /data/ratings-all #now put the files in there HADOOP_USER_NAME=hdfs hdfs dfs -put /tmp/data/movies.csv /data/movies/movies.csv HADOOP_USER_NAME=hdfs hdfs dfs -put /tmp/data/movies_updates.csv /tmp/movies_updates.csv HADOOP_USER_NAME=hdfs hdfs dfs -put /tmp/data/movies.csv /data/movies_internal_hive/movies.csv HADOOP_USER_NAME=hdfs hdfs dfs -put /tmp/data/users.csv /data/users/users.csv HADOOP_USER_NAME=hdfs hdfs dfs -put /tmp/data/ratings.csv /data/ratings/ratings.csv HADOOP_USER_NAME=hdfs hdfs dfs -put /tmp/data/ratings-all.csv /data/ratings-all/ratings-all.csv HADOOP_USER_NAME=hdfs hdfs dfs -chmod -R a+w /data wget -qO- https://raw.githubusercontent.com/academyofdata/clusterdock/master/gen-avro-parquet.sh | bash -s