# Initial definitions

In [None]:
%env HADOOP_VERSION 2.9.2
%env HADOOP_PATH hadoop-2.9.2

# Preparing the environment

## Downloading Hadoop

In [None]:
!wget http://ftp.unicamp.br/pub/apache/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz -q --show-progress

## Extracting compressed files and removing .tar

In [None]:
# !rm ${HADOOP_PATH} -r
!tar -xvf hadoop-${HADOOP_VERSION}.tar.gz >/dev/null 
!rm hadoop-${HADOOP_VERSION}.tar.gz

## Discovering the Java path

In [None]:
!dirname $(dirname $(readlink -f $(which javac)))

## Setting the Java path envvar

We also added it to user's .bashrc so it will be loaded as the nodes perform ssh connections.

In [None]:
%env JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64

In [None]:
!echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 " >> ~/.bashrc
!echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 " >> ~/.profile
!echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 " >> ${HADOOP_PATH}/etc/hadoop/hadoop-env.sh

# Hadoop in Standalone Mode (local)

## MapReduce in the local filesystem - word count example

In [None]:
!${HADOOP_PATH}/bin/hadoop jar ${HADOOP_PATH}/share/hadoop/mapreduce/hadoop-mapreduce-examples-${HADOOP_VERSION}.jar wordcount \
 ./resources/examples/newyorknewyork.txt ./output

### Listing files in the output folder

In [None]:
!ls ./output/

### Reading output file

In [None]:
! cat ./output/part-r-00000

# Hadoop in Pseudo-Distributed Mode

## Preparing the environment

### Starting sshd server

Check `/binder/postBuild` and `/resources/configs/ssh/sshd_config` files for more details

In [None]:
!/usr/sbin/sshd -f resources/configs/ssh/sshd_config 

### Adding names to know hosts 

Commands below stablish ssh connections to used host names/ips. This step avoids yes/no host confirmation.

In [None]:
!ssh -o "StrictHostKeyChecking no" $USER@localhost -p 8822 -C "exit" 
!ssh -o "StrictHostKeyChecking no" $USER@0.0.0.0 -p 8822 -C "exit"

### Adding ssh options to Hadoop via envvar

* connecting in a diferent port (`-p 8822`)
* avoiding host key checking (`-o StrictHostKeyChecking=no`)

In [None]:
%env HADOOP_SSH_OPTS= -o StrictHostKeyChecking=no -p 8822

In [None]:
%env PDSH_RCMD_TYPE ssh

### Copying configurations files to Hadoop folder

Check the configuration files accordingly to the Hadoop version. 
Refer to the `/resources/configs/hadoop/`.

In [None]:
!cp resources/configs/hadoop/${HADOOP_VERSION}/core-site.xml ${HADOOP_PATH}/etc/hadoop/
!cp resources/configs/hadoop/${HADOOP_VERSION}/hdfs-site.xml ${HADOOP_PATH}/etc/hadoop/

## Formatting the filesystem

In [None]:
!${HADOOP_PATH}/bin/hdfs namenode -format -force -nonInteractive

## Starting DFS (NameNode, SecondaryNameNode, and DataNode daemons)

In [None]:
!${HADOOP_PATH}/sbin/start-dfs.sh
!jps

## MapReduce - Word count example 

### Creating folders in the distributed file system

In [None]:
!${HADOOP_PATH}/bin/hdfs dfs -mkdir /user/
!${HADOOP_PATH}/bin/hdfs dfs -mkdir /user/matheus/
!${HADOOP_PATH}/bin/hdfs dfs -mkdir /user/matheus/input/

### Copying a file to a folder in the distributed file system

In [None]:
!${HADOOP_PATH}/bin/hdfs dfs -put ./resources/examples/newyorknewyork.txt /user/matheus/input/

### Listing files in a folder of the distributed file system

In [None]:
!${HADOOP_PATH}/bin/hdfs dfs -ls /user/matheus/input/

### Retrieving the contents of a file in the distributed file system

In [None]:
!${HADOOP_PATH}/bin/hdfs dfs -cat /user/matheus/input/newyorknewyork.txt

### Running MapReduce job in Pseudo-Distributed Mode

In [None]:
!./${HADOOP_PATH}/bin/hadoop jar ./${HADOOP_PATH}/share/hadoop/mapreduce/hadoop-mapreduce-examples-${HADOOP_VERSION}.jar wordcount \
 /user/matheus/input /user/matheus/output

### Listing files in the output folder

In [None]:
!./${HADOOP_PATH}/bin/hdfs dfs -ls /user/matheus/output/

### Reading output file

In [None]:
!./${HADOOP_PATH}/bin/hdfs dfs -cat /user/matheus/output/part-r-00000

# Starting YARN in Pseudo-Distributed Mode

## Preparing the environment

### Copying configurations files to Hadoop folder

In [None]:
!cp resources/configs/hadoop/${HADOOP_VERSION}/mapred-site.xml ${HADOOP_PATH}/etc/hadoop/
!cp resources/configs/hadoop/${HADOOP_VERSION}/yarn-site.xml ${HADOOP_PATH}/etc/hadoop/

## Starting YARN

In [None]:
!${HADOOP_PATH}/sbin/start-yarn.sh
!jps

## MapReduce via YARN - Word count example 

In [None]:
!./${HADOOP_PATH}/bin/yarn jar ./${HADOOP_PATH}/share/hadoop/mapreduce/hadoop-mapreduce-examples-${HADOOP_VERSION}.jar wordcount \
 /user/matheus/input /user/matheus/output2

### Listing files in the output folder

In [None]:
!./${HADOOP_PATH}/bin/hdfs dfs -ls /user/matheus/output2/

### Reading output file

In [None]:
!./${HADOOP_PATH}/bin/hdfs dfs -cat /user/matheus/output2/part-r-00000