#!/usr/bin/env bash # Based on gs://dataproc-initialization-actions/jupyter/jupyter.sh # Installs the following on the dataproc master: # conda # jupyter # npm # nodejs # maven # docker # google_compute_engine [pip] # cigar [pip] # Installs the following on dataproc workers: # conda # cigar [pip] set -e ROLE=$(curl -f -s -H Metadata-Flavor:Google http://metadata/computeMetadata/v1/instance/attributes/dataproc-role) INIT_ACTIONS_REPO=$(curl -f -s -H Metadata-Flavor:Google http://metadata/computeMetadata/v1/instance/attributes/INIT_ACTIONS_REPO || true) INIT_ACTIONS_REPO="${INIT_ACTIONS_REPO:-https://github.com/GoogleCloudPlatform/dataproc-initialization-actions.git}" INIT_ACTIONS_BRANCH=$(curl -f -s -H Metadata-Flavor:Google http://metadata/computeMetadata/v1/instance/attributes/INIT_ACTIONS_BRANCH || true) INIT_ACTIONS_BRANCH="${INIT_ACTIONS_BRANCH:-master}" DATAPROC_BUCKET=$(curl -f -s -H Metadata-Flavor:Google http://metadata/computeMetadata/v1/instance/attributes/dataproc-bucket) # Colon-separated list of conda channels to add before installing packages JUPYTER_CONDA_CHANNELS=$(curl -f -s -H Metadata-Flavor:Google http://metadata/computeMetadata/v1/instance/attributes/JUPYTER_CONDA_CHANNELS || true) # Colon-separated list of conda packages to install, for example 'numpy:pandas' JUPYTER_CONDA_PACKAGES=$(curl -f -s -H Metadata-Flavor:Google http://metadata/computeMetadata/v1/instance/attributes/JUPYTER_CONDA_PACKAGES || true) echo "Cloning fresh dataproc-initialization-actions from repo $INIT_ACTIONS_REPO and branch $INIT_ACTIONS_BRANCH..." git clone -b "$INIT_ACTIONS_BRANCH" --single-branch $INIT_ACTIONS_REPO # Ensure we have conda installed. ./dataproc-initialization-actions/conda/bootstrap-conda.sh source /etc/profile.d/conda.sh if [ -n "${JUPYTER_CONDA_CHANNELS}" ]; then echo "Adding custom conda channels '$(echo ${JUPYTER_CONDA_CHANNELS} | tr ':' ' ')'" conda config --add channels $(echo ${JUPYTER_CONDA_CHANNELS} | tr ':' ',') fi if [ -n "${JUPYTER_CONDA_PACKAGES}" ]; then echo "Installing custom conda packages '$(echo ${JUPYTER_CONDA_PACKAGES} | tr ':' ' ')'" conda install $(echo ${JUPYTER_CONDA_PACKAGES} | tr ':' ' ') fi if [[ "${ROLE}" == 'Master' ]]; then conda install jupyter echo "Completed installing Jupyter!" pip install google_compute_engine if gsutil -q stat "gs://$DATAPROC_BUCKET/notebooks/**"; then echo "Pulling notebooks directory to cluster master node..." gsutil -m cp -r gs://$DATAPROC_BUCKET/notebooks /root/ fi ./dataproc-initialization-actions/jupyter/internal/setup-jupyter-kernel.sh ./dataproc-initialization-actions/jupyter/internal/launch-jupyter-kernel.sh # Install docker # install packages to allow apt to use a repository over HTTPS: apt-get -y install \ apt-transport-https ca-certificates curl software-properties-common # add Docker's GPG key: curl -fsSL https://download.docker.com/linux/debian/gpg | apt-key add - # set up the Docker stable repository. add-apt-repository \ "deb [arch=amd64] https://download.docker.com/linux/debian \ $(lsb_release -cs) \ stable" # update the apt package index: apt-get -y update # install maven (jessie-backport) apt-get -y install -t jessie-backports maven echo "Completed installing Maven!" # finally, install docker apt-get -y install docker-ce echo "Completed installing Docker!" # update node and npm version using Debian, as root curl -sL https://deb.nodesource.com/setup_8.x | sudo -E bash - apt-get -y install nodejs echo "Completed installing nodejs!" npm install -g n echo "Completed installing npm!" fi # Install Jupyter extensions (if desired) if [[ ! -v $INSTALL_JUPYTER_EXT ]] then INSTALL_JUPYTER_EXT=false fi if [[ "$INSTALL_JUPYTER_EXT" = true ]] then echo "Installing Jupyter Notebook extensions..." ./dataproc-initialization-actions/jupyter/internal/bootstrap-jupyter-ext.sh echo "Jupyter Notebook extensions installed!" fi