#!/bin/bash # # Copyright 2015,2016,2017,2018,2019,2020,2023 Google LLC and contributors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS-IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Initialization action for installing Apache Oozie on a Google Cloud # Dataproc cluster. This script will install and configure Oozie to run on the # master node of a Dataproc cluster. The version of Oozie which is installed # comes from the BigTop repository. # # You can find more information about Oozie at http://oozie.apache.org/ # For more information in init actions and Google Cloud Dataproc see the Cloud # Dataproc documentation at https://cloud.google.com/dataproc/init-actions # # This script should run in under a few minutes set -euxo pipefail OS_NAME=$(lsb_release -is | tr '[:upper:]' '[:lower:]') distribution=$(. /etc/os-release;echo $ID$VERSION_ID) readonly OS_NAME readonly master_node=$(/usr/share/google/get_metadata_value attributes/dataproc-master) readonly ROLE="$(/usr/share/google/get_metadata_value attributes/dataproc-role)" readonly MAVEN_CENTRAL_URI=https://maven-central.storage-download.googleapis.com/maven2 # Use Python from /usr/bin instead of /opt/conda. export PATH=/usr/bin:$PATH # Detect dataproc image version from its various names if (! test -v DATAPROC_IMAGE_VERSION) && test -v DATAPROC_VERSION; then DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" fi case "${DATAPROC_IMAGE_VERSION}" in "1.3" | "1.4" | "1.5" | "2.0" ) curator_version="2.13.0" curator_src="/usr/lib/hadoop/lib" ;; "2.1" | "2.2") curator_version="2.13.0" curator_src="/usr/lib/spark/jars" ;; *) echo "unsupported DATAPROC_IMAGE_VERSION: ${DATAPROC_IMAGE_VERSION}" >&2 exit 1 ;; esac # Use Python from /usr/bin instead of /opt/conda. export PATH=/usr/bin:$PATH export METADATA_HTTP_PROXY=$(/usr/share/google/get_metadata_value attributes/http-proxy) export METADATA_EMAIL_SMTP_HOST=$(/usr/share/google/get_metadata_value attributes/email-smtp-host) export METADATA_EMAIL_FROM_ADDRESS=$(/usr/share/google/get_metadata_value attributes/email-from-address) export MYSQL_ROOT_USERNAME=$(/usr/share/google/get_metadata_value attributes/mysql-root-username || echo "root") export OOZIE_DB_NAME=$(/usr/share/google/get_metadata_value attributes/oozie-db-name || echo "oozie") export OOZIE_DB_USERNAME=$(/usr/share/google/get_metadata_value attributes/oozie-db-username || echo "oozie") export OOZIE_PASSWORD_SECRET_NAME=$(/usr/share/google/get_metadata_value attributes/oozie-password-secret-name || echo "secret-name") export OOZIE_PASSWORD_SECRET_VERSION=$(/usr/share/google/get_metadata_value attributes/oozie-password-secret-version || echo 1) export OOZIE_PASSWORD=$(gcloud secrets versions access --secret ${OOZIE_PASSWORD_SECRET_NAME} ${OOZIE_PASSWORD_SECRET_VERSION} || echo oozie-password) export http_proxy="${METADATA_HTTP_PROXY}" export https_proxy="${METADATA_HTTP_PROXY}" export HTTP_PROXY="${METADATA_HTTP_PROXY}" export HTTPS_PROXY="${METADATA_HTTP_PROXY}" export no_proxy=metadata.google.internal export NO_PROXY=metadata.google.internal export MYSQL_ROOT_PASSWORD_SECRET_NAME=$(/usr/share/google/get_metadata_value attributes/mysql-root-password-secret-name) export MYSQL_ROOT_PASSWORD_SECRET_VERSION=$(/usr/share/google/get_metadata_value attributes/mysql-root-password-secret-version || echo 1) export MYSQL_ROOT_PASSWORD=$(gcloud secrets versions access --secret ${MYSQL_ROOT_PASSWORD_SECRET_NAME} ${MYSQL_ROOT_PASSWORD_SECRET_VERSION} || \ grep 'password=' /etc/mysql/my.cnf | sed 's/^.*=//' || echo root-password) NUM_LIVE_DATANODES=0 function remove_old_backports { # This script uses 'apt-get update' and is therefore potentially dependent on # backports repositories which have been archived. In order to mitigate this # problem, we will remove any reference to backports repos older than oldstable # https://github.com/GoogleCloudDataproc/initialization-actions/issues/1157 oldstable=$(curl -s https://deb.debian.org/debian/dists/oldstable/Release | awk '/^Codename/ {print $2}'); stable=$(curl -s https://deb.debian.org/debian/dists/stable/Release | awk '/^Codename/ {print $2}'); matched_files="$(grep -rsil '\-backports' /etc/apt/sources.list*)" if [[ -n "$matched_files" ]]; then for filename in "$matched_files"; do grep -e "$oldstable-backports" -e "$stable-backports" "$filename" || \ sed -i -e 's/^.*-backports.*$//' "$filename" done fi } function await_hdfs_datanodes() { # Wait for HDFS to come online tryno=0 delay=0 until [[ $tryno -gt 9 || ${NUM_LIVE_DATANODES} -gt 0 ]]; do NUM_LIVE_DATANODES=`sudo -u hdfs hdfs dfsadmin -report -live | perl -ne 'print $1 if /^Live.*\((.*)\):/'` sleep ${delay}s (( tryno=${tryno}+1 )) (( delay=${tryno}*5 )) done if [[ $tryno -gt 9 ]]; then echo "hdfs did not come online" return -1 fi } function set_oozie_property() { local prop_name="$1" local prop_val="$2" /usr/local/bin/bdconfig set_property \ --configuration_file '/etc/oozie/conf/oozie-site.xml' \ --name "${prop_name}" --value "${prop_val}" \ --clobber } function set_hadoop_property() { local prop_name="$1" local prop_val="$2" /usr/local/bin/bdconfig set_property \ --configuration_file '/etc/hadoop/conf/core-site.xml' \ --name "${prop_name}" --value "${prop_val}" \ --clobber } function retry_command() { local cmd="$1" # First retry is immediate for ((i = 0; i < 10; i++)); do if eval "$cmd"; then return 0 fi sleep $((i * 5)) done return 1 } function min_version() { echo -e "$1\n$2" | sort -r -t'.' -n -k1,1 -k2,2 -k3,3 | tail -n1 } function configure_ssl() { local oozie_home=$(getent passwd oozie home | cut -f 6 -d :) local domain=$(hostname -d) local keystore_file="${oozie_home}/.keystore" local keystore_password="password" local truststore_file="${oozie_home}/oozie.truststore" local certificate_path="${oozie_home}/certificate.cert" local certificate_secret_name= if [[ "$(hostname -s)" == "${master_node}" ]]; then test -f ${keystore_file} ||\ sudo -u oozie keytool -genkeypair -alias jetty -file ${keystore_file} \ -keyalg RSA -dname "CN=*.${domain}" \ -storepass "${keystore_password}" -keypass "${keystore_password}" test -f ${certificate_path} ||\ sudo -u oozie keytool -exportcert -alias jetty -file "${certificate_path}" \ -storepass "${keystore_password}" test -f ${truststore_file} ||\ sudo -u oozie keytool -import -noprompt -alias jetty -file "${certificate_path}" \ -keystore "${truststore_file}" -storepass "${keystore_password}" if [[ ${NUM_LIVE_DATANODES} != 0 ]]; then retry_command "hdfs dfs -put -f ${certificate_path} /tmp/oozie.certificate" retry_command "hdfs dfs -put -f ${keystore_file} /tmp/oozie.keystore" retry_command "hdfs dfs -put -f ${truststore_file} /tmp/oozie.truststore" fi else if [[ ${NUM_LIVE_DATANODES} != 0 ]]; then echo "Secondary master; attempting to copy SSL files (truststore, keystore, certificate) from HDFS." retry_command "hdfs dfs -get /tmp/oozie.truststore ${truststore_file}" retry_command "hdfs dfs -get /tmp/oozie.keystore ${keystore_file}" retry_command "hdfs dfs -get /tmp/oozie.certificate ${certificate_path}" fi fi # Configure the Oozie client to use the truststore. echo "export OOZIE_CLIENT_OPTS='-Djavax.net.ssl.trustStore=${truststore_file}'" >> /usr/lib/oozie/conf/oozie-client-env.sh # Configure the Oozie client to use the HTTPS URL. echo "export OOZIE_URL='https://$(hostname -f):11443/oozie'" >> /usr/lib/oozie/conf/oozie-client-env.sh set_oozie_property 'oozie.https.enabled' 'true' set_oozie_property 'oozie.https.keystore.file' "${keystore_file}" set_oozie_property 'oozie.https.keystore.pass' "${keystore_password}" set_oozie_property 'oozie.https.truststore.file' "${truststore_file}" } function install_oozie() { local enable_ssl enable_ssl=$(/usr/share/google/get_metadata_value attributes/oozie-enable-ssl || echo "false") # Upgrade the repository and install Oozie if [[ ${OS_NAME} == rocky ]]; then # update dnf proxy retry_command "dnf -y -v install oozie" # unzip does not come pre-installed on the 2.1-rocky8 image if [[ $(echo "${DATAPROC_IMAGE_VERSION} >= 2.1" | bc -l) == 1 ]]; then retry_command "dnf -y install unzip" find /usr/lib/oozie/lib/ -name 'guava*.jar' -delete cp /usr/lib/hadoop/lib/hadoop-shaded-guava-1.1.1.jar /usr/lib/oozie/lib fi # add mysql service dependency on oozie service sed -i '/^# Required-Start:/ s/$/ mysqld.service/' /etc/init.d/oozie # setup symlinks for hadoop jar dependencies ln -sf /usr/lib/hadoop/hadoop-common.jar \ /usr/lib/hadoop/hadoop-auth.jar \ /usr/lib/hadoop/hadoop-annotations.jar \ /usr/lib/hadoop-hdfs/hadoop-hdfs-client.jar \ /usr/lib/hadoop-yarn/hadoop-yarn-common.jar \ /usr/lib/hadoop-yarn/hadoop-yarn-client.jar \ /usr/lib/hadoop-yarn/hadoop-yarn-server-common.jar \ /usr/lib/hadoop-yarn/hadoop-yarn-api.jar \ /usr/lib/hadoop-mapreduce/hadoop-mapreduce-client-jobclient.jar \ /usr/lib/hadoop-mapreduce/hadoop-mapreduce-client-app.jar \ /usr/lib/hadoop-mapreduce/hadoop-mapreduce-client-common.jar \ /usr/lib/hadoop-mapreduce/hadoop-mapreduce-client-core.jar \ /usr/lib/hadoop-mapreduce/hadoop-mapreduce-client-shuffle.jar /usr/lib/oozie/lib/ elif [[ ${OS_NAME} == ubuntu ]] || [[ ${OS_NAME} == debian ]]; then retry_command "apt-get install -y gnupg2 && apt-key adv --keyserver keyserver.ubuntu.com --recv-keys B7B3B788A8D3785C" retry_command "apt-get update --allow-releaseinfo-change" retry_command "apt-get install -q -y oozie oozie-client" else echo "Unsupported OS: '${OS_NAME}'" exit 1 fi # For Oozie, remove Log4j 2 jar not compatible with Log4j 1 that was brought by Hive 2 find /usr/lib/oozie/lib -name "log4j-1.2-api*.jar" -delete # Delete redundant Slf4j backend implementation find /usr/lib/oozie/lib -name "slf4j-simple*.jar" -delete find /usr/lib/oozie/lib -name "log4j-slf4j-impl*.jar" -delete # Redirect Log4j2 logging to Slf4j backend local log4j2_version log4j2_version=$( find /usr/lib/oozie/lib -name "log4j-core*-2.*.jar" | cut -d '/' -f 6 | cut -d '-' -f 3 ) log4j2_version=${log4j2_version/.jar/} if [[ -n ${log4j2_version} ]]; then local log4j2_to_slf4j=log4j-to-slf4j-${log4j2_version}.jar local log4j2_to_slf4j_url=${MAVEN_CENTRAL_URI}/org/apache/logging/log4j/log4j-to-slf4j/${log4j2_version}/${log4j2_to_slf4j} wget -nv --timeout=30 --tries=5 --retry-connrefused "${log4j2_to_slf4j_url}" -P /usr/lib/oozie/lib fi # Delete old versions of Jetty jars brought in by dependencies find /usr/lib/oozie/ -name "jetty*-6.*.jar" -delete local oozie_version oozie_version=$(oozie version 2>&1 | sed -n 's/.*Oozie[^:]\+:[[:blank:]]\+\([0-9]\+\.[0-9]\.[0-9]\+\+\).*/\1/p' | head -n1) if [[ $(min_version '5.0.0' "${oozie_version}") == 5.0.0 ]]; then find /usr/lib/oozie/ -name "jetty*-7.*.jar" -delete fi if [[ "$(hostname -s)" == "${master_node}" ]]; then local tmp_dir tmp_dir=$(mktemp -d -t oozie-install-XXXX) # The ext library is needed to enable the Oozie web console wget -nv --timeout=30 --tries=5 --retry-connrefused \ http://archive.cloudera.com/gplextras/misc/ext-2.2.zip -P "${tmp_dir}" unzip -o -q "${tmp_dir}/ext-2.2.zip" -d /var/lib/oozie # Install share lib tar -xzf /usr/lib/oozie/oozie-sharelib.tar.gz -C "${tmp_dir}" if [[ $(min_version '5.0.0' "${oozie_version}") != 5.0.0 ]]; then # Workaround to issue where jackson 1.8 and 1.9 jars are found on the classpath, causing # AbstractMethodError at runtime. We know hadoop/lib has matching vesions of jackson. rm -f "${tmp_dir}"/share/lib/hive2/jackson-* cp /usr/lib/hadoop/lib/jackson-* "${tmp_dir}/share/lib/hive2/" fi if ! hdfs dfs -test -d "/user/oozie"; then await_hdfs_datanodes if [[ ${NUM_LIVE_DATANODES} != 0 ]]; then hadoop fs -mkdir -p /user/oozie/ hadoop fs -put -f "${tmp_dir}/share" /user/oozie/ if grep '^dataproc' /etc/passwd ; then local hdfs_username=dataproc else local hdfs_username=hdfs fi sudo -u ${hdfs_username} hadoop fs -chown oozie /user/oozie fi fi # Clean up temporary fles rm -rf "${tmp_dir}" fi # Link the MySQL JDBC driver to the Oozie library directory ln -sf /usr/share/java/mysql.jar /usr/lib/oozie/lib/mysql.jar # Set JDBC properties mysql_host=$(/usr/share/google/get_metadata_value attributes/dataproc-master) if [[ "${enable_ssl}" == 'true' ]]; then configure_ssl fi set_oozie_property 'oozie.service.JPAService.jdbc.driver' "com.mysql.cj.jdbc.Driver" set_oozie_property 'oozie.service.JPAService.jdbc.url' "jdbc:mysql://${mysql_host}/oozie" set_oozie_property 'oozie.service.JPAService.jdbc.username' "oozie" set_oozie_property 'oozie.service.JPAService.jdbc.password' "${OOZIE_PASSWORD}" set_oozie_property 'oozie.email.smtp.host' "${METADATA_EMAIL_SMTP_HOST}" set_oozie_property 'oozie.email.from.address' "${METADATA_EMAIL_FROM_ADDRESS}" set_oozie_property 'oozie.action.max.output.data' "20000" # Set hostname to allow connection from other hosts (not only localhost) set_oozie_property 'oozie.http.hostname' "$(hostname -s)" # Following property was requested in customer case set_oozie_property 'oozie.service.WorkflowAppService.WorkflowDefinitionMaxLength' "1500000" # Following 2 properties added for customer case set_oozie_property 'oozie.service.URIHandlerService.uri.handlers' "org.apache.oozie.dependency.FSURIHandler,org.apache.oozie.dependency.HCatURIHandler" set_oozie_property 'oozie.credentials.credentialclasses' \ "hcat=org.apache.oozie.action.hadoop.HCatCredentials,hive2=org.apache.oozie.action.hadoop.Hive2Credentials,hbase=org.apache.oozie.action.hadoop.HbaseCredentials" # Following 4 properties provided by customer platform team for CEAM - Oozie to HCat integration set_oozie_property 'oozie.services.ext' "org.apache.oozie.service.PartitionDependencyManagerService,org.apache.oozie.service.HCatAccessorService" set_oozie_property 'oozie.service.HCatAccessorService.hcat.configuration' "/etc/hive/conf.dist/hive-site.xml" set_oozie_property 'oozie.service.coord.input.check.requeue.interval' "120000" set_oozie_property 'oozie.service.coord.push.check.requeue.interval' "120000" # Following properties were added for materialization issues observed in the NDL data lake set_oozie_property 'oozie.service.PurgeService.purge.interval' "86400" set_oozie_property 'oozie.service.CallableQueueService.threads' "100" set_oozie_property 'oozie.service.CallableQueueService.callable.concurrency' "50" set_oozie_property 'oozie.service.CoordMaterializeTriggerService.lookup.interval' "300" set_oozie_property 'oozie.service.CoordMaterializeTriggerService.scheduling.interval' "60" set_oozie_property 'oozie.service.CoordMaterializeTriggerService.materialization.window' "1500" set_oozie_property 'oozie.service.CoordMaterializeTriggerService.callable.batch.size' "10" set_oozie_property 'oozie.service.CoordMaterializeTriggerService.materialization.system.limit' "150" set_oozie_property 'oozie.service.JPAService.pool.max.active.conn' "50" set_oozie_property 'oozie.service.StatusTransitService.backward.support.for.states.without.error' "false" set_oozie_property 'oozie.service.ActionCheckerService.action.check.delay' "300" set_oozie_property 'oozie.action.retry.policy' "exponential" # Hadoop must allow impersonation for Oozie to work properly set_hadoop_property 'hadoop.proxyuser.oozie.groups' '*' set_hadoop_property 'hadoop.proxyuser.oozie.hosts' '*' set_oozie_property 'oozie.service.HadoopAccessorService.supported.filesystems' 'hdfs,gs' set_oozie_property 'fs.AbstractFileSystem.gs.impl' 'com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS' # https://biconsult.ru/files/new2/Data%20Lake%20for%20Enterprises.pdf page 784 set_oozie_property 'oozie.service.ProxyUserService.proxyuser.oozie.hosts' '*' set_oozie_property 'oozie.service.ProxyUserService.proxyuser.oozie.groups' '*' if [[ "$(hostname -s)" == "${master_node}" ]]; then # Create the Oozie user in MySQL. Do this before the copies, since other # masters may start up and attempt to connect before the HDFS copies # below complete. The other masters need to be able to connect to MySQL. retry_command "/usr/bin/mysql -u ${MYSQL_ROOT_USERNAME} --password='${MYSQL_ROOT_PASSWORD}' -e 'use ${OOZIE_DB_NAME}' || /usr/bin/mysqladmin -u ${MYSQL_ROOT_USERNAME} --password='${MYSQL_ROOT_PASSWORD}' create ${OOZIE_DB_NAME}" /usr/bin/mysql -u ${MYSQL_ROOT_USERNAME} --password="${MYSQL_ROOT_PASSWORD}" <= 2.1" | bc -l) == 1 ]]; then ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/lib/spark/jars/spark-hadoop-cloud*.jar /usr/lib/spark/jars/hadoop-cloud-storage-*.jar " ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/hadoop-common-*.jar ${tmp_dir}/share/lib/hive/woodstox-core-*.jar " ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/stax2-api-*.jar " ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/hadoop*.jar /usr/lib/spark/jars/hadoop*.jar " ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/local/share/google/dataproc/lib/gcs-connector.jar " if [[ -f /usr/local/share/google/dataproc/lib/spark-metrics-listener.jar ]]; then ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/local/share/google/dataproc/lib/spark-metrics-listener.jar " else ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/lib/spark/jars/*spark-metrics-listener*.jar " fi elif [[ $(echo "${DATAPROC_IMAGE_VERSION} > 1.5" | bc -l) == 1 ]]; then ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/lib/spark/jars/spark-hadoop-cloud*.jar /usr/lib/spark/jars/hadoop-cloud-storage-*.jar /usr/lib/spark/jars/re2j-1.1.jar " ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/htrace-core4-*-incubating.jar " ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/hadoop-common-*.jar ${tmp_dir}/share/lib/hive/woodstox-core-*.jar " ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/stax2-api-*.jar " ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/hadoop*.jar /usr/lib/spark/jars/hadoop*.jar " ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/local/share/google/dataproc/lib/gcs-connector.jar " if [[ -f /usr/local/share/google/dataproc/lib/spark-metrics-listener.jar ]]; then ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/local/share/google/dataproc/lib/spark-metrics-listener.jar " else ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/lib/spark/jars/*spark-metrics-listener*.jar " fi elif [[ $(echo "${DATAPROC_IMAGE_VERSION} > 1.4" | bc -l) == 1 ]]; then ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/htrace-core4-*-incubating.jar " ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/hadoop-common-*.jar ${tmp_dir}/share/lib/hive/woodstox-core-*.jar " ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/stax2-api-*.jar " ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/hadoop*.jar /usr/lib/spark/jars/hadoop*.jar " ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/local/share/google/dataproc/lib/gcs-connector.jar " if [[ -f /usr/local/share/google/dataproc/lib/spark-metrics-listener.jar ]]; then ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/local/share/google/dataproc/lib/spark-metrics-listener.jar " else ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/lib/spark/jars/*spark-metrics-listener*.jar " fi find /usr/lib/oozie/lib/ -name 'guava*.jar' -delete wget -P /usr/lib/oozie/lib ${MAVEN_CENTRAL_URI}/com/google/guava/guava/11.0.2/guava-11.0.2.jar elif [[ $(echo "${DATAPROC_IMAGE_VERSION} >= 1.3" | bc -l) == 1 ]]; then ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/hadoop*.jar /usr/lib/spark/jars/hadoop*.jar " ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/local/share/google/dataproc/lib/gcs-connector.jar " if [[ -f /usr/local/share/google/dataproc/lib/spark-metrics-listener.jar ]]; then ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/local/share/google/dataproc/lib/spark-metrics-listener.jar " else ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/lib/spark/jars/*spark-metrics-listener*.jar " fi ADDITIONAL_JARS="" find /usr/lib/oozie/lib/ -name 'guava*.jar' -delete wget -P /usr/lib/oozie/lib ${MAVEN_CENTRAL_URI}/com/google/guava/guava/11.0.2/guava-11.0.2.jar elif [[ $(echo "${DATAPROC_IMAGE_VERSION} > 1.2" | bc -l) == 1 ]]; then ADDITIONAL_JARS="" find /usr/lib/oozie/lib/ -name 'guava*.jar' -delete wget -P /usr/lib/oozie/lib ${MAVEN_CENTRAL_URI}/com/google/guava/guava/11.0.2/guava-11.0.2.jar else echo "unsupported DATAPROC_IMAGE_VERSION: ${DATAPROC_IMAGE_VERSION}" >&2 exit 1 fi else if [[ $(echo "${DATAPROC_IMAGE_VERSION} >= 2.1" | bc -l) == 1 ]]; then ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/lib/spark/jars/spark-hadoop-cloud*.jar /usr/lib/spark/jars/hadoop-cloud-storage-*.jar " ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/hadoop-common-*.jar ${tmp_dir}/share/lib/hive/woodstox-core-*.jar " ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/stax2-api-*.jar " ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/hadoop*.jar /usr/lib/spark/jars/hadoop*.jar " ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/local/share/google/dataproc/lib/gcs-connector.jar " if [[ -f /usr/local/share/google/dataproc/lib/spark-metrics-listener.jar ]]; then ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/local/share/google/dataproc/lib/spark-metrics-listener.jar " else ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/lib/spark/jars/*spark-metrics-listener*.jar " fi elif [[ $(echo "${DATAPROC_IMAGE_VERSION} > 1.5" | bc -l) == 1 ]]; then ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/lib/spark/jars/spark-hadoop-cloud*.jar /usr/lib/spark/jars/hadoop-cloud-storage-*.jar /usr/lib/spark/jars/re2j-1.1.jar " ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/htrace-core4-*-incubating.jar " ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/hadoop-common-*.jar ${tmp_dir}/share/lib/hive/woodstox-core-*.jar " ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/stax2-api-*.jar " ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/hadoop*.jar /usr/lib/spark/jars/hadoop*.jar " ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/local/share/google/dataproc/lib/gcs-connector.jar " if [[ -f /usr/local/share/google/dataproc/lib/spark-metrics-listener.jar ]]; then ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/local/share/google/dataproc/lib/spark-metrics-listener.jar " else ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/lib/spark/jars/*spark-metrics-listener*.jar " fi elif [[ $(echo "${DATAPROC_IMAGE_VERSION} > 1.4" | bc -l) == 1 ]]; then ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/htrace-core4-*-incubating.jar " ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/hadoop-common-*.jar ${tmp_dir}/share/lib/hive/woodstox-core-*.jar " ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/stax2-api-*.jar " ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/hadoop*.jar /usr/lib/spark/jars/hadoop*.jar " ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/local/share/google/dataproc/lib/gcs-connector.jar " if [[ -f /usr/local/share/google/dataproc/lib/spark-metrics-listener.jar ]]; then ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/local/share/google/dataproc/lib/spark-metrics-listener.jar " else ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/lib/spark/jars/*spark-metrics-listener*.jar " fi find /usr/lib/oozie/lib/ -name 'guava*.jar' -delete wget -P /usr/lib/oozie/lib ${MAVEN_CENTRAL_URI}/com/google/guava/guava/11.0.2/guava-11.0.2.jar elif [[ $(echo "${DATAPROC_IMAGE_VERSION} > 1.3" | bc -l) == 1 ]]; then ADDITIONAL_JARS="${ADDITIONAL_JARS} ${tmp_dir}/share/lib/hive/hadoop*.jar /usr/lib/spark/jars/hadoop*.jar " ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/local/share/google/dataproc/lib/gcs-connector.jar " if [[ -f /usr/local/share/google/dataproc/lib/spark-metrics-listener.jar ]]; then ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/local/share/google/dataproc/lib/spark-metrics-listener.jar " else ADDITIONAL_JARS="${ADDITIONAL_JARS} /usr/lib/spark/jars/*spark-metrics-listener*.jar " fi elif [[ $(echo "${DATAPROC_IMAGE_VERSION} > 1.2" | bc -l) == 1 ]]; then ADDITIONAL_JARS="" else echo "unsupported DATAPROC_IMAGE_VERSION: ${DATAPROC_IMAGE_VERSION}" >&2 exit 1 fi fi if [[ ${NUM_LIVE_DATANODES} != 0 ]]; then hadoop fs -put -f \ ${tmp_dir}/share/lib/hive/stax-api-*.jar \ ${tmp_dir}/share/lib/hive/commons-*.jar \ /usr/lib/spark/python/lib/py*.zip \ ${ADDITIONAL_JARS} /user/oozie/share/lib/spark hadoop fs -put -f /usr/lib/hive/lib/disruptor*.jar /user/oozie/share/lib/hive hadoop fs -put -f /usr/lib/hive/lib/hive-service-*.jar /user/oozie/share/lib/hive2 # end - copy spark and hive dependencies # For oozie actions, remove log4j from oozie sharelib to allow log4j api classes loaded to avoid conflicts res=`hadoop fs -find /user/oozie/share/lib/ -name "log4j-1.2.*"` for i in $res do if [[ $(hadoop fs -find $(dirname "$i") -name "log4j-1.2-api*" | wc -l) -gt 0 ]]; then hadoop fs -cp -f $i $i-backup hadoop fs -rm $i fi done # Clean up temporary files if datanodes are live rm -rf "${tmp_dir}" fi fi if [[ "$(hostname -s)" == "${master_node}" ]]; then # Create the Oozie database. Since we are using MySQL, # only do this on the master node. retry_command "sudo -u oozie /usr/lib/oozie/bin/ooziedb.sh create -run" fi local gcs_connector_dir="/usr/local/share/google/dataproc/lib" if [[ ! -d $gcs_connector_dir ]]; then gcs_connector_dir="/usr/lib/hadoop/lib" fi cp "${gcs_connector_dir}/gcs-connector.jar" /usr/lib/oozie/lib/ # Detect if current node configuration is HA and then set oozie servers local additional_nodes additional_nodes=$(/usr/share/google/get_metadata_value attributes/dataproc-master-additional | sed 's/,/\n/g' | wc -l) if [[ ${additional_nodes} -ge 2 ]]; then echo 'Starting configuration for HA' # List of servers is used for proper zookeeper configuration. # It is needed to replace original ports range with specific one local servers servers=$(grep 'server\.' /usr/lib/zookeeper/conf/zoo.cfg | sed 's/server.//g' | sed 's/:2888:3888//g' | cut -d'=' -f2- | sed 's/\n/,/g' | head -n 3 | sed 's/$/:2181,/g' | xargs -L3 | sed 's/.$//g') /usr/local/bin/bdconfig set_property \ --configuration_file "/etc/oozie/conf/oozie-site.xml" \ --name 'oozie.services.ext' --value \ 'org.apache.oozie.service.ZKLocksService, org.apache.oozie.service.ZKXLogStreamingService, org.apache.oozie.service.ZKJobsConcurrencyService, org.apache.oozie.service.ZKUUIDService' \ --clobber /usr/local/bin/bdconfig set_property \ --configuration_file "/etc/oozie/conf/oozie-site.xml" \ --name 'oozie.zookeeper.connection.string' --value "${servers}" \ --clobber fi # Workaround to avoid classnotfound issues due to old curator jar in Oozie classpath if [ -f "/usr/lib/oozie/lib/curator-framework-2.5.0.jar" ] then find /usr/lib/oozie/lib \ -name "curator-framework*.jar" -o \ -name "curator-recipes*.jar" -o \ -name "curator-client*.jar" \ -delete if [ $(ls ${curator_src}/ | grep "curator.*-${curator_version}.jar" | wc -l) -ne 0 ]; then cp ${curator_src}/curator*-${curator_version}.jar /usr/lib/oozie/lib fi fi # Restart the zookeeper service if which systemctl > /dev/null && systemctl list-units | grep zookeeper-server > /dev/null ; then systemctl restart zookeeper-server else /usr/lib/zookeeper/bin/zkServer.sh restart fi # HDFS and YARN must be cycled; restart to clean things up for service in hadoop-hdfs-namenode hadoop-hdfs-secondarynamenode hadoop-yarn-resourcemanager oozie; do if [[ $(systemctl list-unit-files | grep ${service}) != '' ]] && [[ $(systemctl is-enabled ${service}) == 'enabled' ]]; then systemctl restart ${service} fi done # Leave safe mode - HDFS will enter safe mode because of Name Node restart if [[ "$(hostname -s)" == "${master_node}" ]]; then case "${DATAPROC_IMAGE_VERSION}" in "1.3" | "1.4") hadoop dfsadmin -safemode leave ;; *) hdfs dfsadmin -safemode leave ;; esac fi } function install_fluentd_configuration() { # the /etc/google-fluentd/config.d is not created if the cluster is created with the flag dataproc:dataproc.logging.stackdriver.enable=false # enable oozie fluentd only if the directory exists if [[ -d /etc/google-fluentd/config.d ]]; then cat < /etc/google-fluentd/config.d/oozie_fluentd.conf ################# # # Oozie # # Fluentd config to tail the oozie log files. # Currently severity is a seperate field from the Cloud Logging log_level. @type tail format none path /var/log/oozie/* pos_file /var/tmp/fluentd.dataproc.oozie.pos refresh_interval 2s read_from_head true tag concat.raw.tail.* @type detect_exceptions remove_tag_prefix concat multiline_flush_interval 0.1 @type parser key_name message @type multi_format format /^((? format /^((? format none @type record_reformer renew_record false enable_ruby true auto_typecast true # "tag" is transtlated into log in Stackdriver # Strip the instance name and .log from the filename if present tag \${tag_suffix[-2].sub("-#{Socket.gethostname}", "").sub(/\.log\$/, "")} # The following can be used when turning on jobid re-logging: # dataproc.googleapis.com/process_id \${job} filename \${tag_suffix[-2]} EOF if [[ $(echo "${DATAPROC_IMAGE_VERSION} >= 2.2" | bc -l) == 1 ]]; then systemctl reload-or-restart google-fluentd-docker else systemctl reload-or-restart google-fluentd fi else echo "Skipped fluentd configuration for oozie." fi } function main() { #Remove debian backports if [[ ${OS_NAME} == debian ]] && [[ $(echo "${DATAPROC_IMAGE_VERSION} <= 2.1" | bc -l) == 1 ]]; then remove_old_backports fi # Only run on the master node of the cluster if [[ "${ROLE}" == 'Master' ]]; then install_oozie install_fluentd_configuration fi } main