{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## This notebook is part of Hadoop tutorials delivered by IT-DB group" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### SPARK DATAFRAME Hands-On Lab using table from relational database" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Hands-On 1 - Load a relation table into DataFrame" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### First, lets import the bits we need" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from pyspark import SparkContext, SQLContext, SparkConf\n", "import os" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### download the oracle jdbc driver" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!curl -o /tmp/ojdbc7.jar https://cernbox.cern.ch/index.php/s/e7NJ5jD3qZB7RNO/download" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### This is only required in the notebook mode, in spark-shell, spark-submit, pyspark you can direcly pass --jars (e.g pyspark --jars /tmp/ojdbc7.jar)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /tmp/ojdbc7.jar pyspark-shell'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Create the SparkContext and SQLContext explicitly" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "sc = SparkContext()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "sqlContext = SQLContext(sc)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Create the jdbc DataFrame as below, please note how to pass the driver, connection url, username and password\n", "*before executing the next cell, please replace PWD, SERVER and PORT with password, database host and port respectively*" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "db_df = sqlContext.read.format('jdbc') \\\n", " .options(driver='oracle.jdbc.driver.OracleDriver',url='jdbc:oracle:thin:meetup/$PWD@$SERVER:$PORT/PIMT_RAC51.cern.ch',dbtable='meetup_data') \\\n", " .load()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Read the data from the dataframe" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "db_df.select(\"EVENT_ID\",\"EVENT_NAME\").show(5)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.10" } }, "nbformat": 4, "nbformat_minor": 0 }