{ "cells": [ { "cell_type": "markdown", "source": [ "# Display function" ], "metadata": { "application/vnd.databricks.v1+cell": { "showTitle": false, "cellMetadata": {}, "nuid": "4ba01500-3911-48e3-95d9-c45e951d345d", "inputWidgets": {}, "title": "" } } }, { "cell_type": "code", "source": [ "help(display)" ], "metadata": { "application/vnd.databricks.v1+cell": { "showTitle": false, "cellMetadata": {}, "nuid": "f38205a3-d526-4e23-ba26-a830587a8369", "inputWidgets": {}, "title": "" } }, "outputs": [], "execution_count": 0 }, { "cell_type": "code", "source": [ "from dataclasses import dataclass\n\n@dataclass\nclass Element:\n atomic_number: int\n name: str\n \nelements = [Element(1, \"Hydrogen\"), Element(2, \"Helium\")]\ndisplay(elements)" ], "metadata": { "application/vnd.databricks.v1+cell": { "showTitle": false, "cellMetadata": {}, "nuid": "63fa00ba-e437-4934-8273-161721e0232d", "inputWidgets": {}, "title": "" } }, "outputs": [], "execution_count": 0 }, { "cell_type": "markdown", "source": [ "# Widgets and Magics" ], "metadata": { "application/vnd.databricks.v1+cell": { "showTitle": false, "cellMetadata": {}, "nuid": "dac656b4-7a28-4b47-b05d-bcd598a7850e", "inputWidgets": {}, "title": "" } } }, { "cell_type": "code", "source": [ "from collections import namedtuple\nMagic = namedtuple(\"Magic\", \"magic_type magic_name\")\n\nline_magics = [(\"line\", x) for x in get_ipython().magics_manager.magics[\"line\"].keys()]\ncell_magics = [(\"cell\", x) for x in get_ipython().magics_manager.magics[\"cell\"].keys()]\n\n\ndisplay([Magic(x[0], x[1]) for x in (line_magics + cell_magics)])" ], "metadata": { "application/vnd.databricks.v1+cell": { "showTitle": false, "cellMetadata": {}, "nuid": "1c3a0222-217f-4715-a3f4-f91c9b79fb3b", "inputWidgets": {}, "title": "" } }, "outputs": [], "execution_count": 0 }, { "cell_type": "code", "source": [ "%pwd" ], "metadata": { "application/vnd.databricks.v1+cell": { "showTitle": false, "cellMetadata": {}, "nuid": "02cf5a3b-18a3-448e-b034-990d02743c00", "inputWidgets": {}, "title": "" } }, "outputs": [], "execution_count": 0 }, { "cell_type": "code", "source": [ "%%html\n\n

Hello SVG!

\n\n\n \n" ], "metadata": { "application/vnd.databricks.v1+cell": { "showTitle": false, "cellMetadata": {}, "nuid": "e6b5816e-075e-49d7-8b55-5a63e3f59452", "inputWidgets": {}, "title": "" } }, "outputs": [], "execution_count": 0 }, { "cell_type": "code", "source": [ "import ipywidgets as widgets\nrecord_count = widgets.IntSlider(1, 1, 1000000)\nrecord_count" ], "metadata": { "application/vnd.databricks.v1+cell": { "showTitle": false, "cellMetadata": {}, "nuid": "96d07fb7-e361-4b1d-b07e-c5c1022669db", "inputWidgets": {}, "title": "" } }, "outputs": [], "execution_count": 0 }, { "cell_type": "code", "source": [ "spark.range(record_count.value).count()" ], "metadata": { "application/vnd.databricks.v1+cell": { "showTitle": false, "cellMetadata": {}, "nuid": "a5183203-f5d4-4353-8b77-a98d3648f7c6", "inputWidgets": {}, "title": "" } }, "outputs": [], "execution_count": 0 }, { "cell_type": "markdown", "source": [ "# FUSE" ], "metadata": { "application/vnd.databricks.v1+cell": { "showTitle": false, "cellMetadata": {}, "nuid": "ac4c744f-1fb5-471b-9d68-5469286db8dc", "inputWidgets": {}, "title": "" } } }, { "cell_type": "code", "source": [ "spark.read.text(\"dbfs:/databricks-datasets/README.md\").collect()[:5]" ], "metadata": { "application/vnd.databricks.v1+cell": { "showTitle": false, "cellMetadata": {}, "nuid": "3daec3e3-6f92-4e1d-85b1-6778e1f6ab3a", "inputWidgets": {}, "title": "" } }, "outputs": [], "execution_count": 0 }, { "cell_type": "code", "source": [ "%sh ls -lh /dbfs/databricks-datasets/README.md" ], "metadata": { "application/vnd.databricks.v1+cell": { "showTitle": false, "cellMetadata": {}, "nuid": "c708d98a-1756-4e20-8254-d36ee5c26762", "inputWidgets": {}, "title": "" } }, "outputs": [], "execution_count": 0 }, { "cell_type": "code", "source": [ "with open(\"/dbfs/databricks-datasets/README.md\") as f:\n contents = f.read()\n print(contents)" ], "metadata": { "application/vnd.databricks.v1+cell": { "showTitle": false, "cellMetadata": {}, "nuid": "4e796e20-b74f-4690-b9d5-ae4fb41f8ece", "inputWidgets": {}, "title": "" } }, "outputs": [], "execution_count": 0 }, { "cell_type": "markdown", "source": [ "# Importing via sys.path" ], "metadata": { "application/vnd.databricks.v1+cell": { "showTitle": false, "cellMetadata": {}, "nuid": "ff731dfd-8b0d-44ed-b03a-864914a0d7a5", "inputWidgets": {}, "title": "" } } }, { "cell_type": "code", "source": [ "import sys\nsys.path.append(\"/Workspace/Repos/some_repo/common\")\nimport constants" ], "metadata": { "application/vnd.databricks.v1+cell": { "showTitle": false, "cellMetadata": {}, "nuid": "04951c78-8319-4886-8860-6ef9653d7014", "inputWidgets": {}, "title": "" } }, "outputs": [], "execution_count": 0 }, { "cell_type": "markdown", "source": [ "# REST API and SDK access from notebooks" ], "metadata": { "application/vnd.databricks.v1+cell": { "showTitle": false, "cellMetadata": {}, "nuid": "4694449b-21fd-47f9-92a6-4a1fa9a7c36e", "inputWidgets": {}, "title": "" } } }, { "cell_type": "code", "source": [ "import requests\nimport json\nfrom pprint import pprint\n\napi_url = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiUrl().getOrElse(None)\ntoken = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().getOrElse(None)\n\nprint(f\"api_url: {api_url}\")\nprint(f\"token: {token}\")\n\nresp = requests.get(\n f\"{api_url}/api/2.0/clusters/list\",\n headers={\"Authorization\": f\"Bearer {token}\"}\n)\n\nprint()\npprint(json.loads(resp.content))" ], "metadata": { "application/vnd.databricks.v1+cell": { "showTitle": false, "cellMetadata": {}, "nuid": "5a3a2011-a461-49cb-8f03-801b2ae45cfc", "inputWidgets": {}, "title": "" } }, "outputs": [], "execution_count": 0 }, { "cell_type": "code", "source": [ "%pip install databricks-cli" ], "metadata": { "application/vnd.databricks.v1+cell": { "showTitle": false, "cellMetadata": {}, "nuid": "d9bc5140-41ff-4ac2-a6bd-21cac5dc0e7d", "inputWidgets": {}, "title": "" } }, "outputs": [], "execution_count": 0 }, { "cell_type": "code", "source": [ "from databricks_cli.sdk import ClusterService, ApiClient\nclient = ClusterService(ApiClient(host = api_url, token=token))\n\nclusters = client.list_clusters()\npprint(clusters)" ], "metadata": { "application/vnd.databricks.v1+cell": { "showTitle": false, "cellMetadata": {}, "nuid": "7e56f53c-f855-4987-84e3-0ed601d40d1b", "inputWidgets": {}, "title": "" } }, "outputs": [], "execution_count": 0 }, { "cell_type": "markdown", "source": [ "# Disk Cache" ], "metadata": { "application/vnd.databricks.v1+cell": { "showTitle": false, "cellMetadata": {}, "nuid": "96591109-1aca-4f08-8474-21c71745b16e", "inputWidgets": {}, "title": "" } } }, { "cell_type": "code", "source": [ "spark.read.parquet(\"<>\").describe().display()" ], "metadata": { "application/vnd.databricks.v1+cell": { "showTitle": false, "cellMetadata": {}, "nuid": "3182566d-eb72-4918-8a95-c837ab070c82", "inputWidgets": {}, "title": "" } }, "outputs": [], "execution_count": 0 }, { "cell_type": "markdown", "source": [ "# Spark Caching" ], "metadata": { "application/vnd.databricks.v1+cell": { "showTitle": false, "cellMetadata": {}, "nuid": "afa5fd07-e1ff-429a-a803-b93015435954", "inputWidgets": {}, "title": "" } } }, { "cell_type": "code", "source": [ "spark.range(1_000_000).selectExpr(\"id\", \"concat('Person-', id) as name\").cache().count()\nspark.sparkContext.parallelize(range(1_000_000)).map(lambda x: (x, x * x)).cache().count()" ], "metadata": { "application/vnd.databricks.v1+cell": { "showTitle": false, "cellMetadata": {}, "nuid": "1f9e2678-8006-4f50-bf54-da420640ff15", "inputWidgets": {}, "title": "" } }, "outputs": [], "execution_count": 0 }, { "cell_type": "markdown", "source": [ "# Ganglia" ], "metadata": { "application/vnd.databricks.v1+cell": { "showTitle": false, "cellMetadata": {}, "nuid": "9728e3c9-959d-43ae-9002-4b84ac414a68", "inputWidgets": {}, "title": "" } } }, { "cell_type": "code", "source": [ "%scala\nimport org.apache.spark.sql.functions.{lit, col, array_repeat, when, explode, acos, udf}\nimport org.apache.spark.SparkEnv\nimport org.apache.spark.sql.functions.udf\n\nval getExecutorId = udf(() => SparkEnv.get.executorId)\nspark.udf.register(\"getExecutorId\", getExecutorId)\n\nval df = spark.range(10000000)\n .selectExpr(\"id\", \"getExecutorId() as executor\")\n .withColumn(\"array\", when($\"executor\" === \"0\", array_repeat(lit(0.5), 1000)).otherwise(array_repeat(lit(0.5), 10)))\n .withColumn(\"item\", explode($\"array\"))\n .drop(\"array\")\n .withColumn(\"acos\", acos($\"item\"))\n\ndf.write.format(\"noop\").mode(\"overwrite\").save()" ], "metadata": { "application/vnd.databricks.v1+cell": { "showTitle": false, "cellMetadata": {}, "nuid": "87453761-ce73-4fa0-b052-958af3252bc2", "inputWidgets": {}, "title": "" } }, "outputs": [], "execution_count": 0 }, { "cell_type": "markdown", "source": [ "# SparkSession & SparkContext" ], "metadata": { "application/vnd.databricks.v1+cell": { "showTitle": false, "cellMetadata": {}, "nuid": "04efa8e2-c3bc-4522-9c61-37a023c401f1", "inputWidgets": {}, "title": "" } } }, { "cell_type": "code", "source": [ "%scala\nprintln(s\"Spark session instance : ${spark}\")\nprintln(s\"Spark context instance : ${spark.sparkContext}\")" ], "metadata": { "application/vnd.databricks.v1+cell": { "showTitle": false, "cellMetadata": {}, "nuid": "dcaa0d5f-ab7a-404e-a4aa-419514494c1b", "inputWidgets": {}, "title": "" } }, "outputs": [], "execution_count": 0 } ], "metadata": { "application/vnd.databricks.v1+notebook": { "notebookName": "Databricks Tips and Tricks", "dashboards": [], "notebookMetadata": { "pythonIndentUnit": 2, "mostRecentlyExecutedCommandWithImplicitDF": { "commandId": 1237527421015243, "dataframes": [ "_sqldf" ] } }, "language": "python", "widgets": {}, "notebookOrigID": 3981444023629489 } }, "nbformat": 4, "nbformat_minor": 0 }