{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "cef3e0a4-3a68-41a7-bb25-cd722526b09c", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [ "from pyspark.sql import SparkSession\n", "from pyspark.sql.functions import rand, round\n", "from pyspark.sql.types import StructType, StructField, IntegerType, FloatType\n", "import matplotlib.pyplot as plt\n", "\n", "# Initialize the Spark session\n", "spark = SparkSession.builder.getOrCreate() # Skip session creation if already connected to Spark" ] }, { "cell_type": "code", "execution_count": 2, "id": "762ceedc-17cb-4bb3-ad23-1f0547986fa3", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "24/11/09 08:17:45 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 96.54% for 7 writers\n", "24/11/09 08:17:45 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 84.47% for 8 writers\n", "24/11/09 08:17:45 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 75.08% for 9 writers\n", "24/11/09 08:17:45 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 67.58% for 10 writers\n", "24/11/09 08:17:45 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 61.43% for 11 writers\n", "24/11/09 08:17:45 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 56.31% for 12 writers\n", "24/11/09 08:17:45 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 61.43% for 11 writers\n", "24/11/09 08:17:45 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 56.31% for 12 writers\n", "24/11/09 08:17:45 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 51.98% for 13 writers\n", "24/11/09 08:17:45 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 48.27% for 14 writers\n", "24/11/09 08:17:45 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 51.98% for 13 writers\n", "24/11/09 08:17:45 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 48.27% for 14 writers\n", "24/11/09 08:17:45 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 51.98% for 13 writers\n", "24/11/09 08:17:45 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 56.31% for 12 writers\n", "24/11/09 08:17:45 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 61.43% for 11 writers\n", "24/11/09 08:17:45 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 67.58% for 10 writers\n", "24/11/09 08:17:45 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 75.08% for 9 writers\n", "24/11/09 08:17:45 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 84.47% for 8 writers\n", "24/11/09 08:17:45 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory\n", "Scaling row group sizes to 96.54% for 7 writers\n", " \r" ] }, { "name": "stdout", "output_type": "stream", "text": [ "+---+-----+\n", "| id|value|\n", "+---+-----+\n", "| 91| 0.97|\n", "| 92| 0.29|\n", "| 93| 0.51|\n", "| 94| 0.83|\n", "| 95| 0.62|\n", "| 96| 0.01|\n", "| 97| 0.76|\n", "| 98| 0.84|\n", "| 99| 0.16|\n", "|100| 0.31|\n", "+---+-----+\n", "only showing top 10 rows\n", "\n" ] } ], "source": [ "# Create an initial DataFrame with only the `id` column\n", "data = [(i,) for i in range(1, 101)] # Generate 100 rows with sequential IDs\n", "schema = StructType([StructField(\"id\", IntegerType(), True)])\n", "df = spark.createDataFrame(data, schema=schema)\n", "\n", "# Add a `value` column with random values, rounded to 2 decimal places\n", "df = df.withColumn(\"value\", round(rand(), 2))\n", "\n", "# Define HDFS path\n", "hdfs_path = \"hdfs:///share/test_data\"\n", "\n", "# Write DataFrame to HDFS\n", "df.write.mode(\"overwrite\").parquet(hdfs_path)\n", "\n", "# Read the data back from HDFS\n", "df_hdfs = spark.read.parquet(hdfs_path)\n", "\n", "# Show a sample of the data\n", "df_hdfs.show(10)" ] }, { "cell_type": "code", "execution_count": 3, "id": "6dc3db38-a646-4794-93db-7ffb8b15a557", "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Collect data to Pandas for plotting (only small data)\n", "df_pandas = df_hdfs.toPandas()\n", "\n", "# Plotting the data\n", "plt.figure(figsize=(10, 6))\n", "plt.scatter(df_pandas[\"id\"], df_pandas[\"value\"], color=\"blue\", alpha=0.5)\n", "plt.xlabel(\"ID\")\n", "plt.ylabel(\"Value\")\n", "plt.title(\"Random Test Data from HDFS\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "55a9a3da-1648-4180-8218-f6913f6cbe4b", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.15" } }, "nbformat": 4, "nbformat_minor": 5 }