{ "cells": [ { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "## Reading by using inbuilt spark library\n", "excel_path = \"/Volumes/test_data.xlsx\"\n", "\n", "## You must add the JARs to your cluster classpath as per README.md\n", "df = spark.read.format(\"com.crealytics.spark.excel\") \\\n", " .option(\"header\", \"true\") \\\n", " .option(\"inferSchema\", \"true\") \\\n", " .load(excel_path)\n", "\n", "df.show()" ], "id": "4d1c762a078b6ac2" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "## Using pandas to convert excel into csv and then read in spark\n", "import pandas as pd\n", "\n", "excel_path = \"/Volumes/test_data.xlsx\"\n", "df = pd.read_excel(excel_path)\n", "\n", "# Convert to CSV if needed\n", "csv_path = \"/Volumes/test_data.csv\"\n", "df.to_csv(csv_path, index=False)\n", "\n", "print(df.head())\n", "\n", "# Load CSV back into Spark\n", "spark_df = spark.read.csv(csv_path, header=True, inferSchema=True)\n", "spark_df.show()\n" ], "id": "3d929687c9b1c44a" } ], "metadata": {}, "nbformat": 4, "nbformat_minor": 5 }