{
 "cells": [
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "## Reading by using inbuilt spark library\n",
    "excel_path = \"/Volumes/test_data.xlsx\"\n",
    "\n",
    "## You must add the JARs to your cluster classpath as per README.md\n",
    "df = spark.read.format(\"com.crealytics.spark.excel\") \\\n",
    "    .option(\"header\", \"true\") \\\n",
    "    .option(\"inferSchema\", \"true\") \\\n",
    "    .load(excel_path)\n",
    "\n",
    "df.show()"
   ],
   "id": "4d1c762a078b6ac2"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "## Using pandas to convert excel into csv and then read in spark\n",
    "import pandas as pd\n",
    "\n",
    "excel_path = \"/Volumes/test_data.xlsx\"\n",
    "df = pd.read_excel(excel_path)\n",
    "\n",
    "# Convert to CSV if needed\n",
    "csv_path = \"/Volumes/test_data.csv\"\n",
    "df.to_csv(csv_path, index=False)\n",
    "\n",
    "print(df.head())\n",
    "\n",
    "# Load CSV back into Spark\n",
    "spark_df = spark.read.csv(csv_path, header=True, inferSchema=True)\n",
    "spark_df.show()\n"
   ],
   "id": "3d929687c9b1c44a"
  }
 ],
 "metadata": {},
 "nbformat": 4,
 "nbformat_minor": 5
}