{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Problem 1\n", "Apply your skills to classify protein foldType with Decision Tree Classifier\n", "\n", "## Imports" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from mmtfPyspark.ml import SparkMultiClassClassifier, datasetBalancer \n", "from pyspark.sql import SparkSession\n", "from pyspark.sql.functions import *\n", "import mltoolkit" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Configure Spark Session" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "spark = SparkSession.builder.appName(\"Problem-1\").getOrCreate()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## TODO-1: Read in data from parquet file" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "ename": "SyntaxError", "evalue": "invalid syntax (, line 2)", "output_type": "error", "traceback": [ "\u001b[0;36m File \u001b[0;32m\"\"\u001b[0;36m, line \u001b[0;32m2\u001b[0m\n\u001b[0;31m data = # Your Code Here #\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" ] } ], "source": [ "parquetFile = './input_features/'\n", "data = # Your Code Here #" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## TODO-2: Select alpha, beta, alpha+beta foldtypes" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data = # Your Code Here #\n", "print(f\"Total number of data: {data.count()}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## TODO-3: Downsample data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "label = 'foldType'\n", "\n", "data = # Your Code Here #\n", "print(f\"Dataset size (balanced) : {data.count()}\")\n", " \n", "data.groupby(label).count().show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## TODO-4: Decision Tree Classifier with PySpark" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from pyspark.ml.classification import DecisionTreeClassifier\n", "\n", "dtc = # Your Code Here: Make Decision Tree Classifier Class #\n", "mcc = # Your Code Here: Use MulticlassClassifier wrapper on dtc#\n", "matrics = # Your Code Here: fit data#\n", "for k,v in matrics.items(): print(f\"{k}\\t{v}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## BONUS: Decision Tree Classifier with sklearn" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.tree import DecisionTreeClassifier\n", "\n", "df = # Your Code Here: convert data to Pandas Dataframe #\n", "dtc = # Your Code Here: Make Decision Tree Classifier Class #\n", "mcc = # Your Code Here: Use MulticlassClassifier wrapper on dtc#\n", "matrics = # Your Code Here: fit data#\n", "for k,v in matrics.items(): print(f\"{k}\\t{v}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "spark.stop()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.13" } }, "nbformat": 4, "nbformat_minor": 4 }