{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import json\n", "import pandas as pd\n", "import numpy as np\n", "import missingno as msno\n", "from sklearn.impute import SimpleImputer\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "scrolled": false }, "outputs": [], "source": [ "with open('../dtypes.json', 'r') as jsonfile:\n", " dtyp = json.load(jsonfile)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'id': 'float',\n", " 'flag_tsunami': 'str',\n", " 'year': 'float',\n", " 'month': 'float',\n", " 'day': 'float',\n", " 'hour': 'float',\n", " 'minute': 'float',\n", " 'second': 'float',\n", " 'focal_depth': 'float',\n", " 'eq_primary': 'float',\n", " 'eq_mag_mw': 'float',\n", " 'eq_mag_ms': 'float',\n", " 'eq_mag_mb': 'float',\n", " 'intensity': 'float',\n", " 'country': 'str',\n", " 'state': 'str',\n", " 'location_name': 'str',\n", " 'latitude': 'float',\n", " 'longitude': 'float',\n", " 'region_code': 'str',\n", " 'injuries': 'float',\n", " 'injuries_description': 'str',\n", " 'damage_millions_dollars': 'float',\n", " 'damage_description': 'str',\n", " 'total_injuries': 'float',\n", " 'total_injuries_description': 'str',\n", " 'total_damage_millions_dollars': 'float',\n", " 'total_damage_description': 'str'}" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dtyp" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "data = pd.read_csv('../Datasets/earthquake_data.csv', dtype = dtyp)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "description_features = [\n", " 'injuries_description', 'damage_description',\n", " 'total_injuries_description', 'total_damage_description'\n", "]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='NA')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "data[description_features] = imp.fit_transform(data[description_features])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
damage_millions_dollars
damage_description
10.417211
23.078840
313.818806
43574.998799
NANaN
\n", "
" ], "text/plain": [ " damage_millions_dollars\n", "damage_description \n", "1 0.417211\n", "2 3.078840\n", "3 13.818806\n", "4 3574.998799\n", "NA NaN" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "category_means = data[['damage_description', 'damage_millions_dollars']].groupby('damage_description').mean()\n", "category_means" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'1': 0.4172105263157895,\n", " '2': 3.0788402777777772,\n", " '3': 13.818805970149256,\n", " '4': 3574.9987991266385,\n", " 'NA': -1,\n", " '0': 0}" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "replacement_values = category_means.damage_millions_dollars.to_dict()\n", "replacement_values['NA'] = -1\n", "replacement_values['0'] = 0\n", "replacement_values" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "imputed_values = data.damage_description.map(replacement_values)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "data['damage_millions_dollars'] = np.where(data.damage_millions_dollars.isnull(),\n", " data.damage_description.map(replacement_values),\n", " data.damage_millions_dollars)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "Index(['id', 'year', 'month', 'day', 'hour', 'minute', 'second', 'focal_depth',\n", " 'eq_primary', 'eq_mag_mw', 'eq_mag_ms', 'eq_mag_mb', 'intensity',\n", " 'latitude', 'longitude', 'injuries', 'damage_millions_dollars',\n", " 'total_injuries', 'total_damage_millions_dollars'],\n", " dtype='object')" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "numeric_variables = data.select_dtypes(include=[np.number])\n", "numeric_variables.columns" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['flag_tsunami', 'country', 'state', 'location_name', 'region_code',\n", " 'injuries_description', 'damage_description',\n", " 'total_injuries_description', 'total_damage_description'],\n", " dtype='object')" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "object_variables = data.select_dtypes(include=[np.object])\n", "object_variables.columns" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "NA 4723\n", "1 666\n", "3 347\n", "2 193\n", "4 143\n", "Name: injuries_description, dtype: int64" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "counts = data.injuries_description.value_counts(dropna=False)\n", "counts" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "counts = data.damage_description.value_counts()\n", "counts = counts.sort_index()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "data_to_plot = data[~pd.isnull(data.injuries) & ~pd.isnull(data.eq_primary)]" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "plt.figure(figsize=(12,9))\n", "\n", "plt.scatter(x=data_to_plot.eq_primary, y=data_to_plot.injuries)\n", "\n", "plt.xlabel('Primary earthquake magnitude')\n", "plt.ylabel('No. of injuries')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 2 }