{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2017-07-02T23:20:19.547246Z", "start_time": "2017-07-02T23:20:17.981743Z" } }, "outputs": [ { "data": { "text/html": [ " \n", "\n", "\n", " \n", "\n", "
\n", " \n", " BokehJS successfully loaded.\n", "
" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "/home/anand/playspace/data-science-utils/.eggs/statsmodels-0.8.0-py3.6-linux-x86_64.egg/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.\n", " from pandas.core import datetools\n", "/home/anand/anaconda3/envs/analytics/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", " \"This module will be removed in 0.20.\", DeprecationWarning)\n" ] } ], "source": [ "# Standard libraries\n", "import json\n", "%matplotlib inline\n", "import datetime\n", "import numpy as np\n", "import pandas as pd\n", "import random\n", "\n", "from bokeh.plotting import figure, show, output_file, output_notebook, ColumnDataSource\n", "from bokeh.charts import Histogram\n", "import bokeh\n", "output_notebook()\n", "\n", "from datascienceutils import analyze" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2017-07-02T23:20:19.553172Z", "start_time": "2017-07-02T23:20:19.548834Z" } }, "outputs": [], "source": [ "irisDf = pd.read_csv('./data/Iris.csv')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2017-07-02T23:20:59.095578Z", "start_time": "2017-07-02T23:20:19.554732Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "P-value and test statistic for distribution similarity between SepalLengthCm and SepalWidthCm\n" ] }, { "ename": "ModuleNotFoundError", "evalue": "No module named 'permute'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mnumColumns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mirisDf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mselect_dtypes\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minclude\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnumber\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mcombo\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcombinations\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnumColumns\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0manalyze\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mregression_analyze\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mirisDf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcombo\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcombo\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcheck_vif\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcheck_heteroskedasticity\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m~/playspace/data-science-utils/datascienceutils/analyze.py\u001b[0m in \u001b[0;36mregression_analyze\u001b[0;34m(df, col1, col2, trainsize, non_linear, check_heteroskedasticity, check_vif, check_dist_similarity, **kwargs)\u001b[0m\n\u001b[1;32m 312\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcheck_dist_similarity\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 313\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"P-value and test statistic for distribution similarity between %s and %s\"\u001b[0m\u001b[0;34m%\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcol1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcol2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 314\u001b[0;31m \u001b[0mis_similar_distribution\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcol1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcol2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 315\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 316\u001b[0m \u001b[0mnew_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcol1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcol2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdeep\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/playspace/data-science-utils/datascienceutils/analyze.py\u001b[0m in \u001b[0;36mis_similar_distribution\u001b[0;34m(original_dist, target_dist, test_type)\u001b[0m\n\u001b[1;32m 190\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mis_similar_distribution\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moriginal_dist\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget_dist\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'permutation'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 191\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtest_type\u001b[0m\u001b[0;34m==\u001b[0m\u001b[0;34m'permutation'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 192\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mpermute\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtwo_sample\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 193\u001b[0m \u001b[0mkwargs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m'stat'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m't'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'alternative'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m'two-sided'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'seed'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m20\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 194\u001b[0m \u001b[0mp_value\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtwo_sample\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moriginal_dist\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget_dist\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'permute'" ] } ], "source": [ "from itertools import combinations\n", "numColumns = irisDf.select_dtypes(include=[np.number]).columns\n", "for combo in combinations(numColumns,2):\n", " analyze.regression_analyze(irisDf, combo[0], combo[1], check_vif=False, check_heteroskedasticity=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 2 }