{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2020-09-22 16:47:19,276 - whylogs.logs - DEBUG - whylogs.logs logging -> stdout at level DEBUG\n" ] } ], "source": [ "# Just a simple convenience function to send the internal python\n", "# logs to stdout. Definitely not required\n", "from whylogs.logs import display_logging\n", "display_logging('debug')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load data" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idmember_idloan_amntfunded_amntfunded_amnt_invtermint_rateinstallmentgradesub_grade...hardship_payoff_balance_amounthardship_last_payment_amountdisbursement_methoddebt_settlement_flagdebt_settlement_flag_datesettlement_statussettlement_datesettlement_amountsettlement_percentagesettlement_term
090671227NaN4800.04800.04800.036 months13.49162.87CC2...NaNNaNCashNNaNNaNNaNNaNNaNNaN
190060135NaN21600.021600.021600.060 months9.49453.54BB2...NaNNaNCashNNaNNaNNaNNaNNaNNaN
290501423NaN24200.024200.024200.036 months9.49775.09BB2...NaNNaNCashNNaNNaNNaNNaNNaNNaN
390186302NaN3600.03600.03600.036 months11.49118.70BB5...NaNNaNCashNNaNNaNNaNNaNNaNNaN
490805192NaN8000.08000.08000.036 months10.49259.99BB3...NaNNaNCashNNaNNaNNaNNaNNaNNaN
\n", "

5 rows × 151 columns

\n", "
" ], "text/plain": [ " id member_id loan_amnt funded_amnt funded_amnt_inv term \\\n", "0 90671227 NaN 4800.0 4800.0 4800.0 36 months \n", "1 90060135 NaN 21600.0 21600.0 21600.0 60 months \n", "2 90501423 NaN 24200.0 24200.0 24200.0 36 months \n", "3 90186302 NaN 3600.0 3600.0 3600.0 36 months \n", "4 90805192 NaN 8000.0 8000.0 8000.0 36 months \n", "\n", " int_rate installment grade sub_grade ... hardship_payoff_balance_amount \\\n", "0 13.49 162.87 C C2 ... NaN \n", "1 9.49 453.54 B B2 ... NaN \n", "2 9.49 775.09 B B2 ... NaN \n", "3 11.49 118.70 B B5 ... NaN \n", "4 10.49 259.99 B B3 ... NaN \n", "\n", " hardship_last_payment_amount disbursement_method debt_settlement_flag \\\n", "0 NaN Cash N \n", "1 NaN Cash N \n", "2 NaN Cash N \n", "3 NaN Cash N \n", "4 NaN Cash N \n", "\n", " debt_settlement_flag_date settlement_status settlement_date \\\n", "0 NaN NaN NaN \n", "1 NaN NaN NaN \n", "2 NaN NaN NaN \n", "3 NaN NaN NaN \n", "4 NaN NaN NaN \n", "\n", " settlement_amount settlement_percentage settlement_term \n", "0 NaN NaN NaN \n", "1 NaN NaN NaN \n", "2 NaN NaN NaN \n", "3 NaN NaN NaN \n", "4 NaN NaN NaN \n", "\n", "[5 rows x 151 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Load some data\n", "df = pd.read_csv('lending_club_1000.csv')\n", "# Split into a test & training set\n", "df_training = df.sample(int(len(df) * 0.8), replace=False, random_state=123)\n", "df_test = df.drop(df_training.index)\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Log dataset sketches" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2020-09-22 16:47:19,339 - whylogs.app.config - DEBUG - Attempting to load config file: None\n", "2020-09-22 16:47:19,340 - whylogs.app.config - DEBUG - Attempting to load config file: .whylogs.yaml\n" ] } ], "source": [ "from whylogs import get_or_create_session\n", "\n", "session = get_or_create_session()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Log dataframe" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "session.log_dataframe(df_training, 'training.data')\n", "# Then you could do whatever training or calculations you'd like" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Inspect profiles/statistics" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
columncountnull_countbool_countnumeric_countmaxmeanminstddevnunique_numbers...ununique_str_upperquantile_0.0000quantile_0.0100quantile_0.0500quantile_0.2500quantile_0.5000quantile_0.7500quantile_0.9500quantile_0.9900quantile_1.0000
0num_il_tl200.00.00.0199.043.009.8341710.008.29065734.0...0.00.000.0000001.0000004.0000007.00000014.00000028.00000042.00000043.000000
1open_acc_6m200.00.00.0199.08.001.3567840.001.4207498.0...0.00.000.0000000.0000000.0000001.0000002.0000004.0000007.0000008.000000
2avg_cur_bal200.00.00.0199.072812.0013079.467337244.0014001.002777199.0...0.0244.00425.0000001252.0000003039.0000008200.00000017591.00000043647.00000068086.00000072812.000000
3dti_joint200.00.00.04.020.6514.89250012.353.8639224.0...0.012.3512.35000012.35000013.22000013.35000020.65000020.65000020.65000020.650000
4num_accts_ever_120_pd200.00.00.0199.07.000.5427140.001.2296578.0...0.00.000.0000000.0000000.0000000.0000001.0000003.0000007.0000007.000000
..................................................................
146sec_app_collections_12_mths_ex_med200.00.00.00.00.000.0000000.000.0000000.0...0.0NaNNaNNaNNaNNaNNaNNaNNaNNaN
147emp_length200.00.00.00.00.000.0000000.000.0000000.0...11.0NaNNaNNaNNaNNaNNaNNaNNaNNaN
148last_pymnt_amnt200.00.00.0199.035304.765068.3704520.007696.468449194.0...0.00.007.980000118.699997334.100006771.2299807585.50976622287.58007832954.30859435304.761719
149total_pymnt_inv200.00.00.0199.052583.9715089.0573370.0010349.878426198.0...0.00.00828.9000242734.1599127149.43017612359.34960920929.97070335261.21875051942.23046952583.968750
150debt_settlement_flag200.00.00.00.00.000.0000000.000.0000000.0...2.0NaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", "

151 rows × 32 columns

\n", "
" ], "text/plain": [ " column count null_count bool_count \\\n", "0 num_il_tl 200.0 0.0 0.0 \n", "1 open_acc_6m 200.0 0.0 0.0 \n", "2 avg_cur_bal 200.0 0.0 0.0 \n", "3 dti_joint 200.0 0.0 0.0 \n", "4 num_accts_ever_120_pd 200.0 0.0 0.0 \n", ".. ... ... ... ... \n", "146 sec_app_collections_12_mths_ex_med 200.0 0.0 0.0 \n", "147 emp_length 200.0 0.0 0.0 \n", "148 last_pymnt_amnt 200.0 0.0 0.0 \n", "149 total_pymnt_inv 200.0 0.0 0.0 \n", "150 debt_settlement_flag 200.0 0.0 0.0 \n", "\n", " numeric_count max mean min stddev \\\n", "0 199.0 43.00 9.834171 0.00 8.290657 \n", "1 199.0 8.00 1.356784 0.00 1.420749 \n", "2 199.0 72812.00 13079.467337 244.00 14001.002777 \n", "3 4.0 20.65 14.892500 12.35 3.863922 \n", "4 199.0 7.00 0.542714 0.00 1.229657 \n", ".. ... ... ... ... ... \n", "146 0.0 0.00 0.000000 0.00 0.000000 \n", "147 0.0 0.00 0.000000 0.00 0.000000 \n", "148 199.0 35304.76 5068.370452 0.00 7696.468449 \n", "149 199.0 52583.97 15089.057337 0.00 10349.878426 \n", "150 0.0 0.00 0.000000 0.00 0.000000 \n", "\n", " nunique_numbers ... ununique_str_upper quantile_0.0000 \\\n", "0 34.0 ... 0.0 0.00 \n", "1 8.0 ... 0.0 0.00 \n", "2 199.0 ... 0.0 244.00 \n", "3 4.0 ... 0.0 12.35 \n", "4 8.0 ... 0.0 0.00 \n", ".. ... ... ... ... \n", "146 0.0 ... 0.0 NaN \n", "147 0.0 ... 11.0 NaN \n", "148 194.0 ... 0.0 0.00 \n", "149 198.0 ... 0.0 0.00 \n", "150 0.0 ... 2.0 NaN \n", "\n", " quantile_0.0100 quantile_0.0500 quantile_0.2500 quantile_0.5000 \\\n", "0 0.000000 1.000000 4.000000 7.000000 \n", "1 0.000000 0.000000 0.000000 1.000000 \n", "2 425.000000 1252.000000 3039.000000 8200.000000 \n", "3 12.350000 12.350000 13.220000 13.350000 \n", "4 0.000000 0.000000 0.000000 0.000000 \n", ".. ... ... ... ... \n", "146 NaN NaN NaN NaN \n", "147 NaN NaN NaN NaN \n", "148 7.980000 118.699997 334.100006 771.229980 \n", "149 828.900024 2734.159912 7149.430176 12359.349609 \n", "150 NaN NaN NaN NaN \n", "\n", " quantile_0.7500 quantile_0.9500 quantile_0.9900 quantile_1.0000 \n", "0 14.000000 28.000000 42.000000 43.000000 \n", "1 2.000000 4.000000 7.000000 8.000000 \n", "2 17591.000000 43647.000000 68086.000000 72812.000000 \n", "3 20.650000 20.650000 20.650000 20.650000 \n", "4 1.000000 3.000000 7.000000 7.000000 \n", ".. ... ... ... ... \n", "146 NaN NaN NaN NaN \n", "147 NaN NaN NaN NaN \n", "148 7585.509766 22287.580078 32954.308594 35304.761719 \n", "149 20929.970703 35261.218750 51942.230469 52583.968750 \n", "150 NaN NaN NaN NaN \n", "\n", "[151 rows x 32 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# You can also capture the logger response and interact with the generated\n", "# profiles\n", "\n", "# Log the test data\n", "prof = session.log_dataframe(df_test, 'test.data')\n", "summary = prof.flat_summary()\n", "stats_df = summary['summary']\n", "stats_df" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXAAAAD7CAYAAABzGc+QAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/d3fzzAAAACXBIWXMAAAsTAAALEwEAmpwYAAANpklEQVR4nO3dfaxkdX3H8fenrJoULQ/dy3Zj0UsIkvBPV3tLabCtSmt5MIJJYyQN0lSzxpRGjGmzatL65/qcNGls1kLdphTjA1YSaCsSW2NSsBeKPEoAu0RwWS61LbRNNMC3f8zZMl7u3Tt778ze/e6+X8lkzvmdM3e+38zks2fOnN9OqgpJUj8/tdkFSJLWxwCXpKYMcElqygCXpKYMcElqygCXpKbWDPAkpyf5RpL7k9yX5H3D+EeSPJ7kruF28ezLlSQdlLWuA0+yHdheVXcmeQVwB3AZ8Hbgv6vqEzOvUpL0IlvW2qGq9gP7h+VnkjwAvHI9T7Z169aan59fz0Ml6bh1xx13PFVVc8vH1wzwcUnmgdcCtwPnA1cleSewCHygqv7jUI+fn59ncXHxcJ5Sko57SR5daXziLzGTvBz4MnB1VT0NfAY4E9jB6Aj9k6s8bmeSxSSLS0tLh1u3JGkVEwV4kpcwCu/rquoGgKo6UFXPVdXzwGeBc1d6bFXtqaqFqlqYm3vRJwBJ0jpNchVKgGuAB6rqU2Pj28d2extw7/TLkyStZpJz4OcDVwD3JLlrGPsQcHmSHUAB+4D3zKA+SdIqJrkK5VtAVth08/TLkSRNypmYktSUAS5JTRngktSUAS5JTR3WTMzj1fyumybab9/uS2ZciSS9wCNwSWrKAJekpgxwSWrKAJekpgxwSWrKAJekpgxwSWqqzXXgXostST/JI3BJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJamrNAE9yepJvJLk/yX1J3jeMn5rkliQPDfenzL5cSdJBkxyBPwt8oKrOAc4Dfj/JOcAu4NaqOgu4dViXJB0hawZ4Ve2vqjuH5WeAB4BXApcCe4fd9gKXzahGSdIKDusceJJ54LXA7cC2qto/bHoC2Dbd0iRJhzJxgCd5OfBl4Oqqenp8W1UVUKs8bmeSxSSLS0tLGypWkvSCiQI8yUsYhfd1VXXDMHwgyfZh+3bgyZUeW1V7qmqhqhbm5uamUbMkicmuQglwDfBAVX1qbNONwJXD8pXAV6dfniRpNVsm2Od84ArgniR3DWMfAnYDX0jyLuBR4O0zqVCStKI1A7yqvgVklc0XTLccSdKknIkpSU0Z4JLUlAEuSU0Z4JLU1CRXobQyv+umzS5Bko4Ij8AlqSkDXJKaMsAlqSkDXJKaMsAlqSkDXJKaMsAlqSkDXJKaMsAlqSkDXJKaMsAlqSkDXJKaMsAlqSkDXJKaMsAlqSkDXJKaOuZ+0KGDSX90Yt/uS2ZciaTOPAKXpKYMcElqygCXpKYMcElqygCXpKYMcElqygCXpKYMcElqyok8xwAnBknHJ4/AJakpA1ySmjLAJampNQM8ybVJnkxy79jYR5I8nuSu4XbxbMuUJC03yRH454ALVxj/dFXtGG43T7csSdJa1gzwqvom8MMjUIsk6TBs5Bz4VUnuHk6xnDK1iiRJE1lvgH8GOBPYAewHPrnajkl2JllMsri0tLTOp5MkLbeuAK+qA1X1XFU9D3wWOPcQ++6pqoWqWpibm1tvnZKkZdYV4Em2j62+Dbh3tX0lSbOx5lT6JNcDbwC2JnkM+BPgDUl2AAXsA94zuxIlSStZM8Cr6vIVhq+ZQS2SpMPgTExJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6SmDHBJasoAl6Smtmx2AVrd/K6bNruENU1a477dl8y4Eun44xG4JDVlgEtSUwa4JDVlgEtSU2sGeJJrkzyZ5N6xsVOT3JLkoeH+lNmWKUlabpIj8M8BFy4b2wXcWlVnAbcO65KkI2jNAK+qbwI/XDZ8KbB3WN4LXDbdsiRJa1nvOfBtVbV/WH4C2DaleiRJE9rwl5hVVUCttj3JziSLSRaXlpY2+nSSpMF6A/xAku0Aw/2Tq+1YVXuqaqGqFubm5tb5dJKk5dYb4DcCVw7LVwJfnU45kqRJTXIZ4fXAPwNnJ3ksybuA3cBvJnkI+I1hXZJ0BK35n1lV1eWrbLpgyrVIkg6DMzElqSkDXJKaMsAlqSl/0EEv4g9JSD14BC5JTRngktSUAS5JTRngktSUAS5JTRngktSUAS5JTRngktSUAS5JTRngktSUAS5JTRngktSUAS5JTRngktSUAS5JTRngktSUAS5JTRngktSUAS5JTRngktSUAS5JTRngktSUAS5JTRngktSUAS5JTW3Z7AJ0fJjfddNE++3bfckx8bzSkeARuCQ1ZYBLUlMGuCQ1ZYBLUlMb+hIzyT7gGeA54NmqWphGUZKktU3jKpQ3VtVTU/g7kqTD4CkUSWpqowFewNeS3JFk5zQKkiRNZqOnUF5fVY8nOQ24Jcl3q+qb4zsMwb4T4FWvetUGn06ajUkn/ByOzZoc5OSl48eGjsCr6vHh/kngK8C5K+yzp6oWqmphbm5uI08nSRqz7gBPcmKSVxxcBt4M3DutwiRJh7aRUyjbgK8kOfh3/qaq/n4qVUmS1rTuAK+q7wG/MMVaJEmHwcsIJakpA1ySmjLAJakpf9BBamIW16qrN4/AJakpA1ySmjLAJakpA1ySmjLAJakpA1ySmjLAJakpA1ySmnIij7TJnKCj9fIIXJKaMsAlqSkDXJKaMsAlqSkDXJKaMsAlqSkDXJKaMsAlqSkn8khqb9LJUPt2XzLjSo4sj8AlqSkDXJKaMsAlqSkDXJKaMsAlqSkDXJKaMsAlqSmvA5c0NdP+cYppX7e9mT+eMYtr0D0Cl6SmDHBJasoAl6SmDHBJampDAZ7kwiQPJnk4ya5pFSVJWtu6AzzJCcCfARcB5wCXJzlnWoVJkg5tI0fg5wIPV9X3qurHwOeBS6dTliRpLRsJ8FcC3x9bf2wYkyQdAamq9T0w+W3gwqp697B+BfDLVXXVsv12AjuH1bOBB5f9qa3AU+sq4uh1rPV0rPUD9tSFPY28uqrmlg9uZCbm48DpY+s/P4z9hKraA+xZ7Y8kWayqhQ3UcdQ51no61voBe+rCng5tI6dQ/gU4K8kZSV4KvAO4cRpFSZLWtu4j8Kp6NslVwD8AJwDXVtV9U6tMknRIG/rPrKrqZuDmDdaw6umVxo61no61fsCeurCnQ1j3l5iSpM3lVHpJamrmAZ7k5CRfSvLdJA8k+ZVh/A+GsfuSfGxs/w8OU/MfTPJbs65vPVbqKcmOJLcluSvJYpJzh32T5E+Hnu5O8rrNrn+5JGcPdR+8PZ3k6iSnJrklyUPD/SnD/p17+vjwut2d5CtJTh57zFH93lutp7HtH0hSSbYO621fp2Fby4w4xHtv+hlRVTO9AXuBdw/LLwVOBt4IfB142TB+2nB/DvAd4GXAGcAjwAmzrnFKPX0NuGgYuxj4x7HlvwMCnAfcvtn1r9HbCcATwKuBjwG7hvFdwEePgZ7eDGwZxj861lOL995KPQ3rpzO6oOBRYOsx8Dq1zohVepp6Rsz0CDzJScCvAdcAVNWPq+o/gfcCu6vqR8P4k8NDLgU+X1U/qqp/Ax5mNGX/qHGIngr4mWG3k4AfDMuXAn9VI7cBJyfZfmSrPiwXAI9U1aOMat87jO8FLhuW2/ZUVV+rqmeH8dsYzV+ABu+9ZcZfJ4BPA3/E6H14UNvXicYZscx4T1PPiFmfQjkDWAL+Msm/JvmLJCcCrwF+NcntSf4pyS8N+3eYnr9aT1cDH0/yfeATwAeH/Tv0NO4dwPXD8raq2j8sPwFsG5Y79zTu9xgd+UDjnpJcCjxeVd9Ztk/bnuidEePGe7qaKWfErAN8C/A64DNV9Vrgfxh9FN8CnMro48IfAl9IkhnXMi2r9fRe4P1VdTrwfoYj9E4ympD1VuCLy7fV6LNeu0uWVuspyYeBZ4HrNqOujRjvKclPAx8C/nhzq9qYFV6nzhkBrNjT1DNi1gH+GPBYVd0+rH+JUfg9BtwwfGT4NvA8o/8fYKLp+ZtstZ6uBG4Yxr7ICx/rOvR00EXAnVV1YFg/cPCj3HB/8GNs555I8rvAW4DfGf5hgr49ncnoU+F3kuxjVPedSX6Ovj1B74w4aHlPU8+ImQZ4VT0BfD/J2cPQBcD9wN8y+pKCJK9h9EXgU4ym4r8jycuSnAGcBXx7ljUerkP09APg14exNwEPDcs3Au8cvmk+D/ivsdMSR5vL+clTDTcyetMx3H91bLxlT0kuZHSu+K1V9b9j+x31770x/99TVd1TVadV1XxVzTMKvtcN79O2rxONM2LM8p6mnxFH4FvYHcAicDejF+UURi/GXwP3AncCbxrb/8OMvll+kOEb26PttkpPrwfuYPQN+e3ALw77htEPXzwC3AMsbHb9q/R0IvDvwEljYz8L3Dq80b4OnHoM9PQwo/ONdw23P2/23ntRT8u27+OFq1A6v07dM2KlnqaeEc7ElKSmnIkpSU0Z4JLUlAEuSU0Z4JLUlAEuSU0Z4JLUlAEuSU0Z4JLU1P8B4NichWHMVIwAAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# See one of the inspected histograms\n", "hist_data = summary['hist']['fico_range_high']\n", "bins = hist_data['bin_edges']\n", "n = hist_data['counts']\n", "bin_width = np.diff(bins)\n", "\n", "plt.bar(bins[0:-1], n, bin_width, align='edge')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load logged data" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "import glob" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load flat table statistics" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
columncountnull_countbool_countnumeric_countmaxmeanminstddevnunique_numbers...ununique_str_upperquantile_0.0000quantile_0.0100quantile_0.0500quantile_0.2500quantile_0.5000quantile_0.7500quantile_0.9500quantile_0.9900quantile_1.0000
0funded_amnt200.00.00.0199.040000.0016479.8994971000.009811.38494279.0...0.01000.0000001000.0000003325.0000009600.0015000.00000023000.00000035000.00000040000.00000040000.000000
1mo_sin_rcnt_tl200.00.00.0199.046.006.1959800.006.64973526.0...0.00.0000000.0000000.0000002.004.0000008.00000021.00000035.00000046.000000
2open_il_12m200.00.00.0199.04.000.6783920.000.8451205.0...0.00.0000000.0000000.0000000.000.0000001.0000002.0000003.0000004.000000
3installment200.00.00.0199.01300.55486.01809034.96283.607183180.0...0.034.95999936.150002112.139999271.75413.000000668.8599851069.4399411204.5699461300.550049
4bc_open_to_buy200.00.00.0198.088250.0011172.8434340.0014448.281979194.0...0.00.0000000.000000118.0000002011.005719.00000015374.00000042950.00000085587.00000088250.000000
..................................................................
146num_rev_tl_bal_gt_0200.00.00.0199.018.005.9798990.003.35742819.0...0.00.0000001.0000002.0000004.005.0000008.00000013.00000017.00000018.000000
147last_pymnt_d200.00.00.00.00.000.0000000.000.0000000.0...30.0NaNNaNNaNNaNNaNNaNNaNNaNNaN
148percent_bc_gt_75200.00.00.0198.0100.0040.3823230.0033.93326126.0...0.00.0000000.0000000.0000007.7033.29999966.699997100.000000100.000000100.000000
149debt_settlement_flag200.00.00.00.00.000.0000000.000.0000000.0...2.0NaNNaNNaNNaNNaNNaNNaNNaNNaN
150mo_sin_old_il_acct200.00.00.0195.0269.00127.1487183.0049.477824114.0...0.03.0000005.00000028.000000110.00132.000000153.000000209.000000264.000000269.000000
\n", "

151 rows × 32 columns

\n", "
" ], "text/plain": [ " column count null_count bool_count numeric_count \\\n", "0 funded_amnt 200.0 0.0 0.0 199.0 \n", "1 mo_sin_rcnt_tl 200.0 0.0 0.0 199.0 \n", "2 open_il_12m 200.0 0.0 0.0 199.0 \n", "3 installment 200.0 0.0 0.0 199.0 \n", "4 bc_open_to_buy 200.0 0.0 0.0 198.0 \n", ".. ... ... ... ... ... \n", "146 num_rev_tl_bal_gt_0 200.0 0.0 0.0 199.0 \n", "147 last_pymnt_d 200.0 0.0 0.0 0.0 \n", "148 percent_bc_gt_75 200.0 0.0 0.0 198.0 \n", "149 debt_settlement_flag 200.0 0.0 0.0 0.0 \n", "150 mo_sin_old_il_acct 200.0 0.0 0.0 195.0 \n", "\n", " max mean min stddev nunique_numbers ... \\\n", "0 40000.00 16479.899497 1000.00 9811.384942 79.0 ... \n", "1 46.00 6.195980 0.00 6.649735 26.0 ... \n", "2 4.00 0.678392 0.00 0.845120 5.0 ... \n", "3 1300.55 486.018090 34.96 283.607183 180.0 ... \n", "4 88250.00 11172.843434 0.00 14448.281979 194.0 ... \n", ".. ... ... ... ... ... ... \n", "146 18.00 5.979899 0.00 3.357428 19.0 ... \n", "147 0.00 0.000000 0.00 0.000000 0.0 ... \n", "148 100.00 40.382323 0.00 33.933261 26.0 ... \n", "149 0.00 0.000000 0.00 0.000000 0.0 ... \n", "150 269.00 127.148718 3.00 49.477824 114.0 ... \n", "\n", " ununique_str_upper quantile_0.0000 quantile_0.0100 quantile_0.0500 \\\n", "0 0.0 1000.000000 1000.000000 3325.000000 \n", "1 0.0 0.000000 0.000000 0.000000 \n", "2 0.0 0.000000 0.000000 0.000000 \n", "3 0.0 34.959999 36.150002 112.139999 \n", "4 0.0 0.000000 0.000000 118.000000 \n", ".. ... ... ... ... \n", "146 0.0 0.000000 1.000000 2.000000 \n", "147 30.0 NaN NaN NaN \n", "148 0.0 0.000000 0.000000 0.000000 \n", "149 2.0 NaN NaN NaN \n", "150 0.0 3.000000 5.000000 28.000000 \n", "\n", " quantile_0.2500 quantile_0.5000 quantile_0.7500 quantile_0.9500 \\\n", "0 9600.00 15000.000000 23000.000000 35000.000000 \n", "1 2.00 4.000000 8.000000 21.000000 \n", "2 0.00 0.000000 1.000000 2.000000 \n", "3 271.75 413.000000 668.859985 1069.439941 \n", "4 2011.00 5719.000000 15374.000000 42950.000000 \n", ".. ... ... ... ... \n", "146 4.00 5.000000 8.000000 13.000000 \n", "147 NaN NaN NaN NaN \n", "148 7.70 33.299999 66.699997 100.000000 \n", "149 NaN NaN NaN NaN \n", "150 110.00 132.000000 153.000000 209.000000 \n", "\n", " quantile_0.9900 quantile_1.0000 \n", "0 40000.000000 40000.000000 \n", "1 35.000000 46.000000 \n", "2 3.000000 4.000000 \n", "3 1204.569946 1300.550049 \n", "4 85587.000000 88250.000000 \n", ".. ... ... \n", "146 17.000000 18.000000 \n", "147 NaN NaN \n", "148 100.000000 100.000000 \n", "149 NaN NaN \n", "150 264.000000 269.000000 \n", "\n", "[151 rows x 32 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Load the flat table statistics from the 'test.data' dataset\n", "fnames = glob.glob('whylogs-output/test.data/dataset_summary/flat_table/dataset_summary*.csv')\n", "fnames.sort()\n", "# Load the most recent file\n", "test_stats = pd.read_csv(fnames[-1])\n", "test_stats" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load the full dataset profile sketch" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from whylogs import DatasetProfile\n", "\n", "# Load a dataset profile from the 'test.data' dataset\n", "fnames = glob.glob('whylogs-output/test.data/dataset_profile/protobuf/*.bin')\n", "fnames.sort()\n", "\n", "test_prof = DatasetProfile.read_protobuf(fnames[-1], delimited_file=False)\n", "test_prof" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "---" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2020-09-22 16:47:24,053 - whylogs.app.config - DEBUG - Attempting to load config file: None\n", "2020-09-22 16:47:24,054 - whylogs.app.config - DEBUG - Attempting to load config file: .whylogs.yaml\n" ] } ], "source": [ "# Not necessary, but you can reset the WhyLogs session if you want\n", "from whylogs import reset_default_session\n", "reset_default_session()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "whylogs", "language": "python", "name": "whylogs" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 4 }