{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Welcome to\n", " ____ __\n", " / __/__ ___ _____/ /__\n", " _\\ \\/ _ \\/ _ `/ __/ '_/\n", " /__ / .__/\\_,_/_/ /_/\\_\\ version 2.4.0\n", " /_/\n", "\n", "Using Python version 3.6.4 (default, Jan 16 2018 18:10:19)\n", "SparkSession available as 'spark'.\n", "local-1557024762906\n" ] } ], "source": [ "import os, sys, glob, datetime\n", "\n", "# specify spark version, python version\n", "spark_home = \"/home/zero/spark-2.4.0-bin-hadoop2.7\" # MODIFY THIS\n", "python_path=\"/apps/anaconda3/bin/python\"\n", "# set environment variables\n", "os.environ['SPARK_HOME'] = spark_home\n", "os.environ['PYSPARK_PYTHON'] = python_path\n", "os.environ['SPARK_LOCAL_IP'] = \"127.0.0.1\"\n", "\n", "def setup_spark_env(app_name):\n", " # set environment variables\n", " spark_python = os.path.join(spark_home, 'python')\n", " py4j = glob.glob(os.path.join(spark_python, 'lib', 'py4j-*.zip'))[0]\n", " sys.path[:0] = [spark_python, py4j]\n", " # specify Spark application parameters\n", " PYSPARK_SUBMIT_ARGS=\"--master local[2]\"\n", "\n", " os.environ['PYSPARK_SUBMIT_ARGS'] = (PYSPARK_SUBMIT_ARGS \n", " + \" --name '%s_%s'\"%(app_name, datetime.datetime.now().strftime(\"%Y%m%d %H:%M\")) \n", " + \" pyspark-shell\") \n", " return\n", "\n", "#\n", "setup_spark_env(\"your_spark_process_name\") # MODIFY THIS\n", "# launching PySpark application\n", "# execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))\n", "filename=os.path.join(spark_home, 'python/pyspark/shell.py')\n", "exec(compile(open(filename, \"rb\").read(), filename, 'exec'))\n", "sc.setLogLevel('ERROR')\n", "print(\"{}\".format(sc.applicationId))" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from pyspark.sql import functions as sf\n", "from pyspark.sql import Row\n", "from pyspark.sql.types import *\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import os, math, subprocess\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "# some settings for displaying Pandas results\n", "pd.set_option('display.width', 2000)\n", "pd.set_option('display.max_rows', 500)\n", "pd.set_option('display.max_columns', 500)\n", "pd.set_option('display.precision', 4)\n", "pd.set_option('display.max_colwidth', -1)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[Row(SK_ID_CURR='100002', TARGET='1', NAME_CONTRACT_TYPE='Cash loans', CODE_GENDER='M', FLAG_OWN_CAR='N', FLAG_OWN_REALTY='Y', CNT_CHILDREN='0', AMT_INCOME_TOTAL='202500.0', AMT_CREDIT='406597.5', AMT_ANNUITY='24700.5', AMT_GOODS_PRICE='351000.0', NAME_TYPE_SUITE='Unaccompanied', NAME_INCOME_TYPE='Working', NAME_EDUCATION_TYPE='Secondary / secondary special', NAME_FAMILY_STATUS='Single / not married', NAME_HOUSING_TYPE='House / apartment', REGION_POPULATION_RELATIVE='0.018801', DAYS_BIRTH='-9461', DAYS_EMPLOYED='-637', DAYS_REGISTRATION='-3648.0', DAYS_ID_PUBLISH='-2120', OWN_CAR_AGE=None, FLAG_MOBIL='1', FLAG_EMP_PHONE='1', FLAG_WORK_PHONE='0', FLAG_CONT_MOBILE='1', FLAG_PHONE='1', FLAG_EMAIL='0', OCCUPATION_TYPE='Laborers', CNT_FAM_MEMBERS='1.0', REGION_RATING_CLIENT='2', REGION_RATING_CLIENT_W_CITY='2', WEEKDAY_APPR_PROCESS_START='WEDNESDAY', HOUR_APPR_PROCESS_START='10', REG_REGION_NOT_LIVE_REGION='0', REG_REGION_NOT_WORK_REGION='0', LIVE_REGION_NOT_WORK_REGION='0', REG_CITY_NOT_LIVE_CITY='0', REG_CITY_NOT_WORK_CITY='0', LIVE_CITY_NOT_WORK_CITY='0', ORGANIZATION_TYPE='Business Entity Type 3', EXT_SOURCE_1='0.08303696739132256', EXT_SOURCE_2='0.2629485927471776', EXT_SOURCE_3='0.13937578009978951', APARTMENTS_AVG='0.0247', BASEMENTAREA_AVG='0.0369', YEARS_BEGINEXPLUATATION_AVG='0.9722', YEARS_BUILD_AVG='0.6192', COMMONAREA_AVG='0.0143', ELEVATORS_AVG='0.0', ENTRANCES_AVG='0.069', FLOORSMAX_AVG='0.0833', FLOORSMIN_AVG='0.125', LANDAREA_AVG='0.0369', LIVINGAPARTMENTS_AVG='0.0202', LIVINGAREA_AVG='0.019', NONLIVINGAPARTMENTS_AVG='0.0', NONLIVINGAREA_AVG='0.0', APARTMENTS_MODE='0.0252', BASEMENTAREA_MODE='0.0383', YEARS_BEGINEXPLUATATION_MODE='0.9722', YEARS_BUILD_MODE='0.6341', COMMONAREA_MODE='0.0144', ELEVATORS_MODE='0.0', ENTRANCES_MODE='0.069', FLOORSMAX_MODE='0.0833', FLOORSMIN_MODE='0.125', LANDAREA_MODE='0.0377', LIVINGAPARTMENTS_MODE='0.022', LIVINGAREA_MODE='0.0198', NONLIVINGAPARTMENTS_MODE='0.0', NONLIVINGAREA_MODE='0.0', APARTMENTS_MEDI='0.025', BASEMENTAREA_MEDI='0.0369', YEARS_BEGINEXPLUATATION_MEDI='0.9722', YEARS_BUILD_MEDI='0.6243', COMMONAREA_MEDI='0.0144', ELEVATORS_MEDI='0.0', ENTRANCES_MEDI='0.069', FLOORSMAX_MEDI='0.0833', FLOORSMIN_MEDI='0.125', LANDAREA_MEDI='0.0375', LIVINGAPARTMENTS_MEDI='0.0205', LIVINGAREA_MEDI='0.0193', NONLIVINGAPARTMENTS_MEDI='0.0', NONLIVINGAREA_MEDI='0.0', FONDKAPREMONT_MODE='reg oper account', HOUSETYPE_MODE='block of flats', TOTALAREA_MODE='0.0149', WALLSMATERIAL_MODE='Stone, brick', EMERGENCYSTATE_MODE='No', OBS_30_CNT_SOCIAL_CIRCLE='2.0', DEF_30_CNT_SOCIAL_CIRCLE='2.0', OBS_60_CNT_SOCIAL_CIRCLE='2.0', DEF_60_CNT_SOCIAL_CIRCLE='2.0', DAYS_LAST_PHONE_CHANGE='-1134.0', FLAG_DOCUMENT_2='0', FLAG_DOCUMENT_3='1', FLAG_DOCUMENT_4='0', FLAG_DOCUMENT_5='0', FLAG_DOCUMENT_6='0', FLAG_DOCUMENT_7='0', FLAG_DOCUMENT_8='0', FLAG_DOCUMENT_9='0', FLAG_DOCUMENT_10='0', FLAG_DOCUMENT_11='0', FLAG_DOCUMENT_12='0', FLAG_DOCUMENT_13='0', FLAG_DOCUMENT_14='0', FLAG_DOCUMENT_15='0', FLAG_DOCUMENT_16='0', FLAG_DOCUMENT_17='0', FLAG_DOCUMENT_18='0', FLAG_DOCUMENT_19='0', FLAG_DOCUMENT_20='0', FLAG_DOCUMENT_21='0', AMT_REQ_CREDIT_BUREAU_HOUR='0.0', AMT_REQ_CREDIT_BUREAU_DAY='0.0', AMT_REQ_CREDIT_BUREAU_WEEK='0.0', AMT_REQ_CREDIT_BUREAU_MON='0.0', AMT_REQ_CREDIT_BUREAU_QRT='0.0', AMT_REQ_CREDIT_BUREAU_YEAR='1.0')]\n" ] } ], "source": [ "# load data\n", "data_path = \"home-credit-default-risk/application_train.csv\"\n", "df = sqlContext.read.format(\"csv\").option(\"header\", \"true\").load(data_path)\n", "print(df.take(1))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total_records: 307511\n" ] } ], "source": [ "total_records = df.count()\n", "print(\"total_records:\", total_records)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SK_ID_CURR (string)\n", "TARGET (string)\n", "NAME_CONTRACT_TYPE (string)\n", "CODE_GENDER (string)\n", "FLAG_OWN_CAR (string)\n", "FLAG_OWN_REALTY (string)\n", "CNT_CHILDREN (string)\n", "AMT_INCOME_TOTAL (string)\n", "AMT_CREDIT (string)\n", "AMT_ANNUITY (string)\n", "AMT_GOODS_PRICE (string)\n", "NAME_TYPE_SUITE (string)\n", "NAME_INCOME_TYPE (string)\n", "NAME_EDUCATION_TYPE (string)\n", "NAME_FAMILY_STATUS (string)\n", "NAME_HOUSING_TYPE (string)\n", "REGION_POPULATION_RELATIVE (string)\n", "DAYS_BIRTH (string)\n", "DAYS_EMPLOYED (string)\n", "DAYS_REGISTRATION (string)\n", "DAYS_ID_PUBLISH (string)\n", "OWN_CAR_AGE (string)\n", "FLAG_MOBIL (string)\n", "FLAG_EMP_PHONE (string)\n", "FLAG_WORK_PHONE (string)\n", "FLAG_CONT_MOBILE (string)\n", "FLAG_PHONE (string)\n", "FLAG_EMAIL (string)\n", "OCCUPATION_TYPE (string)\n", "CNT_FAM_MEMBERS (string)\n", "REGION_RATING_CLIENT (string)\n", "REGION_RATING_CLIENT_W_CITY (string)\n", "WEEKDAY_APPR_PROCESS_START (string)\n", "HOUR_APPR_PROCESS_START (string)\n", "REG_REGION_NOT_LIVE_REGION (string)\n", "REG_REGION_NOT_WORK_REGION (string)\n", "LIVE_REGION_NOT_WORK_REGION (string)\n", "REG_CITY_NOT_LIVE_CITY (string)\n", "REG_CITY_NOT_WORK_CITY (string)\n", "LIVE_CITY_NOT_WORK_CITY (string)\n", "ORGANIZATION_TYPE (string)\n", "EXT_SOURCE_1 (string)\n", "EXT_SOURCE_2 (string)\n", "EXT_SOURCE_3 (string)\n", "APARTMENTS_AVG (string)\n", "BASEMENTAREA_AVG (string)\n", "YEARS_BEGINEXPLUATATION_AVG (string)\n", "YEARS_BUILD_AVG (string)\n", "COMMONAREA_AVG (string)\n", "ELEVATORS_AVG (string)\n", "ENTRANCES_AVG (string)\n", "FLOORSMAX_AVG (string)\n", "FLOORSMIN_AVG (string)\n", "LANDAREA_AVG (string)\n", "LIVINGAPARTMENTS_AVG (string)\n", "LIVINGAREA_AVG (string)\n", "NONLIVINGAPARTMENTS_AVG (string)\n", "NONLIVINGAREA_AVG (string)\n", "APARTMENTS_MODE (string)\n", "BASEMENTAREA_MODE (string)\n", "YEARS_BEGINEXPLUATATION_MODE (string)\n", "YEARS_BUILD_MODE (string)\n", "COMMONAREA_MODE (string)\n", "ELEVATORS_MODE (string)\n", "ENTRANCES_MODE (string)\n", "FLOORSMAX_MODE (string)\n", "FLOORSMIN_MODE (string)\n", "LANDAREA_MODE (string)\n", "LIVINGAPARTMENTS_MODE (string)\n", "LIVINGAREA_MODE (string)\n", "NONLIVINGAPARTMENTS_MODE (string)\n", "NONLIVINGAREA_MODE (string)\n", "APARTMENTS_MEDI (string)\n", "BASEMENTAREA_MEDI (string)\n", "YEARS_BEGINEXPLUATATION_MEDI (string)\n", "YEARS_BUILD_MEDI (string)\n", "COMMONAREA_MEDI (string)\n", "ELEVATORS_MEDI (string)\n", "ENTRANCES_MEDI (string)\n", "FLOORSMAX_MEDI (string)\n", "FLOORSMIN_MEDI (string)\n", "LANDAREA_MEDI (string)\n", "LIVINGAPARTMENTS_MEDI (string)\n", "LIVINGAREA_MEDI (string)\n", "NONLIVINGAPARTMENTS_MEDI (string)\n", "NONLIVINGAREA_MEDI (string)\n", "FONDKAPREMONT_MODE (string)\n", "HOUSETYPE_MODE (string)\n", "TOTALAREA_MODE (string)\n", "WALLSMATERIAL_MODE (string)\n", "EMERGENCYSTATE_MODE (string)\n", "OBS_30_CNT_SOCIAL_CIRCLE (string)\n", "DEF_30_CNT_SOCIAL_CIRCLE (string)\n", "OBS_60_CNT_SOCIAL_CIRCLE (string)\n", "DEF_60_CNT_SOCIAL_CIRCLE (string)\n", "DAYS_LAST_PHONE_CHANGE (string)\n", "FLAG_DOCUMENT_2 (string)\n", "FLAG_DOCUMENT_3 (string)\n", "FLAG_DOCUMENT_4 (string)\n", "FLAG_DOCUMENT_5 (string)\n", "FLAG_DOCUMENT_6 (string)\n", "FLAG_DOCUMENT_7 (string)\n", "FLAG_DOCUMENT_8 (string)\n", "FLAG_DOCUMENT_9 (string)\n", "FLAG_DOCUMENT_10 (string)\n", "FLAG_DOCUMENT_11 (string)\n", "FLAG_DOCUMENT_12 (string)\n", "FLAG_DOCUMENT_13 (string)\n", "FLAG_DOCUMENT_14 (string)\n", "FLAG_DOCUMENT_15 (string)\n", "FLAG_DOCUMENT_16 (string)\n", "FLAG_DOCUMENT_17 (string)\n", "FLAG_DOCUMENT_18 (string)\n", "FLAG_DOCUMENT_19 (string)\n", "FLAG_DOCUMENT_20 (string)\n", "FLAG_DOCUMENT_21 (string)\n", "AMT_REQ_CREDIT_BUREAU_HOUR (string)\n", "AMT_REQ_CREDIT_BUREAU_DAY (string)\n", "AMT_REQ_CREDIT_BUREAU_WEEK (string)\n", "AMT_REQ_CREDIT_BUREAU_MON (string)\n", "AMT_REQ_CREDIT_BUREAU_QRT (string)\n", "AMT_REQ_CREDIT_BUREAU_YEAR (string)\n" ] } ], "source": [ "# check dtypes\n", "for n, t in df.dtypes:\n", " print(\"{} ({})\".format(n, t))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SK_ID_CURR: 307511 (100.00%)\n", "TARGET: 2 (0.00%)\n", "NAME_CONTRACT_TYPE: 2 (0.00%)\n", "CODE_GENDER: 3 (0.00%)\n", "FLAG_OWN_CAR: 2 (0.00%)\n", "FLAG_OWN_REALTY: 2 (0.00%)\n", "CNT_CHILDREN: 15 (0.00%)\n", "AMT_INCOME_TOTAL: 2548 (0.83%)\n", "AMT_CREDIT: 5603 (1.82%)\n", "AMT_ANNUITY: 13673 (4.45%)\n", "AMT_GOODS_PRICE: 1003 (0.33%)\n", "NAME_TYPE_SUITE: 8 (0.00%)\n", "NAME_INCOME_TYPE: 8 (0.00%)\n", "NAME_EDUCATION_TYPE: 5 (0.00%)\n", "NAME_FAMILY_STATUS: 6 (0.00%)\n", "NAME_HOUSING_TYPE: 6 (0.00%)\n", "REGION_POPULATION_RELATIVE: 81 (0.03%)\n", "DAYS_BIRTH: 17460 (5.68%)\n", "DAYS_EMPLOYED: 12574 (4.09%)\n", "DAYS_REGISTRATION: 15688 (5.10%)\n", "DAYS_ID_PUBLISH: 6168 (2.01%)\n", "OWN_CAR_AGE: 63 (0.02%)\n", "FLAG_MOBIL: 2 (0.00%)\n", "FLAG_EMP_PHONE: 2 (0.00%)\n", "FLAG_WORK_PHONE: 2 (0.00%)\n", "FLAG_CONT_MOBILE: 2 (0.00%)\n", "FLAG_PHONE: 2 (0.00%)\n", "FLAG_EMAIL: 2 (0.00%)\n", "OCCUPATION_TYPE: 19 (0.01%)\n", "CNT_FAM_MEMBERS: 18 (0.01%)\n", "REGION_RATING_CLIENT: 3 (0.00%)\n", "REGION_RATING_CLIENT_W_CITY: 3 (0.00%)\n", "WEEKDAY_APPR_PROCESS_START: 7 (0.00%)\n", "HOUR_APPR_PROCESS_START: 24 (0.01%)\n", "REG_REGION_NOT_LIVE_REGION: 2 (0.00%)\n", "REG_REGION_NOT_WORK_REGION: 2 (0.00%)\n", "LIVE_REGION_NOT_WORK_REGION: 2 (0.00%)\n", "REG_CITY_NOT_LIVE_CITY: 2 (0.00%)\n", "REG_CITY_NOT_WORK_CITY: 2 (0.00%)\n", "LIVE_CITY_NOT_WORK_CITY: 2 (0.00%)\n", "ORGANIZATION_TYPE: 58 (0.02%)\n", "EXT_SOURCE_1: 114585 (37.26%)\n", "EXT_SOURCE_2: 119832 (38.97%)\n", "EXT_SOURCE_3: 815 (0.27%)\n", "APARTMENTS_AVG: 2340 (0.76%)\n", "BASEMENTAREA_AVG: 3781 (1.23%)\n", "YEARS_BEGINEXPLUATATION_AVG: 286 (0.09%)\n", "YEARS_BUILD_AVG: 150 (0.05%)\n", "COMMONAREA_AVG: 3182 (1.03%)\n", "ELEVATORS_AVG: 258 (0.08%)\n", "ENTRANCES_AVG: 286 (0.09%)\n", "FLOORSMAX_AVG: 404 (0.13%)\n", "FLOORSMIN_AVG: 306 (0.10%)\n", "LANDAREA_AVG: 3528 (1.15%)\n", "LIVINGAPARTMENTS_AVG: 1869 (0.61%)\n", "LIVINGAREA_AVG: 5200 (1.69%)\n", "NONLIVINGAPARTMENTS_AVG: 387 (0.13%)\n", "NONLIVINGAREA_AVG: 3291 (1.07%)\n", "APARTMENTS_MODE: 761 (0.25%)\n", "BASEMENTAREA_MODE: 3842 (1.25%)\n", "YEARS_BEGINEXPLUATATION_MODE: 222 (0.07%)\n", "YEARS_BUILD_MODE: 155 (0.05%)\n", "COMMONAREA_MODE: 3129 (1.02%)\n", "ELEVATORS_MODE: 27 (0.01%)\n", "ENTRANCES_MODE: 31 (0.01%)\n", "FLOORSMAX_MODE: 26 (0.01%)\n", "FLOORSMIN_MODE: 26 (0.01%)\n", "LANDAREA_MODE: 3564 (1.16%)\n", "LIVINGAPARTMENTS_MODE: 737 (0.24%)\n", "LIVINGAREA_MODE: 5302 (1.72%)\n", "NONLIVINGAPARTMENTS_MODE: 168 (0.05%)\n", "NONLIVINGAREA_MODE: 3328 (1.08%)\n", "APARTMENTS_MEDI: 1149 (0.37%)\n", "BASEMENTAREA_MEDI: 3773 (1.23%)\n", "YEARS_BEGINEXPLUATATION_MEDI: 246 (0.08%)\n", "YEARS_BUILD_MEDI: 152 (0.05%)\n", "COMMONAREA_MEDI: 3203 (1.04%)\n", "ELEVATORS_MEDI: 47 (0.02%)\n", "ENTRANCES_MEDI: 47 (0.02%)\n", "FLOORSMAX_MEDI: 50 (0.02%)\n", "FLOORSMIN_MEDI: 48 (0.02%)\n", "LANDAREA_MEDI: 3561 (1.16%)\n", "LIVINGAPARTMENTS_MEDI: 1098 (0.36%)\n", "LIVINGAREA_MEDI: 5282 (1.72%)\n", "NONLIVINGAPARTMENTS_MEDI: 215 (0.07%)\n", "NONLIVINGAREA_MEDI: 3324 (1.08%)\n", "FONDKAPREMONT_MODE: 5 (0.00%)\n", "HOUSETYPE_MODE: 4 (0.00%)\n", "TOTALAREA_MODE: 5117 (1.66%)\n", "WALLSMATERIAL_MODE: 8 (0.00%)\n", "EMERGENCYSTATE_MODE: 3 (0.00%)\n", "OBS_30_CNT_SOCIAL_CIRCLE: 34 (0.01%)\n", "DEF_30_CNT_SOCIAL_CIRCLE: 11 (0.00%)\n", "OBS_60_CNT_SOCIAL_CIRCLE: 34 (0.01%)\n", "DEF_60_CNT_SOCIAL_CIRCLE: 10 (0.00%)\n", "DAYS_LAST_PHONE_CHANGE: 3774 (1.23%)\n", "FLAG_DOCUMENT_2: 2 (0.00%)\n", "FLAG_DOCUMENT_3: 2 (0.00%)\n", "FLAG_DOCUMENT_4: 2 (0.00%)\n", "FLAG_DOCUMENT_5: 2 (0.00%)\n", "FLAG_DOCUMENT_6: 2 (0.00%)\n", "FLAG_DOCUMENT_7: 2 (0.00%)\n", "FLAG_DOCUMENT_8: 2 (0.00%)\n", "FLAG_DOCUMENT_9: 2 (0.00%)\n", "FLAG_DOCUMENT_10: 2 (0.00%)\n", "FLAG_DOCUMENT_11: 2 (0.00%)\n", "FLAG_DOCUMENT_12: 2 (0.00%)\n", "FLAG_DOCUMENT_13: 2 (0.00%)\n", "FLAG_DOCUMENT_14: 2 (0.00%)\n", "FLAG_DOCUMENT_15: 2 (0.00%)\n", "FLAG_DOCUMENT_16: 2 (0.00%)\n", "FLAG_DOCUMENT_17: 2 (0.00%)\n", "FLAG_DOCUMENT_18: 2 (0.00%)\n", "FLAG_DOCUMENT_19: 2 (0.00%)\n", "FLAG_DOCUMENT_20: 2 (0.00%)\n", "FLAG_DOCUMENT_21: 2 (0.00%)\n", "AMT_REQ_CREDIT_BUREAU_HOUR: 6 (0.00%)\n", "AMT_REQ_CREDIT_BUREAU_DAY: 10 (0.00%)\n", "AMT_REQ_CREDIT_BUREAU_WEEK: 10 (0.00%)\n", "AMT_REQ_CREDIT_BUREAU_MON: 25 (0.01%)\n", "AMT_REQ_CREDIT_BUREAU_QRT: 12 (0.00%)\n", "AMT_REQ_CREDIT_BUREAU_YEAR: 26 (0.01%)\n" ] } ], "source": [ "# count distinct\n", "for cname in df.columns:\n", " cnt_dist = df.select(cname).distinct().count()\n", " pct_dist = cnt_dist * 100.0 / total_records\n", " print(\"{}: {} ({:0.2f}%)\".format(cname, cnt_dist, pct_dist))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SK_ID_CURR: 0 (0.00%)\n", "TARGET: 0 (0.00%)\n", "NAME_CONTRACT_TYPE: 0 (0.00%)\n", "CODE_GENDER: 0 (0.00%)\n", "FLAG_OWN_CAR: 0 (0.00%)\n", "FLAG_OWN_REALTY: 0 (0.00%)\n", "CNT_CHILDREN: 0 (0.00%)\n", "AMT_INCOME_TOTAL: 0 (0.00%)\n", "AMT_CREDIT: 0 (0.00%)\n", "AMT_ANNUITY: 12 (0.00%)\n", "AMT_GOODS_PRICE: 278 (0.09%)\n", "NAME_TYPE_SUITE: 1292 (0.42%)\n", "NAME_INCOME_TYPE: 0 (0.00%)\n", "NAME_EDUCATION_TYPE: 0 (0.00%)\n", "NAME_FAMILY_STATUS: 0 (0.00%)\n", "NAME_HOUSING_TYPE: 0 (0.00%)\n", "REGION_POPULATION_RELATIVE: 0 (0.00%)\n", "DAYS_BIRTH: 0 (0.00%)\n", "DAYS_EMPLOYED: 0 (0.00%)\n", "DAYS_REGISTRATION: 0 (0.00%)\n", "DAYS_ID_PUBLISH: 0 (0.00%)\n", "OWN_CAR_AGE: 202929 (65.99%)\n", "FLAG_MOBIL: 0 (0.00%)\n", "FLAG_EMP_PHONE: 0 (0.00%)\n", "FLAG_WORK_PHONE: 0 (0.00%)\n", "FLAG_CONT_MOBILE: 0 (0.00%)\n", "FLAG_PHONE: 0 (0.00%)\n", "FLAG_EMAIL: 0 (0.00%)\n", "OCCUPATION_TYPE: 96391 (31.35%)\n", "CNT_FAM_MEMBERS: 2 (0.00%)\n", "REGION_RATING_CLIENT: 0 (0.00%)\n", "REGION_RATING_CLIENT_W_CITY: 0 (0.00%)\n", "WEEKDAY_APPR_PROCESS_START: 0 (0.00%)\n", "HOUR_APPR_PROCESS_START: 0 (0.00%)\n", "REG_REGION_NOT_LIVE_REGION: 0 (0.00%)\n", "REG_REGION_NOT_WORK_REGION: 0 (0.00%)\n", "LIVE_REGION_NOT_WORK_REGION: 0 (0.00%)\n", "REG_CITY_NOT_LIVE_CITY: 0 (0.00%)\n", "REG_CITY_NOT_WORK_CITY: 0 (0.00%)\n", "LIVE_CITY_NOT_WORK_CITY: 0 (0.00%)\n", "ORGANIZATION_TYPE: 0 (0.00%)\n", "EXT_SOURCE_1: 173378 (56.38%)\n", "EXT_SOURCE_2: 660 (0.21%)\n", "EXT_SOURCE_3: 60965 (19.83%)\n", "APARTMENTS_AVG: 156061 (50.75%)\n", "BASEMENTAREA_AVG: 179943 (58.52%)\n", "YEARS_BEGINEXPLUATATION_AVG: 150007 (48.78%)\n", "YEARS_BUILD_AVG: 204488 (66.50%)\n", "COMMONAREA_AVG: 214865 (69.87%)\n", "ELEVATORS_AVG: 163891 (53.30%)\n", "ENTRANCES_AVG: 154828 (50.35%)\n", "FLOORSMAX_AVG: 153020 (49.76%)\n", "FLOORSMIN_AVG: 208642 (67.85%)\n", "LANDAREA_AVG: 182590 (59.38%)\n", "LIVINGAPARTMENTS_AVG: 210199 (68.35%)\n", "LIVINGAREA_AVG: 154350 (50.19%)\n", "NONLIVINGAPARTMENTS_AVG: 213514 (69.43%)\n", "NONLIVINGAREA_AVG: 169682 (55.18%)\n", "APARTMENTS_MODE: 156061 (50.75%)\n", "BASEMENTAREA_MODE: 179943 (58.52%)\n", "YEARS_BEGINEXPLUATATION_MODE: 150007 (48.78%)\n", "YEARS_BUILD_MODE: 204488 (66.50%)\n", "COMMONAREA_MODE: 214865 (69.87%)\n", "ELEVATORS_MODE: 163891 (53.30%)\n", "ENTRANCES_MODE: 154828 (50.35%)\n", "FLOORSMAX_MODE: 153020 (49.76%)\n", "FLOORSMIN_MODE: 208642 (67.85%)\n", "LANDAREA_MODE: 182590 (59.38%)\n", "LIVINGAPARTMENTS_MODE: 210199 (68.35%)\n", "LIVINGAREA_MODE: 154350 (50.19%)\n", "NONLIVINGAPARTMENTS_MODE: 213514 (69.43%)\n", "NONLIVINGAREA_MODE: 169682 (55.18%)\n", "APARTMENTS_MEDI: 156061 (50.75%)\n", "BASEMENTAREA_MEDI: 179943 (58.52%)\n", "YEARS_BEGINEXPLUATATION_MEDI: 150007 (48.78%)\n", "YEARS_BUILD_MEDI: 204488 (66.50%)\n", "COMMONAREA_MEDI: 214865 (69.87%)\n", "ELEVATORS_MEDI: 163891 (53.30%)\n", "ENTRANCES_MEDI: 154828 (50.35%)\n", "FLOORSMAX_MEDI: 153020 (49.76%)\n", "FLOORSMIN_MEDI: 208642 (67.85%)\n", "LANDAREA_MEDI: 182590 (59.38%)\n", "LIVINGAPARTMENTS_MEDI: 210199 (68.35%)\n", "LIVINGAREA_MEDI: 154350 (50.19%)\n", "NONLIVINGAPARTMENTS_MEDI: 213514 (69.43%)\n", "NONLIVINGAREA_MEDI: 169682 (55.18%)\n", "FONDKAPREMONT_MODE: 210295 (68.39%)\n", "HOUSETYPE_MODE: 154297 (50.18%)\n", "TOTALAREA_MODE: 148431 (48.27%)\n", "WALLSMATERIAL_MODE: 156341 (50.84%)\n", "EMERGENCYSTATE_MODE: 145755 (47.40%)\n", "OBS_30_CNT_SOCIAL_CIRCLE: 1021 (0.33%)\n", "DEF_30_CNT_SOCIAL_CIRCLE: 1021 (0.33%)\n", "OBS_60_CNT_SOCIAL_CIRCLE: 1021 (0.33%)\n", "DEF_60_CNT_SOCIAL_CIRCLE: 1021 (0.33%)\n", "DAYS_LAST_PHONE_CHANGE: 1 (0.00%)\n", "FLAG_DOCUMENT_2: 0 (0.00%)\n", "FLAG_DOCUMENT_3: 0 (0.00%)\n", "FLAG_DOCUMENT_4: 0 (0.00%)\n", "FLAG_DOCUMENT_5: 0 (0.00%)\n", "FLAG_DOCUMENT_6: 0 (0.00%)\n", "FLAG_DOCUMENT_7: 0 (0.00%)\n", "FLAG_DOCUMENT_8: 0 (0.00%)\n", "FLAG_DOCUMENT_9: 0 (0.00%)\n", "FLAG_DOCUMENT_10: 0 (0.00%)\n", "FLAG_DOCUMENT_11: 0 (0.00%)\n", "FLAG_DOCUMENT_12: 0 (0.00%)\n", "FLAG_DOCUMENT_13: 0 (0.00%)\n", "FLAG_DOCUMENT_14: 0 (0.00%)\n", "FLAG_DOCUMENT_15: 0 (0.00%)\n", "FLAG_DOCUMENT_16: 0 (0.00%)\n", "FLAG_DOCUMENT_17: 0 (0.00%)\n", "FLAG_DOCUMENT_18: 0 (0.00%)\n", "FLAG_DOCUMENT_19: 0 (0.00%)\n", "FLAG_DOCUMENT_20: 0 (0.00%)\n", "FLAG_DOCUMENT_21: 0 (0.00%)\n", "AMT_REQ_CREDIT_BUREAU_HOUR: 41519 (13.50%)\n", "AMT_REQ_CREDIT_BUREAU_DAY: 41519 (13.50%)\n", "AMT_REQ_CREDIT_BUREAU_WEEK: 41519 (13.50%)\n", "AMT_REQ_CREDIT_BUREAU_MON: 41519 (13.50%)\n", "AMT_REQ_CREDIT_BUREAU_QRT: 41519 (13.50%)\n", "AMT_REQ_CREDIT_BUREAU_YEAR: 41519 (13.50%)\n" ] } ], "source": [ "# count NULL\n", "for cname in df.columns:\n", " cnt_null = df.where(\"{} is NULL\".format(cname)).count()\n", " pct_miss = cnt_null * 100.0 / total_records\n", " print(\"{}: {} ({:0.2f}%)\".format(cname, cnt_null, pct_miss))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SK_ID_CURR: 0 (0.00%)\n", "TARGET: 282686 (91.93%)\n", "NAME_CONTRACT_TYPE: 0 (0.00%)\n", "CODE_GENDER: 0 (0.00%)\n", "FLAG_OWN_CAR: 0 (0.00%)\n", "FLAG_OWN_REALTY: 0 (0.00%)\n", "CNT_CHILDREN: 215371 (70.04%)\n", "AMT_INCOME_TOTAL: 0 (0.00%)\n", "AMT_CREDIT: 0 (0.00%)\n", "AMT_ANNUITY: 0 (0.00%)\n", "AMT_GOODS_PRICE: 0 (0.00%)\n", "NAME_TYPE_SUITE: 0 (0.00%)\n", "NAME_INCOME_TYPE: 0 (0.00%)\n", "NAME_EDUCATION_TYPE: 0 (0.00%)\n", "NAME_FAMILY_STATUS: 0 (0.00%)\n", "NAME_HOUSING_TYPE: 0 (0.00%)\n", "REGION_POPULATION_RELATIVE: 0 (0.00%)\n", "DAYS_BIRTH: 0 (0.00%)\n", "DAYS_EMPLOYED: 2 (0.00%)\n", "DAYS_REGISTRATION: 80 (0.03%)\n", "DAYS_ID_PUBLISH: 16 (0.01%)\n", "OWN_CAR_AGE: 2134 (0.69%)\n", "FLAG_MOBIL: 1 (0.00%)\n", "FLAG_EMP_PHONE: 55386 (18.01%)\n", "FLAG_WORK_PHONE: 246203 (80.06%)\n", "FLAG_CONT_MOBILE: 574 (0.19%)\n", "FLAG_PHONE: 221080 (71.89%)\n", "FLAG_EMAIL: 290069 (94.33%)\n", "OCCUPATION_TYPE: 0 (0.00%)\n", "CNT_FAM_MEMBERS: 0 (0.00%)\n", "REGION_RATING_CLIENT: 0 (0.00%)\n", "REGION_RATING_CLIENT_W_CITY: 0 (0.00%)\n", "WEEKDAY_APPR_PROCESS_START: 0 (0.00%)\n", "HOUR_APPR_PROCESS_START: 40 (0.01%)\n", "REG_REGION_NOT_LIVE_REGION: 302854 (98.49%)\n", "REG_REGION_NOT_WORK_REGION: 291899 (94.92%)\n", "LIVE_REGION_NOT_WORK_REGION: 295008 (95.93%)\n", "REG_CITY_NOT_LIVE_CITY: 283472 (92.18%)\n", "REG_CITY_NOT_WORK_CITY: 236644 (76.95%)\n", "LIVE_CITY_NOT_WORK_CITY: 252296 (82.04%)\n", "ORGANIZATION_TYPE: 0 (0.00%)\n", "EXT_SOURCE_1: 0 (0.00%)\n", "EXT_SOURCE_2: 0 (0.00%)\n", "EXT_SOURCE_3: 0 (0.00%)\n", "APARTMENTS_AVG: 751 (0.24%)\n", "BASEMENTAREA_AVG: 14745 (4.79%)\n", "YEARS_BEGINEXPLUATATION_AVG: 514 (0.17%)\n", "YEARS_BUILD_AVG: 102 (0.03%)\n", "COMMONAREA_AVG: 8442 (2.75%)\n", "ELEVATORS_AVG: 85718 (27.87%)\n", "ENTRANCES_AVG: 323 (0.11%)\n", "FLOORSMAX_AVG: 2938 (0.96%)\n", "FLOORSMIN_AVG: 2320 (0.75%)\n", "LANDAREA_AVG: 15600 (5.07%)\n", "LIVINGAPARTMENTS_AVG: 418 (0.14%)\n", "LIVINGAREA_AVG: 284 (0.09%)\n", "NONLIVINGAPARTMENTS_AVG: 54549 (17.74%)\n", "NONLIVINGAREA_AVG: 58735 (19.10%)\n", "APARTMENTS_MODE: 976 (0.32%)\n", "BASEMENTAREA_MODE: 16598 (5.40%)\n", "YEARS_BEGINEXPLUATATION_MODE: 142 (0.05%)\n", "YEARS_BUILD_MODE: 103 (0.03%)\n", "COMMONAREA_MODE: 9690 (3.15%)\n", "ELEVATORS_MODE: 89498 (29.10%)\n", "ENTRANCES_MODE: 387 (0.13%)\n", "FLOORSMAX_MODE: 3415 (1.11%)\n", "FLOORSMIN_MODE: 2517 (0.82%)\n", "LANDAREA_MODE: 17453 (5.68%)\n", "LIVINGAPARTMENTS_MODE: 519 (0.17%)\n", "LIVINGAREA_MODE: 444 (0.14%)\n", "NONLIVINGAPARTMENTS_MODE: 59255 (19.27%)\n", "NONLIVINGAREA_MODE: 67126 (21.83%)\n", "APARTMENTS_MEDI: 771 (0.25%)\n", "BASEMENTAREA_MEDI: 14991 (4.87%)\n", "YEARS_BEGINEXPLUATATION_MEDI: 548 (0.18%)\n", "YEARS_BUILD_MEDI: 101 (0.03%)\n", "COMMONAREA_MEDI: 8691 (2.83%)\n", "ELEVATORS_MEDI: 87026 (28.30%)\n", "ENTRANCES_MEDI: 329 (0.11%)\n", "FLOORSMAX_MEDI: 2995 (0.97%)\n", "FLOORSMIN_MEDI: 2351 (0.76%)\n", "LANDAREA_MEDI: 15919 (5.18%)\n", "LIVINGAPARTMENTS_MEDI: 433 (0.14%)\n", "LIVINGAREA_MEDI: 299 (0.10%)\n", "NONLIVINGAPARTMENTS_MEDI: 56097 (18.24%)\n", "NONLIVINGAREA_MEDI: 60954 (19.82%)\n", "FONDKAPREMONT_MODE: 0 (0.00%)\n", "HOUSETYPE_MODE: 0 (0.00%)\n", "TOTALAREA_MODE: 582 (0.19%)\n", "WALLSMATERIAL_MODE: 0 (0.00%)\n", "EMERGENCYSTATE_MODE: 0 (0.00%)\n", "OBS_30_CNT_SOCIAL_CIRCLE: 163910 (53.30%)\n", "DEF_30_CNT_SOCIAL_CIRCLE: 271324 (88.23%)\n", "OBS_60_CNT_SOCIAL_CIRCLE: 164666 (53.55%)\n", "DEF_60_CNT_SOCIAL_CIRCLE: 280721 (91.29%)\n", "DAYS_LAST_PHONE_CHANGE: 37672 (12.25%)\n", "FLAG_DOCUMENT_2: 307498 (100.00%)\n", "FLAG_DOCUMENT_3: 89171 (29.00%)\n", "FLAG_DOCUMENT_4: 307486 (99.99%)\n", "FLAG_DOCUMENT_5: 302863 (98.49%)\n", "FLAG_DOCUMENT_6: 280433 (91.19%)\n", "FLAG_DOCUMENT_7: 307452 (99.98%)\n", "FLAG_DOCUMENT_8: 282487 (91.86%)\n", "FLAG_DOCUMENT_9: 306313 (99.61%)\n", "FLAG_DOCUMENT_10: 307504 (100.00%)\n", "FLAG_DOCUMENT_11: 306308 (99.61%)\n", "FLAG_DOCUMENT_12: 307509 (100.00%)\n", "FLAG_DOCUMENT_13: 306427 (99.65%)\n", "FLAG_DOCUMENT_14: 306608 (99.71%)\n", "FLAG_DOCUMENT_15: 307139 (99.88%)\n", "FLAG_DOCUMENT_16: 304458 (99.01%)\n", "FLAG_DOCUMENT_17: 307429 (99.97%)\n", "FLAG_DOCUMENT_18: 305011 (99.19%)\n", "FLAG_DOCUMENT_19: 307328 (99.94%)\n", "FLAG_DOCUMENT_20: 307355 (99.95%)\n", "FLAG_DOCUMENT_21: 307408 (99.97%)\n", "AMT_REQ_CREDIT_BUREAU_HOUR: 264366 (85.97%)\n", "AMT_REQ_CREDIT_BUREAU_DAY: 264503 (86.01%)\n", "AMT_REQ_CREDIT_BUREAU_WEEK: 257456 (83.72%)\n", "AMT_REQ_CREDIT_BUREAU_MON: 222233 (72.27%)\n", "AMT_REQ_CREDIT_BUREAU_QRT: 215417 (70.05%)\n", "AMT_REQ_CREDIT_BUREAU_YEAR: 71801 (23.35%)\n" ] } ], "source": [ "# count zeros\n", "for cname in df.columns:\n", " cnt_zeros = df.where(\"{} = 0.0\".format(cname)).count()\n", " pct_zeros = cnt_zeros * 100.0 / total_records\n", " print(\"{}: {} ({:0.2f}%)\".format(cname, cnt_zeros, pct_zeros))" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SK_ID_CURR: 0 (0.00%)\n", "TARGET: 0 (0.00%)\n", "NAME_CONTRACT_TYPE: 0 (0.00%)\n", "CODE_GENDER: 0 (0.00%)\n", "FLAG_OWN_CAR: 0 (0.00%)\n", "FLAG_OWN_REALTY: 0 (0.00%)\n", "CNT_CHILDREN: 0 (0.00%)\n", "AMT_INCOME_TOTAL: 0 (0.00%)\n", "AMT_CREDIT: 0 (0.00%)\n", "AMT_ANNUITY: 0 (0.00%)\n", "AMT_GOODS_PRICE: 0 (0.00%)\n", "NAME_TYPE_SUITE: 0 (0.00%)\n", "NAME_INCOME_TYPE: 0 (0.00%)\n", "NAME_EDUCATION_TYPE: 0 (0.00%)\n", "NAME_FAMILY_STATUS: 0 (0.00%)\n", "NAME_HOUSING_TYPE: 0 (0.00%)\n", "REGION_POPULATION_RELATIVE: 0 (0.00%)\n", "DAYS_BIRTH: 307511 (100.00%)\n", "DAYS_EMPLOYED: 252135 (81.99%)\n", "DAYS_REGISTRATION: 307431 (99.97%)\n", "DAYS_ID_PUBLISH: 307495 (99.99%)\n", "OWN_CAR_AGE: 0 (0.00%)\n", "FLAG_MOBIL: 0 (0.00%)\n", "FLAG_EMP_PHONE: 0 (0.00%)\n", "FLAG_WORK_PHONE: 0 (0.00%)\n", "FLAG_CONT_MOBILE: 0 (0.00%)\n", "FLAG_PHONE: 0 (0.00%)\n", "FLAG_EMAIL: 0 (0.00%)\n", "OCCUPATION_TYPE: 0 (0.00%)\n", "CNT_FAM_MEMBERS: 0 (0.00%)\n", "REGION_RATING_CLIENT: 0 (0.00%)\n", "REGION_RATING_CLIENT_W_CITY: 0 (0.00%)\n", "WEEKDAY_APPR_PROCESS_START: 0 (0.00%)\n", "HOUR_APPR_PROCESS_START: 0 (0.00%)\n", "REG_REGION_NOT_LIVE_REGION: 0 (0.00%)\n", "REG_REGION_NOT_WORK_REGION: 0 (0.00%)\n", "LIVE_REGION_NOT_WORK_REGION: 0 (0.00%)\n", "REG_CITY_NOT_LIVE_CITY: 0 (0.00%)\n", "REG_CITY_NOT_WORK_CITY: 0 (0.00%)\n", "LIVE_CITY_NOT_WORK_CITY: 0 (0.00%)\n", "ORGANIZATION_TYPE: 0 (0.00%)\n", "EXT_SOURCE_1: 0 (0.00%)\n", "EXT_SOURCE_2: 0 (0.00%)\n", "EXT_SOURCE_3: 0 (0.00%)\n", "APARTMENTS_AVG: 0 (0.00%)\n", "BASEMENTAREA_AVG: 0 (0.00%)\n", "YEARS_BEGINEXPLUATATION_AVG: 0 (0.00%)\n", "YEARS_BUILD_AVG: 0 (0.00%)\n", "COMMONAREA_AVG: 0 (0.00%)\n", "ELEVATORS_AVG: 0 (0.00%)\n", "ENTRANCES_AVG: 0 (0.00%)\n", "FLOORSMAX_AVG: 0 (0.00%)\n", "FLOORSMIN_AVG: 0 (0.00%)\n", "LANDAREA_AVG: 0 (0.00%)\n", "LIVINGAPARTMENTS_AVG: 0 (0.00%)\n", "LIVINGAREA_AVG: 0 (0.00%)\n", "NONLIVINGAPARTMENTS_AVG: 0 (0.00%)\n", "NONLIVINGAREA_AVG: 0 (0.00%)\n", "APARTMENTS_MODE: 0 (0.00%)\n", "BASEMENTAREA_MODE: 0 (0.00%)\n", "YEARS_BEGINEXPLUATATION_MODE: 0 (0.00%)\n", "YEARS_BUILD_MODE: 0 (0.00%)\n", "COMMONAREA_MODE: 0 (0.00%)\n", "ELEVATORS_MODE: 0 (0.00%)\n", "ENTRANCES_MODE: 0 (0.00%)\n", "FLOORSMAX_MODE: 0 (0.00%)\n", "FLOORSMIN_MODE: 0 (0.00%)\n", "LANDAREA_MODE: 0 (0.00%)\n", "LIVINGAPARTMENTS_MODE: 0 (0.00%)\n", "LIVINGAREA_MODE: 0 (0.00%)\n", "NONLIVINGAPARTMENTS_MODE: 0 (0.00%)\n", "NONLIVINGAREA_MODE: 0 (0.00%)\n", "APARTMENTS_MEDI: 0 (0.00%)\n", "BASEMENTAREA_MEDI: 0 (0.00%)\n", "YEARS_BEGINEXPLUATATION_MEDI: 0 (0.00%)\n", "YEARS_BUILD_MEDI: 0 (0.00%)\n", "COMMONAREA_MEDI: 0 (0.00%)\n", "ELEVATORS_MEDI: 0 (0.00%)\n", "ENTRANCES_MEDI: 0 (0.00%)\n", "FLOORSMAX_MEDI: 0 (0.00%)\n", "FLOORSMIN_MEDI: 0 (0.00%)\n", "LANDAREA_MEDI: 0 (0.00%)\n", "LIVINGAPARTMENTS_MEDI: 0 (0.00%)\n", "LIVINGAREA_MEDI: 0 (0.00%)\n", "NONLIVINGAPARTMENTS_MEDI: 0 (0.00%)\n", "NONLIVINGAREA_MEDI: 0 (0.00%)\n", "FONDKAPREMONT_MODE: 0 (0.00%)\n", "HOUSETYPE_MODE: 0 (0.00%)\n", "TOTALAREA_MODE: 0 (0.00%)\n", "WALLSMATERIAL_MODE: 0 (0.00%)\n", "EMERGENCYSTATE_MODE: 0 (0.00%)\n", "OBS_30_CNT_SOCIAL_CIRCLE: 0 (0.00%)\n", "DEF_30_CNT_SOCIAL_CIRCLE: 0 (0.00%)\n", "OBS_60_CNT_SOCIAL_CIRCLE: 0 (0.00%)\n", "DEF_60_CNT_SOCIAL_CIRCLE: 0 (0.00%)\n", "DAYS_LAST_PHONE_CHANGE: 269838 (87.75%)\n", "FLAG_DOCUMENT_2: 0 (0.00%)\n", "FLAG_DOCUMENT_3: 0 (0.00%)\n", "FLAG_DOCUMENT_4: 0 (0.00%)\n", "FLAG_DOCUMENT_5: 0 (0.00%)\n", "FLAG_DOCUMENT_6: 0 (0.00%)\n", "FLAG_DOCUMENT_7: 0 (0.00%)\n", "FLAG_DOCUMENT_8: 0 (0.00%)\n", "FLAG_DOCUMENT_9: 0 (0.00%)\n", "FLAG_DOCUMENT_10: 0 (0.00%)\n", "FLAG_DOCUMENT_11: 0 (0.00%)\n", "FLAG_DOCUMENT_12: 0 (0.00%)\n", "FLAG_DOCUMENT_13: 0 (0.00%)\n", "FLAG_DOCUMENT_14: 0 (0.00%)\n", "FLAG_DOCUMENT_15: 0 (0.00%)\n", "FLAG_DOCUMENT_16: 0 (0.00%)\n", "FLAG_DOCUMENT_17: 0 (0.00%)\n", "FLAG_DOCUMENT_18: 0 (0.00%)\n", "FLAG_DOCUMENT_19: 0 (0.00%)\n", "FLAG_DOCUMENT_20: 0 (0.00%)\n", "FLAG_DOCUMENT_21: 0 (0.00%)\n", "AMT_REQ_CREDIT_BUREAU_HOUR: 0 (0.00%)\n", "AMT_REQ_CREDIT_BUREAU_DAY: 0 (0.00%)\n", "AMT_REQ_CREDIT_BUREAU_WEEK: 0 (0.00%)\n", "AMT_REQ_CREDIT_BUREAU_MON: 0 (0.00%)\n", "AMT_REQ_CREDIT_BUREAU_QRT: 0 (0.00%)\n", "AMT_REQ_CREDIT_BUREAU_YEAR: 0 (0.00%)\n" ] } ], "source": [ "# count negative\n", "for cname in df.columns:\n", " cnt_neg = df.where(\"{} < 0\".format(cname)).count()\n", " pct_neg = cnt_neg * 100.0 / total_records\n", " print(\"{}: {} ({:0.2f}%)\".format(cname, cnt_neg, pct_neg))" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01234
summarycountmeanstddevminmax
TARGET3075110.080728819456864960.2724186456483954601
NAME_CONTRACT_TYPE307511NoneNoneCash loansRevolving loans
CODE_GENDER307511NoneNoneFXNA
FLAG_OWN_CAR307511NoneNoneNY
FLAG_OWN_REALTY307511NoneNoneNY
CNT_CHILDREN3075110.41705174774235720.72212138443762609
AMT_INCOME_TOTAL307511168797.91929698453237123.14627885324100071.099900.0
AMT_CREDIT307511599025.9997057016402490.776995854451000417.5999886.5
AMT_ANNUITY30749927108.57390918344414493.737315118291100017.09999.0
AMT_GOODS_PRICE307233538396.2074288895369446.46054005761003500.0999000.0
NAME_TYPE_SUITE306219NoneNoneChildrenUnaccompanied
NAME_INCOME_TYPE307511NoneNoneBusinessmanWorking
NAME_EDUCATION_TYPE307511NoneNoneAcademic degreeSecondary / secondary special
NAME_FAMILY_STATUS307511NoneNoneCivil marriageWidow
NAME_HOUSING_TYPE307511NoneNoneCo-op apartmentWith parents
REGION_POPULATION_RELATIVE3075110.020868112057790860.013831280122704650.000290.072508
DAYS_BIRTH307511-16036.9950668431374363.98863178559-10000-9999
DAYS_EMPLOYED30751163815.04590404896141275.76651872776-1365243
DAYS_REGISTRATION307511-4986.1203275384183522.8863209630895-1.00.0
DAYS_ID_PUBLISH307511-2994.20237324843671509.4504190030277-10
OWN_CAR_AGE10458212.06109081868772711.9448115822427710.091.0
FLAG_MOBIL3075110.99999674808380830.001803307015351482801
FLAG_EMP_PHONE3075110.81988936981116120.3842801989387647501
FLAG_WORK_PHONE3075110.19936847787558820.3995262281502293701
FLAG_CONT_MOBILE3075110.99813340010601250.0431638941424323101
FLAG_PHONE3075110.281066368357554660.4495205468567573501
FLAG_EMAIL3075110.05671992221416470.231307039722708201
OCCUPATION_TYPE211120NoneNoneAccountantsWaiters/barmen staff
CNT_FAM_MEMBERS3075092.1526654504421010.91068156917929451.09.0
REGION_RATING_CLIENT3075112.05246316391933940.509033902815677613
REGION_RATING_CLIENT_W_CITY3075112.0315208236453330.50273703291476713
WEEKDAY_APPR_PROCESS_START307511NoneNoneFRIDAYWEDNESDAY
HOUR_APPR_PROCESS_START30751112.0634188695688943.265832255437871409
REG_REGION_NOT_LIVE_REGION3075110.0151441737043552910.1221264762821528801
REG_REGION_NOT_WORK_REGION3075110.050768915583507580.2195258287969604501
LIVE_REGION_NOT_WORK_REGION3075110.040658708143773720.1974986188284246201
REG_CITY_NOT_LIVE_CITY3075110.078172813330254860.268443772373404401
REG_CITY_NOT_WORK_CITY3075110.230453544751244660.4211238359138969601
LIVE_CITY_NOT_WORK_CITY3075110.179554552520072460.3838166153855961401
ORGANIZATION_TYPE307511NoneNoneAdvertisingXNA
EXT_SOURCE_11341330.50212980565666610.211062249273924530.0145681324124455870.9626927705613059
EXT_SOURCE_23068510.51439267413083940.191060154984934950.000107950291678108049.936476188005656e-06
EXT_SOURCE_32465460.51085290617992630.194844364463748640.00052726523870988170.8960095494948396
APARTMENTS_AVG1514500.117440499174646430.108240291300322260.01.0
BASEMENTAREA_AVG1275680.088442219051800510.082438158735684770.01.0
YEARS_BEGINEXPLUATATION_AVG1575040.97773485816221610.0592233143583626860.01.0
YEARS_BUILD_AVG1030230.75247143259272840.113279926632246810.01.0
COMMONAREA_AVG926460.0446207154113510.07603574505040910.01.0
ELEVATORS_AVG1436200.078941512324166230.134576001100343980.01.0
ENTRANCES_AVG1526830.14972467006798340.100049120760359070.01.0
FLOORSMAX_AVG1544910.226281907036647160.14464069954800420.01.0
FLOORSMIN_AVG988690.231893500490564540.161380288800137630.01.0
LANDAREA_AVG1249210.066333184172397060.081183640701793740.01.0
LIVINGAPARTMENTS_AVG973120.100774774950673760.092576133960497740.01.0
LIVINGAREA_AVG1531610.10739901933259950.110564523183713070.01.0
NONLIVINGAPARTMENTS_AVG939970.0088086726172090910.047731662050347950.01.0
NONLIVINGAREA_AVG1378290.0283577570757967450.069523183321235960.01.0
APARTMENTS_MODE1514500.114231006932996290.107936039087532740.01.0
BASEMENTAREA_MODE1275680.087543212247584960.084307174869245560.01.0
YEARS_BEGINEXPLUATATION_MODE1575040.97706537294286610.064575437080480070.01.0
YEARS_BUILD_MODE1030230.75963732273374180.110111027341948150.01.0
COMMONAREA_MODE926460.042553137750146290.074444522538391410.01.0
ELEVATORS_MODE1436200.074489736109167970.13225614415050660.01.0
ENTRANCES_MODE1526830.145192658645624940.100976988160246580.01.0
FLOORSMAX_MODE1544910.222315047478475140.143709406595315730.01.0
FLOORSMIN_MODE988690.228058492550764630.161159771495475750.01.0
LANDAREA_MODE1249210.064957684456576530.081750277808435090.01.0
LIVINGAPARTMENTS_MODE973120.105644856749435060.097880446578793680.01.0
LIVINGAREA_MODE1531610.105975050437121850.11184526587783430.01.0
NONLIVINGAPARTMENTS_MODE939970.0080763875442819120.046276266219835640.01.0
NONLIVINGAREA_MODE1378290.027022319685987660.070253859043944470.01.0
APARTMENTS_MEDI1514500.117849920765932050.109075906001153090.01.0
BASEMENTAREA_MEDI1275680.087954854665747650.08217874951463420.01.0
YEARS_BEGINEXPLUATATION_MEDI1575040.97775226406936930.05989731850511960.01.0
YEARS_BUILD_MEDI1030230.75574627219167150.112066309644043810.01.0
COMMONAREA_MEDI926460.04459510178529090.076144262240914570.01.0
ELEVATORS_MEDI1436200.07807784431135320.134467147690674440.01.0
ENTRANCES_MEDI1526830.149212780728629980.10036839449763240.01.0
FLOORSMAX_MEDI1544910.225896590092614760.145067025919351170.01.0
FLOORSMIN_MEDI988690.23162493804935410.161933541457155740.01.0
LANDAREA_MEDI1249210.067168749049399720.082167010280071980.01.0
LIVINGAPARTMENTS_MEDI973120.101954473240732690.093642332711538450.01.0
LIVINGAREA_MEDI1531610.108606736048995680.112260258675347920.01.0
NONLIVINGAPARTMENTS_MEDI939970.0086510133302103220.047414727907802750.01.0
NONLIVINGAREA_MEDI1378290.0282359205972621580.070166481506824890.01.0
FONDKAPREMONT_MODE97216NoneNonenot specifiedreg oper spec account
HOUSETYPE_MODE153214NoneNoneblock of flatsterraced house
TOTALAREA_MODE1590800.10254666268544120.107462324149618860.01.0
WALLSMATERIAL_MODE151170NoneNoneBlockWooden
EMERGENCYSTATE_MODE161756NoneNoneNoYes
OBS_30_CNT_SOCIAL_CIRCLE3064901.42224542399425752.40098874610901270.09.0
DEF_30_CNT_SOCIAL_CIRCLE3064900.14342066625338510.446698429381527150.08.0
OBS_60_CNT_SOCIAL_CIRCLE3064901.40529217919018562.379803351979390.09.0
DEF_60_CNT_SOCIAL_CIRCLE3064900.100048941237887050.36229080397557310.07.0
DAYS_LAST_PHONE_CHANGE307510-962.8587883320868826.8084870406575-1.00.0
FLAG_DOCUMENT_23075114.2274910491006824E-50.006501789045489792501
FLAG_DOCUMENT_33075110.71002338127741770.4537519684327382401
FLAG_DOCUMENT_43075118.129790479039775E-50.00901618321655084501
FLAG_DOCUMENT_53075110.0151149064586307490.1220102228135413301
FLAG_DOCUMENT_63075110.08805538663657560.2833758928629923601
FLAG_DOCUMENT_73075111.9186305530533867E-40.01385015767701744601
FLAG_DOCUMENT_83075110.081375950778996520.273412048944512901
FLAG_DOCUMENT_93075110.003895795597555860.0622947108003934901
FLAG_DOCUMENT_103075112.276341334131137E-50.004771055354069201
FLAG_DOCUMENT_113075110.0039120551785139390.0624240632668451501
FLAG_DOCUMENT_123075116.503832383231819E-60.002550257091597873601
FLAG_DOCUMENT_133075110.0035250771517116460.0592677180737530901
FLAG_DOCUMENT_143075110.00293648032102916640.0541097673764288101
FLAG_DOCUMENT_153075110.00120971282328111830.0347599388276926401
FLAG_DOCUMENT_163075110.0099281001330033730.0991441623378493401
FLAG_DOCUMENT_173075112.666571277125046E-40.01632748874159662201
FLAG_DOCUMENT_183075110.0081297904790397740.0897982361093961201
FLAG_DOCUMENT_193075115.951006630657115E-40.0243874650658623901
FLAG_DOCUMENT_203075115.072989258920819E-40.02251762026844606301
FLAG_DOCUMENT_213075113.349473677364387E-40.01829853182243754501
AMT_REQ_CREDIT_BUREAU_HOUR2659920.0064024481939306450.083849128447476580.04.0
AMT_REQ_CREDIT_BUREAU_DAY2659920.00700021053264759850.110757406324354460.09.0
AMT_REQ_CREDIT_BUREAU_WEEK2659920.03436193569731420.20468487581282440.08.0
AMT_REQ_CREDIT_BUREAU_MON2659920.267395260007819770.91600239615261790.09.0
AMT_REQ_CREDIT_BUREAU_QRT2659920.265474149598484140.79405564832075470.08.0
AMT_REQ_CREDIT_BUREAU_YEAR2659921.8999744353213631.8692949981815610.09.0
\n", "
" ], "text/plain": [ " 0 1 2 3 4\n", "summary count mean stddev min max \n", "TARGET 307511 0.08072881945686496 0.27241864564839546 0 1 \n", "NAME_CONTRACT_TYPE 307511 None None Cash loans Revolving loans \n", "CODE_GENDER 307511 None None F XNA \n", "FLAG_OWN_CAR 307511 None None N Y \n", "FLAG_OWN_REALTY 307511 None None N Y \n", "CNT_CHILDREN 307511 0.4170517477423572 0.722121384437626 0 9 \n", "AMT_INCOME_TOTAL 307511 168797.91929698453 237123.14627885324 100071.0 99900.0 \n", "AMT_CREDIT 307511 599025.9997057016 402490.77699585445 1000417.5 999886.5 \n", "AMT_ANNUITY 307499 27108.573909183444 14493.737315118291 100017.0 9999.0 \n", "AMT_GOODS_PRICE 307233 538396.2074288895 369446.4605400576 1003500.0 999000.0 \n", "NAME_TYPE_SUITE 306219 None None Children Unaccompanied \n", "NAME_INCOME_TYPE 307511 None None Businessman Working \n", "NAME_EDUCATION_TYPE 307511 None None Academic degree Secondary / secondary special\n", "NAME_FAMILY_STATUS 307511 None None Civil marriage Widow \n", "NAME_HOUSING_TYPE 307511 None None Co-op apartment With parents \n", "REGION_POPULATION_RELATIVE 307511 0.02086811205779086 0.01383128012270465 0.00029 0.072508 \n", "DAYS_BIRTH 307511 -16036.995066843137 4363.98863178559 -10000 -9999 \n", "DAYS_EMPLOYED 307511 63815.04590404896 141275.76651872776 -1 365243 \n", "DAYS_REGISTRATION 307511 -4986.120327538418 3522.8863209630895 -1.0 0.0 \n", "DAYS_ID_PUBLISH 307511 -2994.2023732484367 1509.4504190030277 -1 0 \n", "OWN_CAR_AGE 104582 12.061090818687727 11.944811582242771 0.0 91.0 \n", "FLAG_MOBIL 307511 0.9999967480838083 0.0018033070153514828 0 1 \n", "FLAG_EMP_PHONE 307511 0.8198893698111612 0.38428019893876475 0 1 \n", "FLAG_WORK_PHONE 307511 0.1993684778755882 0.39952622815022937 0 1 \n", "FLAG_CONT_MOBILE 307511 0.9981334001060125 0.04316389414243231 0 1 \n", "FLAG_PHONE 307511 0.28106636835755466 0.44952054685675735 0 1 \n", "FLAG_EMAIL 307511 0.0567199222141647 0.2313070397227082 0 1 \n", "OCCUPATION_TYPE 211120 None None Accountants Waiters/barmen staff \n", "CNT_FAM_MEMBERS 307509 2.152665450442101 0.9106815691792945 1.0 9.0 \n", "REGION_RATING_CLIENT 307511 2.0524631639193394 0.5090339028156776 1 3 \n", "REGION_RATING_CLIENT_W_CITY 307511 2.031520823645333 0.502737032914767 1 3 \n", "WEEKDAY_APPR_PROCESS_START 307511 None None FRIDAY WEDNESDAY \n", "HOUR_APPR_PROCESS_START 307511 12.063418869568894 3.2658322554378714 0 9 \n", "REG_REGION_NOT_LIVE_REGION 307511 0.015144173704355291 0.12212647628215288 0 1 \n", "REG_REGION_NOT_WORK_REGION 307511 0.05076891558350758 0.21952582879696045 0 1 \n", "LIVE_REGION_NOT_WORK_REGION 307511 0.04065870814377372 0.19749861882842462 0 1 \n", "REG_CITY_NOT_LIVE_CITY 307511 0.07817281333025486 0.2684437723734044 0 1 \n", "REG_CITY_NOT_WORK_CITY 307511 0.23045354475124466 0.42112383591389696 0 1 \n", "LIVE_CITY_NOT_WORK_CITY 307511 0.17955455252007246 0.38381661538559614 0 1 \n", "ORGANIZATION_TYPE 307511 None None Advertising XNA \n", "EXT_SOURCE_1 134133 0.5021298056566661 0.21106224927392453 0.014568132412445587 0.9626927705613059 \n", "EXT_SOURCE_2 306851 0.5143926741308394 0.19106015498493495 0.00010795029167810804 9.936476188005656e-06 \n", "EXT_SOURCE_3 246546 0.5108529061799263 0.19484436446374864 0.0005272652387098817 0.8960095494948396 \n", "APARTMENTS_AVG 151450 0.11744049917464643 0.10824029130032226 0.0 1.0 \n", "BASEMENTAREA_AVG 127568 0.08844221905180051 0.08243815873568477 0.0 1.0 \n", "YEARS_BEGINEXPLUATATION_AVG 157504 0.9777348581622161 0.059223314358362686 0.0 1.0 \n", "YEARS_BUILD_AVG 103023 0.7524714325927284 0.11327992663224681 0.0 1.0 \n", "COMMONAREA_AVG 92646 0.044620715411351 0.0760357450504091 0.0 1.0 \n", "ELEVATORS_AVG 143620 0.07894151232416623 0.13457600110034398 0.0 1.0 \n", "ENTRANCES_AVG 152683 0.1497246700679834 0.10004912076035907 0.0 1.0 \n", "FLOORSMAX_AVG 154491 0.22628190703664716 0.1446406995480042 0.0 1.0 \n", "FLOORSMIN_AVG 98869 0.23189350049056454 0.16138028880013763 0.0 1.0 \n", "LANDAREA_AVG 124921 0.06633318417239706 0.08118364070179374 0.0 1.0 \n", "LIVINGAPARTMENTS_AVG 97312 0.10077477495067376 0.09257613396049774 0.0 1.0 \n", "LIVINGAREA_AVG 153161 0.1073990193325995 0.11056452318371307 0.0 1.0 \n", "NONLIVINGAPARTMENTS_AVG 93997 0.008808672617209091 0.04773166205034795 0.0 1.0 \n", "NONLIVINGAREA_AVG 137829 0.028357757075796745 0.06952318332123596 0.0 1.0 \n", "APARTMENTS_MODE 151450 0.11423100693299629 0.10793603908753274 0.0 1.0 \n", "BASEMENTAREA_MODE 127568 0.08754321224758496 0.08430717486924556 0.0 1.0 \n", "YEARS_BEGINEXPLUATATION_MODE 157504 0.9770653729428661 0.06457543708048007 0.0 1.0 \n", "YEARS_BUILD_MODE 103023 0.7596373227337418 0.11011102734194815 0.0 1.0 \n", "COMMONAREA_MODE 92646 0.04255313775014629 0.07444452253839141 0.0 1.0 \n", "ELEVATORS_MODE 143620 0.07448973610916797 0.1322561441505066 0.0 1.0 \n", "ENTRANCES_MODE 152683 0.14519265864562494 0.10097698816024658 0.0 1.0 \n", "FLOORSMAX_MODE 154491 0.22231504747847514 0.14370940659531573 0.0 1.0 \n", "FLOORSMIN_MODE 98869 0.22805849255076463 0.16115977149547575 0.0 1.0 \n", "LANDAREA_MODE 124921 0.06495768445657653 0.08175027780843509 0.0 1.0 \n", "LIVINGAPARTMENTS_MODE 97312 0.10564485674943506 0.09788044657879368 0.0 1.0 \n", "LIVINGAREA_MODE 153161 0.10597505043712185 0.1118452658778343 0.0 1.0 \n", "NONLIVINGAPARTMENTS_MODE 93997 0.008076387544281912 0.04627626621983564 0.0 1.0 \n", "NONLIVINGAREA_MODE 137829 0.02702231968598766 0.07025385904394447 0.0 1.0 \n", "APARTMENTS_MEDI 151450 0.11784992076593205 0.10907590600115309 0.0 1.0 \n", "BASEMENTAREA_MEDI 127568 0.08795485466574765 0.0821787495146342 0.0 1.0 \n", "YEARS_BEGINEXPLUATATION_MEDI 157504 0.9777522640693693 0.0598973185051196 0.0 1.0 \n", "YEARS_BUILD_MEDI 103023 0.7557462721916715 0.11206630964404381 0.0 1.0 \n", "COMMONAREA_MEDI 92646 0.0445951017852909 0.07614426224091457 0.0 1.0 \n", "ELEVATORS_MEDI 143620 0.0780778443113532 0.13446714769067444 0.0 1.0 \n", "ENTRANCES_MEDI 152683 0.14921278072862998 0.1003683944976324 0.0 1.0 \n", "FLOORSMAX_MEDI 154491 0.22589659009261476 0.14506702591935117 0.0 1.0 \n", "FLOORSMIN_MEDI 98869 0.2316249380493541 0.16193354145715574 0.0 1.0 \n", "LANDAREA_MEDI 124921 0.06716874904939972 0.08216701028007198 0.0 1.0 \n", "LIVINGAPARTMENTS_MEDI 97312 0.10195447324073269 0.09364233271153845 0.0 1.0 \n", "LIVINGAREA_MEDI 153161 0.10860673604899568 0.11226025867534792 0.0 1.0 \n", "NONLIVINGAPARTMENTS_MEDI 93997 0.008651013330210322 0.04741472790780275 0.0 1.0 \n", "NONLIVINGAREA_MEDI 137829 0.028235920597262158 0.07016648150682489 0.0 1.0 \n", "FONDKAPREMONT_MODE 97216 None None not specified reg oper spec account \n", "HOUSETYPE_MODE 153214 None None block of flats terraced house \n", "TOTALAREA_MODE 159080 0.1025466626854412 0.10746232414961886 0.0 1.0 \n", "WALLSMATERIAL_MODE 151170 None None Block Wooden \n", "EMERGENCYSTATE_MODE 161756 None None No Yes \n", "OBS_30_CNT_SOCIAL_CIRCLE 306490 1.4222454239942575 2.4009887461090127 0.0 9.0 \n", "DEF_30_CNT_SOCIAL_CIRCLE 306490 0.1434206662533851 0.44669842938152715 0.0 8.0 \n", "OBS_60_CNT_SOCIAL_CIRCLE 306490 1.4052921791901856 2.37980335197939 0.0 9.0 \n", "DEF_60_CNT_SOCIAL_CIRCLE 306490 0.10004894123788705 0.3622908039755731 0.0 7.0 \n", "DAYS_LAST_PHONE_CHANGE 307510 -962.8587883320868 826.8084870406575 -1.0 0.0 \n", "FLAG_DOCUMENT_2 307511 4.2274910491006824E-5 0.0065017890454897925 0 1 \n", "FLAG_DOCUMENT_3 307511 0.7100233812774177 0.45375196843273824 0 1 \n", "FLAG_DOCUMENT_4 307511 8.129790479039775E-5 0.009016183216550845 0 1 \n", "FLAG_DOCUMENT_5 307511 0.015114906458630749 0.12201022281354133 0 1 \n", "FLAG_DOCUMENT_6 307511 0.0880553866365756 0.28337589286299236 0 1 \n", "FLAG_DOCUMENT_7 307511 1.9186305530533867E-4 0.013850157677017446 0 1 \n", "FLAG_DOCUMENT_8 307511 0.08137595077899652 0.2734120489445129 0 1 \n", "FLAG_DOCUMENT_9 307511 0.00389579559755586 0.06229471080039349 0 1 \n", "FLAG_DOCUMENT_10 307511 2.276341334131137E-5 0.0047710553540692 0 1 \n", "FLAG_DOCUMENT_11 307511 0.003912055178513939 0.06242406326684515 0 1 \n", "FLAG_DOCUMENT_12 307511 6.503832383231819E-6 0.0025502570915978736 0 1 \n", "FLAG_DOCUMENT_13 307511 0.003525077151711646 0.05926771807375309 0 1 \n", "FLAG_DOCUMENT_14 307511 0.0029364803210291664 0.05410976737642881 0 1 \n", "FLAG_DOCUMENT_15 307511 0.0012097128232811183 0.03475993882769264 0 1 \n", "FLAG_DOCUMENT_16 307511 0.009928100133003373 0.09914416233784934 0 1 \n", "FLAG_DOCUMENT_17 307511 2.666571277125046E-4 0.016327488741596622 0 1 \n", "FLAG_DOCUMENT_18 307511 0.008129790479039774 0.08979823610939612 0 1 \n", "FLAG_DOCUMENT_19 307511 5.951006630657115E-4 0.02438746506586239 0 1 \n", "FLAG_DOCUMENT_20 307511 5.072989258920819E-4 0.022517620268446063 0 1 \n", "FLAG_DOCUMENT_21 307511 3.349473677364387E-4 0.018298531822437545 0 1 \n", "AMT_REQ_CREDIT_BUREAU_HOUR 265992 0.006402448193930645 0.08384912844747658 0.0 4.0 \n", "AMT_REQ_CREDIT_BUREAU_DAY 265992 0.0070002105326475985 0.11075740632435446 0.0 9.0 \n", "AMT_REQ_CREDIT_BUREAU_WEEK 265992 0.0343619356973142 0.2046848758128244 0.0 8.0 \n", "AMT_REQ_CREDIT_BUREAU_MON 265992 0.26739526000781977 0.9160023961526179 0.0 9.0 \n", "AMT_REQ_CREDIT_BUREAU_QRT 265992 0.26547414959848414 0.7940556483207547 0.0 8.0 \n", "AMT_REQ_CREDIT_BUREAU_YEAR 265992 1.899974435321363 1.869294998181561 0.0 9.0 " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# stats for number\n", "ls_features = [cname for cname in df.columns if cname != \"SK_ID_CURR\"]\n", "pdf_stats = df.select(ls_features).describe().toPandas()\n", "pdf_stats.T" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }