{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Welcome to\n", " ____ __\n", " / __/__ ___ _____/ /__\n", " _\\ \\/ _ \\/ _ `/ __/ '_/\n", " /__ / .__/\\_,_/_/ /_/\\_\\ version 2.4.0\n", " /_/\n", "\n", "Using Python version 3.6.4 (default, Jan 16 2018 18:10:19)\n", "SparkSession available as 'spark'.\n", "local-1557024762906\n" ] } ], "source": [ "import os, sys, glob, datetime\n", "\n", "# specify spark version, python version\n", "spark_home = \"/home/zero/spark-2.4.0-bin-hadoop2.7\" # MODIFY THIS\n", "python_path=\"/apps/anaconda3/bin/python\"\n", "# set environment variables\n", "os.environ['SPARK_HOME'] = spark_home\n", "os.environ['PYSPARK_PYTHON'] = python_path\n", "os.environ['SPARK_LOCAL_IP'] = \"127.0.0.1\"\n", "\n", "def setup_spark_env(app_name):\n", " # set environment variables\n", " spark_python = os.path.join(spark_home, 'python')\n", " py4j = glob.glob(os.path.join(spark_python, 'lib', 'py4j-*.zip'))[0]\n", " sys.path[:0] = [spark_python, py4j]\n", " # specify Spark application parameters\n", " PYSPARK_SUBMIT_ARGS=\"--master local[2]\"\n", "\n", " os.environ['PYSPARK_SUBMIT_ARGS'] = (PYSPARK_SUBMIT_ARGS \n", " + \" --name '%s_%s'\"%(app_name, datetime.datetime.now().strftime(\"%Y%m%d %H:%M\")) \n", " + \" pyspark-shell\") \n", " return\n", "\n", "#\n", "setup_spark_env(\"your_spark_process_name\") # MODIFY THIS\n", "# launching PySpark application\n", "# execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))\n", "filename=os.path.join(spark_home, 'python/pyspark/shell.py')\n", "exec(compile(open(filename, \"rb\").read(), filename, 'exec'))\n", "sc.setLogLevel('ERROR')\n", "print(\"{}\".format(sc.applicationId))" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from pyspark.sql import functions as sf\n", "from pyspark.sql import Row\n", "from pyspark.sql.types import *\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import os, math, subprocess\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "# some settings for displaying Pandas results\n", "pd.set_option('display.width', 2000)\n", "pd.set_option('display.max_rows', 500)\n", "pd.set_option('display.max_columns', 500)\n", "pd.set_option('display.precision', 4)\n", "pd.set_option('display.max_colwidth', -1)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[Row(SK_ID_CURR='100002', TARGET='1', NAME_CONTRACT_TYPE='Cash loans', CODE_GENDER='M', FLAG_OWN_CAR='N', FLAG_OWN_REALTY='Y', CNT_CHILDREN='0', AMT_INCOME_TOTAL='202500.0', AMT_CREDIT='406597.5', AMT_ANNUITY='24700.5', AMT_GOODS_PRICE='351000.0', NAME_TYPE_SUITE='Unaccompanied', NAME_INCOME_TYPE='Working', NAME_EDUCATION_TYPE='Secondary / secondary special', NAME_FAMILY_STATUS='Single / not married', NAME_HOUSING_TYPE='House / apartment', REGION_POPULATION_RELATIVE='0.018801', DAYS_BIRTH='-9461', DAYS_EMPLOYED='-637', DAYS_REGISTRATION='-3648.0', DAYS_ID_PUBLISH='-2120', OWN_CAR_AGE=None, FLAG_MOBIL='1', FLAG_EMP_PHONE='1', FLAG_WORK_PHONE='0', FLAG_CONT_MOBILE='1', FLAG_PHONE='1', FLAG_EMAIL='0', OCCUPATION_TYPE='Laborers', CNT_FAM_MEMBERS='1.0', REGION_RATING_CLIENT='2', REGION_RATING_CLIENT_W_CITY='2', WEEKDAY_APPR_PROCESS_START='WEDNESDAY', HOUR_APPR_PROCESS_START='10', REG_REGION_NOT_LIVE_REGION='0', REG_REGION_NOT_WORK_REGION='0', LIVE_REGION_NOT_WORK_REGION='0', REG_CITY_NOT_LIVE_CITY='0', REG_CITY_NOT_WORK_CITY='0', LIVE_CITY_NOT_WORK_CITY='0', ORGANIZATION_TYPE='Business Entity Type 3', EXT_SOURCE_1='0.08303696739132256', EXT_SOURCE_2='0.2629485927471776', EXT_SOURCE_3='0.13937578009978951', APARTMENTS_AVG='0.0247', BASEMENTAREA_AVG='0.0369', YEARS_BEGINEXPLUATATION_AVG='0.9722', YEARS_BUILD_AVG='0.6192', COMMONAREA_AVG='0.0143', ELEVATORS_AVG='0.0', ENTRANCES_AVG='0.069', FLOORSMAX_AVG='0.0833', FLOORSMIN_AVG='0.125', LANDAREA_AVG='0.0369', LIVINGAPARTMENTS_AVG='0.0202', LIVINGAREA_AVG='0.019', NONLIVINGAPARTMENTS_AVG='0.0', NONLIVINGAREA_AVG='0.0', APARTMENTS_MODE='0.0252', BASEMENTAREA_MODE='0.0383', YEARS_BEGINEXPLUATATION_MODE='0.9722', YEARS_BUILD_MODE='0.6341', COMMONAREA_MODE='0.0144', ELEVATORS_MODE='0.0', ENTRANCES_MODE='0.069', FLOORSMAX_MODE='0.0833', FLOORSMIN_MODE='0.125', LANDAREA_MODE='0.0377', LIVINGAPARTMENTS_MODE='0.022', LIVINGAREA_MODE='0.0198', NONLIVINGAPARTMENTS_MODE='0.0', NONLIVINGAREA_MODE='0.0', APARTMENTS_MEDI='0.025', BASEMENTAREA_MEDI='0.0369', YEARS_BEGINEXPLUATATION_MEDI='0.9722', YEARS_BUILD_MEDI='0.6243', COMMONAREA_MEDI='0.0144', ELEVATORS_MEDI='0.0', ENTRANCES_MEDI='0.069', FLOORSMAX_MEDI='0.0833', FLOORSMIN_MEDI='0.125', LANDAREA_MEDI='0.0375', LIVINGAPARTMENTS_MEDI='0.0205', LIVINGAREA_MEDI='0.0193', NONLIVINGAPARTMENTS_MEDI='0.0', NONLIVINGAREA_MEDI='0.0', FONDKAPREMONT_MODE='reg oper account', HOUSETYPE_MODE='block of flats', TOTALAREA_MODE='0.0149', WALLSMATERIAL_MODE='Stone, brick', EMERGENCYSTATE_MODE='No', OBS_30_CNT_SOCIAL_CIRCLE='2.0', DEF_30_CNT_SOCIAL_CIRCLE='2.0', OBS_60_CNT_SOCIAL_CIRCLE='2.0', DEF_60_CNT_SOCIAL_CIRCLE='2.0', DAYS_LAST_PHONE_CHANGE='-1134.0', FLAG_DOCUMENT_2='0', FLAG_DOCUMENT_3='1', FLAG_DOCUMENT_4='0', FLAG_DOCUMENT_5='0', FLAG_DOCUMENT_6='0', FLAG_DOCUMENT_7='0', FLAG_DOCUMENT_8='0', FLAG_DOCUMENT_9='0', FLAG_DOCUMENT_10='0', FLAG_DOCUMENT_11='0', FLAG_DOCUMENT_12='0', FLAG_DOCUMENT_13='0', FLAG_DOCUMENT_14='0', FLAG_DOCUMENT_15='0', FLAG_DOCUMENT_16='0', FLAG_DOCUMENT_17='0', FLAG_DOCUMENT_18='0', FLAG_DOCUMENT_19='0', FLAG_DOCUMENT_20='0', FLAG_DOCUMENT_21='0', AMT_REQ_CREDIT_BUREAU_HOUR='0.0', AMT_REQ_CREDIT_BUREAU_DAY='0.0', AMT_REQ_CREDIT_BUREAU_WEEK='0.0', AMT_REQ_CREDIT_BUREAU_MON='0.0', AMT_REQ_CREDIT_BUREAU_QRT='0.0', AMT_REQ_CREDIT_BUREAU_YEAR='1.0')]\n" ] } ], "source": [ "# load data\n", "data_path = \"home-credit-default-risk/application_train.csv\"\n", "df = sqlContext.read.format(\"csv\").option(\"header\", \"true\").load(data_path)\n", "print(df.take(1))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total_records: 307511\n" ] } ], "source": [ "total_records = df.count()\n", "print(\"total_records:\", total_records)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SK_ID_CURR (string)\n", "TARGET (string)\n", "NAME_CONTRACT_TYPE (string)\n", "CODE_GENDER (string)\n", "FLAG_OWN_CAR (string)\n", "FLAG_OWN_REALTY (string)\n", "CNT_CHILDREN (string)\n", "AMT_INCOME_TOTAL (string)\n", "AMT_CREDIT (string)\n", "AMT_ANNUITY (string)\n", "AMT_GOODS_PRICE (string)\n", "NAME_TYPE_SUITE (string)\n", "NAME_INCOME_TYPE (string)\n", "NAME_EDUCATION_TYPE (string)\n", "NAME_FAMILY_STATUS (string)\n", "NAME_HOUSING_TYPE (string)\n", "REGION_POPULATION_RELATIVE (string)\n", "DAYS_BIRTH (string)\n", "DAYS_EMPLOYED (string)\n", "DAYS_REGISTRATION (string)\n", "DAYS_ID_PUBLISH (string)\n", "OWN_CAR_AGE (string)\n", "FLAG_MOBIL (string)\n", "FLAG_EMP_PHONE (string)\n", "FLAG_WORK_PHONE (string)\n", "FLAG_CONT_MOBILE (string)\n", "FLAG_PHONE (string)\n", "FLAG_EMAIL (string)\n", "OCCUPATION_TYPE (string)\n", "CNT_FAM_MEMBERS (string)\n", "REGION_RATING_CLIENT (string)\n", "REGION_RATING_CLIENT_W_CITY (string)\n", "WEEKDAY_APPR_PROCESS_START (string)\n", "HOUR_APPR_PROCESS_START (string)\n", "REG_REGION_NOT_LIVE_REGION (string)\n", "REG_REGION_NOT_WORK_REGION (string)\n", "LIVE_REGION_NOT_WORK_REGION (string)\n", "REG_CITY_NOT_LIVE_CITY (string)\n", "REG_CITY_NOT_WORK_CITY (string)\n", "LIVE_CITY_NOT_WORK_CITY (string)\n", "ORGANIZATION_TYPE (string)\n", "EXT_SOURCE_1 (string)\n", "EXT_SOURCE_2 (string)\n", "EXT_SOURCE_3 (string)\n", "APARTMENTS_AVG (string)\n", "BASEMENTAREA_AVG (string)\n", "YEARS_BEGINEXPLUATATION_AVG (string)\n", "YEARS_BUILD_AVG (string)\n", "COMMONAREA_AVG (string)\n", "ELEVATORS_AVG (string)\n", "ENTRANCES_AVG (string)\n", "FLOORSMAX_AVG (string)\n", "FLOORSMIN_AVG (string)\n", "LANDAREA_AVG (string)\n", "LIVINGAPARTMENTS_AVG (string)\n", "LIVINGAREA_AVG (string)\n", "NONLIVINGAPARTMENTS_AVG (string)\n", "NONLIVINGAREA_AVG (string)\n", "APARTMENTS_MODE (string)\n", "BASEMENTAREA_MODE (string)\n", "YEARS_BEGINEXPLUATATION_MODE (string)\n", "YEARS_BUILD_MODE (string)\n", "COMMONAREA_MODE (string)\n", "ELEVATORS_MODE (string)\n", "ENTRANCES_MODE (string)\n", "FLOORSMAX_MODE (string)\n", "FLOORSMIN_MODE (string)\n", "LANDAREA_MODE (string)\n", "LIVINGAPARTMENTS_MODE (string)\n", "LIVINGAREA_MODE (string)\n", "NONLIVINGAPARTMENTS_MODE (string)\n", "NONLIVINGAREA_MODE (string)\n", "APARTMENTS_MEDI (string)\n", "BASEMENTAREA_MEDI (string)\n", "YEARS_BEGINEXPLUATATION_MEDI (string)\n", "YEARS_BUILD_MEDI (string)\n", "COMMONAREA_MEDI (string)\n", "ELEVATORS_MEDI (string)\n", "ENTRANCES_MEDI (string)\n", "FLOORSMAX_MEDI (string)\n", "FLOORSMIN_MEDI (string)\n", "LANDAREA_MEDI (string)\n", "LIVINGAPARTMENTS_MEDI (string)\n", "LIVINGAREA_MEDI (string)\n", "NONLIVINGAPARTMENTS_MEDI (string)\n", "NONLIVINGAREA_MEDI (string)\n", "FONDKAPREMONT_MODE (string)\n", "HOUSETYPE_MODE (string)\n", "TOTALAREA_MODE (string)\n", "WALLSMATERIAL_MODE (string)\n", "EMERGENCYSTATE_MODE (string)\n", "OBS_30_CNT_SOCIAL_CIRCLE (string)\n", "DEF_30_CNT_SOCIAL_CIRCLE (string)\n", "OBS_60_CNT_SOCIAL_CIRCLE (string)\n", "DEF_60_CNT_SOCIAL_CIRCLE (string)\n", "DAYS_LAST_PHONE_CHANGE (string)\n", "FLAG_DOCUMENT_2 (string)\n", "FLAG_DOCUMENT_3 (string)\n", "FLAG_DOCUMENT_4 (string)\n", "FLAG_DOCUMENT_5 (string)\n", "FLAG_DOCUMENT_6 (string)\n", "FLAG_DOCUMENT_7 (string)\n", "FLAG_DOCUMENT_8 (string)\n", "FLAG_DOCUMENT_9 (string)\n", "FLAG_DOCUMENT_10 (string)\n", "FLAG_DOCUMENT_11 (string)\n", "FLAG_DOCUMENT_12 (string)\n", "FLAG_DOCUMENT_13 (string)\n", "FLAG_DOCUMENT_14 (string)\n", "FLAG_DOCUMENT_15 (string)\n", "FLAG_DOCUMENT_16 (string)\n", "FLAG_DOCUMENT_17 (string)\n", "FLAG_DOCUMENT_18 (string)\n", "FLAG_DOCUMENT_19 (string)\n", "FLAG_DOCUMENT_20 (string)\n", "FLAG_DOCUMENT_21 (string)\n", "AMT_REQ_CREDIT_BUREAU_HOUR (string)\n", "AMT_REQ_CREDIT_BUREAU_DAY (string)\n", "AMT_REQ_CREDIT_BUREAU_WEEK (string)\n", "AMT_REQ_CREDIT_BUREAU_MON (string)\n", "AMT_REQ_CREDIT_BUREAU_QRT (string)\n", "AMT_REQ_CREDIT_BUREAU_YEAR (string)\n" ] } ], "source": [ "# check dtypes\n", "for n, t in df.dtypes:\n", " print(\"{} ({})\".format(n, t))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SK_ID_CURR: 307511 (100.00%)\n", "TARGET: 2 (0.00%)\n", "NAME_CONTRACT_TYPE: 2 (0.00%)\n", "CODE_GENDER: 3 (0.00%)\n", "FLAG_OWN_CAR: 2 (0.00%)\n", "FLAG_OWN_REALTY: 2 (0.00%)\n", "CNT_CHILDREN: 15 (0.00%)\n", "AMT_INCOME_TOTAL: 2548 (0.83%)\n", "AMT_CREDIT: 5603 (1.82%)\n", "AMT_ANNUITY: 13673 (4.45%)\n", "AMT_GOODS_PRICE: 1003 (0.33%)\n", "NAME_TYPE_SUITE: 8 (0.00%)\n", "NAME_INCOME_TYPE: 8 (0.00%)\n", "NAME_EDUCATION_TYPE: 5 (0.00%)\n", "NAME_FAMILY_STATUS: 6 (0.00%)\n", "NAME_HOUSING_TYPE: 6 (0.00%)\n", "REGION_POPULATION_RELATIVE: 81 (0.03%)\n", "DAYS_BIRTH: 17460 (5.68%)\n", "DAYS_EMPLOYED: 12574 (4.09%)\n", "DAYS_REGISTRATION: 15688 (5.10%)\n", "DAYS_ID_PUBLISH: 6168 (2.01%)\n", "OWN_CAR_AGE: 63 (0.02%)\n", "FLAG_MOBIL: 2 (0.00%)\n", "FLAG_EMP_PHONE: 2 (0.00%)\n", "FLAG_WORK_PHONE: 2 (0.00%)\n", "FLAG_CONT_MOBILE: 2 (0.00%)\n", "FLAG_PHONE: 2 (0.00%)\n", "FLAG_EMAIL: 2 (0.00%)\n", "OCCUPATION_TYPE: 19 (0.01%)\n", "CNT_FAM_MEMBERS: 18 (0.01%)\n", "REGION_RATING_CLIENT: 3 (0.00%)\n", "REGION_RATING_CLIENT_W_CITY: 3 (0.00%)\n", "WEEKDAY_APPR_PROCESS_START: 7 (0.00%)\n", "HOUR_APPR_PROCESS_START: 24 (0.01%)\n", "REG_REGION_NOT_LIVE_REGION: 2 (0.00%)\n", "REG_REGION_NOT_WORK_REGION: 2 (0.00%)\n", "LIVE_REGION_NOT_WORK_REGION: 2 (0.00%)\n", "REG_CITY_NOT_LIVE_CITY: 2 (0.00%)\n", "REG_CITY_NOT_WORK_CITY: 2 (0.00%)\n", "LIVE_CITY_NOT_WORK_CITY: 2 (0.00%)\n", "ORGANIZATION_TYPE: 58 (0.02%)\n", "EXT_SOURCE_1: 114585 (37.26%)\n", "EXT_SOURCE_2: 119832 (38.97%)\n", "EXT_SOURCE_3: 815 (0.27%)\n", "APARTMENTS_AVG: 2340 (0.76%)\n", "BASEMENTAREA_AVG: 3781 (1.23%)\n", "YEARS_BEGINEXPLUATATION_AVG: 286 (0.09%)\n", "YEARS_BUILD_AVG: 150 (0.05%)\n", "COMMONAREA_AVG: 3182 (1.03%)\n", "ELEVATORS_AVG: 258 (0.08%)\n", "ENTRANCES_AVG: 286 (0.09%)\n", "FLOORSMAX_AVG: 404 (0.13%)\n", "FLOORSMIN_AVG: 306 (0.10%)\n", "LANDAREA_AVG: 3528 (1.15%)\n", "LIVINGAPARTMENTS_AVG: 1869 (0.61%)\n", "LIVINGAREA_AVG: 5200 (1.69%)\n", "NONLIVINGAPARTMENTS_AVG: 387 (0.13%)\n", "NONLIVINGAREA_AVG: 3291 (1.07%)\n", "APARTMENTS_MODE: 761 (0.25%)\n", "BASEMENTAREA_MODE: 3842 (1.25%)\n", "YEARS_BEGINEXPLUATATION_MODE: 222 (0.07%)\n", "YEARS_BUILD_MODE: 155 (0.05%)\n", "COMMONAREA_MODE: 3129 (1.02%)\n", "ELEVATORS_MODE: 27 (0.01%)\n", "ENTRANCES_MODE: 31 (0.01%)\n", "FLOORSMAX_MODE: 26 (0.01%)\n", "FLOORSMIN_MODE: 26 (0.01%)\n", "LANDAREA_MODE: 3564 (1.16%)\n", "LIVINGAPARTMENTS_MODE: 737 (0.24%)\n", "LIVINGAREA_MODE: 5302 (1.72%)\n", "NONLIVINGAPARTMENTS_MODE: 168 (0.05%)\n", "NONLIVINGAREA_MODE: 3328 (1.08%)\n", "APARTMENTS_MEDI: 1149 (0.37%)\n", "BASEMENTAREA_MEDI: 3773 (1.23%)\n", "YEARS_BEGINEXPLUATATION_MEDI: 246 (0.08%)\n", "YEARS_BUILD_MEDI: 152 (0.05%)\n", "COMMONAREA_MEDI: 3203 (1.04%)\n", "ELEVATORS_MEDI: 47 (0.02%)\n", "ENTRANCES_MEDI: 47 (0.02%)\n", "FLOORSMAX_MEDI: 50 (0.02%)\n", "FLOORSMIN_MEDI: 48 (0.02%)\n", "LANDAREA_MEDI: 3561 (1.16%)\n", "LIVINGAPARTMENTS_MEDI: 1098 (0.36%)\n", "LIVINGAREA_MEDI: 5282 (1.72%)\n", "NONLIVINGAPARTMENTS_MEDI: 215 (0.07%)\n", "NONLIVINGAREA_MEDI: 3324 (1.08%)\n", "FONDKAPREMONT_MODE: 5 (0.00%)\n", "HOUSETYPE_MODE: 4 (0.00%)\n", "TOTALAREA_MODE: 5117 (1.66%)\n", "WALLSMATERIAL_MODE: 8 (0.00%)\n", "EMERGENCYSTATE_MODE: 3 (0.00%)\n", "OBS_30_CNT_SOCIAL_CIRCLE: 34 (0.01%)\n", "DEF_30_CNT_SOCIAL_CIRCLE: 11 (0.00%)\n", "OBS_60_CNT_SOCIAL_CIRCLE: 34 (0.01%)\n", "DEF_60_CNT_SOCIAL_CIRCLE: 10 (0.00%)\n", "DAYS_LAST_PHONE_CHANGE: 3774 (1.23%)\n", "FLAG_DOCUMENT_2: 2 (0.00%)\n", "FLAG_DOCUMENT_3: 2 (0.00%)\n", "FLAG_DOCUMENT_4: 2 (0.00%)\n", "FLAG_DOCUMENT_5: 2 (0.00%)\n", "FLAG_DOCUMENT_6: 2 (0.00%)\n", "FLAG_DOCUMENT_7: 2 (0.00%)\n", "FLAG_DOCUMENT_8: 2 (0.00%)\n", "FLAG_DOCUMENT_9: 2 (0.00%)\n", "FLAG_DOCUMENT_10: 2 (0.00%)\n", "FLAG_DOCUMENT_11: 2 (0.00%)\n", "FLAG_DOCUMENT_12: 2 (0.00%)\n", "FLAG_DOCUMENT_13: 2 (0.00%)\n", "FLAG_DOCUMENT_14: 2 (0.00%)\n", "FLAG_DOCUMENT_15: 2 (0.00%)\n", "FLAG_DOCUMENT_16: 2 (0.00%)\n", "FLAG_DOCUMENT_17: 2 (0.00%)\n", "FLAG_DOCUMENT_18: 2 (0.00%)\n", "FLAG_DOCUMENT_19: 2 (0.00%)\n", "FLAG_DOCUMENT_20: 2 (0.00%)\n", "FLAG_DOCUMENT_21: 2 (0.00%)\n", "AMT_REQ_CREDIT_BUREAU_HOUR: 6 (0.00%)\n", "AMT_REQ_CREDIT_BUREAU_DAY: 10 (0.00%)\n", "AMT_REQ_CREDIT_BUREAU_WEEK: 10 (0.00%)\n", "AMT_REQ_CREDIT_BUREAU_MON: 25 (0.01%)\n", "AMT_REQ_CREDIT_BUREAU_QRT: 12 (0.00%)\n", "AMT_REQ_CREDIT_BUREAU_YEAR: 26 (0.01%)\n" ] } ], "source": [ "# count distinct\n", "for cname in df.columns:\n", " cnt_dist = df.select(cname).distinct().count()\n", " pct_dist = cnt_dist * 100.0 / total_records\n", " print(\"{}: {} ({:0.2f}%)\".format(cname, cnt_dist, pct_dist))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SK_ID_CURR: 0 (0.00%)\n", "TARGET: 0 (0.00%)\n", "NAME_CONTRACT_TYPE: 0 (0.00%)\n", "CODE_GENDER: 0 (0.00%)\n", "FLAG_OWN_CAR: 0 (0.00%)\n", "FLAG_OWN_REALTY: 0 (0.00%)\n", "CNT_CHILDREN: 0 (0.00%)\n", "AMT_INCOME_TOTAL: 0 (0.00%)\n", "AMT_CREDIT: 0 (0.00%)\n", "AMT_ANNUITY: 12 (0.00%)\n", "AMT_GOODS_PRICE: 278 (0.09%)\n", "NAME_TYPE_SUITE: 1292 (0.42%)\n", "NAME_INCOME_TYPE: 0 (0.00%)\n", "NAME_EDUCATION_TYPE: 0 (0.00%)\n", "NAME_FAMILY_STATUS: 0 (0.00%)\n", "NAME_HOUSING_TYPE: 0 (0.00%)\n", "REGION_POPULATION_RELATIVE: 0 (0.00%)\n", "DAYS_BIRTH: 0 (0.00%)\n", "DAYS_EMPLOYED: 0 (0.00%)\n", "DAYS_REGISTRATION: 0 (0.00%)\n", "DAYS_ID_PUBLISH: 0 (0.00%)\n", "OWN_CAR_AGE: 202929 (65.99%)\n", "FLAG_MOBIL: 0 (0.00%)\n", "FLAG_EMP_PHONE: 0 (0.00%)\n", "FLAG_WORK_PHONE: 0 (0.00%)\n", "FLAG_CONT_MOBILE: 0 (0.00%)\n", "FLAG_PHONE: 0 (0.00%)\n", "FLAG_EMAIL: 0 (0.00%)\n", "OCCUPATION_TYPE: 96391 (31.35%)\n", "CNT_FAM_MEMBERS: 2 (0.00%)\n", "REGION_RATING_CLIENT: 0 (0.00%)\n", "REGION_RATING_CLIENT_W_CITY: 0 (0.00%)\n", "WEEKDAY_APPR_PROCESS_START: 0 (0.00%)\n", "HOUR_APPR_PROCESS_START: 0 (0.00%)\n", "REG_REGION_NOT_LIVE_REGION: 0 (0.00%)\n", "REG_REGION_NOT_WORK_REGION: 0 (0.00%)\n", "LIVE_REGION_NOT_WORK_REGION: 0 (0.00%)\n", "REG_CITY_NOT_LIVE_CITY: 0 (0.00%)\n", "REG_CITY_NOT_WORK_CITY: 0 (0.00%)\n", "LIVE_CITY_NOT_WORK_CITY: 0 (0.00%)\n", "ORGANIZATION_TYPE: 0 (0.00%)\n", "EXT_SOURCE_1: 173378 (56.38%)\n", "EXT_SOURCE_2: 660 (0.21%)\n", "EXT_SOURCE_3: 60965 (19.83%)\n", "APARTMENTS_AVG: 156061 (50.75%)\n", "BASEMENTAREA_AVG: 179943 (58.52%)\n", "YEARS_BEGINEXPLUATATION_AVG: 150007 (48.78%)\n", "YEARS_BUILD_AVG: 204488 (66.50%)\n", "COMMONAREA_AVG: 214865 (69.87%)\n", "ELEVATORS_AVG: 163891 (53.30%)\n", "ENTRANCES_AVG: 154828 (50.35%)\n", "FLOORSMAX_AVG: 153020 (49.76%)\n", "FLOORSMIN_AVG: 208642 (67.85%)\n", "LANDAREA_AVG: 182590 (59.38%)\n", "LIVINGAPARTMENTS_AVG: 210199 (68.35%)\n", "LIVINGAREA_AVG: 154350 (50.19%)\n", "NONLIVINGAPARTMENTS_AVG: 213514 (69.43%)\n", "NONLIVINGAREA_AVG: 169682 (55.18%)\n", "APARTMENTS_MODE: 156061 (50.75%)\n", "BASEMENTAREA_MODE: 179943 (58.52%)\n", "YEARS_BEGINEXPLUATATION_MODE: 150007 (48.78%)\n", "YEARS_BUILD_MODE: 204488 (66.50%)\n", "COMMONAREA_MODE: 214865 (69.87%)\n", "ELEVATORS_MODE: 163891 (53.30%)\n", "ENTRANCES_MODE: 154828 (50.35%)\n", "FLOORSMAX_MODE: 153020 (49.76%)\n", "FLOORSMIN_MODE: 208642 (67.85%)\n", "LANDAREA_MODE: 182590 (59.38%)\n", "LIVINGAPARTMENTS_MODE: 210199 (68.35%)\n", "LIVINGAREA_MODE: 154350 (50.19%)\n", "NONLIVINGAPARTMENTS_MODE: 213514 (69.43%)\n", "NONLIVINGAREA_MODE: 169682 (55.18%)\n", "APARTMENTS_MEDI: 156061 (50.75%)\n", "BASEMENTAREA_MEDI: 179943 (58.52%)\n", "YEARS_BEGINEXPLUATATION_MEDI: 150007 (48.78%)\n", "YEARS_BUILD_MEDI: 204488 (66.50%)\n", "COMMONAREA_MEDI: 214865 (69.87%)\n", "ELEVATORS_MEDI: 163891 (53.30%)\n", "ENTRANCES_MEDI: 154828 (50.35%)\n", "FLOORSMAX_MEDI: 153020 (49.76%)\n", "FLOORSMIN_MEDI: 208642 (67.85%)\n", "LANDAREA_MEDI: 182590 (59.38%)\n", "LIVINGAPARTMENTS_MEDI: 210199 (68.35%)\n", "LIVINGAREA_MEDI: 154350 (50.19%)\n", "NONLIVINGAPARTMENTS_MEDI: 213514 (69.43%)\n", "NONLIVINGAREA_MEDI: 169682 (55.18%)\n", "FONDKAPREMONT_MODE: 210295 (68.39%)\n", "HOUSETYPE_MODE: 154297 (50.18%)\n", "TOTALAREA_MODE: 148431 (48.27%)\n", "WALLSMATERIAL_MODE: 156341 (50.84%)\n", "EMERGENCYSTATE_MODE: 145755 (47.40%)\n", "OBS_30_CNT_SOCIAL_CIRCLE: 1021 (0.33%)\n", "DEF_30_CNT_SOCIAL_CIRCLE: 1021 (0.33%)\n", "OBS_60_CNT_SOCIAL_CIRCLE: 1021 (0.33%)\n", "DEF_60_CNT_SOCIAL_CIRCLE: 1021 (0.33%)\n", "DAYS_LAST_PHONE_CHANGE: 1 (0.00%)\n", "FLAG_DOCUMENT_2: 0 (0.00%)\n", "FLAG_DOCUMENT_3: 0 (0.00%)\n", "FLAG_DOCUMENT_4: 0 (0.00%)\n", "FLAG_DOCUMENT_5: 0 (0.00%)\n", "FLAG_DOCUMENT_6: 0 (0.00%)\n", "FLAG_DOCUMENT_7: 0 (0.00%)\n", "FLAG_DOCUMENT_8: 0 (0.00%)\n", "FLAG_DOCUMENT_9: 0 (0.00%)\n", "FLAG_DOCUMENT_10: 0 (0.00%)\n", "FLAG_DOCUMENT_11: 0 (0.00%)\n", "FLAG_DOCUMENT_12: 0 (0.00%)\n", "FLAG_DOCUMENT_13: 0 (0.00%)\n", "FLAG_DOCUMENT_14: 0 (0.00%)\n", "FLAG_DOCUMENT_15: 0 (0.00%)\n", "FLAG_DOCUMENT_16: 0 (0.00%)\n", "FLAG_DOCUMENT_17: 0 (0.00%)\n", "FLAG_DOCUMENT_18: 0 (0.00%)\n", "FLAG_DOCUMENT_19: 0 (0.00%)\n", "FLAG_DOCUMENT_20: 0 (0.00%)\n", "FLAG_DOCUMENT_21: 0 (0.00%)\n", "AMT_REQ_CREDIT_BUREAU_HOUR: 41519 (13.50%)\n", "AMT_REQ_CREDIT_BUREAU_DAY: 41519 (13.50%)\n", "AMT_REQ_CREDIT_BUREAU_WEEK: 41519 (13.50%)\n", "AMT_REQ_CREDIT_BUREAU_MON: 41519 (13.50%)\n", "AMT_REQ_CREDIT_BUREAU_QRT: 41519 (13.50%)\n", "AMT_REQ_CREDIT_BUREAU_YEAR: 41519 (13.50%)\n" ] } ], "source": [ "# count NULL\n", "for cname in df.columns:\n", " cnt_null = df.where(\"{} is NULL\".format(cname)).count()\n", " pct_miss = cnt_null * 100.0 / total_records\n", " print(\"{}: {} ({:0.2f}%)\".format(cname, cnt_null, pct_miss))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SK_ID_CURR: 0 (0.00%)\n", "TARGET: 282686 (91.93%)\n", "NAME_CONTRACT_TYPE: 0 (0.00%)\n", "CODE_GENDER: 0 (0.00%)\n", "FLAG_OWN_CAR: 0 (0.00%)\n", "FLAG_OWN_REALTY: 0 (0.00%)\n", "CNT_CHILDREN: 215371 (70.04%)\n", "AMT_INCOME_TOTAL: 0 (0.00%)\n", "AMT_CREDIT: 0 (0.00%)\n", "AMT_ANNUITY: 0 (0.00%)\n", "AMT_GOODS_PRICE: 0 (0.00%)\n", "NAME_TYPE_SUITE: 0 (0.00%)\n", "NAME_INCOME_TYPE: 0 (0.00%)\n", "NAME_EDUCATION_TYPE: 0 (0.00%)\n", "NAME_FAMILY_STATUS: 0 (0.00%)\n", "NAME_HOUSING_TYPE: 0 (0.00%)\n", "REGION_POPULATION_RELATIVE: 0 (0.00%)\n", "DAYS_BIRTH: 0 (0.00%)\n", "DAYS_EMPLOYED: 2 (0.00%)\n", "DAYS_REGISTRATION: 80 (0.03%)\n", "DAYS_ID_PUBLISH: 16 (0.01%)\n", "OWN_CAR_AGE: 2134 (0.69%)\n", "FLAG_MOBIL: 1 (0.00%)\n", "FLAG_EMP_PHONE: 55386 (18.01%)\n", "FLAG_WORK_PHONE: 246203 (80.06%)\n", "FLAG_CONT_MOBILE: 574 (0.19%)\n", "FLAG_PHONE: 221080 (71.89%)\n", "FLAG_EMAIL: 290069 (94.33%)\n", "OCCUPATION_TYPE: 0 (0.00%)\n", "CNT_FAM_MEMBERS: 0 (0.00%)\n", "REGION_RATING_CLIENT: 0 (0.00%)\n", "REGION_RATING_CLIENT_W_CITY: 0 (0.00%)\n", "WEEKDAY_APPR_PROCESS_START: 0 (0.00%)\n", "HOUR_APPR_PROCESS_START: 40 (0.01%)\n", "REG_REGION_NOT_LIVE_REGION: 302854 (98.49%)\n", "REG_REGION_NOT_WORK_REGION: 291899 (94.92%)\n", "LIVE_REGION_NOT_WORK_REGION: 295008 (95.93%)\n", "REG_CITY_NOT_LIVE_CITY: 283472 (92.18%)\n", "REG_CITY_NOT_WORK_CITY: 236644 (76.95%)\n", "LIVE_CITY_NOT_WORK_CITY: 252296 (82.04%)\n", "ORGANIZATION_TYPE: 0 (0.00%)\n", "EXT_SOURCE_1: 0 (0.00%)\n", "EXT_SOURCE_2: 0 (0.00%)\n", "EXT_SOURCE_3: 0 (0.00%)\n", "APARTMENTS_AVG: 751 (0.24%)\n", "BASEMENTAREA_AVG: 14745 (4.79%)\n", "YEARS_BEGINEXPLUATATION_AVG: 514 (0.17%)\n", "YEARS_BUILD_AVG: 102 (0.03%)\n", "COMMONAREA_AVG: 8442 (2.75%)\n", "ELEVATORS_AVG: 85718 (27.87%)\n", "ENTRANCES_AVG: 323 (0.11%)\n", "FLOORSMAX_AVG: 2938 (0.96%)\n", "FLOORSMIN_AVG: 2320 (0.75%)\n", "LANDAREA_AVG: 15600 (5.07%)\n", "LIVINGAPARTMENTS_AVG: 418 (0.14%)\n", "LIVINGAREA_AVG: 284 (0.09%)\n", "NONLIVINGAPARTMENTS_AVG: 54549 (17.74%)\n", "NONLIVINGAREA_AVG: 58735 (19.10%)\n", "APARTMENTS_MODE: 976 (0.32%)\n", "BASEMENTAREA_MODE: 16598 (5.40%)\n", "YEARS_BEGINEXPLUATATION_MODE: 142 (0.05%)\n", "YEARS_BUILD_MODE: 103 (0.03%)\n", "COMMONAREA_MODE: 9690 (3.15%)\n", "ELEVATORS_MODE: 89498 (29.10%)\n", "ENTRANCES_MODE: 387 (0.13%)\n", "FLOORSMAX_MODE: 3415 (1.11%)\n", "FLOORSMIN_MODE: 2517 (0.82%)\n", "LANDAREA_MODE: 17453 (5.68%)\n", "LIVINGAPARTMENTS_MODE: 519 (0.17%)\n", "LIVINGAREA_MODE: 444 (0.14%)\n", "NONLIVINGAPARTMENTS_MODE: 59255 (19.27%)\n", "NONLIVINGAREA_MODE: 67126 (21.83%)\n", "APARTMENTS_MEDI: 771 (0.25%)\n", "BASEMENTAREA_MEDI: 14991 (4.87%)\n", "YEARS_BEGINEXPLUATATION_MEDI: 548 (0.18%)\n", "YEARS_BUILD_MEDI: 101 (0.03%)\n", "COMMONAREA_MEDI: 8691 (2.83%)\n", "ELEVATORS_MEDI: 87026 (28.30%)\n", "ENTRANCES_MEDI: 329 (0.11%)\n", "FLOORSMAX_MEDI: 2995 (0.97%)\n", "FLOORSMIN_MEDI: 2351 (0.76%)\n", "LANDAREA_MEDI: 15919 (5.18%)\n", "LIVINGAPARTMENTS_MEDI: 433 (0.14%)\n", "LIVINGAREA_MEDI: 299 (0.10%)\n", "NONLIVINGAPARTMENTS_MEDI: 56097 (18.24%)\n", "NONLIVINGAREA_MEDI: 60954 (19.82%)\n", "FONDKAPREMONT_MODE: 0 (0.00%)\n", "HOUSETYPE_MODE: 0 (0.00%)\n", "TOTALAREA_MODE: 582 (0.19%)\n", "WALLSMATERIAL_MODE: 0 (0.00%)\n", "EMERGENCYSTATE_MODE: 0 (0.00%)\n", "OBS_30_CNT_SOCIAL_CIRCLE: 163910 (53.30%)\n", "DEF_30_CNT_SOCIAL_CIRCLE: 271324 (88.23%)\n", "OBS_60_CNT_SOCIAL_CIRCLE: 164666 (53.55%)\n", "DEF_60_CNT_SOCIAL_CIRCLE: 280721 (91.29%)\n", "DAYS_LAST_PHONE_CHANGE: 37672 (12.25%)\n", "FLAG_DOCUMENT_2: 307498 (100.00%)\n", "FLAG_DOCUMENT_3: 89171 (29.00%)\n", "FLAG_DOCUMENT_4: 307486 (99.99%)\n", "FLAG_DOCUMENT_5: 302863 (98.49%)\n", "FLAG_DOCUMENT_6: 280433 (91.19%)\n", "FLAG_DOCUMENT_7: 307452 (99.98%)\n", "FLAG_DOCUMENT_8: 282487 (91.86%)\n", "FLAG_DOCUMENT_9: 306313 (99.61%)\n", "FLAG_DOCUMENT_10: 307504 (100.00%)\n", "FLAG_DOCUMENT_11: 306308 (99.61%)\n", "FLAG_DOCUMENT_12: 307509 (100.00%)\n", "FLAG_DOCUMENT_13: 306427 (99.65%)\n", "FLAG_DOCUMENT_14: 306608 (99.71%)\n", "FLAG_DOCUMENT_15: 307139 (99.88%)\n", "FLAG_DOCUMENT_16: 304458 (99.01%)\n", "FLAG_DOCUMENT_17: 307429 (99.97%)\n", "FLAG_DOCUMENT_18: 305011 (99.19%)\n", "FLAG_DOCUMENT_19: 307328 (99.94%)\n", "FLAG_DOCUMENT_20: 307355 (99.95%)\n", "FLAG_DOCUMENT_21: 307408 (99.97%)\n", "AMT_REQ_CREDIT_BUREAU_HOUR: 264366 (85.97%)\n", "AMT_REQ_CREDIT_BUREAU_DAY: 264503 (86.01%)\n", "AMT_REQ_CREDIT_BUREAU_WEEK: 257456 (83.72%)\n", "AMT_REQ_CREDIT_BUREAU_MON: 222233 (72.27%)\n", "AMT_REQ_CREDIT_BUREAU_QRT: 215417 (70.05%)\n", "AMT_REQ_CREDIT_BUREAU_YEAR: 71801 (23.35%)\n" ] } ], "source": [ "# count zeros\n", "for cname in df.columns:\n", " cnt_zeros = df.where(\"{} = 0.0\".format(cname)).count()\n", " pct_zeros = cnt_zeros * 100.0 / total_records\n", " print(\"{}: {} ({:0.2f}%)\".format(cname, cnt_zeros, pct_zeros))" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SK_ID_CURR: 0 (0.00%)\n", "TARGET: 0 (0.00%)\n", "NAME_CONTRACT_TYPE: 0 (0.00%)\n", "CODE_GENDER: 0 (0.00%)\n", "FLAG_OWN_CAR: 0 (0.00%)\n", "FLAG_OWN_REALTY: 0 (0.00%)\n", "CNT_CHILDREN: 0 (0.00%)\n", "AMT_INCOME_TOTAL: 0 (0.00%)\n", "AMT_CREDIT: 0 (0.00%)\n", "AMT_ANNUITY: 0 (0.00%)\n", "AMT_GOODS_PRICE: 0 (0.00%)\n", "NAME_TYPE_SUITE: 0 (0.00%)\n", "NAME_INCOME_TYPE: 0 (0.00%)\n", "NAME_EDUCATION_TYPE: 0 (0.00%)\n", "NAME_FAMILY_STATUS: 0 (0.00%)\n", "NAME_HOUSING_TYPE: 0 (0.00%)\n", "REGION_POPULATION_RELATIVE: 0 (0.00%)\n", "DAYS_BIRTH: 307511 (100.00%)\n", "DAYS_EMPLOYED: 252135 (81.99%)\n", "DAYS_REGISTRATION: 307431 (99.97%)\n", "DAYS_ID_PUBLISH: 307495 (99.99%)\n", "OWN_CAR_AGE: 0 (0.00%)\n", "FLAG_MOBIL: 0 (0.00%)\n", "FLAG_EMP_PHONE: 0 (0.00%)\n", "FLAG_WORK_PHONE: 0 (0.00%)\n", "FLAG_CONT_MOBILE: 0 (0.00%)\n", "FLAG_PHONE: 0 (0.00%)\n", "FLAG_EMAIL: 0 (0.00%)\n", "OCCUPATION_TYPE: 0 (0.00%)\n", "CNT_FAM_MEMBERS: 0 (0.00%)\n", "REGION_RATING_CLIENT: 0 (0.00%)\n", "REGION_RATING_CLIENT_W_CITY: 0 (0.00%)\n", "WEEKDAY_APPR_PROCESS_START: 0 (0.00%)\n", "HOUR_APPR_PROCESS_START: 0 (0.00%)\n", "REG_REGION_NOT_LIVE_REGION: 0 (0.00%)\n", "REG_REGION_NOT_WORK_REGION: 0 (0.00%)\n", "LIVE_REGION_NOT_WORK_REGION: 0 (0.00%)\n", "REG_CITY_NOT_LIVE_CITY: 0 (0.00%)\n", "REG_CITY_NOT_WORK_CITY: 0 (0.00%)\n", "LIVE_CITY_NOT_WORK_CITY: 0 (0.00%)\n", "ORGANIZATION_TYPE: 0 (0.00%)\n", "EXT_SOURCE_1: 0 (0.00%)\n", "EXT_SOURCE_2: 0 (0.00%)\n", "EXT_SOURCE_3: 0 (0.00%)\n", "APARTMENTS_AVG: 0 (0.00%)\n", "BASEMENTAREA_AVG: 0 (0.00%)\n", "YEARS_BEGINEXPLUATATION_AVG: 0 (0.00%)\n", "YEARS_BUILD_AVG: 0 (0.00%)\n", "COMMONAREA_AVG: 0 (0.00%)\n", "ELEVATORS_AVG: 0 (0.00%)\n", "ENTRANCES_AVG: 0 (0.00%)\n", "FLOORSMAX_AVG: 0 (0.00%)\n", "FLOORSMIN_AVG: 0 (0.00%)\n", "LANDAREA_AVG: 0 (0.00%)\n", "LIVINGAPARTMENTS_AVG: 0 (0.00%)\n", "LIVINGAREA_AVG: 0 (0.00%)\n", "NONLIVINGAPARTMENTS_AVG: 0 (0.00%)\n", "NONLIVINGAREA_AVG: 0 (0.00%)\n", "APARTMENTS_MODE: 0 (0.00%)\n", "BASEMENTAREA_MODE: 0 (0.00%)\n", "YEARS_BEGINEXPLUATATION_MODE: 0 (0.00%)\n", "YEARS_BUILD_MODE: 0 (0.00%)\n", "COMMONAREA_MODE: 0 (0.00%)\n", "ELEVATORS_MODE: 0 (0.00%)\n", "ENTRANCES_MODE: 0 (0.00%)\n", "FLOORSMAX_MODE: 0 (0.00%)\n", "FLOORSMIN_MODE: 0 (0.00%)\n", "LANDAREA_MODE: 0 (0.00%)\n", "LIVINGAPARTMENTS_MODE: 0 (0.00%)\n", "LIVINGAREA_MODE: 0 (0.00%)\n", "NONLIVINGAPARTMENTS_MODE: 0 (0.00%)\n", "NONLIVINGAREA_MODE: 0 (0.00%)\n", "APARTMENTS_MEDI: 0 (0.00%)\n", "BASEMENTAREA_MEDI: 0 (0.00%)\n", "YEARS_BEGINEXPLUATATION_MEDI: 0 (0.00%)\n", "YEARS_BUILD_MEDI: 0 (0.00%)\n", "COMMONAREA_MEDI: 0 (0.00%)\n", "ELEVATORS_MEDI: 0 (0.00%)\n", "ENTRANCES_MEDI: 0 (0.00%)\n", "FLOORSMAX_MEDI: 0 (0.00%)\n", "FLOORSMIN_MEDI: 0 (0.00%)\n", "LANDAREA_MEDI: 0 (0.00%)\n", "LIVINGAPARTMENTS_MEDI: 0 (0.00%)\n", "LIVINGAREA_MEDI: 0 (0.00%)\n", "NONLIVINGAPARTMENTS_MEDI: 0 (0.00%)\n", "NONLIVINGAREA_MEDI: 0 (0.00%)\n", "FONDKAPREMONT_MODE: 0 (0.00%)\n", "HOUSETYPE_MODE: 0 (0.00%)\n", "TOTALAREA_MODE: 0 (0.00%)\n", "WALLSMATERIAL_MODE: 0 (0.00%)\n", "EMERGENCYSTATE_MODE: 0 (0.00%)\n", "OBS_30_CNT_SOCIAL_CIRCLE: 0 (0.00%)\n", "DEF_30_CNT_SOCIAL_CIRCLE: 0 (0.00%)\n", "OBS_60_CNT_SOCIAL_CIRCLE: 0 (0.00%)\n", "DEF_60_CNT_SOCIAL_CIRCLE: 0 (0.00%)\n", "DAYS_LAST_PHONE_CHANGE: 269838 (87.75%)\n", "FLAG_DOCUMENT_2: 0 (0.00%)\n", "FLAG_DOCUMENT_3: 0 (0.00%)\n", "FLAG_DOCUMENT_4: 0 (0.00%)\n", "FLAG_DOCUMENT_5: 0 (0.00%)\n", "FLAG_DOCUMENT_6: 0 (0.00%)\n", "FLAG_DOCUMENT_7: 0 (0.00%)\n", "FLAG_DOCUMENT_8: 0 (0.00%)\n", "FLAG_DOCUMENT_9: 0 (0.00%)\n", "FLAG_DOCUMENT_10: 0 (0.00%)\n", "FLAG_DOCUMENT_11: 0 (0.00%)\n", "FLAG_DOCUMENT_12: 0 (0.00%)\n", "FLAG_DOCUMENT_13: 0 (0.00%)\n", "FLAG_DOCUMENT_14: 0 (0.00%)\n", "FLAG_DOCUMENT_15: 0 (0.00%)\n", "FLAG_DOCUMENT_16: 0 (0.00%)\n", "FLAG_DOCUMENT_17: 0 (0.00%)\n", "FLAG_DOCUMENT_18: 0 (0.00%)\n", "FLAG_DOCUMENT_19: 0 (0.00%)\n", "FLAG_DOCUMENT_20: 0 (0.00%)\n", "FLAG_DOCUMENT_21: 0 (0.00%)\n", "AMT_REQ_CREDIT_BUREAU_HOUR: 0 (0.00%)\n", "AMT_REQ_CREDIT_BUREAU_DAY: 0 (0.00%)\n", "AMT_REQ_CREDIT_BUREAU_WEEK: 0 (0.00%)\n", "AMT_REQ_CREDIT_BUREAU_MON: 0 (0.00%)\n", "AMT_REQ_CREDIT_BUREAU_QRT: 0 (0.00%)\n", "AMT_REQ_CREDIT_BUREAU_YEAR: 0 (0.00%)\n" ] } ], "source": [ "# count negative\n", "for cname in df.columns:\n", " cnt_neg = df.where(\"{} < 0\".format(cname)).count()\n", " pct_neg = cnt_neg * 100.0 / total_records\n", " print(\"{}: {} ({:0.2f}%)\".format(cname, cnt_neg, pct_neg))" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | 0 | \n", "1 | \n", "2 | \n", "3 | \n", "4 | \n", "
---|---|---|---|---|---|
summary | \n", "count | \n", "mean | \n", "stddev | \n", "min | \n", "max | \n", "
TARGET | \n", "307511 | \n", "0.08072881945686496 | \n", "0.27241864564839546 | \n", "0 | \n", "1 | \n", "
NAME_CONTRACT_TYPE | \n", "307511 | \n", "None | \n", "None | \n", "Cash loans | \n", "Revolving loans | \n", "
CODE_GENDER | \n", "307511 | \n", "None | \n", "None | \n", "F | \n", "XNA | \n", "
FLAG_OWN_CAR | \n", "307511 | \n", "None | \n", "None | \n", "N | \n", "Y | \n", "
FLAG_OWN_REALTY | \n", "307511 | \n", "None | \n", "None | \n", "N | \n", "Y | \n", "
CNT_CHILDREN | \n", "307511 | \n", "0.4170517477423572 | \n", "0.722121384437626 | \n", "0 | \n", "9 | \n", "
AMT_INCOME_TOTAL | \n", "307511 | \n", "168797.91929698453 | \n", "237123.14627885324 | \n", "100071.0 | \n", "99900.0 | \n", "
AMT_CREDIT | \n", "307511 | \n", "599025.9997057016 | \n", "402490.77699585445 | \n", "1000417.5 | \n", "999886.5 | \n", "
AMT_ANNUITY | \n", "307499 | \n", "27108.573909183444 | \n", "14493.737315118291 | \n", "100017.0 | \n", "9999.0 | \n", "
AMT_GOODS_PRICE | \n", "307233 | \n", "538396.2074288895 | \n", "369446.4605400576 | \n", "1003500.0 | \n", "999000.0 | \n", "
NAME_TYPE_SUITE | \n", "306219 | \n", "None | \n", "None | \n", "Children | \n", "Unaccompanied | \n", "
NAME_INCOME_TYPE | \n", "307511 | \n", "None | \n", "None | \n", "Businessman | \n", "Working | \n", "
NAME_EDUCATION_TYPE | \n", "307511 | \n", "None | \n", "None | \n", "Academic degree | \n", "Secondary / secondary special | \n", "
NAME_FAMILY_STATUS | \n", "307511 | \n", "None | \n", "None | \n", "Civil marriage | \n", "Widow | \n", "
NAME_HOUSING_TYPE | \n", "307511 | \n", "None | \n", "None | \n", "Co-op apartment | \n", "With parents | \n", "
REGION_POPULATION_RELATIVE | \n", "307511 | \n", "0.02086811205779086 | \n", "0.01383128012270465 | \n", "0.00029 | \n", "0.072508 | \n", "
DAYS_BIRTH | \n", "307511 | \n", "-16036.995066843137 | \n", "4363.98863178559 | \n", "-10000 | \n", "-9999 | \n", "
DAYS_EMPLOYED | \n", "307511 | \n", "63815.04590404896 | \n", "141275.76651872776 | \n", "-1 | \n", "365243 | \n", "
DAYS_REGISTRATION | \n", "307511 | \n", "-4986.120327538418 | \n", "3522.8863209630895 | \n", "-1.0 | \n", "0.0 | \n", "
DAYS_ID_PUBLISH | \n", "307511 | \n", "-2994.2023732484367 | \n", "1509.4504190030277 | \n", "-1 | \n", "0 | \n", "
OWN_CAR_AGE | \n", "104582 | \n", "12.061090818687727 | \n", "11.944811582242771 | \n", "0.0 | \n", "91.0 | \n", "
FLAG_MOBIL | \n", "307511 | \n", "0.9999967480838083 | \n", "0.0018033070153514828 | \n", "0 | \n", "1 | \n", "
FLAG_EMP_PHONE | \n", "307511 | \n", "0.8198893698111612 | \n", "0.38428019893876475 | \n", "0 | \n", "1 | \n", "
FLAG_WORK_PHONE | \n", "307511 | \n", "0.1993684778755882 | \n", "0.39952622815022937 | \n", "0 | \n", "1 | \n", "
FLAG_CONT_MOBILE | \n", "307511 | \n", "0.9981334001060125 | \n", "0.04316389414243231 | \n", "0 | \n", "1 | \n", "
FLAG_PHONE | \n", "307511 | \n", "0.28106636835755466 | \n", "0.44952054685675735 | \n", "0 | \n", "1 | \n", "
FLAG_EMAIL | \n", "307511 | \n", "0.0567199222141647 | \n", "0.2313070397227082 | \n", "0 | \n", "1 | \n", "
OCCUPATION_TYPE | \n", "211120 | \n", "None | \n", "None | \n", "Accountants | \n", "Waiters/barmen staff | \n", "
CNT_FAM_MEMBERS | \n", "307509 | \n", "2.152665450442101 | \n", "0.9106815691792945 | \n", "1.0 | \n", "9.0 | \n", "
REGION_RATING_CLIENT | \n", "307511 | \n", "2.0524631639193394 | \n", "0.5090339028156776 | \n", "1 | \n", "3 | \n", "
REGION_RATING_CLIENT_W_CITY | \n", "307511 | \n", "2.031520823645333 | \n", "0.502737032914767 | \n", "1 | \n", "3 | \n", "
WEEKDAY_APPR_PROCESS_START | \n", "307511 | \n", "None | \n", "None | \n", "FRIDAY | \n", "WEDNESDAY | \n", "
HOUR_APPR_PROCESS_START | \n", "307511 | \n", "12.063418869568894 | \n", "3.2658322554378714 | \n", "0 | \n", "9 | \n", "
REG_REGION_NOT_LIVE_REGION | \n", "307511 | \n", "0.015144173704355291 | \n", "0.12212647628215288 | \n", "0 | \n", "1 | \n", "
REG_REGION_NOT_WORK_REGION | \n", "307511 | \n", "0.05076891558350758 | \n", "0.21952582879696045 | \n", "0 | \n", "1 | \n", "
LIVE_REGION_NOT_WORK_REGION | \n", "307511 | \n", "0.04065870814377372 | \n", "0.19749861882842462 | \n", "0 | \n", "1 | \n", "
REG_CITY_NOT_LIVE_CITY | \n", "307511 | \n", "0.07817281333025486 | \n", "0.2684437723734044 | \n", "0 | \n", "1 | \n", "
REG_CITY_NOT_WORK_CITY | \n", "307511 | \n", "0.23045354475124466 | \n", "0.42112383591389696 | \n", "0 | \n", "1 | \n", "
LIVE_CITY_NOT_WORK_CITY | \n", "307511 | \n", "0.17955455252007246 | \n", "0.38381661538559614 | \n", "0 | \n", "1 | \n", "
ORGANIZATION_TYPE | \n", "307511 | \n", "None | \n", "None | \n", "Advertising | \n", "XNA | \n", "
EXT_SOURCE_1 | \n", "134133 | \n", "0.5021298056566661 | \n", "0.21106224927392453 | \n", "0.014568132412445587 | \n", "0.9626927705613059 | \n", "
EXT_SOURCE_2 | \n", "306851 | \n", "0.5143926741308394 | \n", "0.19106015498493495 | \n", "0.00010795029167810804 | \n", "9.936476188005656e-06 | \n", "
EXT_SOURCE_3 | \n", "246546 | \n", "0.5108529061799263 | \n", "0.19484436446374864 | \n", "0.0005272652387098817 | \n", "0.8960095494948396 | \n", "
APARTMENTS_AVG | \n", "151450 | \n", "0.11744049917464643 | \n", "0.10824029130032226 | \n", "0.0 | \n", "1.0 | \n", "
BASEMENTAREA_AVG | \n", "127568 | \n", "0.08844221905180051 | \n", "0.08243815873568477 | \n", "0.0 | \n", "1.0 | \n", "
YEARS_BEGINEXPLUATATION_AVG | \n", "157504 | \n", "0.9777348581622161 | \n", "0.059223314358362686 | \n", "0.0 | \n", "1.0 | \n", "
YEARS_BUILD_AVG | \n", "103023 | \n", "0.7524714325927284 | \n", "0.11327992663224681 | \n", "0.0 | \n", "1.0 | \n", "
COMMONAREA_AVG | \n", "92646 | \n", "0.044620715411351 | \n", "0.0760357450504091 | \n", "0.0 | \n", "1.0 | \n", "
ELEVATORS_AVG | \n", "143620 | \n", "0.07894151232416623 | \n", "0.13457600110034398 | \n", "0.0 | \n", "1.0 | \n", "
ENTRANCES_AVG | \n", "152683 | \n", "0.1497246700679834 | \n", "0.10004912076035907 | \n", "0.0 | \n", "1.0 | \n", "
FLOORSMAX_AVG | \n", "154491 | \n", "0.22628190703664716 | \n", "0.1446406995480042 | \n", "0.0 | \n", "1.0 | \n", "
FLOORSMIN_AVG | \n", "98869 | \n", "0.23189350049056454 | \n", "0.16138028880013763 | \n", "0.0 | \n", "1.0 | \n", "
LANDAREA_AVG | \n", "124921 | \n", "0.06633318417239706 | \n", "0.08118364070179374 | \n", "0.0 | \n", "1.0 | \n", "
LIVINGAPARTMENTS_AVG | \n", "97312 | \n", "0.10077477495067376 | \n", "0.09257613396049774 | \n", "0.0 | \n", "1.0 | \n", "
LIVINGAREA_AVG | \n", "153161 | \n", "0.1073990193325995 | \n", "0.11056452318371307 | \n", "0.0 | \n", "1.0 | \n", "
NONLIVINGAPARTMENTS_AVG | \n", "93997 | \n", "0.008808672617209091 | \n", "0.04773166205034795 | \n", "0.0 | \n", "1.0 | \n", "
NONLIVINGAREA_AVG | \n", "137829 | \n", "0.028357757075796745 | \n", "0.06952318332123596 | \n", "0.0 | \n", "1.0 | \n", "
APARTMENTS_MODE | \n", "151450 | \n", "0.11423100693299629 | \n", "0.10793603908753274 | \n", "0.0 | \n", "1.0 | \n", "
BASEMENTAREA_MODE | \n", "127568 | \n", "0.08754321224758496 | \n", "0.08430717486924556 | \n", "0.0 | \n", "1.0 | \n", "
YEARS_BEGINEXPLUATATION_MODE | \n", "157504 | \n", "0.9770653729428661 | \n", "0.06457543708048007 | \n", "0.0 | \n", "1.0 | \n", "
YEARS_BUILD_MODE | \n", "103023 | \n", "0.7596373227337418 | \n", "0.11011102734194815 | \n", "0.0 | \n", "1.0 | \n", "
COMMONAREA_MODE | \n", "92646 | \n", "0.04255313775014629 | \n", "0.07444452253839141 | \n", "0.0 | \n", "1.0 | \n", "
ELEVATORS_MODE | \n", "143620 | \n", "0.07448973610916797 | \n", "0.1322561441505066 | \n", "0.0 | \n", "1.0 | \n", "
ENTRANCES_MODE | \n", "152683 | \n", "0.14519265864562494 | \n", "0.10097698816024658 | \n", "0.0 | \n", "1.0 | \n", "
FLOORSMAX_MODE | \n", "154491 | \n", "0.22231504747847514 | \n", "0.14370940659531573 | \n", "0.0 | \n", "1.0 | \n", "
FLOORSMIN_MODE | \n", "98869 | \n", "0.22805849255076463 | \n", "0.16115977149547575 | \n", "0.0 | \n", "1.0 | \n", "
LANDAREA_MODE | \n", "124921 | \n", "0.06495768445657653 | \n", "0.08175027780843509 | \n", "0.0 | \n", "1.0 | \n", "
LIVINGAPARTMENTS_MODE | \n", "97312 | \n", "0.10564485674943506 | \n", "0.09788044657879368 | \n", "0.0 | \n", "1.0 | \n", "
LIVINGAREA_MODE | \n", "153161 | \n", "0.10597505043712185 | \n", "0.1118452658778343 | \n", "0.0 | \n", "1.0 | \n", "
NONLIVINGAPARTMENTS_MODE | \n", "93997 | \n", "0.008076387544281912 | \n", "0.04627626621983564 | \n", "0.0 | \n", "1.0 | \n", "
NONLIVINGAREA_MODE | \n", "137829 | \n", "0.02702231968598766 | \n", "0.07025385904394447 | \n", "0.0 | \n", "1.0 | \n", "
APARTMENTS_MEDI | \n", "151450 | \n", "0.11784992076593205 | \n", "0.10907590600115309 | \n", "0.0 | \n", "1.0 | \n", "
BASEMENTAREA_MEDI | \n", "127568 | \n", "0.08795485466574765 | \n", "0.0821787495146342 | \n", "0.0 | \n", "1.0 | \n", "
YEARS_BEGINEXPLUATATION_MEDI | \n", "157504 | \n", "0.9777522640693693 | \n", "0.0598973185051196 | \n", "0.0 | \n", "1.0 | \n", "
YEARS_BUILD_MEDI | \n", "103023 | \n", "0.7557462721916715 | \n", "0.11206630964404381 | \n", "0.0 | \n", "1.0 | \n", "
COMMONAREA_MEDI | \n", "92646 | \n", "0.0445951017852909 | \n", "0.07614426224091457 | \n", "0.0 | \n", "1.0 | \n", "
ELEVATORS_MEDI | \n", "143620 | \n", "0.0780778443113532 | \n", "0.13446714769067444 | \n", "0.0 | \n", "1.0 | \n", "
ENTRANCES_MEDI | \n", "152683 | \n", "0.14921278072862998 | \n", "0.1003683944976324 | \n", "0.0 | \n", "1.0 | \n", "
FLOORSMAX_MEDI | \n", "154491 | \n", "0.22589659009261476 | \n", "0.14506702591935117 | \n", "0.0 | \n", "1.0 | \n", "
FLOORSMIN_MEDI | \n", "98869 | \n", "0.2316249380493541 | \n", "0.16193354145715574 | \n", "0.0 | \n", "1.0 | \n", "
LANDAREA_MEDI | \n", "124921 | \n", "0.06716874904939972 | \n", "0.08216701028007198 | \n", "0.0 | \n", "1.0 | \n", "
LIVINGAPARTMENTS_MEDI | \n", "97312 | \n", "0.10195447324073269 | \n", "0.09364233271153845 | \n", "0.0 | \n", "1.0 | \n", "
LIVINGAREA_MEDI | \n", "153161 | \n", "0.10860673604899568 | \n", "0.11226025867534792 | \n", "0.0 | \n", "1.0 | \n", "
NONLIVINGAPARTMENTS_MEDI | \n", "93997 | \n", "0.008651013330210322 | \n", "0.04741472790780275 | \n", "0.0 | \n", "1.0 | \n", "
NONLIVINGAREA_MEDI | \n", "137829 | \n", "0.028235920597262158 | \n", "0.07016648150682489 | \n", "0.0 | \n", "1.0 | \n", "
FONDKAPREMONT_MODE | \n", "97216 | \n", "None | \n", "None | \n", "not specified | \n", "reg oper spec account | \n", "
HOUSETYPE_MODE | \n", "153214 | \n", "None | \n", "None | \n", "block of flats | \n", "terraced house | \n", "
TOTALAREA_MODE | \n", "159080 | \n", "0.1025466626854412 | \n", "0.10746232414961886 | \n", "0.0 | \n", "1.0 | \n", "
WALLSMATERIAL_MODE | \n", "151170 | \n", "None | \n", "None | \n", "Block | \n", "Wooden | \n", "
EMERGENCYSTATE_MODE | \n", "161756 | \n", "None | \n", "None | \n", "No | \n", "Yes | \n", "
OBS_30_CNT_SOCIAL_CIRCLE | \n", "306490 | \n", "1.4222454239942575 | \n", "2.4009887461090127 | \n", "0.0 | \n", "9.0 | \n", "
DEF_30_CNT_SOCIAL_CIRCLE | \n", "306490 | \n", "0.1434206662533851 | \n", "0.44669842938152715 | \n", "0.0 | \n", "8.0 | \n", "
OBS_60_CNT_SOCIAL_CIRCLE | \n", "306490 | \n", "1.4052921791901856 | \n", "2.37980335197939 | \n", "0.0 | \n", "9.0 | \n", "
DEF_60_CNT_SOCIAL_CIRCLE | \n", "306490 | \n", "0.10004894123788705 | \n", "0.3622908039755731 | \n", "0.0 | \n", "7.0 | \n", "
DAYS_LAST_PHONE_CHANGE | \n", "307510 | \n", "-962.8587883320868 | \n", "826.8084870406575 | \n", "-1.0 | \n", "0.0 | \n", "
FLAG_DOCUMENT_2 | \n", "307511 | \n", "4.2274910491006824E-5 | \n", "0.0065017890454897925 | \n", "0 | \n", "1 | \n", "
FLAG_DOCUMENT_3 | \n", "307511 | \n", "0.7100233812774177 | \n", "0.45375196843273824 | \n", "0 | \n", "1 | \n", "
FLAG_DOCUMENT_4 | \n", "307511 | \n", "8.129790479039775E-5 | \n", "0.009016183216550845 | \n", "0 | \n", "1 | \n", "
FLAG_DOCUMENT_5 | \n", "307511 | \n", "0.015114906458630749 | \n", "0.12201022281354133 | \n", "0 | \n", "1 | \n", "
FLAG_DOCUMENT_6 | \n", "307511 | \n", "0.0880553866365756 | \n", "0.28337589286299236 | \n", "0 | \n", "1 | \n", "
FLAG_DOCUMENT_7 | \n", "307511 | \n", "1.9186305530533867E-4 | \n", "0.013850157677017446 | \n", "0 | \n", "1 | \n", "
FLAG_DOCUMENT_8 | \n", "307511 | \n", "0.08137595077899652 | \n", "0.2734120489445129 | \n", "0 | \n", "1 | \n", "
FLAG_DOCUMENT_9 | \n", "307511 | \n", "0.00389579559755586 | \n", "0.06229471080039349 | \n", "0 | \n", "1 | \n", "
FLAG_DOCUMENT_10 | \n", "307511 | \n", "2.276341334131137E-5 | \n", "0.0047710553540692 | \n", "0 | \n", "1 | \n", "
FLAG_DOCUMENT_11 | \n", "307511 | \n", "0.003912055178513939 | \n", "0.06242406326684515 | \n", "0 | \n", "1 | \n", "
FLAG_DOCUMENT_12 | \n", "307511 | \n", "6.503832383231819E-6 | \n", "0.0025502570915978736 | \n", "0 | \n", "1 | \n", "
FLAG_DOCUMENT_13 | \n", "307511 | \n", "0.003525077151711646 | \n", "0.05926771807375309 | \n", "0 | \n", "1 | \n", "
FLAG_DOCUMENT_14 | \n", "307511 | \n", "0.0029364803210291664 | \n", "0.05410976737642881 | \n", "0 | \n", "1 | \n", "
FLAG_DOCUMENT_15 | \n", "307511 | \n", "0.0012097128232811183 | \n", "0.03475993882769264 | \n", "0 | \n", "1 | \n", "
FLAG_DOCUMENT_16 | \n", "307511 | \n", "0.009928100133003373 | \n", "0.09914416233784934 | \n", "0 | \n", "1 | \n", "
FLAG_DOCUMENT_17 | \n", "307511 | \n", "2.666571277125046E-4 | \n", "0.016327488741596622 | \n", "0 | \n", "1 | \n", "
FLAG_DOCUMENT_18 | \n", "307511 | \n", "0.008129790479039774 | \n", "0.08979823610939612 | \n", "0 | \n", "1 | \n", "
FLAG_DOCUMENT_19 | \n", "307511 | \n", "5.951006630657115E-4 | \n", "0.02438746506586239 | \n", "0 | \n", "1 | \n", "
FLAG_DOCUMENT_20 | \n", "307511 | \n", "5.072989258920819E-4 | \n", "0.022517620268446063 | \n", "0 | \n", "1 | \n", "
FLAG_DOCUMENT_21 | \n", "307511 | \n", "3.349473677364387E-4 | \n", "0.018298531822437545 | \n", "0 | \n", "1 | \n", "
AMT_REQ_CREDIT_BUREAU_HOUR | \n", "265992 | \n", "0.006402448193930645 | \n", "0.08384912844747658 | \n", "0.0 | \n", "4.0 | \n", "
AMT_REQ_CREDIT_BUREAU_DAY | \n", "265992 | \n", "0.0070002105326475985 | \n", "0.11075740632435446 | \n", "0.0 | \n", "9.0 | \n", "
AMT_REQ_CREDIT_BUREAU_WEEK | \n", "265992 | \n", "0.0343619356973142 | \n", "0.2046848758128244 | \n", "0.0 | \n", "8.0 | \n", "
AMT_REQ_CREDIT_BUREAU_MON | \n", "265992 | \n", "0.26739526000781977 | \n", "0.9160023961526179 | \n", "0.0 | \n", "9.0 | \n", "
AMT_REQ_CREDIT_BUREAU_QRT | \n", "265992 | \n", "0.26547414959848414 | \n", "0.7940556483207547 | \n", "0.0 | \n", "8.0 | \n", "
AMT_REQ_CREDIT_BUREAU_YEAR | \n", "265992 | \n", "1.899974435321363 | \n", "1.869294998181561 | \n", "0.0 | \n", "9.0 | \n", "