{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2018-11-20T12:28:45.032449Z", "start_time": "2018-11-20T12:28:45.018999Z" }, "init_cell": true, "slideshow": { "slide_type": "skip" } }, "outputs": [ { "data": { "text/html": [ " \n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%%HTML\n", " " ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2018-11-20T12:28:49.173882Z", "start_time": "2018-11-20T12:28:45.039407Z" }, "init_cell": true, "slideshow": { "slide_type": "skip" } }, "outputs": [ { "data": { "text/html": [ "" ], "text/vnd.plotly.v1+html": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "" ], "text/vnd.plotly.v1+html": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from IPython.display import Image\n", "\n", "import warnings\n", "warnings.filterwarnings(\"ignore\", category=DeprecationWarning) \n", "warnings.filterwarnings(\"ignore\", category=FutureWarning)\n", "\n", "from matplotlib import pyplot as plt\n", "plt.rcParams.update({'figure.max_open_warning': 0})\n", "graph_figsize = (10,6) # I'm forgetful and lazy\n", "plt.rcParams.update({'figure.figsize': graph_figsize})\n", "\n", "%matplotlib nbagg\n", " # Jupyter Magics!\n", "import geopandas as gp\n", "import urllib.request\n", "import zipfile\n", "from pathlib import Path\n", "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "from itertools import chain\n", "from collections import defaultdict\n", " \n", "import moviepy.editor as mpy\n", "import moviepy.video as mpyv\n", "\n", "import plotly.plotly as py\n", "from plotly.offline import init_notebook_mode, plot, iplot\n", "import plotly.graph_objs as go\n", "from datetime import datetime\n", "init_notebook_mode(connected=False)\n", "import cufflinks\n", "\n", "from bs4 import BeautifulSoup\n", "import requests\n", "from tqdm import tqdm_notebook as tqdm\n", "\n", "from ckanapi import RemoteCKAN # it's on pip\n", "def build_odni_connection():\n", " \"\"\"Be nice to OpenDataNI and tell them how old I am. (And that it's me)\"\"\"\n", " version_no = (pd.to_datetime('now') -\n", " pd.to_datetime('1988/05/17')).days/365\n", " ua = f'@Bolster/{version_no:.2f} (+http://bolster.online/)'\n", " return RemoteCKAN('https://www.opendatani.gov.uk/', user_agent=ua)" ] }, { "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2018-06-05T16:50:13.356742Z", "start_time": "2018-06-05T16:50:13.352035Z" }, "slideshow": { "slide_type": "slide" } }, "source": [ "# Extraction, Transformation and Loading\n", "\n", "* Get the data\n", "* Clean the data\n", "* Store the data\n", "\n", "## Extraction: \n", "* use BS4 to walk the Department of Education datasets for post primary school level data for:\n", "* * " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T10:04:36.310286Z", "start_time": "2018-11-08T10:04:08.037938Z" }, "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "def get_year_files_for_enrolement_data(dest_base_dir):\n", " base_url = \"https://www.education-ni.gov.uk\"\n", " year_files = defaultdict(list)\n", " listing_path = \"/articles/school-enrolments-school-level-data\"\n", " soup = BeautifulSoup(requests.get(base_url+listing_path).text, 'lxml')\n", " for link in tqdm(soup.find_all('a'),\n", " desc='years',\n", " leave=False\n", " ):\n", " if 'School enrolments - school level data 20' in ' '.join(map(str, link.contents)):\n", " year = link.get('href')[-6:-2]\n", " # Follow year and get all the relevant files\n", " year_soup = BeautifulSoup(\n", " requests.get(base_url+link.get('href')).text,\n", " 'lxml'\n", " )\n", " for link in tqdm(year_soup.find_all('a'), \n", " desc=f'files in {year}',\n", " leave=False\n", " ):\n", " year_dir = dest_base_dir.joinpath(str(year))\n", " year_dir.mkdir(parents=True, exist_ok=True)\n", " href = link.get('href', \"\")\n", " filename = href.split('/')[-1]\n", " contents = ' '.join(map(str, link.contents))\n", " if 'xls' in href.split('.')[-1].lower():\n", " # See this requirement right 👆\n", " # Mix of xls, XLSX, xlsx and XLS\n", " dest_file = year_dir/filename\n", " if not dest_file.exists():\n", " urllib.request.urlretrieve(\n", " href, dest_file\n", " )\n", " year_files[year].append(filename)\n", "\n", " return year_files\n", "\n", "\n", "dest = Path('./data/education-ni/')\n", "year_files = get_year_files_for_enrolement_data(dest)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T10:04:55.503454Z", "start_time": "2018-11-08T10:04:54.734470Z" }, "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "sheets = defaultdict(list)\n", "for year, files in year_files.items():\n", " for file in files:\n", " if 'post' in file:\n", " xls = pd.ExcelFile(f'data/education-ni/{year}/{file}')\n", " sheets[year].extend(xls.sheet_names)\n", "dict(sheets)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T10:05:01.221132Z", "start_time": "2018-11-08T10:05:01.214289Z" }, "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "from collections import Counter\n", "all_sheet_names = Counter([_ for d in sheets.values() for _ in d])\n", "all_sheet_names.most_common()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T10:05:04.375506Z", "start_time": "2018-11-08T10:05:04.342869Z" }, "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "df = pd.DataFrame.from_dict({\n", " year: [sn in sheet_names for sn in all_sheet_names] \n", " for year, sheet_names in sheets.items()\n", "}, orient='index')\n", "df.columns=all_sheet_names.keys()\n", "df.T" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T10:05:10.836538Z", "start_time": "2018-11-08T10:05:10.335106Z" }, "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "import seaborn as sns\n", "f,ax = plt.subplots(figsize=graph_figsize)\n", "sns.heatmap(df.T, ax=ax)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T10:17:30.388833Z", "start_time": "2018-11-08T10:17:30.353368Z" }, "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "def parse_reference_table(xls):\n", " \"\"\"\n", " From an ExcelFile, clean up:\n", " * School Mgmt Type disaster\n", " * Inconsistent header depth\n", " * Multi-row header names\n", " * inconsistent headers (ref_key_map)\n", " * inconsistent col order\n", " * inconsistent caps/spacing (strip|lower)\n", " \"\"\"\n", " cols= [\n", " 'de ref',\n", " 'school name',\n", " 'school type',\n", " 'address 1',\n", " 'postcode',\n", " 'urban_rural',\n", " 'school management type',\n", " 'district council',\n", " 'parliamentary constituency',\n", " 'town'\n", " ]\n", " \n", " categories = [\n", " 'school type',\n", " 'urban_rural',\n", " 'school management type',\n", " 'district council',\n", " 'parliamentary constituency',\n", " 'town'\n", " ]\n", "\n", " ref_key_map={\n", " 'denino':'de ref',\n", " 'urban/ rural': 'urban_rural',\n", " 'schoolname': 'school name'\n", " }\n", " \n", " reference_value_rename = {\n", " 'school management type':{\n", " 'gmi':'integrated',\n", " 'controlled integrated':'integrated',\n", " 'roman catholic maintained':'rc maintained',\n", " 'grant maintained integrated':'integrated',\n", " 'voluntary - other managed':'voluntary',\n", " 'voluntary - rc managed':'voluntary',\n", " 'catholic maintained':'rc maintained'\n", " }\n", " }\n", " join_n_strip_n_lower = lambda l: ' '.join(l).strip().lower()\n", " \n", " if 'reference data' in xls.sheet_names:\n", " df = pd.read_excel(xls, 'reference data', header=None)\n", " h_range = 2 if isinstance(df.ix[3,0], int) else 3\n", " try:\n", " df.columns=df.ix[1:h_range].fillna('').apply(join_n_strip_n_lower, axis=0).values\n", " df.rename(columns=ref_key_map, inplace=True)\n", " df = df.drop(df.index[0:h_range+1]).reset_index(drop=True)\n", " df = df[cols]\n", " df['de ref'] = df['de ref'].astype(int)\n", " df.set_index('de ref', inplace=True)\n", " for c in df:\n", " df[c]=df[c].str.lower().str.strip() \n", " df.replace(reference_value_rename, inplace=True)\n", " for c in categories:\n", " df[c].fillna('NA', inplace=True)\n", " df[c] = df[c].astype('category')\n", " except TypeError as e:\n", " print(e)\n", " else:\n", " df=None\n", " return df\n", "\n", "def parse_enrolments_table(xls):\n", " \"\"\"From an ExcelFile, clean up:\n", " * Inconsistent header depth\n", " * fucked up nans/nulls all over the place\n", " * inconsistent *footer* depth...\n", " * Set de ref as join index and drop pointless fields\n", " \"\"\"\n", " def join_n_strip_n_lower(l): return ' '.join(l).strip().lower()\n", "\n", " def strip_n_lower(s): return s.strip().lower()\n", "\n", " def unyearify(s): return int(s.replace('year ', ''))\n", "\n", " if 'enrolments' in xls.sheet_names:\n", " df = pd.read_excel(xls, 'enrolments', header=None, skip_footer=5)\n", " h_range = 2 if isinstance(df.ix[3, 0], int) else 3\n", " try:\n", " df.columns = df.ix[3].fillna('').apply(strip_n_lower).values\n", " df = df.drop(df.index[0:h_range+1]).reset_index(drop=True)\n", " df.dropna(how='all', inplace=True, axis=0)\n", " df['de ref'] = df['de ref'].astype(int)\n", " df.drop('schoolname', axis=1, inplace=True)\n", " df.drop('total pupils', axis=1, inplace=True)\n", " df.set_index('de ref', inplace=True)\n", " df.rename(columns=unyearify, inplace=True)\n", " df = df.astype(float)\n", " except TypeError as e:\n", " print(e)\n", " else:\n", " df = None\n", " return df\n", "\n", "def parse_fsm_table(xls):\n", " \"\"\"From an ExcelFile, clean up:\n", " * Inconsistent header depth\n", " * fucked up nans/nulls all over the place\n", " * inconsistent *footer* depth...\n", " * Set de ref as join index and drop pointless fields\n", " \"\"\"\n", " join_n_strip_n_lower = lambda l: ' '.join(l).strip().lower()\n", " strip_n_lower = lambda s: s.strip().lower()\n", " unyearify = lambda s: int(s.replace('year ',''))\n", " \n", " if 'free school meals' in xls.sheet_names:\n", " df = pd.read_excel(xls, 'free school meals', header=None, skip_footer=5)\n", " h_range = 2 if isinstance(df.ix[3,0], int) else 3\n", " try:\n", " df.columns=df.ix[3].fillna('').apply(strip_n_lower).values\n", " df = df.drop(df.index[0:h_range+1]).reset_index(drop=True)\n", " df.dropna(how='all', inplace=True, axis=0)\n", " df['de ref'] = df['de ref'].astype(int)\n", " df.drop('schoolname',axis=1, inplace=True)\n", " df.drop('free school meals', axis=1, inplace=True)\n", " df.set_index('de ref', inplace=True)\n", " df.replace('#',pd.np.nan, inplace=True) # # = Undisclosed\n", " df.replace('*',2.0, inplace=True) # * == < 5\n", " df.replace('!',1, inplace=True) # ! avoid identification, so it's prob one or two\n", " \n", " df=df.astype(float)\n", " except TypeError as e:\n", " print(e)\n", " else:\n", " df=None\n", " return df\n", "\n", "def parse_available_table(xls):\n", " \"\"\"From an ExcelFile, clean up:\n", " * Inconsistent header depth\n", " * fucked up nans/nulls all over the place\n", " * inconsistent *footer* depth...\n", " * Set de ref as join index and drop pointless fields\n", " * Totally different schemas between years\n", " * Inconsistent metric naming\n", " * non numerical data flags (*/!)\n", " \"\"\"\n", " ref_key_map={\n", " 'schoolname': 'school name',\n", " 'total unfilled places': 'available places',\n", " 'unfilled places': 'available places',\n", " 'total approved enrolment number': 'approved enrolments'\n", " }\n", " \n", " join_n_strip_n_lower = lambda l: ' '.join(l).strip().lower()\n", " \n", " if 'School level data' in xls.sheet_names:\n", " df = pd.read_excel(xls, 'School level data', header=None)\n", " h_range = 2 if isinstance(df.ix[3,0], int) else 3\n", " elif 'unfilled places' in xls.sheet_names:\n", " df = pd.read_excel(xls, 'unfilled places', header=None)\n", " h_range = 2 if isinstance(df.ix[3,0], int) else 3\n", " else:\n", " df=None\n", " \n", " if df is not None:\n", " try:\n", " df.columns=df.ix[1:h_range].fillna('').apply(join_n_strip_n_lower, axis=0).values\n", " df.rename(columns=ref_key_map, inplace=True)\n", " df = df.drop(df.index[0:h_range+1]).reset_index(drop=True)\n", " df=df.applymap(lambda x: np.nan if isinstance(x, str) and x.isspace() else x)\n", " df.dropna(how='all', axis=1, inplace=True)\n", " df.dropna(how='any', axis=0, inplace=True)\n", " if df.shape[1] == 6: # recent doesn't have fecking headers\n", " cols = list(df.columns)\n", " cols[0] = 'de ref'\n", " cols[1] = 'school name'\n", " df.columns=cols\n", " df.drop('school name', axis=1, inplace=True)\n", " df['de ref'] = df['de ref'].astype(int)\n", " df.set_index('de ref', inplace=True)\n", " df.replace('*',2.0, inplace=True) # * == < 5\n", " df.replace('!',1, inplace=True) # ! avoid identification, so it's prob one or two\n", " df.dropna(how='all', inplace=True, axis=1)\n", " df.astype(int, inplace=True)\n", " \n", "\n", " \n", " except TypeError as e:\n", " print(e)\n", "\n", " return df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T10:49:35.294506Z", "start_time": "2018-11-08T10:49:33.999419Z" }, "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "re_dfs={}\n", "av_dfs={}\n", "en_dfs={}\n", "fsm_dfs={}\n", "for year, files in year_files.items():\n", " for file in files:\n", " if 'post' in file:\n", " xls = pd.ExcelFile(f'data/education-ni/{year}/{file}')\n", " df = parse_reference_table(xls)\n", " if df is not None:\n", " print(f'Got reference data for {year}')\n", " re_dfs[year]=df\n", " df = parse_enrolments_table(xls)\n", " if df is not None:\n", " print(f'Got enrolment data for {year}')\n", " en_dfs[year]=df\n", " df = parse_available_table(xls)\n", " if df is not None:\n", " print(f'Got available data for {year}')\n", " av_dfs[year]=df\n", " df = parse_fsm_table(xls)\n", " if df is not None:\n", " print(f'Got fsm data for {year}')\n", " fsm_dfs[year]=df\n", " \n", "reference = pd.Panel(re_dfs).sort_index().rename_axis('year').rename_axis('metric', axis=2)\n", "available = pd.Panel(av_dfs).sort_index().rename_axis('year').rename_axis('metric', axis=2)\n", "enrolment = pd.Panel(en_dfs).sort_index().rename_axis('year').rename_axis('yeargroup', axis=2)\n", "fsm = pd.Panel(fsm_dfs).sort_index().rename_axis('year').rename_axis('metric', axis=2)\n", "\n", "reference.to_hdf('data.h5', 'reference')\n", "available.to_hdf('data.h5', 'available')\n", "enrolment.to_hdf('data.h5', 'enrolment')\n", "fsm.to_hdf('data.h5','fsm')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-09T19:14:16.772070Z", "start_time": "2018-11-09T19:14:16.760223Z" } }, "outputs": [], "source": [ "reference" ] }, { "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2018-10-12T13:41:59.584195Z", "start_time": "2018-10-12T13:41:59.577650Z" }, "slideshow": { "slide_type": "slide" } }, "source": [ "# Population Data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-09T14:43:16.240767Z", "start_time": "2018-11-09T14:43:15.708076Z" }, "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "if not Path('data/cons_pop.csv').exists():\n", " odni = build_odni_connection()\n", " for dataset in odni.action.package_show(id='population-estimates-for-northern-ireland')['resources']:\n", " if dataset['name'] == \"Parliamentary Constituencies by single year of age and gender (mid-2001 to mid-2017)\":\n", " cons_pop = pd.read_csv(dataset['url'])\n", " cons_pop.to_csv(\"data/cons_pop.csv\", index=False)\n", " cons_pop.head()\n", "\n", "cons_pop = pd.read_csv('data/cons_pop.csv')\n", "cons_pop['Mid_Year_Ending'] = cons_pop.Mid_Year_Ending.astype(int)\n", "cons_pop['Population_Estimate'] = cons_pop.Population_Estimate.astype(float)\n", "cons_pop['Age'] = cons_pop.Age.astype(int)\n", "cons_pop.rename(columns={'Geo_Name':'constituency'}, inplace=True)\n", "cons_pop['constituency']= cons_pop.constituency.str.strip().str.lower()\n", "cons_pop[(cons_pop.Gender == 'All Persons') & (cons_pop.Mid_Year_Ending == 2016)].head()\n", "cons_pop.to_hdf('data.h5','cons_pop')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Constituency Maps" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T11:26:08.524378Z", "start_time": "2018-11-08T11:26:08.268398Z" } }, "outputs": [], "source": [ "cons_gdf_zip=\"http://osni-spatial-ni.opendata.arcgis.com/datasets/563dc2ec3d9943428e3fe68966d40deb_3.zip\"\n", "cons_gdf_shp = \"OSNI_Open_Data_Largescale_Boundaries__Parliamentary_Constituencies_2008.shp\"\n", "if not Path('data/'+cons_gdf_shp).exists():\n", " urllib.request.urlretrieve(cons_gdf_zip, 'data/_tmp.zip')\n", " with zipfile.ZipFile('data/_tmp.zip') as z:\n", " z.extractall('data/')\n", " Path('data/_tmp.zip').unlink()\n", " \n", "cons_gdf=gp.GeoDataFrame.from_file('data/'+cons_gdf_shp)\n", "cons_gdf.rename(columns={'PC_NAME':'constituency'}, inplace=True)\n", "cons_gdf.drop(['OBJECTID','PC_ID'], axis=1, inplace=True)\n", "cons_gdf['constituency'] = cons_gdf['constituency'].str.lower().str.strip()\n", "cons_gdf.set_index('constituency', inplace=True)\n", "cons_gdf.plot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-09T14:44:11.583304Z", "start_time": "2018-11-09T14:44:11.574442Z" } }, "outputs": [], "source": [ "import fiona; fiona.supported_drivers" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-09T14:43:21.565867Z", "start_time": "2018-11-09T14:43:20.628912Z" } }, "outputs": [], "source": [ "# Enrolment sum of year group for all schools in constituencey per consitituency\n", "en_df= pd.DataFrame.from_dict({\n", " con:\n", " enrolment[:,reference.major_axis[(reference.minor_xs('parliamentary constituency')==con).any(axis=1)],:].sum().sum()\n", " for con in cons\n", "}).T\n", "\n", "# Available places sum of year group for all schools in constituencey per consitituency\n", "av_df= pd.DataFrame.from_dict({\n", " con:\n", " available[:,reference.major_axis[(reference.minor_xs('parliamentary constituency')==con).any(axis=1)],'available places'].sum()\n", " for con in cons\n", "}).T\n", "\n", "# Free School Meals\n", "fsm_df= pd.DataFrame.from_dict({\n", " con:\n", " fsm[:,reference.major_axis[(reference.minor_xs('parliamentary constituency')==con).any(axis=1)],'fsme'].sum()\n", " for con in cons\n", "}).T\n", "\n", "for c in av_df:\n", " cons_gdf[f\"av_{c}\"] = av_df[c]\n", " cons_gdf[f\"av_{c}_rat\"] = (av_df[c]/en_df[c])\n", "for c in en_df:\n", " cons_gdf[f\"en_{c}\"] = en_df[c]\n", " cons_gdf[f\"en_{c}_pk\"] = en_df[c]/cons_gdf[\"Area_sqkm\"]\n", "for c in fsm_df:\n", " cons_gdf[f\"fsm_{c}\"] = fsm_df[c]\n", " cons_gdf[f\"fsm_{c}_rat\"] = (fsm_df[c]/en_df[c])\n", " \n", " \n", "def get_winning_quartile(df, age_quartiles):\n", " age_counts = df.groupby(\"Age\")['Population_Estimate'].sum()\n", " quartile_counts = pd.Series({\n", " f\"{lower}-{upper}\":age_counts.loc[lower:upper].sum()\n", " for lower,upper in zip([0]+list(age_quartiles), list(age_quartiles)+[99])\n", " })\n", " return quartile_counts.idxmax()\n", "\n", "pop_years = set(cons_pop.Mid_Year_Ending.unique())\n", "for c in tqdm(reference.axes[0].astype(int), desc=\"Year\"):\n", " # Preserve edu data and use closest population year\n", " if c not in pop_years:\n", " yr = pop_years[np.abs(pop_years-c).argmin()]\n", " else:\n", " yr=c\n", " \n", " age_quartiles=annual_quartiles.loc[yr].astype(int)\n", " \n", " _cons_pop = cons_pop[(cons_pop.Gender != 'All Persons') \\\n", " & (cons_pop.Mid_Year_Ending == yr) \\\n", " ].groupby(['constituency', 'Gender'])['Population_Estimate'].sum()\\\n", " .unstack().rename(columns=lambda c: c.lower())\n", " _cons_pop['total'] = _cons_pop.sum(axis=1)\n", " _cons_pop['m_per_f'] = _cons_pop['males']/_cons_pop['females']\n", " \n", " cons_gdf[f\"pop_{c}\"] = _cons_pop['total']\n", " cons_gdf[f\"pop_{c}_males\"] = _cons_pop['males']\n", " cons_gdf[f\"pop_{c}_females\"] = _cons_pop['females']\n", " cons_gdf[f\"pop_{c}_m_per_f\"] = _cons_pop['m_per_f']\n", " \n", " \n", "\n", " cons_gdf[f\"topqt_{c}\"]=cons_pop[\n", " (cons_pop.Gender != 'All Persons') & (cons_pop.Mid_Year_Ending == yr)\n", " ].groupby('constituency').apply(get_winning_quartile, age_quartiles=annual_quartiles.loc[yr].values) \n", " \n", " cons_gdf[f\"en_{c}_pc\"]=cons_gdf[f\"en_{c}\"]/cons_gdf[f\"pop_{c}\"]\n", " cons_gdf[f\"fsm_{c}_pc\"]=cons_gdf[f\"fsm_{c}\"]/cons_gdf[f\"pop_{c}\"]\n", " \n", " cons_pop_mean=cons_pop[(cons_pop.Gender.isin([\"All Persons\"])) & (cons_pop.Mid_Year_Ending == yr)]\n", " cons_pop_mean['popprod'] = cons_pop_mean[['Age','Population_Estimate']].product(axis=1)\n", " cons_pop_mean=cons_pop_mean.groupby('constituency')[['popprod','Population_Estimate']].sum(axis=0)\n", " cons_gdf[f'age_{c}_avg'] = cons_pop_mean['popprod']/cons_pop_mean['Population_Estimate']\n", " try:\n", " cons_gdf[f\"av_{c}_pc\"]=cons_gdf[f\"av_{c}\"]/cons_gdf[f\"pop_{c}\"]\n", " except:\n", " print(f\"No data for av in {c}\")\n", " \n", "cons_gdf.to_file('data.h5', 'cons_gdf')" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "subslide" } }, "source": [ "### That's not a great way of summarising age distributions...." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T16:34:14.126450Z", "start_time": "2018-11-08T16:34:07.276361Z" }, "code_folding": [] }, "outputs": [], "source": [ "def flatten_ages(s):\n", " return np.asarray(\n", " list(chain.from_iterable(\n", " ([age]*int(n) for age,n in s.iteritems())\n", " ))\n", " )\n", "\n", "def get_age_quantiles(df, q=[0.25,0.5,0.75]):\n", " pop_sum=df.groupby('Age')['Population_Estimate'].sum()\n", " age_quartiles = np.quantile(flatten_ages(pop_sum),q)\n", " return pd.Series(dict(zip(q,age_quartiles)))\n", "\n", "annual_cons_quartiles = cons_pop[(cons_pop.Gender == \"All Persons\")].groupby([\"Mid_Year_Ending\",\"constituency\"]).apply(get_age_quantiles)\n", "annual_quartiles = cons_pop[(cons_pop.Gender == \"All Persons\")].groupby(\"Mid_Year_Ending\").apply(get_age_quantiles)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-08T16:39:44.347542Z", "start_time": "2018-11-08T16:39:43.594953Z" }, "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "pop_years = cons_pop.Mid_Year_Ending.unique()\n", "for c in tqdm(reference.axes[0].astype(int), desc=\"Year\"):\n", " # Preserve edu data and use closest population year\n", " if c not in pop_years:\n", " yr = pop_years[np.abs(pop_years-c).argmin()]\n", " else:\n", " yr=c\n", " \n", " age_quartiles=annual_quartiles.loc[yr].astype(int)\n", " \n", " _cons_pop = cons_pop[(cons_pop.Gender != 'All Persons') \\\n", " & (cons_pop.Mid_Year_Ending == yr) \\\n", " ].groupby(['constituency', 'Gender'])['Population_Estimate'].sum()\\\n", " .unstack().rename(columns=lambda c: c.lower())\n", " _cons_pop['total'] = _cons_pop.sum(axis=1)\n", " _cons_pop['m_per_f'] = _cons_pop['males']/_cons_pop['females']\n", " \n", " cons_pop_qilted=pd.DataFrame.from_dict(\n", " {\n", " f\"{lotile}-{qtile}\":cons_pop[(cons_pop.Gender == 'All Persons') \\\n", " & (cons_pop.Mid_Year_Ending == yr) \\\n", " & (lotile<=cons_pop.Age) &(cons_pop.Age1 else c\n", "allowed_list = ['grampct']\n", "cons_stats_2016 = cons_stats[[c for c in cons_stats.columns if '2016' in c or c in allowed_list ]].rename(columns=unyearify)\n", "\n", "sns.pairplot(data=cons_stats_2016, hue='topqt',\n", " vars=['fsm_rat','grampct'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-06T17:55:17.541304Z", "start_time": "2018-11-06T17:55:17.504062Z" } }, "outputs": [], "source": [ "cons_stats_2016" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-11-07T11:53:31.523680Z", "start_time": "2018-11-07T11:53:31.515670Z" } }, "outputs": [], "source": [ "enrolment.sum().sum()" ] } ], "metadata": { "celltoolbar": "Slideshow", "kernelspec": { "display_name": "Python [conda env:pyconie]", "language": "python", "name": "conda-env-pyconie-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }