{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Python Tools for Record Linking and Fuzzy Matching\n", "\n", "This notebook accompanies the [article](https://pbpython.com/record-linking.html) on Practical Business Python\n", "\n", "This notebook relies on [fuzzymatcher](https://github.com/RobinL/fuzzymatcher) and the [Python Record Linkage Toolkit](https://recordlinkage.readthedocs.io/en/latest/about.html)\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from pathlib import Path\n", "import fuzzymatcher\n", "import recordlinkage" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Example using fuzzymatcher" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "hospital_accounts = pd.read_csv(\n", " 'https://github.com/chris1610/pbpython/raw/master/data/hospital_account_info.csv'\n", ")\n", "hospital_reimbursement = pd.read_csv(\n", " 'https://raw.githubusercontent.com/chris1610/pbpython/master/data/hospital_reimbursement.csv'\n", ")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Account_NumFacility NameAddressCityStateZIP CodeCounty NamePhone NumberHospital TypeHospital Ownership
010605SAGE MEMORIAL HOSPITALSTATE ROUTE 264 SOUTH 191GANADOAZ86505APACHE(928) 755-4541Critical Access HospitalsVoluntary non-profit - Private
124250WOODRIDGE BEHAVIORAL CENTER600 NORTH 7TH STREETWEST MEMPHISAR72301CRITTENDEN(870) 394-4113PsychiatricProprietary
210341DOUGLAS GARDENS HOSPITAL5200 NE 2ND AVEMIAMIFL33137MIAMI-DADE(305) 751-8626Acute Care HospitalsVoluntary non-profit - Private
381101SUNCOAST BEHAVIORAL HEALTH CENTER4480 51ST ST WBRADENTONFL34210MANATEE(941) 792-2222PsychiatricProprietary
439835TREASURE VALLEY HOSPITAL8800 WEST EMERALD STREETBOISEID83704ADA(208) 373-5000Acute Care HospitalsProprietary
\n", "
" ], "text/plain": [ " Account_Num Facility Name Address \\\n", "0 10605 SAGE MEMORIAL HOSPITAL STATE ROUTE 264 SOUTH 191 \n", "1 24250 WOODRIDGE BEHAVIORAL CENTER 600 NORTH 7TH STREET \n", "2 10341 DOUGLAS GARDENS HOSPITAL 5200 NE 2ND AVE \n", "3 81101 SUNCOAST BEHAVIORAL HEALTH CENTER 4480 51ST ST W \n", "4 39835 TREASURE VALLEY HOSPITAL 8800 WEST EMERALD STREET \n", "\n", " City State ZIP Code County Name Phone Number \\\n", "0 GANADO AZ 86505 APACHE (928) 755-4541 \n", "1 WEST MEMPHIS AR 72301 CRITTENDEN (870) 394-4113 \n", "2 MIAMI FL 33137 MIAMI-DADE (305) 751-8626 \n", "3 BRADENTON FL 34210 MANATEE (941) 792-2222 \n", "4 BOISE ID 83704 ADA (208) 373-5000 \n", "\n", " Hospital Type Hospital Ownership \n", "0 Critical Access Hospitals Voluntary non-profit - Private \n", "1 Psychiatric Proprietary \n", "2 Acute Care Hospitals Voluntary non-profit - Private \n", "3 Psychiatric Proprietary \n", "4 Acute Care Hospitals Proprietary " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hospital_accounts.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Provider_NumProvider NameProvider Street AddressProvider CityProvider StateProvider Zip CodeTotal DischargesAverage Covered ChargesAverage Total PaymentsAverage Medicare Payments
0839987SOUTHEAST ALABAMA MEDICAL CENTER1108 ROSS CLARK CIRCLEDOTHANAL3630111820855.615026.194115.52
1519118MARSHALL MEDICAL CENTER SOUTH2505 U S HIGHWAY 431 NORTHBOAZAL359574313289.095413.634490.93
2733073ELIZA COFFEE MEMORIAL HOSPITAL205 MARENGO STREETFLORENCEAL356317322261.604922.184021.79
3201752MIZELL MEMORIAL HOSPITAL702 N MAIN STOPPAL364671210901.335343.504284.17
4678488ST VINCENT'S EAST50 MEDICAL PARK EAST DRIVEBIRMINGHAMAL352357428117.955947.124819.53
\n", "
" ], "text/plain": [ " Provider_Num Provider Name Provider Street Address \\\n", "0 839987 SOUTHEAST ALABAMA MEDICAL CENTER 1108 ROSS CLARK CIRCLE \n", "1 519118 MARSHALL MEDICAL CENTER SOUTH 2505 U S HIGHWAY 431 NORTH \n", "2 733073 ELIZA COFFEE MEMORIAL HOSPITAL 205 MARENGO STREET \n", "3 201752 MIZELL MEMORIAL HOSPITAL 702 N MAIN ST \n", "4 678488 ST VINCENT'S EAST 50 MEDICAL PARK EAST DRIVE \n", "\n", " Provider City Provider State Provider Zip Code Total Discharges \\\n", "0 DOTHAN AL 36301 118 \n", "1 BOAZ AL 35957 43 \n", "2 FLORENCE AL 35631 73 \n", "3 OPP AL 36467 12 \n", "4 BIRMINGHAM AL 35235 74 \n", "\n", " Average Covered Charges Average Total Payments Average Medicare Payments \n", "0 20855.61 5026.19 4115.52 \n", "1 13289.09 5413.63 4490.93 \n", "2 22261.60 4922.18 4021.79 \n", "3 10901.33 5343.50 4284.17 \n", "4 28117.95 5947.12 4819.53 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hospital_reimbursement.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Columns to match on from df_left\n", "left_on = [\"Facility Name\", \"Address\", \"City\", \"State\"]\n", "\n", "# Columns to match on from df_right\n", "right_on = [\n", " \"Provider Name\", \"Provider Street Address\", \"Provider City\",\n", " \"Provider State\"\n", "]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Now perform the match\n", "# It will take several minutes to run on this data set\n", "matched_results = fuzzymatcher.fuzzy_left_join(hospital_accounts,\n", " hospital_reimbursement,\n", " left_on,\n", " right_on,\n", " left_id_col='Account_Num',\n", " right_id_col='Provider_Num')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
best_match_score__id_left__id_rightAccount_NumFacility NameAddressCityStateZIP CodeCounty Name...Provider_NumProvider NameProvider Street AddressProvider CityProvider StateProvider Zip CodeTotal DischargesAverage Covered ChargesAverage Total PaymentsAverage Medicare Payments
0-0.7466131060564359510605SAGE MEMORIAL HOSPITALSTATE ROUTE 264 SOUTH 191GANADOAZ86505APACHE...643595TYLER MEMORIAL HOSPITAL5950 STATE ROUTE 6 WESTTUNKHANNOCKPA186571820482.945783.224929.22
234-0.6098732425042676724250WOODRIDGE BEHAVIORAL CENTER600 NORTH 7TH STREETWEST MEMPHISAR72301CRITTENDEN...426767CRISP REGIONAL HOSPITAL902 7TH STREET NORTHCORDELEGA310151814655.945680.284899.39
446-0.4044731034173031110341DOUGLAS GARDENS HOSPITAL5200 NE 2ND AVEMIAMIFL33137MIAMI-DADE...730311JACKSON MEMORIAL HOSPITAL1611 NW 12TH AVEMIAMIFL3313611330021.9114448.7511043.58
675-0.3330208110130217381101SUNCOAST BEHAVIORAL HEALTH CENTER4480 51ST ST WBRADENTONFL34210MANATEE...302173BLAKE MEDICAL CENTER2020 59TH ST WBRADENTONFL3420913546792.595651.224534.90
893-0.6292613983541606039835TREASURE VALLEY HOSPITAL8800 WEST EMERALD STREETBOISEID83704ADA...416060ST LUKE'S REGIONAL MEDICAL CENTER190 EAST BANNOCK STREETBOISEID837126826564.976514.515631.87
\n", "

5 rows × 23 columns

\n", "
" ], "text/plain": [ " best_match_score __id_left __id_right Account_Num \\\n", "0 -0.746613 10605 643595 10605 \n", "234 -0.609873 24250 426767 24250 \n", "446 -0.404473 10341 730311 10341 \n", "675 -0.333020 81101 302173 81101 \n", "893 -0.629261 39835 416060 39835 \n", "\n", " Facility Name Address \\\n", "0 SAGE MEMORIAL HOSPITAL STATE ROUTE 264 SOUTH 191 \n", "234 WOODRIDGE BEHAVIORAL CENTER 600 NORTH 7TH STREET \n", "446 DOUGLAS GARDENS HOSPITAL 5200 NE 2ND AVE \n", "675 SUNCOAST BEHAVIORAL HEALTH CENTER 4480 51ST ST W \n", "893 TREASURE VALLEY HOSPITAL 8800 WEST EMERALD STREET \n", "\n", " City State ZIP Code County Name ... Provider_Num \\\n", "0 GANADO AZ 86505 APACHE ... 643595 \n", "234 WEST MEMPHIS AR 72301 CRITTENDEN ... 426767 \n", "446 MIAMI FL 33137 MIAMI-DADE ... 730311 \n", "675 BRADENTON FL 34210 MANATEE ... 302173 \n", "893 BOISE ID 83704 ADA ... 416060 \n", "\n", " Provider Name Provider Street Address \\\n", "0 TYLER MEMORIAL HOSPITAL 5950 STATE ROUTE 6 WEST \n", "234 CRISP REGIONAL HOSPITAL 902 7TH STREET NORTH \n", "446 JACKSON MEMORIAL HOSPITAL 1611 NW 12TH AVE \n", "675 BLAKE MEDICAL CENTER 2020 59TH ST W \n", "893 ST LUKE'S REGIONAL MEDICAL CENTER 190 EAST BANNOCK STREET \n", "\n", " Provider City Provider State Provider Zip Code Total Discharges \\\n", "0 TUNKHANNOCK PA 18657 18 \n", "234 CORDELE GA 31015 18 \n", "446 MIAMI FL 33136 113 \n", "675 BRADENTON FL 34209 135 \n", "893 BOISE ID 83712 68 \n", "\n", " Average Covered Charges Average Total Payments Average Medicare Payments \n", "0 20482.94 5783.22 4929.22 \n", "234 14655.94 5680.28 4899.39 \n", "446 30021.91 14448.75 11043.58 \n", "675 46792.59 5651.22 4534.90 \n", "893 26564.97 6514.51 5631.87 \n", "\n", "[5 rows x 23 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "matched_results.head()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Reorder the columns to make viewing easier\n", "cols = [\n", " \"best_match_score\", \"Facility Name\", \"Provider Name\", \"Address\", \"Provider Street Address\",\n", " \"Provider City\", \"City\", \"Provider State\", \"State\"\n", "]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
best_match_scoreFacility NameProvider NameAddressProvider Street AddressProvider CityCityProvider StateState
778463.090931RARITAN BAY MEDICAL CENTER PERTH AMBOY DIVISIONRARITAN BAY MEDICAL CENTER PERTH AMBOY DIVISION530 NEW BRUNSWICK AVE530 NEW BRUNSWICK AVEPERTH AMBOYPERTH AMBOYNJNJ
5324662.799072ROBERT WOOD JOHNSON UNIVERSITY HOSPITALROBERT WOOD JOHNSON UNIVERSITY HOSPITALONE ROBERT WOOD JOHNSON PLACEONE ROBERT WOOD JOHNSON PLACENEW BRUNSWICKNEW BRUNSWICKNJNJ
784352.785132AVERA MCKENNAN HOSPITAL & UNIVERSITY HEALTH CE...AVERA MCKENNAN HOSPITAL & UNIVERSITY HEALTH CE...1325 S CLIFF AVE POST OFFICE BOX 50451325 S CLIFF AVE POST OFFICE BOX 5045SIOUX FALLSSIOUX FALLSSDSD
2426972.778860JOHN T MATHER MEMORIAL HOSPITAL OF PORT JEFFE...JOHN T MATHER MEMORIAL HOSPITAL OF PORT JEFFE...75 NORTH COUNTRY ROAD75 NORTH COUNTRY ROADPORT JEFFERSONPORT JEFFERSONNYNY
4473472.721425MAYO CLINIC HEALTH SYSTEM - RED WINGMAYO CLINIC HEALTH SYSTEM IN RED WING701 HEWITT BOULEVARD, PO BOX 95701 HEWITT BOULEVARD, PO BOX 95RED WINGRED WINGMNMN
\n", "
" ], "text/plain": [ " best_match_score Facility Name \\\n", "77846 3.090931 RARITAN BAY MEDICAL CENTER PERTH AMBOY DIVISION \n", "532466 2.799072 ROBERT WOOD JOHNSON UNIVERSITY HOSPITAL \n", "78435 2.785132 AVERA MCKENNAN HOSPITAL & UNIVERSITY HEALTH CE... \n", "242697 2.778860 JOHN T MATHER MEMORIAL HOSPITAL OF PORT JEFFE... \n", "447347 2.721425 MAYO CLINIC HEALTH SYSTEM - RED WING \n", "\n", " Provider Name \\\n", "77846 RARITAN BAY MEDICAL CENTER PERTH AMBOY DIVISION \n", "532466 ROBERT WOOD JOHNSON UNIVERSITY HOSPITAL \n", "78435 AVERA MCKENNAN HOSPITAL & UNIVERSITY HEALTH CE... \n", "242697 JOHN T MATHER MEMORIAL HOSPITAL OF PORT JEFFE... \n", "447347 MAYO CLINIC HEALTH SYSTEM IN RED WING \n", "\n", " Address \\\n", "77846 530 NEW BRUNSWICK AVE \n", "532466 ONE ROBERT WOOD JOHNSON PLACE \n", "78435 1325 S CLIFF AVE POST OFFICE BOX 5045 \n", "242697 75 NORTH COUNTRY ROAD \n", "447347 701 HEWITT BOULEVARD, PO BOX 95 \n", "\n", " Provider Street Address Provider City \\\n", "77846 530 NEW BRUNSWICK AVE PERTH AMBOY \n", "532466 ONE ROBERT WOOD JOHNSON PLACE NEW BRUNSWICK \n", "78435 1325 S CLIFF AVE POST OFFICE BOX 5045 SIOUX FALLS \n", "242697 75 NORTH COUNTRY ROAD PORT JEFFERSON \n", "447347 701 HEWITT BOULEVARD, PO BOX 95 RED WING \n", "\n", " City Provider State State \n", "77846 PERTH AMBOY NJ NJ \n", "532466 NEW BRUNSWICK NJ NJ \n", "78435 SIOUX FALLS SD SD \n", "242697 PORT JEFFERSON NY NY \n", "447347 RED WING MN MN " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Let's see the best matches\n", "matched_results[cols].sort_values(by=['best_match_score'], ascending=False).head(5)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
best_match_scoreFacility NameProvider NameAddressProvider Street AddressProvider CityCityProvider StateState
426256-2.268231CENTRO MEDICO WILMA N VAZQUEZBAPTIST MEDICAL CENTER EASTCARR. 2 KM 39.5 ROAD NUMBER 2 BO ALGARROBO400 TAYLOR ROADMONTGOMERYVEGA BAJAALPR
83051-2.124071DOCTOR CENTER HOSPITAL SAN FERNANDO DE LA CARO...OVERLAKE HOSPITAL MEDICAL CENTEREDIF JESUS T PINEIRO AVE FERNANDEZ JUNCOS BO P...1035-116TH AVE NEBELLEVUECAROLINAWAPR
42613-2.106746HOSPITAL ONCOLOGICO DR ISAAC GONZALEZ MARTINEZSCRIPPS MERCY HOSPITALBO. MONACILLOS CARR 22 CENTRO MEDICO DE PUERTO...4077 5TH AVESAN DIEGOSAN JUANCAPR
450232-2.050888CENTRO DE SALUD CONDUCTUAL MENONITA-CIMAMILFORD REGIONAL MEDICAL CENTERCARR ESTATAL 14 INTERIOR SARGENTO GERARDO SANT...14 PROSPECT STREETMILFORDAIBONITOMAPR
476086-1.996508ADMIN DE SERVICIOS MEDICOS PUERTO RICMAINE MEDICAL CENTERBO MONACILLO CARR NUM 2222 BRAMHALL STPORTLANDSAN JUANMEPR
\n", "
" ], "text/plain": [ " best_match_score Facility Name \\\n", "426256 -2.268231 CENTRO MEDICO WILMA N VAZQUEZ \n", "83051 -2.124071 DOCTOR CENTER HOSPITAL SAN FERNANDO DE LA CARO... \n", "42613 -2.106746 HOSPITAL ONCOLOGICO DR ISAAC GONZALEZ MARTINEZ \n", "450232 -2.050888 CENTRO DE SALUD CONDUCTUAL MENONITA-CIMA \n", "476086 -1.996508 ADMIN DE SERVICIOS MEDICOS PUERTO RIC \n", "\n", " Provider Name \\\n", "426256 BAPTIST MEDICAL CENTER EAST \n", "83051 OVERLAKE HOSPITAL MEDICAL CENTER \n", "42613 SCRIPPS MERCY HOSPITAL \n", "450232 MILFORD REGIONAL MEDICAL CENTER \n", "476086 MAINE MEDICAL CENTER \n", "\n", " Address \\\n", "426256 CARR. 2 KM 39.5 ROAD NUMBER 2 BO ALGARROBO \n", "83051 EDIF JESUS T PINEIRO AVE FERNANDEZ JUNCOS BO P... \n", "42613 BO. MONACILLOS CARR 22 CENTRO MEDICO DE PUERTO... \n", "450232 CARR ESTATAL 14 INTERIOR SARGENTO GERARDO SANT... \n", "476086 BO MONACILLO CARR NUM 22 \n", "\n", " Provider Street Address Provider City City Provider State State \n", "426256 400 TAYLOR ROAD MONTGOMERY VEGA BAJA AL PR \n", "83051 1035-116TH AVE NE BELLEVUE CAROLINA WA PR \n", "42613 4077 5TH AVE SAN DIEGO SAN JUAN CA PR \n", "450232 14 PROSPECT STREET MILFORD AIBONITO MA PR \n", "476086 22 BRAMHALL ST PORTLAND SAN JUAN ME PR " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Now the worst matches\n", "matched_results[cols].sort_values(by=['best_match_score'],\n", " ascending=True).head(5)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
best_match_scoreFacility NameProvider NameAddressProvider Street AddressProvider CityCityProvider StateState
4137790.999843PRISMA HEALTH BAPTISTPALMETTO HEALTH BAPTIST1330 TAYLOR AT MARION STTAYLOR AT MARION STCOLUMBIACOLUMBIASCSC
1488130.999724CHRIST HOSPITALCHRIST HOSPITAL2139 AUBURN AVENUE2139 AUBURN AVENUECINCINNATICINCINNATIOHOH
3877960.997759RUSSELL COUNTY HOSPITALRUSSELL COUNTY MEDICAL CENTER58 CARROLL STREET58 CARROLL STREETLEBANONLEBANONVAVA
3776110.993306SKY RIDGE MEDICAL CENTERSKY RIDGE MEDICAL CENTER10101 RIDGEGATE PARKWAY10101 RIDGE GATE PARKWAYLONE TREELONE TREECOCO
3033560.991168HIGHLAND HOSPITALHIGHLAND HOSPITAL1000 SOUTH AVENUE1000 SOUTH AVENUEROCHESTERROCHESTERNYNY
5381050.990481PAOLI HOSPITALMAIN LINE HOSPITAL PAOLI255 WEST LANCASTER AVENUE255 WEST LANCASTER AVENUEPAOLIPAOLIPAPA
2515020.986695NYU WINTHROP HOSPITALWINTHROP-UNIVERSITY HOSPITAL259 FIRST STREET259 FIRST STREETMINEOLAMINEOLANYNY
3106740.986440ADVENTHEALTH GORDONGORDON HOSPITAL1035 RED BUD ROAD1035 RED BUD ROADCALHOUNCALHOUNGAGA
2039310.985763MEMORIAL MEDICAL CENTERMEMORIAL MEDICAL CENTER701 N FIRST ST701 N FIRST STSPRINGFIELDSPRINGFIELDILIL
1021870.984978HENDRICK MEDICAL CENTERHENDRICK MEDICAL CENTER1900 PINE1900 PINEABILENEABILENETXTX
\n", "
" ], "text/plain": [ " best_match_score Facility Name \\\n", "413779 0.999843 PRISMA HEALTH BAPTIST \n", "148813 0.999724 CHRIST HOSPITAL \n", "387796 0.997759 RUSSELL COUNTY HOSPITAL \n", "377611 0.993306 SKY RIDGE MEDICAL CENTER \n", "303356 0.991168 HIGHLAND HOSPITAL \n", "538105 0.990481 PAOLI HOSPITAL \n", "251502 0.986695 NYU WINTHROP HOSPITAL \n", "310674 0.986440 ADVENTHEALTH GORDON \n", "203931 0.985763 MEMORIAL MEDICAL CENTER \n", "102187 0.984978 HENDRICK MEDICAL CENTER \n", "\n", " Provider Name Address \\\n", "413779 PALMETTO HEALTH BAPTIST 1330 TAYLOR AT MARION ST \n", "148813 CHRIST HOSPITAL 2139 AUBURN AVENUE \n", "387796 RUSSELL COUNTY MEDICAL CENTER 58 CARROLL STREET \n", "377611 SKY RIDGE MEDICAL CENTER 10101 RIDGEGATE PARKWAY \n", "303356 HIGHLAND HOSPITAL 1000 SOUTH AVENUE \n", "538105 MAIN LINE HOSPITAL PAOLI 255 WEST LANCASTER AVENUE \n", "251502 WINTHROP-UNIVERSITY HOSPITAL 259 FIRST STREET \n", "310674 GORDON HOSPITAL 1035 RED BUD ROAD \n", "203931 MEMORIAL MEDICAL CENTER 701 N FIRST ST \n", "102187 HENDRICK MEDICAL CENTER 1900 PINE \n", "\n", " Provider Street Address Provider City City Provider State \\\n", "413779 TAYLOR AT MARION ST COLUMBIA COLUMBIA SC \n", "148813 2139 AUBURN AVENUE CINCINNATI CINCINNATI OH \n", "387796 58 CARROLL STREET LEBANON LEBANON VA \n", "377611 10101 RIDGE GATE PARKWAY LONE TREE LONE TREE CO \n", "303356 1000 SOUTH AVENUE ROCHESTER ROCHESTER NY \n", "538105 255 WEST LANCASTER AVENUE PAOLI PAOLI PA \n", "251502 259 FIRST STREET MINEOLA MINEOLA NY \n", "310674 1035 RED BUD ROAD CALHOUN CALHOUN GA \n", "203931 701 N FIRST ST SPRINGFIELD SPRINGFIELD IL \n", "102187 1900 PINE ABILENE ABILENE TX \n", "\n", " State \n", "413779 SC \n", "148813 OH \n", "387796 VA \n", "377611 CO \n", "303356 NY \n", "538105 PA \n", "251502 NY \n", "310674 GA \n", "203931 IL \n", "102187 TX " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Look at the matches around 1\n", "matched_results[cols].query(\"best_match_score <= 1\").sort_values(\n", " by=['best_match_score'], ascending=False).head(10)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
best_match_scoreFacility NameProvider NameAddressProvider Street AddressProvider CityCityProvider StateState
5180660.792471METHODIST HOSPITAL SOUTHSOUTH TEXAS REGIONAL MEDICAL CENTER1905 HWY 97 EAST1905 HWY 97 EASTJOURDANTONJOURDANTONTXTX
4169050.791668ADVENTIST HEALTH UKIAH VALLEYUKIAH VALLEY MEDICAL CENTER275 HOSPITAL DRIVE275 HOSPITAL DRIVEUKIAHUKIAHCACA
3031100.787163MADISON HEALTHMADISON COUNTY HOSPITAL INC210 NORTH MAIN STREET210 NORTH MAIN STREETLONDONLONDONOHOH
3881400.776632PENN HIGHLANDS CLEARFIELDCLEARFIELD HOSPITAL809 TURNPIKE AVE809 TURNPIKE AVECLEARFIELDCLEARFIELDPAPA
4928110.775573MEMORIAL HOSPITAL AT GULFPORTMEMORIAL HOSPITAL AT GULFPORT4500 13TH STREET4500 13TH ST-P O BOX 1810GULFPORTGULFPORTMSMS
\n", "
" ], "text/plain": [ " best_match_score Facility Name \\\n", "518066 0.792471 METHODIST HOSPITAL SOUTH \n", "416905 0.791668 ADVENTIST HEALTH UKIAH VALLEY \n", "303110 0.787163 MADISON HEALTH \n", "388140 0.776632 PENN HIGHLANDS CLEARFIELD \n", "492811 0.775573 MEMORIAL HOSPITAL AT GULFPORT \n", "\n", " Provider Name Address \\\n", "518066 SOUTH TEXAS REGIONAL MEDICAL CENTER 1905 HWY 97 EAST \n", "416905 UKIAH VALLEY MEDICAL CENTER 275 HOSPITAL DRIVE \n", "303110 MADISON COUNTY HOSPITAL INC 210 NORTH MAIN STREET \n", "388140 CLEARFIELD HOSPITAL 809 TURNPIKE AVE \n", "492811 MEMORIAL HOSPITAL AT GULFPORT 4500 13TH STREET \n", "\n", " Provider Street Address Provider City City Provider State \\\n", "518066 1905 HWY 97 EAST JOURDANTON JOURDANTON TX \n", "416905 275 HOSPITAL DRIVE UKIAH UKIAH CA \n", "303110 210 NORTH MAIN STREET LONDON LONDON OH \n", "388140 809 TURNPIKE AVE CLEARFIELD CLEARFIELD PA \n", "492811 4500 13TH ST-P O BOX 1810 GULFPORT GULFPORT MS \n", "\n", " State \n", "518066 TX \n", "416905 CA \n", "303110 OH \n", "388140 PA \n", "492811 MS " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "matched_results[cols].query(\"best_match_score <= .80\").sort_values(\n", " by=['best_match_score'], ascending=False).head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Example using Python Record Linkage Toolkit" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# Re-read in the data using the index_col\n", "hospital_accounts = pd.read_csv(\n", " 'https://github.com/chris1610/pbpython/raw/master/data/hospital_account_info.csv',\n", " index_col='Account_Num'\n", ")\n", "hospital_reimbursement = pd.read_csv(\n", " 'https://raw.githubusercontent.com/chris1610/pbpython/master/data/hospital_reimbursement.csv',\n", " index_col='Provider_Num'\n", ")" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Facility NameAddressCityStateZIP CodeCounty NamePhone NumberHospital TypeHospital Ownership
Account_Num
10605SAGE MEMORIAL HOSPITALSTATE ROUTE 264 SOUTH 191GANADOAZ86505APACHE(928) 755-4541Critical Access HospitalsVoluntary non-profit - Private
24250WOODRIDGE BEHAVIORAL CENTER600 NORTH 7TH STREETWEST MEMPHISAR72301CRITTENDEN(870) 394-4113PsychiatricProprietary
10341DOUGLAS GARDENS HOSPITAL5200 NE 2ND AVEMIAMIFL33137MIAMI-DADE(305) 751-8626Acute Care HospitalsVoluntary non-profit - Private
81101SUNCOAST BEHAVIORAL HEALTH CENTER4480 51ST ST WBRADENTONFL34210MANATEE(941) 792-2222PsychiatricProprietary
39835TREASURE VALLEY HOSPITAL8800 WEST EMERALD STREETBOISEID83704ADA(208) 373-5000Acute Care HospitalsProprietary
\n", "
" ], "text/plain": [ " Facility Name Address \\\n", "Account_Num \n", "10605 SAGE MEMORIAL HOSPITAL STATE ROUTE 264 SOUTH 191 \n", "24250 WOODRIDGE BEHAVIORAL CENTER 600 NORTH 7TH STREET \n", "10341 DOUGLAS GARDENS HOSPITAL 5200 NE 2ND AVE \n", "81101 SUNCOAST BEHAVIORAL HEALTH CENTER 4480 51ST ST W \n", "39835 TREASURE VALLEY HOSPITAL 8800 WEST EMERALD STREET \n", "\n", " City State ZIP Code County Name Phone Number \\\n", "Account_Num \n", "10605 GANADO AZ 86505 APACHE (928) 755-4541 \n", "24250 WEST MEMPHIS AR 72301 CRITTENDEN (870) 394-4113 \n", "10341 MIAMI FL 33137 MIAMI-DADE (305) 751-8626 \n", "81101 BRADENTON FL 34210 MANATEE (941) 792-2222 \n", "39835 BOISE ID 83704 ADA (208) 373-5000 \n", "\n", " Hospital Type Hospital Ownership \n", "Account_Num \n", "10605 Critical Access Hospitals Voluntary non-profit - Private \n", "24250 Psychiatric Proprietary \n", "10341 Acute Care Hospitals Voluntary non-profit - Private \n", "81101 Psychiatric Proprietary \n", "39835 Acute Care Hospitals Proprietary " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hospital_accounts.head()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Provider NameProvider Street AddressProvider CityProvider StateProvider Zip CodeTotal DischargesAverage Covered ChargesAverage Total PaymentsAverage Medicare Payments
Provider_Num
839987SOUTHEAST ALABAMA MEDICAL CENTER1108 ROSS CLARK CIRCLEDOTHANAL3630111820855.615026.194115.52
519118MARSHALL MEDICAL CENTER SOUTH2505 U S HIGHWAY 431 NORTHBOAZAL359574313289.095413.634490.93
733073ELIZA COFFEE MEMORIAL HOSPITAL205 MARENGO STREETFLORENCEAL356317322261.604922.184021.79
201752MIZELL MEMORIAL HOSPITAL702 N MAIN STOPPAL364671210901.335343.504284.17
678488ST VINCENT'S EAST50 MEDICAL PARK EAST DRIVEBIRMINGHAMAL352357428117.955947.124819.53
\n", "
" ], "text/plain": [ " Provider Name Provider Street Address \\\n", "Provider_Num \n", "839987 SOUTHEAST ALABAMA MEDICAL CENTER 1108 ROSS CLARK CIRCLE \n", "519118 MARSHALL MEDICAL CENTER SOUTH 2505 U S HIGHWAY 431 NORTH \n", "733073 ELIZA COFFEE MEMORIAL HOSPITAL 205 MARENGO STREET \n", "201752 MIZELL MEMORIAL HOSPITAL 702 N MAIN ST \n", "678488 ST VINCENT'S EAST 50 MEDICAL PARK EAST DRIVE \n", "\n", " Provider City Provider State Provider Zip Code \\\n", "Provider_Num \n", "839987 DOTHAN AL 36301 \n", "519118 BOAZ AL 35957 \n", "733073 FLORENCE AL 35631 \n", "201752 OPP AL 36467 \n", "678488 BIRMINGHAM AL 35235 \n", "\n", " Total Discharges Average Covered Charges \\\n", "Provider_Num \n", "839987 118 20855.61 \n", "519118 43 13289.09 \n", "733073 73 22261.60 \n", "201752 12 10901.33 \n", "678488 74 28117.95 \n", "\n", " Average Total Payments Average Medicare Payments \n", "Provider_Num \n", "839987 5026.19 4115.52 \n", "519118 5413.63 4490.93 \n", "733073 4922.18 4021.79 \n", "201752 5343.50 4284.17 \n", "678488 5947.12 4819.53 " ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hospital_reimbursement.head()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Build the indexer\n", "indexer = recordlinkage.Index()\n", "# Can use full or block\n", "#indexer.full()\n", "#indexer.block(left_on='State', right_on='Provider State')\n", "\n", "# Use sortedneighbor as a good option if data is not clean\n", "indexer.sortedneighbourhood(left_on='State', right_on='Provider State')" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "candidates = indexer.index(hospital_accounts, hospital_reimbursement)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "998860\n" ] } ], "source": [ "# Let's see how many matches we want to do\n", "print(len(candidates))" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "# Takes 3 minutes using the full index.\n", "# 14s using sorted neighbor\n", "# 7s using blocking\n", "compare = recordlinkage.Compare()\n", "compare.exact('City', 'Provider City', label='City')\n", "compare.string('Facility Name',\n", " 'Provider Name',\n", " threshold=0.85,\n", " label='Hosp_Name')\n", "compare.string('Address',\n", " 'Provider Street Address',\n", " method='jarowinkler',\n", " threshold=0.85,\n", " label='Hosp_Address')\n", "features = compare.compute(candidates, hospital_accounts,\n", " hospital_reimbursement)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CityHosp_NameHosp_Address
Account_NumProvider_Num
1060553718400.00.0
80318100.00.0
45061600.00.0
85437700.00.0
56036100.00.0
...............
7022681590400.00.0
74609000.00.0
19306200.00.0
83498400.00.0
36509500.00.0
\n", "

998860 rows × 3 columns

\n", "
" ], "text/plain": [ " City Hosp_Name Hosp_Address\n", "Account_Num Provider_Num \n", "10605 537184 0 0.0 0.0\n", " 803181 0 0.0 0.0\n", " 450616 0 0.0 0.0\n", " 854377 0 0.0 0.0\n", " 560361 0 0.0 0.0\n", "... ... ... ...\n", "70226 815904 0 0.0 0.0\n", " 746090 0 0.0 0.0\n", " 193062 0 0.0 0.0\n", " 834984 0 0.0 0.0\n", " 365095 0 0.0 0.0\n", "\n", "[998860 rows x 3 columns]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "features" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3.0 2285\n", "2.0 451\n", "1.0 7937\n", "0.0 988187\n", "dtype: int64" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# What are the score totals?\n", "features.sum(axis=1).value_counts().sort_index(ascending=False)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "# Get the potential matches\n", "potential_matches = features[features.sum(axis=1) > 1].reset_index()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Account_NumProvider_NumCityHosp_NameHosp_AddressScore
05121626878101.01.02.0
15527255691711.01.03.0
28780785463711.01.03.0
35115178314610.01.02.0
41174026037411.01.03.0
\n", "
" ], "text/plain": [ " Account_Num Provider_Num City Hosp_Name Hosp_Address Score\n", "0 51216 268781 0 1.0 1.0 2.0\n", "1 55272 556917 1 1.0 1.0 3.0\n", "2 87807 854637 1 1.0 1.0 3.0\n", "3 51151 783146 1 0.0 1.0 2.0\n", "4 11740 260374 1 1.0 1.0 3.0" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "potential_matches['Score'] = potential_matches.loc[:, 'City':'Hosp_Address'].sum(axis=1)\n", "potential_matches.head()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Facility Name ST FRANCIS MEDICAL CENTER\n", "Address 2400 ST FRANCIS DRIVE\n", "City BRECKENRIDGE\n", "State MN\n", "ZIP Code 56520\n", "County Name WILKIN\n", "Phone Number (218) 643-3000\n", "Hospital Type Critical Access Hospitals\n", "Hospital Ownership Voluntary non-profit - Church\n", "Name: 51216, dtype: object" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hospital_accounts.loc[51216,:]" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Provider Name SAINT FRANCIS MEDICAL CENTER\n", "Provider Street Address 211 ST FRANCIS DR\n", "Provider City CAPE GIRARDEAU\n", "Provider State MO\n", "Provider Zip Code 63703\n", "Total Discharges 141\n", "Average Covered Charges 42515.1\n", "Average Total Payments 5902.24\n", "Average Medicare Payments 4993.43\n", "Name: 268781, dtype: object" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hospital_reimbursement.loc[268781,:]" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "# Add some convenience columns for comparing data\n", "hospital_accounts['Acct_Name_Lookup'] = hospital_accounts[[\n", " 'Facility Name', 'Address', 'City', 'State'\n", "]].apply(lambda x: '_'.join(x), axis=1)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "hospital_reimbursement['Reimbursement_Name_Lookup'] = hospital_reimbursement[[\n", " 'Provider Name', 'Provider Street Address', 'Provider City',\n", " 'Provider State'\n", "]].apply(lambda x: '_'.join(x), axis=1)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "reimbursement_lookup = hospital_reimbursement[['Reimbursement_Name_Lookup']].reset_index()\n", "account_lookup = hospital_accounts[['Acct_Name_Lookup']].reset_index()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Account_NumAcct_Name_Lookup
010605SAGE MEMORIAL HOSPITAL_STATE ROUTE 264 SOUTH 1...
124250WOODRIDGE BEHAVIORAL CENTER_600 NORTH 7TH STRE...
210341DOUGLAS GARDENS HOSPITAL_5200 NE 2ND AVE_MIAMI_FL
381101SUNCOAST BEHAVIORAL HEALTH CENTER_4480 51ST ST...
439835TREASURE VALLEY HOSPITAL_8800 WEST EMERALD STR...
\n", "
" ], "text/plain": [ " Account_Num Acct_Name_Lookup\n", "0 10605 SAGE MEMORIAL HOSPITAL_STATE ROUTE 264 SOUTH 1...\n", "1 24250 WOODRIDGE BEHAVIORAL CENTER_600 NORTH 7TH STRE...\n", "2 10341 DOUGLAS GARDENS HOSPITAL_5200 NE 2ND AVE_MIAMI_FL\n", "3 81101 SUNCOAST BEHAVIORAL HEALTH CENTER_4480 51ST ST...\n", "4 39835 TREASURE VALLEY HOSPITAL_8800 WEST EMERALD STR..." ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "account_lookup.head()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Provider_NumReimbursement_Name_Lookup
0839987SOUTHEAST ALABAMA MEDICAL CENTER_1108 ROSS CLA...
1519118MARSHALL MEDICAL CENTER SOUTH_2505 U S HIGHWAY...
2733073ELIZA COFFEE MEMORIAL HOSPITAL_205 MARENGO STR...
3201752MIZELL MEMORIAL HOSPITAL_702 N MAIN ST_OPP_AL
4678488ST VINCENT'S EAST_50 MEDICAL PARK EAST DRIVE_B...
\n", "
" ], "text/plain": [ " Provider_Num Reimbursement_Name_Lookup\n", "0 839987 SOUTHEAST ALABAMA MEDICAL CENTER_1108 ROSS CLA...\n", "1 519118 MARSHALL MEDICAL CENTER SOUTH_2505 U S HIGHWAY...\n", "2 733073 ELIZA COFFEE MEMORIAL HOSPITAL_205 MARENGO STR...\n", "3 201752 MIZELL MEMORIAL HOSPITAL_702 N MAIN ST_OPP_AL\n", "4 678488 ST VINCENT'S EAST_50 MEDICAL PARK EAST DRIVE_B..." ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "reimbursement_lookup.head()" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "account_merge = potential_matches.merge(account_lookup, how='left')" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Account_NumProvider_NumCityHosp_NameHosp_AddressScoreAcct_Name_Lookup
05121626878101.01.02.0ST FRANCIS MEDICAL CENTER_2400 ST FRANCIS DRIV...
15527255691711.01.03.0SCOTTSDALE OSBORN MEDICAL CENTER_7400 EAST OSB...
28780785463711.01.03.0ORO VALLEY HOSPITAL_1551 EAST TANGERINE ROAD_O...
35115178314610.01.02.0ST. LUKE'S BEHAVIORAL HOSPITAL, LP_1800 EAST V...
41174026037411.01.03.0SUMMIT HEALTHCARE REGIONAL MEDICAL CENTER_2200...
\n", "
" ], "text/plain": [ " Account_Num Provider_Num City Hosp_Name Hosp_Address Score \\\n", "0 51216 268781 0 1.0 1.0 2.0 \n", "1 55272 556917 1 1.0 1.0 3.0 \n", "2 87807 854637 1 1.0 1.0 3.0 \n", "3 51151 783146 1 0.0 1.0 2.0 \n", "4 11740 260374 1 1.0 1.0 3.0 \n", "\n", " Acct_Name_Lookup \n", "0 ST FRANCIS MEDICAL CENTER_2400 ST FRANCIS DRIV... \n", "1 SCOTTSDALE OSBORN MEDICAL CENTER_7400 EAST OSB... \n", "2 ORO VALLEY HOSPITAL_1551 EAST TANGERINE ROAD_O... \n", "3 ST. LUKE'S BEHAVIORAL HOSPITAL, LP_1800 EAST V... \n", "4 SUMMIT HEALTHCARE REGIONAL MEDICAL CENTER_2200... " ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "account_merge.head()" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Provider_NumReimbursement_Name_Lookup
0839987SOUTHEAST ALABAMA MEDICAL CENTER_1108 ROSS CLA...
1519118MARSHALL MEDICAL CENTER SOUTH_2505 U S HIGHWAY...
2733073ELIZA COFFEE MEMORIAL HOSPITAL_205 MARENGO STR...
3201752MIZELL MEMORIAL HOSPITAL_702 N MAIN ST_OPP_AL
4678488ST VINCENT'S EAST_50 MEDICAL PARK EAST DRIVE_B...
\n", "
" ], "text/plain": [ " Provider_Num Reimbursement_Name_Lookup\n", "0 839987 SOUTHEAST ALABAMA MEDICAL CENTER_1108 ROSS CLA...\n", "1 519118 MARSHALL MEDICAL CENTER SOUTH_2505 U S HIGHWAY...\n", "2 733073 ELIZA COFFEE MEMORIAL HOSPITAL_205 MARENGO STR...\n", "3 201752 MIZELL MEMORIAL HOSPITAL_702 N MAIN ST_OPP_AL\n", "4 678488 ST VINCENT'S EAST_50 MEDICAL PARK EAST DRIVE_B..." ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "reimbursement_lookup.head()" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "# Let's build a dataframe to compare\n", "final_merge = account_merge.merge(reimbursement_lookup, how='left')" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " 'Account_Num', 'Provider_Num', 'Score', 'Acct_Name_Lookup',\n", " 'Reimbursement_Name_Lookup'\n", "]" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Account_NumProvider_NumScoreAcct_Name_LookupReimbursement_Name_Lookup
2660949958259143.0CLAIBORNE MEDICAL CENTER_1850 OLD KNOXVILLE HI...CLAIBORNE MEDICAL CENTER_1850 OLD KNOXVILLE HI...
1975949538191813.0LAKE CHARLES MEMORIAL HOSPITAL_1701 OAK PARK B...LAKE CHARLES MEMORIAL HOSPITAL_1701 OAK PARK B...
1042949436805963.0VALLEY PRESBYTERIAN HOSPITAL_15107 VANOWEN ST_...VALLEY PRESBYTERIAN HOSPITAL_15107 VANOWEN ST_...
2305949234031513.0UNIVERSITY COLO HEALTH MEMORIAL HOSPITAL CENTR...UNIVERSITY COLO HEALTH MEMORIAL HOSPITAL CENTR...
2512948877522842.0NEW YORK-PRESBYTERIAN BROOKLYN METHODIST HOSPI...NEW YORK METHODIST HOSPITAL_506 SIXTH STREET_B...
..................
2080101651882473.0UTAH VALLEY HOSPITAL_1034 NORTH 500 WEST_PROVO_UTUTAH VALLEY HOSPITAL_1034 NORTH 500 WEST_PROVO_UT
1825100902120693.0CANONSBURG GENERAL HOSPITAL_100 MEDICAL BOULEV...CANONSBURG GENERAL HOSPITAL_100 MEDICAL BOULEV...
2424100431405353.0BETH ISRAEL DEACONESS HOSPITAL - PLYMOUTH_275 ...BETH ISRAEL DEACONESS HOSPITAL - PLYMOUTH_275 ...
1959100202106573.0ST FRANCIS MEDICAL CENTER_309 JACKSON STREET_M...ST FRANCIS MEDICAL CENTER_309 JACKSON STREET_M...
1958100201216702.0ST FRANCIS MEDICAL CENTER_309 JACKSON STREET_M...UNIVERSITY HEALTH CONWAY_4864 JACKSON STREET_M...
\n", "

2736 rows × 5 columns

\n", "
" ], "text/plain": [ " Account_Num Provider_Num Score \\\n", "2660 94995 825914 3.0 \n", "1975 94953 819181 3.0 \n", "1042 94943 680596 3.0 \n", "2305 94923 403151 3.0 \n", "2512 94887 752284 2.0 \n", "... ... ... ... \n", "2080 10165 188247 3.0 \n", "1825 10090 212069 3.0 \n", "2424 10043 140535 3.0 \n", "1959 10020 210657 3.0 \n", "1958 10020 121670 2.0 \n", "\n", " Acct_Name_Lookup \\\n", "2660 CLAIBORNE MEDICAL CENTER_1850 OLD KNOXVILLE HI... \n", "1975 LAKE CHARLES MEMORIAL HOSPITAL_1701 OAK PARK B... \n", "1042 VALLEY PRESBYTERIAN HOSPITAL_15107 VANOWEN ST_... \n", "2305 UNIVERSITY COLO HEALTH MEMORIAL HOSPITAL CENTR... \n", "2512 NEW YORK-PRESBYTERIAN BROOKLYN METHODIST HOSPI... \n", "... ... \n", "2080 UTAH VALLEY HOSPITAL_1034 NORTH 500 WEST_PROVO_UT \n", "1825 CANONSBURG GENERAL HOSPITAL_100 MEDICAL BOULEV... \n", "2424 BETH ISRAEL DEACONESS HOSPITAL - PLYMOUTH_275 ... \n", "1959 ST FRANCIS MEDICAL CENTER_309 JACKSON STREET_M... \n", "1958 ST FRANCIS MEDICAL CENTER_309 JACKSON STREET_M... \n", "\n", " Reimbursement_Name_Lookup \n", "2660 CLAIBORNE MEDICAL CENTER_1850 OLD KNOXVILLE HI... \n", "1975 LAKE CHARLES MEMORIAL HOSPITAL_1701 OAK PARK B... \n", "1042 VALLEY PRESBYTERIAN HOSPITAL_15107 VANOWEN ST_... \n", "2305 UNIVERSITY COLO HEALTH MEMORIAL HOSPITAL CENTR... \n", "2512 NEW YORK METHODIST HOSPITAL_506 SIXTH STREET_B... \n", "... ... \n", "2080 UTAH VALLEY HOSPITAL_1034 NORTH 500 WEST_PROVO_UT \n", "1825 CANONSBURG GENERAL HOSPITAL_100 MEDICAL BOULEV... \n", "2424 BETH ISRAEL DEACONESS HOSPITAL - PLYMOUTH_275 ... \n", "1959 ST FRANCIS MEDICAL CENTER_309 JACKSON STREET_M... \n", "1958 UNIVERSITY HEALTH CONWAY_4864 JACKSON STREET_M... \n", "\n", "[2736 rows x 5 columns]" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "final_merge[cols].sort_values(by=['Account_Num', 'Score'], ascending=False)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "# If you need to save it to Excel -\n", "#final_merge.sort_values(by=['Account_Num', 'Score'],\n", "# ascending=False).to_excel('merge_list.xlsx',\n", "# index=False)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Account_NumProvider_NumScoreAcct_Name_LookupReimbursement_Name_Lookup
2302110355509212.0SAINT JOSEPH HOSPITAL_1375 E 19TH AVE_DENVER_COPRESBYTERIAN ST LUKES MEDICAL CENTER_1719 E 19...
2303110357068073.0SAINT JOSEPH HOSPITAL_1375 E 19TH AVE_DENVER_COSAINT JOSEPH HOSPITAL_1375 EAST 19TH AVE_DENVE...
\n", "
" ], "text/plain": [ " Account_Num Provider_Num Score \\\n", "2302 11035 550921 2.0 \n", "2303 11035 706807 3.0 \n", "\n", " Acct_Name_Lookup \\\n", "2302 SAINT JOSEPH HOSPITAL_1375 E 19TH AVE_DENVER_CO \n", "2303 SAINT JOSEPH HOSPITAL_1375 E 19TH AVE_DENVER_CO \n", "\n", " Reimbursement_Name_Lookup \n", "2302 PRESBYTERIAN ST LUKES MEDICAL CENTER_1719 E 19... \n", "2303 SAINT JOSEPH HOSPITAL_1375 EAST 19TH AVE_DENVE... " ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "final_merge[final_merge['Account_Num']==11035][cols]" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Account_NumProvider_NumScoreAcct_Name_LookupReimbursement_Name_Lookup
155563753904022.0BROWARD HEALTH IMPERIAL POINT_6401 N FEDERAL H...HOLY CROSS HOSPITAL_4725 N FEDERAL HWY_FORT LA...
156563754512293.0BROWARD HEALTH IMPERIAL POINT_6401 N FEDERAL H...BROWARD HEALTH IMPERIAL POINT_6401 N FEDERAL H...
\n", "
" ], "text/plain": [ " Account_Num Provider_Num Score \\\n", "155 56375 390402 2.0 \n", "156 56375 451229 3.0 \n", "\n", " Acct_Name_Lookup \\\n", "155 BROWARD HEALTH IMPERIAL POINT_6401 N FEDERAL H... \n", "156 BROWARD HEALTH IMPERIAL POINT_6401 N FEDERAL H... \n", "\n", " Reimbursement_Name_Lookup \n", "155 HOLY CROSS HOSPITAL_4725 N FEDERAL HWY_FORT LA... \n", "156 BROWARD HEALTH IMPERIAL POINT_6401 N FEDERAL H... " ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "final_merge[final_merge['Account_Num']==56375][cols]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Dedupe the data" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "hospital_dupes = pd.read_csv(\n", " 'https://github.com/chris1610/pbpython/raw/master/data/hospital_account_dupes.csv',\n", " index_col='Account_Num')" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Facility NameAddressCityStateZIP CodeCounty NamePhone NumberHospital TypeHospital Ownership
Account_Num
71730SAGE MEMORIAL HOSPITALSTATE ROUTE 264 SOUTH 191GANADOAZ86505APACHE(928) 755-4541Critical Access HospitalsVoluntary non-profit - Private
70116WOODRIDGE BEHAVIORAL CENTER600 NORTH 7TH STREETWEST MEMPHISAR72301CRITTENDEN(870) 394-4113PsychiatricProprietary
87991DOUGLAS GARDENS HOSPITAL5200 NE 2ND AVEMIAMIFL33137MIAMI-DADE(305) 751-8626Acute Care HospitalsVoluntary non-profit - Private
22662SUNCOAST BEHAVIORAL HEALTH CENTER4480 51ST ST WBRADENTONFL34210MANATEE(941) 792-2222PsychiatricProprietary
63165TREASURE VALLEY HOSPITAL8800 WEST EMERALD STREETBOISEID83704ADA(208) 373-5000Acute Care HospitalsProprietary
\n", "
" ], "text/plain": [ " Facility Name Address \\\n", "Account_Num \n", "71730 SAGE MEMORIAL HOSPITAL STATE ROUTE 264 SOUTH 191 \n", "70116 WOODRIDGE BEHAVIORAL CENTER 600 NORTH 7TH STREET \n", "87991 DOUGLAS GARDENS HOSPITAL 5200 NE 2ND AVE \n", "22662 SUNCOAST BEHAVIORAL HEALTH CENTER 4480 51ST ST W \n", "63165 TREASURE VALLEY HOSPITAL 8800 WEST EMERALD STREET \n", "\n", " City State ZIP Code County Name Phone Number \\\n", "Account_Num \n", "71730 GANADO AZ 86505 APACHE (928) 755-4541 \n", "70116 WEST MEMPHIS AR 72301 CRITTENDEN (870) 394-4113 \n", "87991 MIAMI FL 33137 MIAMI-DADE (305) 751-8626 \n", "22662 BRADENTON FL 34210 MANATEE (941) 792-2222 \n", "63165 BOISE ID 83704 ADA (208) 373-5000 \n", "\n", " Hospital Type Hospital Ownership \n", "Account_Num \n", "71730 Critical Access Hospitals Voluntary non-profit - Private \n", "70116 Psychiatric Proprietary \n", "87991 Acute Care Hospitals Voluntary non-profit - Private \n", "22662 Psychiatric Proprietary \n", "63165 Acute Care Hospitals Proprietary " ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hospital_dupes.head()" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "# Deduping follows the same process, you just use 1 single dataframe\n", "dupe_indexer = recordlinkage.Index()\n", "dupe_indexer.sortedneighbourhood(left_on='State')\n", "dupe_candidate_links = dupe_indexer.index(hospital_dupes)\n" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "# Comparison step\n", "compare_dupes = recordlinkage.Compare()\n", "compare_dupes.string('City', 'City', threshold=0.85, label='City')\n", "compare_dupes.string('Phone Number',\n", " 'Phone Number',\n", " threshold=0.85,\n", " label='Phone_Num')\n", "compare_dupes.string('Facility Name',\n", " 'Facility Name',\n", " threshold=0.80,\n", " label='Hosp_Name')\n", "compare_dupes.string('Address',\n", " 'Address',\n", " threshold=0.85,\n", " label='Hosp_Address')\n", "dupe_features = compare_dupes.compute(dupe_candidate_links, hospital_dupes)" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CityPhone_NumHosp_NameHosp_Address
Account_Num_1Account_Num_2
26270284850.00.00.00.0
304300.00.00.00.0
436020.00.00.00.0
59585284850.00.00.00.0
304300.00.00.00.0
..................
64029386000.00.00.00.0
354130.00.00.00.0
815250.00.00.00.0
829160.00.00.00.0
189070.00.00.00.0
\n", "

981277 rows × 4 columns

\n", "
" ], "text/plain": [ " City Phone_Num Hosp_Name Hosp_Address\n", "Account_Num_1 Account_Num_2 \n", "26270 28485 0.0 0.0 0.0 0.0\n", " 30430 0.0 0.0 0.0 0.0\n", " 43602 0.0 0.0 0.0 0.0\n", "59585 28485 0.0 0.0 0.0 0.0\n", " 30430 0.0 0.0 0.0 0.0\n", "... ... ... ... ...\n", "64029 38600 0.0 0.0 0.0 0.0\n", " 35413 0.0 0.0 0.0 0.0\n", " 81525 0.0 0.0 0.0 0.0\n", " 82916 0.0 0.0 0.0 0.0\n", " 18907 0.0 0.0 0.0 0.0\n", "\n", "[981277 rows x 4 columns]" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dupe_features" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3.0 7\n", "2.0 206\n", "1.0 7859\n", "0.0 973205\n", "dtype: int64" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dupe_features.sum(axis=1).value_counts().sort_index(ascending=False)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "potential_dupes = dupe_features[dupe_features.sum(axis=1) > 2].reset_index()\n", "potential_dupes['Score'] = potential_dupes.loc[:, 'City':'Hosp_Address'].sum(axis=1)" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Account_Num_1Account_Num_2CityPhone_NumHosp_NameHosp_AddressScore
028494379491.01.00.01.03.0
174835770001.01.00.01.03.0
224549284851.01.00.01.03.0
370366526541.01.00.01.03.0
461685248491.01.00.01.03.0
551567411661.01.01.00.03.0
626495410791.01.00.01.03.0
\n", "
" ], "text/plain": [ " Account_Num_1 Account_Num_2 City Phone_Num Hosp_Name Hosp_Address \\\n", "0 28494 37949 1.0 1.0 0.0 1.0 \n", "1 74835 77000 1.0 1.0 0.0 1.0 \n", "2 24549 28485 1.0 1.0 0.0 1.0 \n", "3 70366 52654 1.0 1.0 0.0 1.0 \n", "4 61685 24849 1.0 1.0 0.0 1.0 \n", "5 51567 41166 1.0 1.0 1.0 0.0 \n", "6 26495 41079 1.0 1.0 0.0 1.0 \n", "\n", " Score \n", "0 3.0 \n", "1 3.0 \n", "2 3.0 \n", "3 3.0 \n", "4 3.0 \n", "5 3.0 \n", "6 3.0 " ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "potential_dupes.sort_values(by=['Score'], ascending=True)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Facility NameAddressCityStateZIP CodeCounty NamePhone NumberHospital TypeHospital Ownership
Account_Num
41166ST VINCENT HOSPITAL835 S VAN BUREN STGREEN BAYWI54301BROWN(920) 433-0111Acute Care HospitalsVoluntary non-profit - Church
51567SAINT VINCENT HOSPITAL835 SOUTH VAN BUREN STGREEN BAYWI54301BROWN(920) 433-0112Acute Care HospitalsVoluntary non-profit - Church
\n", "
" ], "text/plain": [ " Facility Name Address City State \\\n", "Account_Num \n", "41166 ST VINCENT HOSPITAL 835 S VAN BUREN ST GREEN BAY WI \n", "51567 SAINT VINCENT HOSPITAL 835 SOUTH VAN BUREN ST GREEN BAY WI \n", "\n", " ZIP Code County Name Phone Number Hospital Type \\\n", "Account_Num \n", "41166 54301 BROWN (920) 433-0111 Acute Care Hospitals \n", "51567 54301 BROWN (920) 433-0112 Acute Care Hospitals \n", "\n", " Hospital Ownership \n", "Account_Num \n", "41166 Voluntary non-profit - Church \n", "51567 Voluntary non-profit - Church " ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Take a look at one of the potential duplicates\n", "hospital_dupes[hospital_dupes.index.isin([51567, 41166])]" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.7.2 64-bit", "language": "python", "name": "python37264bit24e6adcd4e9b4d8bb6ff9239d4d45105" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.2" } }, "nbformat": 4, "nbformat_minor": 2 }