{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Python Tools for Record Linking and Fuzzy Matching\n",
"\n",
"This notebook accompanies the [article](https://pbpython.com/record-linking.html) on Practical Business Python\n",
"\n",
"This notebook relies on [fuzzymatcher](https://github.com/RobinL/fuzzymatcher) and the [Python Record Linkage Toolkit](https://recordlinkage.readthedocs.io/en/latest/about.html)\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from pathlib import Path\n",
"import fuzzymatcher\n",
"import recordlinkage"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Example using fuzzymatcher"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"hospital_accounts = pd.read_csv(\n",
" 'https://github.com/chris1610/pbpython/raw/master/data/hospital_account_info.csv'\n",
")\n",
"hospital_reimbursement = pd.read_csv(\n",
" 'https://raw.githubusercontent.com/chris1610/pbpython/master/data/hospital_reimbursement.csv'\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Account_Num | \n",
" Facility Name | \n",
" Address | \n",
" City | \n",
" State | \n",
" ZIP Code | \n",
" County Name | \n",
" Phone Number | \n",
" Hospital Type | \n",
" Hospital Ownership | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 10605 | \n",
" SAGE MEMORIAL HOSPITAL | \n",
" STATE ROUTE 264 SOUTH 191 | \n",
" GANADO | \n",
" AZ | \n",
" 86505 | \n",
" APACHE | \n",
" (928) 755-4541 | \n",
" Critical Access Hospitals | \n",
" Voluntary non-profit - Private | \n",
"
\n",
" \n",
" 1 | \n",
" 24250 | \n",
" WOODRIDGE BEHAVIORAL CENTER | \n",
" 600 NORTH 7TH STREET | \n",
" WEST MEMPHIS | \n",
" AR | \n",
" 72301 | \n",
" CRITTENDEN | \n",
" (870) 394-4113 | \n",
" Psychiatric | \n",
" Proprietary | \n",
"
\n",
" \n",
" 2 | \n",
" 10341 | \n",
" DOUGLAS GARDENS HOSPITAL | \n",
" 5200 NE 2ND AVE | \n",
" MIAMI | \n",
" FL | \n",
" 33137 | \n",
" MIAMI-DADE | \n",
" (305) 751-8626 | \n",
" Acute Care Hospitals | \n",
" Voluntary non-profit - Private | \n",
"
\n",
" \n",
" 3 | \n",
" 81101 | \n",
" SUNCOAST BEHAVIORAL HEALTH CENTER | \n",
" 4480 51ST ST W | \n",
" BRADENTON | \n",
" FL | \n",
" 34210 | \n",
" MANATEE | \n",
" (941) 792-2222 | \n",
" Psychiatric | \n",
" Proprietary | \n",
"
\n",
" \n",
" 4 | \n",
" 39835 | \n",
" TREASURE VALLEY HOSPITAL | \n",
" 8800 WEST EMERALD STREET | \n",
" BOISE | \n",
" ID | \n",
" 83704 | \n",
" ADA | \n",
" (208) 373-5000 | \n",
" Acute Care Hospitals | \n",
" Proprietary | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Account_Num Facility Name Address \\\n",
"0 10605 SAGE MEMORIAL HOSPITAL STATE ROUTE 264 SOUTH 191 \n",
"1 24250 WOODRIDGE BEHAVIORAL CENTER 600 NORTH 7TH STREET \n",
"2 10341 DOUGLAS GARDENS HOSPITAL 5200 NE 2ND AVE \n",
"3 81101 SUNCOAST BEHAVIORAL HEALTH CENTER 4480 51ST ST W \n",
"4 39835 TREASURE VALLEY HOSPITAL 8800 WEST EMERALD STREET \n",
"\n",
" City State ZIP Code County Name Phone Number \\\n",
"0 GANADO AZ 86505 APACHE (928) 755-4541 \n",
"1 WEST MEMPHIS AR 72301 CRITTENDEN (870) 394-4113 \n",
"2 MIAMI FL 33137 MIAMI-DADE (305) 751-8626 \n",
"3 BRADENTON FL 34210 MANATEE (941) 792-2222 \n",
"4 BOISE ID 83704 ADA (208) 373-5000 \n",
"\n",
" Hospital Type Hospital Ownership \n",
"0 Critical Access Hospitals Voluntary non-profit - Private \n",
"1 Psychiatric Proprietary \n",
"2 Acute Care Hospitals Voluntary non-profit - Private \n",
"3 Psychiatric Proprietary \n",
"4 Acute Care Hospitals Proprietary "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hospital_accounts.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Provider_Num | \n",
" Provider Name | \n",
" Provider Street Address | \n",
" Provider City | \n",
" Provider State | \n",
" Provider Zip Code | \n",
" Total Discharges | \n",
" Average Covered Charges | \n",
" Average Total Payments | \n",
" Average Medicare Payments | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 839987 | \n",
" SOUTHEAST ALABAMA MEDICAL CENTER | \n",
" 1108 ROSS CLARK CIRCLE | \n",
" DOTHAN | \n",
" AL | \n",
" 36301 | \n",
" 118 | \n",
" 20855.61 | \n",
" 5026.19 | \n",
" 4115.52 | \n",
"
\n",
" \n",
" 1 | \n",
" 519118 | \n",
" MARSHALL MEDICAL CENTER SOUTH | \n",
" 2505 U S HIGHWAY 431 NORTH | \n",
" BOAZ | \n",
" AL | \n",
" 35957 | \n",
" 43 | \n",
" 13289.09 | \n",
" 5413.63 | \n",
" 4490.93 | \n",
"
\n",
" \n",
" 2 | \n",
" 733073 | \n",
" ELIZA COFFEE MEMORIAL HOSPITAL | \n",
" 205 MARENGO STREET | \n",
" FLORENCE | \n",
" AL | \n",
" 35631 | \n",
" 73 | \n",
" 22261.60 | \n",
" 4922.18 | \n",
" 4021.79 | \n",
"
\n",
" \n",
" 3 | \n",
" 201752 | \n",
" MIZELL MEMORIAL HOSPITAL | \n",
" 702 N MAIN ST | \n",
" OPP | \n",
" AL | \n",
" 36467 | \n",
" 12 | \n",
" 10901.33 | \n",
" 5343.50 | \n",
" 4284.17 | \n",
"
\n",
" \n",
" 4 | \n",
" 678488 | \n",
" ST VINCENT'S EAST | \n",
" 50 MEDICAL PARK EAST DRIVE | \n",
" BIRMINGHAM | \n",
" AL | \n",
" 35235 | \n",
" 74 | \n",
" 28117.95 | \n",
" 5947.12 | \n",
" 4819.53 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Provider_Num Provider Name Provider Street Address \\\n",
"0 839987 SOUTHEAST ALABAMA MEDICAL CENTER 1108 ROSS CLARK CIRCLE \n",
"1 519118 MARSHALL MEDICAL CENTER SOUTH 2505 U S HIGHWAY 431 NORTH \n",
"2 733073 ELIZA COFFEE MEMORIAL HOSPITAL 205 MARENGO STREET \n",
"3 201752 MIZELL MEMORIAL HOSPITAL 702 N MAIN ST \n",
"4 678488 ST VINCENT'S EAST 50 MEDICAL PARK EAST DRIVE \n",
"\n",
" Provider City Provider State Provider Zip Code Total Discharges \\\n",
"0 DOTHAN AL 36301 118 \n",
"1 BOAZ AL 35957 43 \n",
"2 FLORENCE AL 35631 73 \n",
"3 OPP AL 36467 12 \n",
"4 BIRMINGHAM AL 35235 74 \n",
"\n",
" Average Covered Charges Average Total Payments Average Medicare Payments \n",
"0 20855.61 5026.19 4115.52 \n",
"1 13289.09 5413.63 4490.93 \n",
"2 22261.60 4922.18 4021.79 \n",
"3 10901.33 5343.50 4284.17 \n",
"4 28117.95 5947.12 4819.53 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hospital_reimbursement.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Columns to match on from df_left\n",
"left_on = [\"Facility Name\", \"Address\", \"City\", \"State\"]\n",
"\n",
"# Columns to match on from df_right\n",
"right_on = [\n",
" \"Provider Name\", \"Provider Street Address\", \"Provider City\",\n",
" \"Provider State\"\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Now perform the match\n",
"# It will take several minutes to run on this data set\n",
"matched_results = fuzzymatcher.fuzzy_left_join(hospital_accounts,\n",
" hospital_reimbursement,\n",
" left_on,\n",
" right_on,\n",
" left_id_col='Account_Num',\n",
" right_id_col='Provider_Num')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" best_match_score | \n",
" __id_left | \n",
" __id_right | \n",
" Account_Num | \n",
" Facility Name | \n",
" Address | \n",
" City | \n",
" State | \n",
" ZIP Code | \n",
" County Name | \n",
" ... | \n",
" Provider_Num | \n",
" Provider Name | \n",
" Provider Street Address | \n",
" Provider City | \n",
" Provider State | \n",
" Provider Zip Code | \n",
" Total Discharges | \n",
" Average Covered Charges | \n",
" Average Total Payments | \n",
" Average Medicare Payments | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" -0.746613 | \n",
" 10605 | \n",
" 643595 | \n",
" 10605 | \n",
" SAGE MEMORIAL HOSPITAL | \n",
" STATE ROUTE 264 SOUTH 191 | \n",
" GANADO | \n",
" AZ | \n",
" 86505 | \n",
" APACHE | \n",
" ... | \n",
" 643595 | \n",
" TYLER MEMORIAL HOSPITAL | \n",
" 5950 STATE ROUTE 6 WEST | \n",
" TUNKHANNOCK | \n",
" PA | \n",
" 18657 | \n",
" 18 | \n",
" 20482.94 | \n",
" 5783.22 | \n",
" 4929.22 | \n",
"
\n",
" \n",
" 234 | \n",
" -0.609873 | \n",
" 24250 | \n",
" 426767 | \n",
" 24250 | \n",
" WOODRIDGE BEHAVIORAL CENTER | \n",
" 600 NORTH 7TH STREET | \n",
" WEST MEMPHIS | \n",
" AR | \n",
" 72301 | \n",
" CRITTENDEN | \n",
" ... | \n",
" 426767 | \n",
" CRISP REGIONAL HOSPITAL | \n",
" 902 7TH STREET NORTH | \n",
" CORDELE | \n",
" GA | \n",
" 31015 | \n",
" 18 | \n",
" 14655.94 | \n",
" 5680.28 | \n",
" 4899.39 | \n",
"
\n",
" \n",
" 446 | \n",
" -0.404473 | \n",
" 10341 | \n",
" 730311 | \n",
" 10341 | \n",
" DOUGLAS GARDENS HOSPITAL | \n",
" 5200 NE 2ND AVE | \n",
" MIAMI | \n",
" FL | \n",
" 33137 | \n",
" MIAMI-DADE | \n",
" ... | \n",
" 730311 | \n",
" JACKSON MEMORIAL HOSPITAL | \n",
" 1611 NW 12TH AVE | \n",
" MIAMI | \n",
" FL | \n",
" 33136 | \n",
" 113 | \n",
" 30021.91 | \n",
" 14448.75 | \n",
" 11043.58 | \n",
"
\n",
" \n",
" 675 | \n",
" -0.333020 | \n",
" 81101 | \n",
" 302173 | \n",
" 81101 | \n",
" SUNCOAST BEHAVIORAL HEALTH CENTER | \n",
" 4480 51ST ST W | \n",
" BRADENTON | \n",
" FL | \n",
" 34210 | \n",
" MANATEE | \n",
" ... | \n",
" 302173 | \n",
" BLAKE MEDICAL CENTER | \n",
" 2020 59TH ST W | \n",
" BRADENTON | \n",
" FL | \n",
" 34209 | \n",
" 135 | \n",
" 46792.59 | \n",
" 5651.22 | \n",
" 4534.90 | \n",
"
\n",
" \n",
" 893 | \n",
" -0.629261 | \n",
" 39835 | \n",
" 416060 | \n",
" 39835 | \n",
" TREASURE VALLEY HOSPITAL | \n",
" 8800 WEST EMERALD STREET | \n",
" BOISE | \n",
" ID | \n",
" 83704 | \n",
" ADA | \n",
" ... | \n",
" 416060 | \n",
" ST LUKE'S REGIONAL MEDICAL CENTER | \n",
" 190 EAST BANNOCK STREET | \n",
" BOISE | \n",
" ID | \n",
" 83712 | \n",
" 68 | \n",
" 26564.97 | \n",
" 6514.51 | \n",
" 5631.87 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 23 columns
\n",
"
"
],
"text/plain": [
" best_match_score __id_left __id_right Account_Num \\\n",
"0 -0.746613 10605 643595 10605 \n",
"234 -0.609873 24250 426767 24250 \n",
"446 -0.404473 10341 730311 10341 \n",
"675 -0.333020 81101 302173 81101 \n",
"893 -0.629261 39835 416060 39835 \n",
"\n",
" Facility Name Address \\\n",
"0 SAGE MEMORIAL HOSPITAL STATE ROUTE 264 SOUTH 191 \n",
"234 WOODRIDGE BEHAVIORAL CENTER 600 NORTH 7TH STREET \n",
"446 DOUGLAS GARDENS HOSPITAL 5200 NE 2ND AVE \n",
"675 SUNCOAST BEHAVIORAL HEALTH CENTER 4480 51ST ST W \n",
"893 TREASURE VALLEY HOSPITAL 8800 WEST EMERALD STREET \n",
"\n",
" City State ZIP Code County Name ... Provider_Num \\\n",
"0 GANADO AZ 86505 APACHE ... 643595 \n",
"234 WEST MEMPHIS AR 72301 CRITTENDEN ... 426767 \n",
"446 MIAMI FL 33137 MIAMI-DADE ... 730311 \n",
"675 BRADENTON FL 34210 MANATEE ... 302173 \n",
"893 BOISE ID 83704 ADA ... 416060 \n",
"\n",
" Provider Name Provider Street Address \\\n",
"0 TYLER MEMORIAL HOSPITAL 5950 STATE ROUTE 6 WEST \n",
"234 CRISP REGIONAL HOSPITAL 902 7TH STREET NORTH \n",
"446 JACKSON MEMORIAL HOSPITAL 1611 NW 12TH AVE \n",
"675 BLAKE MEDICAL CENTER 2020 59TH ST W \n",
"893 ST LUKE'S REGIONAL MEDICAL CENTER 190 EAST BANNOCK STREET \n",
"\n",
" Provider City Provider State Provider Zip Code Total Discharges \\\n",
"0 TUNKHANNOCK PA 18657 18 \n",
"234 CORDELE GA 31015 18 \n",
"446 MIAMI FL 33136 113 \n",
"675 BRADENTON FL 34209 135 \n",
"893 BOISE ID 83712 68 \n",
"\n",
" Average Covered Charges Average Total Payments Average Medicare Payments \n",
"0 20482.94 5783.22 4929.22 \n",
"234 14655.94 5680.28 4899.39 \n",
"446 30021.91 14448.75 11043.58 \n",
"675 46792.59 5651.22 4534.90 \n",
"893 26564.97 6514.51 5631.87 \n",
"\n",
"[5 rows x 23 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"matched_results.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# Reorder the columns to make viewing easier\n",
"cols = [\n",
" \"best_match_score\", \"Facility Name\", \"Provider Name\", \"Address\", \"Provider Street Address\",\n",
" \"Provider City\", \"City\", \"Provider State\", \"State\"\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" best_match_score | \n",
" Facility Name | \n",
" Provider Name | \n",
" Address | \n",
" Provider Street Address | \n",
" Provider City | \n",
" City | \n",
" Provider State | \n",
" State | \n",
"
\n",
" \n",
" \n",
" \n",
" 77846 | \n",
" 3.090931 | \n",
" RARITAN BAY MEDICAL CENTER PERTH AMBOY DIVISION | \n",
" RARITAN BAY MEDICAL CENTER PERTH AMBOY DIVISION | \n",
" 530 NEW BRUNSWICK AVE | \n",
" 530 NEW BRUNSWICK AVE | \n",
" PERTH AMBOY | \n",
" PERTH AMBOY | \n",
" NJ | \n",
" NJ | \n",
"
\n",
" \n",
" 532466 | \n",
" 2.799072 | \n",
" ROBERT WOOD JOHNSON UNIVERSITY HOSPITAL | \n",
" ROBERT WOOD JOHNSON UNIVERSITY HOSPITAL | \n",
" ONE ROBERT WOOD JOHNSON PLACE | \n",
" ONE ROBERT WOOD JOHNSON PLACE | \n",
" NEW BRUNSWICK | \n",
" NEW BRUNSWICK | \n",
" NJ | \n",
" NJ | \n",
"
\n",
" \n",
" 78435 | \n",
" 2.785132 | \n",
" AVERA MCKENNAN HOSPITAL & UNIVERSITY HEALTH CE... | \n",
" AVERA MCKENNAN HOSPITAL & UNIVERSITY HEALTH CE... | \n",
" 1325 S CLIFF AVE POST OFFICE BOX 5045 | \n",
" 1325 S CLIFF AVE POST OFFICE BOX 5045 | \n",
" SIOUX FALLS | \n",
" SIOUX FALLS | \n",
" SD | \n",
" SD | \n",
"
\n",
" \n",
" 242697 | \n",
" 2.778860 | \n",
" JOHN T MATHER MEMORIAL HOSPITAL OF PORT JEFFE... | \n",
" JOHN T MATHER MEMORIAL HOSPITAL OF PORT JEFFE... | \n",
" 75 NORTH COUNTRY ROAD | \n",
" 75 NORTH COUNTRY ROAD | \n",
" PORT JEFFERSON | \n",
" PORT JEFFERSON | \n",
" NY | \n",
" NY | \n",
"
\n",
" \n",
" 447347 | \n",
" 2.721425 | \n",
" MAYO CLINIC HEALTH SYSTEM - RED WING | \n",
" MAYO CLINIC HEALTH SYSTEM IN RED WING | \n",
" 701 HEWITT BOULEVARD, PO BOX 95 | \n",
" 701 HEWITT BOULEVARD, PO BOX 95 | \n",
" RED WING | \n",
" RED WING | \n",
" MN | \n",
" MN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" best_match_score Facility Name \\\n",
"77846 3.090931 RARITAN BAY MEDICAL CENTER PERTH AMBOY DIVISION \n",
"532466 2.799072 ROBERT WOOD JOHNSON UNIVERSITY HOSPITAL \n",
"78435 2.785132 AVERA MCKENNAN HOSPITAL & UNIVERSITY HEALTH CE... \n",
"242697 2.778860 JOHN T MATHER MEMORIAL HOSPITAL OF PORT JEFFE... \n",
"447347 2.721425 MAYO CLINIC HEALTH SYSTEM - RED WING \n",
"\n",
" Provider Name \\\n",
"77846 RARITAN BAY MEDICAL CENTER PERTH AMBOY DIVISION \n",
"532466 ROBERT WOOD JOHNSON UNIVERSITY HOSPITAL \n",
"78435 AVERA MCKENNAN HOSPITAL & UNIVERSITY HEALTH CE... \n",
"242697 JOHN T MATHER MEMORIAL HOSPITAL OF PORT JEFFE... \n",
"447347 MAYO CLINIC HEALTH SYSTEM IN RED WING \n",
"\n",
" Address \\\n",
"77846 530 NEW BRUNSWICK AVE \n",
"532466 ONE ROBERT WOOD JOHNSON PLACE \n",
"78435 1325 S CLIFF AVE POST OFFICE BOX 5045 \n",
"242697 75 NORTH COUNTRY ROAD \n",
"447347 701 HEWITT BOULEVARD, PO BOX 95 \n",
"\n",
" Provider Street Address Provider City \\\n",
"77846 530 NEW BRUNSWICK AVE PERTH AMBOY \n",
"532466 ONE ROBERT WOOD JOHNSON PLACE NEW BRUNSWICK \n",
"78435 1325 S CLIFF AVE POST OFFICE BOX 5045 SIOUX FALLS \n",
"242697 75 NORTH COUNTRY ROAD PORT JEFFERSON \n",
"447347 701 HEWITT BOULEVARD, PO BOX 95 RED WING \n",
"\n",
" City Provider State State \n",
"77846 PERTH AMBOY NJ NJ \n",
"532466 NEW BRUNSWICK NJ NJ \n",
"78435 SIOUX FALLS SD SD \n",
"242697 PORT JEFFERSON NY NY \n",
"447347 RED WING MN MN "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Let's see the best matches\n",
"matched_results[cols].sort_values(by=['best_match_score'], ascending=False).head(5)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" best_match_score | \n",
" Facility Name | \n",
" Provider Name | \n",
" Address | \n",
" Provider Street Address | \n",
" Provider City | \n",
" City | \n",
" Provider State | \n",
" State | \n",
"
\n",
" \n",
" \n",
" \n",
" 426256 | \n",
" -2.268231 | \n",
" CENTRO MEDICO WILMA N VAZQUEZ | \n",
" BAPTIST MEDICAL CENTER EAST | \n",
" CARR. 2 KM 39.5 ROAD NUMBER 2 BO ALGARROBO | \n",
" 400 TAYLOR ROAD | \n",
" MONTGOMERY | \n",
" VEGA BAJA | \n",
" AL | \n",
" PR | \n",
"
\n",
" \n",
" 83051 | \n",
" -2.124071 | \n",
" DOCTOR CENTER HOSPITAL SAN FERNANDO DE LA CARO... | \n",
" OVERLAKE HOSPITAL MEDICAL CENTER | \n",
" EDIF JESUS T PINEIRO AVE FERNANDEZ JUNCOS BO P... | \n",
" 1035-116TH AVE NE | \n",
" BELLEVUE | \n",
" CAROLINA | \n",
" WA | \n",
" PR | \n",
"
\n",
" \n",
" 42613 | \n",
" -2.106746 | \n",
" HOSPITAL ONCOLOGICO DR ISAAC GONZALEZ MARTINEZ | \n",
" SCRIPPS MERCY HOSPITAL | \n",
" BO. MONACILLOS CARR 22 CENTRO MEDICO DE PUERTO... | \n",
" 4077 5TH AVE | \n",
" SAN DIEGO | \n",
" SAN JUAN | \n",
" CA | \n",
" PR | \n",
"
\n",
" \n",
" 450232 | \n",
" -2.050888 | \n",
" CENTRO DE SALUD CONDUCTUAL MENONITA-CIMA | \n",
" MILFORD REGIONAL MEDICAL CENTER | \n",
" CARR ESTATAL 14 INTERIOR SARGENTO GERARDO SANT... | \n",
" 14 PROSPECT STREET | \n",
" MILFORD | \n",
" AIBONITO | \n",
" MA | \n",
" PR | \n",
"
\n",
" \n",
" 476086 | \n",
" -1.996508 | \n",
" ADMIN DE SERVICIOS MEDICOS PUERTO RIC | \n",
" MAINE MEDICAL CENTER | \n",
" BO MONACILLO CARR NUM 22 | \n",
" 22 BRAMHALL ST | \n",
" PORTLAND | \n",
" SAN JUAN | \n",
" ME | \n",
" PR | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" best_match_score Facility Name \\\n",
"426256 -2.268231 CENTRO MEDICO WILMA N VAZQUEZ \n",
"83051 -2.124071 DOCTOR CENTER HOSPITAL SAN FERNANDO DE LA CARO... \n",
"42613 -2.106746 HOSPITAL ONCOLOGICO DR ISAAC GONZALEZ MARTINEZ \n",
"450232 -2.050888 CENTRO DE SALUD CONDUCTUAL MENONITA-CIMA \n",
"476086 -1.996508 ADMIN DE SERVICIOS MEDICOS PUERTO RIC \n",
"\n",
" Provider Name \\\n",
"426256 BAPTIST MEDICAL CENTER EAST \n",
"83051 OVERLAKE HOSPITAL MEDICAL CENTER \n",
"42613 SCRIPPS MERCY HOSPITAL \n",
"450232 MILFORD REGIONAL MEDICAL CENTER \n",
"476086 MAINE MEDICAL CENTER \n",
"\n",
" Address \\\n",
"426256 CARR. 2 KM 39.5 ROAD NUMBER 2 BO ALGARROBO \n",
"83051 EDIF JESUS T PINEIRO AVE FERNANDEZ JUNCOS BO P... \n",
"42613 BO. MONACILLOS CARR 22 CENTRO MEDICO DE PUERTO... \n",
"450232 CARR ESTATAL 14 INTERIOR SARGENTO GERARDO SANT... \n",
"476086 BO MONACILLO CARR NUM 22 \n",
"\n",
" Provider Street Address Provider City City Provider State State \n",
"426256 400 TAYLOR ROAD MONTGOMERY VEGA BAJA AL PR \n",
"83051 1035-116TH AVE NE BELLEVUE CAROLINA WA PR \n",
"42613 4077 5TH AVE SAN DIEGO SAN JUAN CA PR \n",
"450232 14 PROSPECT STREET MILFORD AIBONITO MA PR \n",
"476086 22 BRAMHALL ST PORTLAND SAN JUAN ME PR "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Now the worst matches\n",
"matched_results[cols].sort_values(by=['best_match_score'],\n",
" ascending=True).head(5)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" best_match_score | \n",
" Facility Name | \n",
" Provider Name | \n",
" Address | \n",
" Provider Street Address | \n",
" Provider City | \n",
" City | \n",
" Provider State | \n",
" State | \n",
"
\n",
" \n",
" \n",
" \n",
" 413779 | \n",
" 0.999843 | \n",
" PRISMA HEALTH BAPTIST | \n",
" PALMETTO HEALTH BAPTIST | \n",
" 1330 TAYLOR AT MARION ST | \n",
" TAYLOR AT MARION ST | \n",
" COLUMBIA | \n",
" COLUMBIA | \n",
" SC | \n",
" SC | \n",
"
\n",
" \n",
" 148813 | \n",
" 0.999724 | \n",
" CHRIST HOSPITAL | \n",
" CHRIST HOSPITAL | \n",
" 2139 AUBURN AVENUE | \n",
" 2139 AUBURN AVENUE | \n",
" CINCINNATI | \n",
" CINCINNATI | \n",
" OH | \n",
" OH | \n",
"
\n",
" \n",
" 387796 | \n",
" 0.997759 | \n",
" RUSSELL COUNTY HOSPITAL | \n",
" RUSSELL COUNTY MEDICAL CENTER | \n",
" 58 CARROLL STREET | \n",
" 58 CARROLL STREET | \n",
" LEBANON | \n",
" LEBANON | \n",
" VA | \n",
" VA | \n",
"
\n",
" \n",
" 377611 | \n",
" 0.993306 | \n",
" SKY RIDGE MEDICAL CENTER | \n",
" SKY RIDGE MEDICAL CENTER | \n",
" 10101 RIDGEGATE PARKWAY | \n",
" 10101 RIDGE GATE PARKWAY | \n",
" LONE TREE | \n",
" LONE TREE | \n",
" CO | \n",
" CO | \n",
"
\n",
" \n",
" 303356 | \n",
" 0.991168 | \n",
" HIGHLAND HOSPITAL | \n",
" HIGHLAND HOSPITAL | \n",
" 1000 SOUTH AVENUE | \n",
" 1000 SOUTH AVENUE | \n",
" ROCHESTER | \n",
" ROCHESTER | \n",
" NY | \n",
" NY | \n",
"
\n",
" \n",
" 538105 | \n",
" 0.990481 | \n",
" PAOLI HOSPITAL | \n",
" MAIN LINE HOSPITAL PAOLI | \n",
" 255 WEST LANCASTER AVENUE | \n",
" 255 WEST LANCASTER AVENUE | \n",
" PAOLI | \n",
" PAOLI | \n",
" PA | \n",
" PA | \n",
"
\n",
" \n",
" 251502 | \n",
" 0.986695 | \n",
" NYU WINTHROP HOSPITAL | \n",
" WINTHROP-UNIVERSITY HOSPITAL | \n",
" 259 FIRST STREET | \n",
" 259 FIRST STREET | \n",
" MINEOLA | \n",
" MINEOLA | \n",
" NY | \n",
" NY | \n",
"
\n",
" \n",
" 310674 | \n",
" 0.986440 | \n",
" ADVENTHEALTH GORDON | \n",
" GORDON HOSPITAL | \n",
" 1035 RED BUD ROAD | \n",
" 1035 RED BUD ROAD | \n",
" CALHOUN | \n",
" CALHOUN | \n",
" GA | \n",
" GA | \n",
"
\n",
" \n",
" 203931 | \n",
" 0.985763 | \n",
" MEMORIAL MEDICAL CENTER | \n",
" MEMORIAL MEDICAL CENTER | \n",
" 701 N FIRST ST | \n",
" 701 N FIRST ST | \n",
" SPRINGFIELD | \n",
" SPRINGFIELD | \n",
" IL | \n",
" IL | \n",
"
\n",
" \n",
" 102187 | \n",
" 0.984978 | \n",
" HENDRICK MEDICAL CENTER | \n",
" HENDRICK MEDICAL CENTER | \n",
" 1900 PINE | \n",
" 1900 PINE | \n",
" ABILENE | \n",
" ABILENE | \n",
" TX | \n",
" TX | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" best_match_score Facility Name \\\n",
"413779 0.999843 PRISMA HEALTH BAPTIST \n",
"148813 0.999724 CHRIST HOSPITAL \n",
"387796 0.997759 RUSSELL COUNTY HOSPITAL \n",
"377611 0.993306 SKY RIDGE MEDICAL CENTER \n",
"303356 0.991168 HIGHLAND HOSPITAL \n",
"538105 0.990481 PAOLI HOSPITAL \n",
"251502 0.986695 NYU WINTHROP HOSPITAL \n",
"310674 0.986440 ADVENTHEALTH GORDON \n",
"203931 0.985763 MEMORIAL MEDICAL CENTER \n",
"102187 0.984978 HENDRICK MEDICAL CENTER \n",
"\n",
" Provider Name Address \\\n",
"413779 PALMETTO HEALTH BAPTIST 1330 TAYLOR AT MARION ST \n",
"148813 CHRIST HOSPITAL 2139 AUBURN AVENUE \n",
"387796 RUSSELL COUNTY MEDICAL CENTER 58 CARROLL STREET \n",
"377611 SKY RIDGE MEDICAL CENTER 10101 RIDGEGATE PARKWAY \n",
"303356 HIGHLAND HOSPITAL 1000 SOUTH AVENUE \n",
"538105 MAIN LINE HOSPITAL PAOLI 255 WEST LANCASTER AVENUE \n",
"251502 WINTHROP-UNIVERSITY HOSPITAL 259 FIRST STREET \n",
"310674 GORDON HOSPITAL 1035 RED BUD ROAD \n",
"203931 MEMORIAL MEDICAL CENTER 701 N FIRST ST \n",
"102187 HENDRICK MEDICAL CENTER 1900 PINE \n",
"\n",
" Provider Street Address Provider City City Provider State \\\n",
"413779 TAYLOR AT MARION ST COLUMBIA COLUMBIA SC \n",
"148813 2139 AUBURN AVENUE CINCINNATI CINCINNATI OH \n",
"387796 58 CARROLL STREET LEBANON LEBANON VA \n",
"377611 10101 RIDGE GATE PARKWAY LONE TREE LONE TREE CO \n",
"303356 1000 SOUTH AVENUE ROCHESTER ROCHESTER NY \n",
"538105 255 WEST LANCASTER AVENUE PAOLI PAOLI PA \n",
"251502 259 FIRST STREET MINEOLA MINEOLA NY \n",
"310674 1035 RED BUD ROAD CALHOUN CALHOUN GA \n",
"203931 701 N FIRST ST SPRINGFIELD SPRINGFIELD IL \n",
"102187 1900 PINE ABILENE ABILENE TX \n",
"\n",
" State \n",
"413779 SC \n",
"148813 OH \n",
"387796 VA \n",
"377611 CO \n",
"303356 NY \n",
"538105 PA \n",
"251502 NY \n",
"310674 GA \n",
"203931 IL \n",
"102187 TX "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Look at the matches around 1\n",
"matched_results[cols].query(\"best_match_score <= 1\").sort_values(\n",
" by=['best_match_score'], ascending=False).head(10)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" best_match_score | \n",
" Facility Name | \n",
" Provider Name | \n",
" Address | \n",
" Provider Street Address | \n",
" Provider City | \n",
" City | \n",
" Provider State | \n",
" State | \n",
"
\n",
" \n",
" \n",
" \n",
" 518066 | \n",
" 0.792471 | \n",
" METHODIST HOSPITAL SOUTH | \n",
" SOUTH TEXAS REGIONAL MEDICAL CENTER | \n",
" 1905 HWY 97 EAST | \n",
" 1905 HWY 97 EAST | \n",
" JOURDANTON | \n",
" JOURDANTON | \n",
" TX | \n",
" TX | \n",
"
\n",
" \n",
" 416905 | \n",
" 0.791668 | \n",
" ADVENTIST HEALTH UKIAH VALLEY | \n",
" UKIAH VALLEY MEDICAL CENTER | \n",
" 275 HOSPITAL DRIVE | \n",
" 275 HOSPITAL DRIVE | \n",
" UKIAH | \n",
" UKIAH | \n",
" CA | \n",
" CA | \n",
"
\n",
" \n",
" 303110 | \n",
" 0.787163 | \n",
" MADISON HEALTH | \n",
" MADISON COUNTY HOSPITAL INC | \n",
" 210 NORTH MAIN STREET | \n",
" 210 NORTH MAIN STREET | \n",
" LONDON | \n",
" LONDON | \n",
" OH | \n",
" OH | \n",
"
\n",
" \n",
" 388140 | \n",
" 0.776632 | \n",
" PENN HIGHLANDS CLEARFIELD | \n",
" CLEARFIELD HOSPITAL | \n",
" 809 TURNPIKE AVE | \n",
" 809 TURNPIKE AVE | \n",
" CLEARFIELD | \n",
" CLEARFIELD | \n",
" PA | \n",
" PA | \n",
"
\n",
" \n",
" 492811 | \n",
" 0.775573 | \n",
" MEMORIAL HOSPITAL AT GULFPORT | \n",
" MEMORIAL HOSPITAL AT GULFPORT | \n",
" 4500 13TH STREET | \n",
" 4500 13TH ST-P O BOX 1810 | \n",
" GULFPORT | \n",
" GULFPORT | \n",
" MS | \n",
" MS | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" best_match_score Facility Name \\\n",
"518066 0.792471 METHODIST HOSPITAL SOUTH \n",
"416905 0.791668 ADVENTIST HEALTH UKIAH VALLEY \n",
"303110 0.787163 MADISON HEALTH \n",
"388140 0.776632 PENN HIGHLANDS CLEARFIELD \n",
"492811 0.775573 MEMORIAL HOSPITAL AT GULFPORT \n",
"\n",
" Provider Name Address \\\n",
"518066 SOUTH TEXAS REGIONAL MEDICAL CENTER 1905 HWY 97 EAST \n",
"416905 UKIAH VALLEY MEDICAL CENTER 275 HOSPITAL DRIVE \n",
"303110 MADISON COUNTY HOSPITAL INC 210 NORTH MAIN STREET \n",
"388140 CLEARFIELD HOSPITAL 809 TURNPIKE AVE \n",
"492811 MEMORIAL HOSPITAL AT GULFPORT 4500 13TH STREET \n",
"\n",
" Provider Street Address Provider City City Provider State \\\n",
"518066 1905 HWY 97 EAST JOURDANTON JOURDANTON TX \n",
"416905 275 HOSPITAL DRIVE UKIAH UKIAH CA \n",
"303110 210 NORTH MAIN STREET LONDON LONDON OH \n",
"388140 809 TURNPIKE AVE CLEARFIELD CLEARFIELD PA \n",
"492811 4500 13TH ST-P O BOX 1810 GULFPORT GULFPORT MS \n",
"\n",
" State \n",
"518066 TX \n",
"416905 CA \n",
"303110 OH \n",
"388140 PA \n",
"492811 MS "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"matched_results[cols].query(\"best_match_score <= .80\").sort_values(\n",
" by=['best_match_score'], ascending=False).head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Example using Python Record Linkage Toolkit"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# Re-read in the data using the index_col\n",
"hospital_accounts = pd.read_csv(\n",
" 'https://github.com/chris1610/pbpython/raw/master/data/hospital_account_info.csv',\n",
" index_col='Account_Num'\n",
")\n",
"hospital_reimbursement = pd.read_csv(\n",
" 'https://raw.githubusercontent.com/chris1610/pbpython/master/data/hospital_reimbursement.csv',\n",
" index_col='Provider_Num'\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Facility Name | \n",
" Address | \n",
" City | \n",
" State | \n",
" ZIP Code | \n",
" County Name | \n",
" Phone Number | \n",
" Hospital Type | \n",
" Hospital Ownership | \n",
"
\n",
" \n",
" Account_Num | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 10605 | \n",
" SAGE MEMORIAL HOSPITAL | \n",
" STATE ROUTE 264 SOUTH 191 | \n",
" GANADO | \n",
" AZ | \n",
" 86505 | \n",
" APACHE | \n",
" (928) 755-4541 | \n",
" Critical Access Hospitals | \n",
" Voluntary non-profit - Private | \n",
"
\n",
" \n",
" 24250 | \n",
" WOODRIDGE BEHAVIORAL CENTER | \n",
" 600 NORTH 7TH STREET | \n",
" WEST MEMPHIS | \n",
" AR | \n",
" 72301 | \n",
" CRITTENDEN | \n",
" (870) 394-4113 | \n",
" Psychiatric | \n",
" Proprietary | \n",
"
\n",
" \n",
" 10341 | \n",
" DOUGLAS GARDENS HOSPITAL | \n",
" 5200 NE 2ND AVE | \n",
" MIAMI | \n",
" FL | \n",
" 33137 | \n",
" MIAMI-DADE | \n",
" (305) 751-8626 | \n",
" Acute Care Hospitals | \n",
" Voluntary non-profit - Private | \n",
"
\n",
" \n",
" 81101 | \n",
" SUNCOAST BEHAVIORAL HEALTH CENTER | \n",
" 4480 51ST ST W | \n",
" BRADENTON | \n",
" FL | \n",
" 34210 | \n",
" MANATEE | \n",
" (941) 792-2222 | \n",
" Psychiatric | \n",
" Proprietary | \n",
"
\n",
" \n",
" 39835 | \n",
" TREASURE VALLEY HOSPITAL | \n",
" 8800 WEST EMERALD STREET | \n",
" BOISE | \n",
" ID | \n",
" 83704 | \n",
" ADA | \n",
" (208) 373-5000 | \n",
" Acute Care Hospitals | \n",
" Proprietary | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Facility Name Address \\\n",
"Account_Num \n",
"10605 SAGE MEMORIAL HOSPITAL STATE ROUTE 264 SOUTH 191 \n",
"24250 WOODRIDGE BEHAVIORAL CENTER 600 NORTH 7TH STREET \n",
"10341 DOUGLAS GARDENS HOSPITAL 5200 NE 2ND AVE \n",
"81101 SUNCOAST BEHAVIORAL HEALTH CENTER 4480 51ST ST W \n",
"39835 TREASURE VALLEY HOSPITAL 8800 WEST EMERALD STREET \n",
"\n",
" City State ZIP Code County Name Phone Number \\\n",
"Account_Num \n",
"10605 GANADO AZ 86505 APACHE (928) 755-4541 \n",
"24250 WEST MEMPHIS AR 72301 CRITTENDEN (870) 394-4113 \n",
"10341 MIAMI FL 33137 MIAMI-DADE (305) 751-8626 \n",
"81101 BRADENTON FL 34210 MANATEE (941) 792-2222 \n",
"39835 BOISE ID 83704 ADA (208) 373-5000 \n",
"\n",
" Hospital Type Hospital Ownership \n",
"Account_Num \n",
"10605 Critical Access Hospitals Voluntary non-profit - Private \n",
"24250 Psychiatric Proprietary \n",
"10341 Acute Care Hospitals Voluntary non-profit - Private \n",
"81101 Psychiatric Proprietary \n",
"39835 Acute Care Hospitals Proprietary "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hospital_accounts.head()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Provider Name | \n",
" Provider Street Address | \n",
" Provider City | \n",
" Provider State | \n",
" Provider Zip Code | \n",
" Total Discharges | \n",
" Average Covered Charges | \n",
" Average Total Payments | \n",
" Average Medicare Payments | \n",
"
\n",
" \n",
" Provider_Num | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 839987 | \n",
" SOUTHEAST ALABAMA MEDICAL CENTER | \n",
" 1108 ROSS CLARK CIRCLE | \n",
" DOTHAN | \n",
" AL | \n",
" 36301 | \n",
" 118 | \n",
" 20855.61 | \n",
" 5026.19 | \n",
" 4115.52 | \n",
"
\n",
" \n",
" 519118 | \n",
" MARSHALL MEDICAL CENTER SOUTH | \n",
" 2505 U S HIGHWAY 431 NORTH | \n",
" BOAZ | \n",
" AL | \n",
" 35957 | \n",
" 43 | \n",
" 13289.09 | \n",
" 5413.63 | \n",
" 4490.93 | \n",
"
\n",
" \n",
" 733073 | \n",
" ELIZA COFFEE MEMORIAL HOSPITAL | \n",
" 205 MARENGO STREET | \n",
" FLORENCE | \n",
" AL | \n",
" 35631 | \n",
" 73 | \n",
" 22261.60 | \n",
" 4922.18 | \n",
" 4021.79 | \n",
"
\n",
" \n",
" 201752 | \n",
" MIZELL MEMORIAL HOSPITAL | \n",
" 702 N MAIN ST | \n",
" OPP | \n",
" AL | \n",
" 36467 | \n",
" 12 | \n",
" 10901.33 | \n",
" 5343.50 | \n",
" 4284.17 | \n",
"
\n",
" \n",
" 678488 | \n",
" ST VINCENT'S EAST | \n",
" 50 MEDICAL PARK EAST DRIVE | \n",
" BIRMINGHAM | \n",
" AL | \n",
" 35235 | \n",
" 74 | \n",
" 28117.95 | \n",
" 5947.12 | \n",
" 4819.53 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Provider Name Provider Street Address \\\n",
"Provider_Num \n",
"839987 SOUTHEAST ALABAMA MEDICAL CENTER 1108 ROSS CLARK CIRCLE \n",
"519118 MARSHALL MEDICAL CENTER SOUTH 2505 U S HIGHWAY 431 NORTH \n",
"733073 ELIZA COFFEE MEMORIAL HOSPITAL 205 MARENGO STREET \n",
"201752 MIZELL MEMORIAL HOSPITAL 702 N MAIN ST \n",
"678488 ST VINCENT'S EAST 50 MEDICAL PARK EAST DRIVE \n",
"\n",
" Provider City Provider State Provider Zip Code \\\n",
"Provider_Num \n",
"839987 DOTHAN AL 36301 \n",
"519118 BOAZ AL 35957 \n",
"733073 FLORENCE AL 35631 \n",
"201752 OPP AL 36467 \n",
"678488 BIRMINGHAM AL 35235 \n",
"\n",
" Total Discharges Average Covered Charges \\\n",
"Provider_Num \n",
"839987 118 20855.61 \n",
"519118 43 13289.09 \n",
"733073 73 22261.60 \n",
"201752 12 10901.33 \n",
"678488 74 28117.95 \n",
"\n",
" Average Total Payments Average Medicare Payments \n",
"Provider_Num \n",
"839987 5026.19 4115.52 \n",
"519118 5413.63 4490.93 \n",
"733073 4922.18 4021.79 \n",
"201752 5343.50 4284.17 \n",
"678488 5947.12 4819.53 "
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hospital_reimbursement.head()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Build the indexer\n",
"indexer = recordlinkage.Index()\n",
"# Can use full or block\n",
"#indexer.full()\n",
"#indexer.block(left_on='State', right_on='Provider State')\n",
"\n",
"# Use sortedneighbor as a good option if data is not clean\n",
"indexer.sortedneighbourhood(left_on='State', right_on='Provider State')"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"candidates = indexer.index(hospital_accounts, hospital_reimbursement)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"998860\n"
]
}
],
"source": [
"# Let's see how many matches we want to do\n",
"print(len(candidates))"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"# Takes 3 minutes using the full index.\n",
"# 14s using sorted neighbor\n",
"# 7s using blocking\n",
"compare = recordlinkage.Compare()\n",
"compare.exact('City', 'Provider City', label='City')\n",
"compare.string('Facility Name',\n",
" 'Provider Name',\n",
" threshold=0.85,\n",
" label='Hosp_Name')\n",
"compare.string('Address',\n",
" 'Provider Street Address',\n",
" method='jarowinkler',\n",
" threshold=0.85,\n",
" label='Hosp_Address')\n",
"features = compare.compute(candidates, hospital_accounts,\n",
" hospital_reimbursement)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" City | \n",
" Hosp_Name | \n",
" Hosp_Address | \n",
"
\n",
" \n",
" Account_Num | \n",
" Provider_Num | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 10605 | \n",
" 537184 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 803181 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 450616 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 854377 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 560361 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 70226 | \n",
" 815904 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 746090 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 193062 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 834984 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 365095 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
998860 rows × 3 columns
\n",
"
"
],
"text/plain": [
" City Hosp_Name Hosp_Address\n",
"Account_Num Provider_Num \n",
"10605 537184 0 0.0 0.0\n",
" 803181 0 0.0 0.0\n",
" 450616 0 0.0 0.0\n",
" 854377 0 0.0 0.0\n",
" 560361 0 0.0 0.0\n",
"... ... ... ...\n",
"70226 815904 0 0.0 0.0\n",
" 746090 0 0.0 0.0\n",
" 193062 0 0.0 0.0\n",
" 834984 0 0.0 0.0\n",
" 365095 0 0.0 0.0\n",
"\n",
"[998860 rows x 3 columns]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"features"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3.0 2285\n",
"2.0 451\n",
"1.0 7937\n",
"0.0 988187\n",
"dtype: int64"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# What are the score totals?\n",
"features.sum(axis=1).value_counts().sort_index(ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"# Get the potential matches\n",
"potential_matches = features[features.sum(axis=1) > 1].reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Account_Num | \n",
" Provider_Num | \n",
" City | \n",
" Hosp_Name | \n",
" Hosp_Address | \n",
" Score | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 51216 | \n",
" 268781 | \n",
" 0 | \n",
" 1.0 | \n",
" 1.0 | \n",
" 2.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 55272 | \n",
" 556917 | \n",
" 1 | \n",
" 1.0 | \n",
" 1.0 | \n",
" 3.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 87807 | \n",
" 854637 | \n",
" 1 | \n",
" 1.0 | \n",
" 1.0 | \n",
" 3.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 51151 | \n",
" 783146 | \n",
" 1 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 2.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 11740 | \n",
" 260374 | \n",
" 1 | \n",
" 1.0 | \n",
" 1.0 | \n",
" 3.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Account_Num Provider_Num City Hosp_Name Hosp_Address Score\n",
"0 51216 268781 0 1.0 1.0 2.0\n",
"1 55272 556917 1 1.0 1.0 3.0\n",
"2 87807 854637 1 1.0 1.0 3.0\n",
"3 51151 783146 1 0.0 1.0 2.0\n",
"4 11740 260374 1 1.0 1.0 3.0"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"potential_matches['Score'] = potential_matches.loc[:, 'City':'Hosp_Address'].sum(axis=1)\n",
"potential_matches.head()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Facility Name ST FRANCIS MEDICAL CENTER\n",
"Address 2400 ST FRANCIS DRIVE\n",
"City BRECKENRIDGE\n",
"State MN\n",
"ZIP Code 56520\n",
"County Name WILKIN\n",
"Phone Number (218) 643-3000\n",
"Hospital Type Critical Access Hospitals\n",
"Hospital Ownership Voluntary non-profit - Church\n",
"Name: 51216, dtype: object"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hospital_accounts.loc[51216,:]"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Provider Name SAINT FRANCIS MEDICAL CENTER\n",
"Provider Street Address 211 ST FRANCIS DR\n",
"Provider City CAPE GIRARDEAU\n",
"Provider State MO\n",
"Provider Zip Code 63703\n",
"Total Discharges 141\n",
"Average Covered Charges 42515.1\n",
"Average Total Payments 5902.24\n",
"Average Medicare Payments 4993.43\n",
"Name: 268781, dtype: object"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hospital_reimbursement.loc[268781,:]"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"# Add some convenience columns for comparing data\n",
"hospital_accounts['Acct_Name_Lookup'] = hospital_accounts[[\n",
" 'Facility Name', 'Address', 'City', 'State'\n",
"]].apply(lambda x: '_'.join(x), axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"hospital_reimbursement['Reimbursement_Name_Lookup'] = hospital_reimbursement[[\n",
" 'Provider Name', 'Provider Street Address', 'Provider City',\n",
" 'Provider State'\n",
"]].apply(lambda x: '_'.join(x), axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"reimbursement_lookup = hospital_reimbursement[['Reimbursement_Name_Lookup']].reset_index()\n",
"account_lookup = hospital_accounts[['Acct_Name_Lookup']].reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Account_Num | \n",
" Acct_Name_Lookup | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 10605 | \n",
" SAGE MEMORIAL HOSPITAL_STATE ROUTE 264 SOUTH 1... | \n",
"
\n",
" \n",
" 1 | \n",
" 24250 | \n",
" WOODRIDGE BEHAVIORAL CENTER_600 NORTH 7TH STRE... | \n",
"
\n",
" \n",
" 2 | \n",
" 10341 | \n",
" DOUGLAS GARDENS HOSPITAL_5200 NE 2ND AVE_MIAMI_FL | \n",
"
\n",
" \n",
" 3 | \n",
" 81101 | \n",
" SUNCOAST BEHAVIORAL HEALTH CENTER_4480 51ST ST... | \n",
"
\n",
" \n",
" 4 | \n",
" 39835 | \n",
" TREASURE VALLEY HOSPITAL_8800 WEST EMERALD STR... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Account_Num Acct_Name_Lookup\n",
"0 10605 SAGE MEMORIAL HOSPITAL_STATE ROUTE 264 SOUTH 1...\n",
"1 24250 WOODRIDGE BEHAVIORAL CENTER_600 NORTH 7TH STRE...\n",
"2 10341 DOUGLAS GARDENS HOSPITAL_5200 NE 2ND AVE_MIAMI_FL\n",
"3 81101 SUNCOAST BEHAVIORAL HEALTH CENTER_4480 51ST ST...\n",
"4 39835 TREASURE VALLEY HOSPITAL_8800 WEST EMERALD STR..."
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"account_lookup.head()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Provider_Num | \n",
" Reimbursement_Name_Lookup | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 839987 | \n",
" SOUTHEAST ALABAMA MEDICAL CENTER_1108 ROSS CLA... | \n",
"
\n",
" \n",
" 1 | \n",
" 519118 | \n",
" MARSHALL MEDICAL CENTER SOUTH_2505 U S HIGHWAY... | \n",
"
\n",
" \n",
" 2 | \n",
" 733073 | \n",
" ELIZA COFFEE MEMORIAL HOSPITAL_205 MARENGO STR... | \n",
"
\n",
" \n",
" 3 | \n",
" 201752 | \n",
" MIZELL MEMORIAL HOSPITAL_702 N MAIN ST_OPP_AL | \n",
"
\n",
" \n",
" 4 | \n",
" 678488 | \n",
" ST VINCENT'S EAST_50 MEDICAL PARK EAST DRIVE_B... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Provider_Num Reimbursement_Name_Lookup\n",
"0 839987 SOUTHEAST ALABAMA MEDICAL CENTER_1108 ROSS CLA...\n",
"1 519118 MARSHALL MEDICAL CENTER SOUTH_2505 U S HIGHWAY...\n",
"2 733073 ELIZA COFFEE MEMORIAL HOSPITAL_205 MARENGO STR...\n",
"3 201752 MIZELL MEMORIAL HOSPITAL_702 N MAIN ST_OPP_AL\n",
"4 678488 ST VINCENT'S EAST_50 MEDICAL PARK EAST DRIVE_B..."
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"reimbursement_lookup.head()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"account_merge = potential_matches.merge(account_lookup, how='left')"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Account_Num | \n",
" Provider_Num | \n",
" City | \n",
" Hosp_Name | \n",
" Hosp_Address | \n",
" Score | \n",
" Acct_Name_Lookup | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 51216 | \n",
" 268781 | \n",
" 0 | \n",
" 1.0 | \n",
" 1.0 | \n",
" 2.0 | \n",
" ST FRANCIS MEDICAL CENTER_2400 ST FRANCIS DRIV... | \n",
"
\n",
" \n",
" 1 | \n",
" 55272 | \n",
" 556917 | \n",
" 1 | \n",
" 1.0 | \n",
" 1.0 | \n",
" 3.0 | \n",
" SCOTTSDALE OSBORN MEDICAL CENTER_7400 EAST OSB... | \n",
"
\n",
" \n",
" 2 | \n",
" 87807 | \n",
" 854637 | \n",
" 1 | \n",
" 1.0 | \n",
" 1.0 | \n",
" 3.0 | \n",
" ORO VALLEY HOSPITAL_1551 EAST TANGERINE ROAD_O... | \n",
"
\n",
" \n",
" 3 | \n",
" 51151 | \n",
" 783146 | \n",
" 1 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 2.0 | \n",
" ST. LUKE'S BEHAVIORAL HOSPITAL, LP_1800 EAST V... | \n",
"
\n",
" \n",
" 4 | \n",
" 11740 | \n",
" 260374 | \n",
" 1 | \n",
" 1.0 | \n",
" 1.0 | \n",
" 3.0 | \n",
" SUMMIT HEALTHCARE REGIONAL MEDICAL CENTER_2200... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Account_Num Provider_Num City Hosp_Name Hosp_Address Score \\\n",
"0 51216 268781 0 1.0 1.0 2.0 \n",
"1 55272 556917 1 1.0 1.0 3.0 \n",
"2 87807 854637 1 1.0 1.0 3.0 \n",
"3 51151 783146 1 0.0 1.0 2.0 \n",
"4 11740 260374 1 1.0 1.0 3.0 \n",
"\n",
" Acct_Name_Lookup \n",
"0 ST FRANCIS MEDICAL CENTER_2400 ST FRANCIS DRIV... \n",
"1 SCOTTSDALE OSBORN MEDICAL CENTER_7400 EAST OSB... \n",
"2 ORO VALLEY HOSPITAL_1551 EAST TANGERINE ROAD_O... \n",
"3 ST. LUKE'S BEHAVIORAL HOSPITAL, LP_1800 EAST V... \n",
"4 SUMMIT HEALTHCARE REGIONAL MEDICAL CENTER_2200... "
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"account_merge.head()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Provider_Num | \n",
" Reimbursement_Name_Lookup | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 839987 | \n",
" SOUTHEAST ALABAMA MEDICAL CENTER_1108 ROSS CLA... | \n",
"
\n",
" \n",
" 1 | \n",
" 519118 | \n",
" MARSHALL MEDICAL CENTER SOUTH_2505 U S HIGHWAY... | \n",
"
\n",
" \n",
" 2 | \n",
" 733073 | \n",
" ELIZA COFFEE MEMORIAL HOSPITAL_205 MARENGO STR... | \n",
"
\n",
" \n",
" 3 | \n",
" 201752 | \n",
" MIZELL MEMORIAL HOSPITAL_702 N MAIN ST_OPP_AL | \n",
"
\n",
" \n",
" 4 | \n",
" 678488 | \n",
" ST VINCENT'S EAST_50 MEDICAL PARK EAST DRIVE_B... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Provider_Num Reimbursement_Name_Lookup\n",
"0 839987 SOUTHEAST ALABAMA MEDICAL CENTER_1108 ROSS CLA...\n",
"1 519118 MARSHALL MEDICAL CENTER SOUTH_2505 U S HIGHWAY...\n",
"2 733073 ELIZA COFFEE MEMORIAL HOSPITAL_205 MARENGO STR...\n",
"3 201752 MIZELL MEMORIAL HOSPITAL_702 N MAIN ST_OPP_AL\n",
"4 678488 ST VINCENT'S EAST_50 MEDICAL PARK EAST DRIVE_B..."
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"reimbursement_lookup.head()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"# Let's build a dataframe to compare\n",
"final_merge = account_merge.merge(reimbursement_lookup, how='left')"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"cols = [\n",
" 'Account_Num', 'Provider_Num', 'Score', 'Acct_Name_Lookup',\n",
" 'Reimbursement_Name_Lookup'\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Account_Num | \n",
" Provider_Num | \n",
" Score | \n",
" Acct_Name_Lookup | \n",
" Reimbursement_Name_Lookup | \n",
"
\n",
" \n",
" \n",
" \n",
" 2660 | \n",
" 94995 | \n",
" 825914 | \n",
" 3.0 | \n",
" CLAIBORNE MEDICAL CENTER_1850 OLD KNOXVILLE HI... | \n",
" CLAIBORNE MEDICAL CENTER_1850 OLD KNOXVILLE HI... | \n",
"
\n",
" \n",
" 1975 | \n",
" 94953 | \n",
" 819181 | \n",
" 3.0 | \n",
" LAKE CHARLES MEMORIAL HOSPITAL_1701 OAK PARK B... | \n",
" LAKE CHARLES MEMORIAL HOSPITAL_1701 OAK PARK B... | \n",
"
\n",
" \n",
" 1042 | \n",
" 94943 | \n",
" 680596 | \n",
" 3.0 | \n",
" VALLEY PRESBYTERIAN HOSPITAL_15107 VANOWEN ST_... | \n",
" VALLEY PRESBYTERIAN HOSPITAL_15107 VANOWEN ST_... | \n",
"
\n",
" \n",
" 2305 | \n",
" 94923 | \n",
" 403151 | \n",
" 3.0 | \n",
" UNIVERSITY COLO HEALTH MEMORIAL HOSPITAL CENTR... | \n",
" UNIVERSITY COLO HEALTH MEMORIAL HOSPITAL CENTR... | \n",
"
\n",
" \n",
" 2512 | \n",
" 94887 | \n",
" 752284 | \n",
" 2.0 | \n",
" NEW YORK-PRESBYTERIAN BROOKLYN METHODIST HOSPI... | \n",
" NEW YORK METHODIST HOSPITAL_506 SIXTH STREET_B... | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 2080 | \n",
" 10165 | \n",
" 188247 | \n",
" 3.0 | \n",
" UTAH VALLEY HOSPITAL_1034 NORTH 500 WEST_PROVO_UT | \n",
" UTAH VALLEY HOSPITAL_1034 NORTH 500 WEST_PROVO_UT | \n",
"
\n",
" \n",
" 1825 | \n",
" 10090 | \n",
" 212069 | \n",
" 3.0 | \n",
" CANONSBURG GENERAL HOSPITAL_100 MEDICAL BOULEV... | \n",
" CANONSBURG GENERAL HOSPITAL_100 MEDICAL BOULEV... | \n",
"
\n",
" \n",
" 2424 | \n",
" 10043 | \n",
" 140535 | \n",
" 3.0 | \n",
" BETH ISRAEL DEACONESS HOSPITAL - PLYMOUTH_275 ... | \n",
" BETH ISRAEL DEACONESS HOSPITAL - PLYMOUTH_275 ... | \n",
"
\n",
" \n",
" 1959 | \n",
" 10020 | \n",
" 210657 | \n",
" 3.0 | \n",
" ST FRANCIS MEDICAL CENTER_309 JACKSON STREET_M... | \n",
" ST FRANCIS MEDICAL CENTER_309 JACKSON STREET_M... | \n",
"
\n",
" \n",
" 1958 | \n",
" 10020 | \n",
" 121670 | \n",
" 2.0 | \n",
" ST FRANCIS MEDICAL CENTER_309 JACKSON STREET_M... | \n",
" UNIVERSITY HEALTH CONWAY_4864 JACKSON STREET_M... | \n",
"
\n",
" \n",
"
\n",
"
2736 rows × 5 columns
\n",
"
"
],
"text/plain": [
" Account_Num Provider_Num Score \\\n",
"2660 94995 825914 3.0 \n",
"1975 94953 819181 3.0 \n",
"1042 94943 680596 3.0 \n",
"2305 94923 403151 3.0 \n",
"2512 94887 752284 2.0 \n",
"... ... ... ... \n",
"2080 10165 188247 3.0 \n",
"1825 10090 212069 3.0 \n",
"2424 10043 140535 3.0 \n",
"1959 10020 210657 3.0 \n",
"1958 10020 121670 2.0 \n",
"\n",
" Acct_Name_Lookup \\\n",
"2660 CLAIBORNE MEDICAL CENTER_1850 OLD KNOXVILLE HI... \n",
"1975 LAKE CHARLES MEMORIAL HOSPITAL_1701 OAK PARK B... \n",
"1042 VALLEY PRESBYTERIAN HOSPITAL_15107 VANOWEN ST_... \n",
"2305 UNIVERSITY COLO HEALTH MEMORIAL HOSPITAL CENTR... \n",
"2512 NEW YORK-PRESBYTERIAN BROOKLYN METHODIST HOSPI... \n",
"... ... \n",
"2080 UTAH VALLEY HOSPITAL_1034 NORTH 500 WEST_PROVO_UT \n",
"1825 CANONSBURG GENERAL HOSPITAL_100 MEDICAL BOULEV... \n",
"2424 BETH ISRAEL DEACONESS HOSPITAL - PLYMOUTH_275 ... \n",
"1959 ST FRANCIS MEDICAL CENTER_309 JACKSON STREET_M... \n",
"1958 ST FRANCIS MEDICAL CENTER_309 JACKSON STREET_M... \n",
"\n",
" Reimbursement_Name_Lookup \n",
"2660 CLAIBORNE MEDICAL CENTER_1850 OLD KNOXVILLE HI... \n",
"1975 LAKE CHARLES MEMORIAL HOSPITAL_1701 OAK PARK B... \n",
"1042 VALLEY PRESBYTERIAN HOSPITAL_15107 VANOWEN ST_... \n",
"2305 UNIVERSITY COLO HEALTH MEMORIAL HOSPITAL CENTR... \n",
"2512 NEW YORK METHODIST HOSPITAL_506 SIXTH STREET_B... \n",
"... ... \n",
"2080 UTAH VALLEY HOSPITAL_1034 NORTH 500 WEST_PROVO_UT \n",
"1825 CANONSBURG GENERAL HOSPITAL_100 MEDICAL BOULEV... \n",
"2424 BETH ISRAEL DEACONESS HOSPITAL - PLYMOUTH_275 ... \n",
"1959 ST FRANCIS MEDICAL CENTER_309 JACKSON STREET_M... \n",
"1958 UNIVERSITY HEALTH CONWAY_4864 JACKSON STREET_M... \n",
"\n",
"[2736 rows x 5 columns]"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final_merge[cols].sort_values(by=['Account_Num', 'Score'], ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"# If you need to save it to Excel -\n",
"#final_merge.sort_values(by=['Account_Num', 'Score'],\n",
"# ascending=False).to_excel('merge_list.xlsx',\n",
"# index=False)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Account_Num | \n",
" Provider_Num | \n",
" Score | \n",
" Acct_Name_Lookup | \n",
" Reimbursement_Name_Lookup | \n",
"
\n",
" \n",
" \n",
" \n",
" 2302 | \n",
" 11035 | \n",
" 550921 | \n",
" 2.0 | \n",
" SAINT JOSEPH HOSPITAL_1375 E 19TH AVE_DENVER_CO | \n",
" PRESBYTERIAN ST LUKES MEDICAL CENTER_1719 E 19... | \n",
"
\n",
" \n",
" 2303 | \n",
" 11035 | \n",
" 706807 | \n",
" 3.0 | \n",
" SAINT JOSEPH HOSPITAL_1375 E 19TH AVE_DENVER_CO | \n",
" SAINT JOSEPH HOSPITAL_1375 EAST 19TH AVE_DENVE... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Account_Num Provider_Num Score \\\n",
"2302 11035 550921 2.0 \n",
"2303 11035 706807 3.0 \n",
"\n",
" Acct_Name_Lookup \\\n",
"2302 SAINT JOSEPH HOSPITAL_1375 E 19TH AVE_DENVER_CO \n",
"2303 SAINT JOSEPH HOSPITAL_1375 E 19TH AVE_DENVER_CO \n",
"\n",
" Reimbursement_Name_Lookup \n",
"2302 PRESBYTERIAN ST LUKES MEDICAL CENTER_1719 E 19... \n",
"2303 SAINT JOSEPH HOSPITAL_1375 EAST 19TH AVE_DENVE... "
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final_merge[final_merge['Account_Num']==11035][cols]"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Account_Num | \n",
" Provider_Num | \n",
" Score | \n",
" Acct_Name_Lookup | \n",
" Reimbursement_Name_Lookup | \n",
"
\n",
" \n",
" \n",
" \n",
" 155 | \n",
" 56375 | \n",
" 390402 | \n",
" 2.0 | \n",
" BROWARD HEALTH IMPERIAL POINT_6401 N FEDERAL H... | \n",
" HOLY CROSS HOSPITAL_4725 N FEDERAL HWY_FORT LA... | \n",
"
\n",
" \n",
" 156 | \n",
" 56375 | \n",
" 451229 | \n",
" 3.0 | \n",
" BROWARD HEALTH IMPERIAL POINT_6401 N FEDERAL H... | \n",
" BROWARD HEALTH IMPERIAL POINT_6401 N FEDERAL H... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Account_Num Provider_Num Score \\\n",
"155 56375 390402 2.0 \n",
"156 56375 451229 3.0 \n",
"\n",
" Acct_Name_Lookup \\\n",
"155 BROWARD HEALTH IMPERIAL POINT_6401 N FEDERAL H... \n",
"156 BROWARD HEALTH IMPERIAL POINT_6401 N FEDERAL H... \n",
"\n",
" Reimbursement_Name_Lookup \n",
"155 HOLY CROSS HOSPITAL_4725 N FEDERAL HWY_FORT LA... \n",
"156 BROWARD HEALTH IMPERIAL POINT_6401 N FEDERAL H... "
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final_merge[final_merge['Account_Num']==56375][cols]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Dedupe the data"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"hospital_dupes = pd.read_csv(\n",
" 'https://github.com/chris1610/pbpython/raw/master/data/hospital_account_dupes.csv',\n",
" index_col='Account_Num')"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Facility Name | \n",
" Address | \n",
" City | \n",
" State | \n",
" ZIP Code | \n",
" County Name | \n",
" Phone Number | \n",
" Hospital Type | \n",
" Hospital Ownership | \n",
"
\n",
" \n",
" Account_Num | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 71730 | \n",
" SAGE MEMORIAL HOSPITAL | \n",
" STATE ROUTE 264 SOUTH 191 | \n",
" GANADO | \n",
" AZ | \n",
" 86505 | \n",
" APACHE | \n",
" (928) 755-4541 | \n",
" Critical Access Hospitals | \n",
" Voluntary non-profit - Private | \n",
"
\n",
" \n",
" 70116 | \n",
" WOODRIDGE BEHAVIORAL CENTER | \n",
" 600 NORTH 7TH STREET | \n",
" WEST MEMPHIS | \n",
" AR | \n",
" 72301 | \n",
" CRITTENDEN | \n",
" (870) 394-4113 | \n",
" Psychiatric | \n",
" Proprietary | \n",
"
\n",
" \n",
" 87991 | \n",
" DOUGLAS GARDENS HOSPITAL | \n",
" 5200 NE 2ND AVE | \n",
" MIAMI | \n",
" FL | \n",
" 33137 | \n",
" MIAMI-DADE | \n",
" (305) 751-8626 | \n",
" Acute Care Hospitals | \n",
" Voluntary non-profit - Private | \n",
"
\n",
" \n",
" 22662 | \n",
" SUNCOAST BEHAVIORAL HEALTH CENTER | \n",
" 4480 51ST ST W | \n",
" BRADENTON | \n",
" FL | \n",
" 34210 | \n",
" MANATEE | \n",
" (941) 792-2222 | \n",
" Psychiatric | \n",
" Proprietary | \n",
"
\n",
" \n",
" 63165 | \n",
" TREASURE VALLEY HOSPITAL | \n",
" 8800 WEST EMERALD STREET | \n",
" BOISE | \n",
" ID | \n",
" 83704 | \n",
" ADA | \n",
" (208) 373-5000 | \n",
" Acute Care Hospitals | \n",
" Proprietary | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Facility Name Address \\\n",
"Account_Num \n",
"71730 SAGE MEMORIAL HOSPITAL STATE ROUTE 264 SOUTH 191 \n",
"70116 WOODRIDGE BEHAVIORAL CENTER 600 NORTH 7TH STREET \n",
"87991 DOUGLAS GARDENS HOSPITAL 5200 NE 2ND AVE \n",
"22662 SUNCOAST BEHAVIORAL HEALTH CENTER 4480 51ST ST W \n",
"63165 TREASURE VALLEY HOSPITAL 8800 WEST EMERALD STREET \n",
"\n",
" City State ZIP Code County Name Phone Number \\\n",
"Account_Num \n",
"71730 GANADO AZ 86505 APACHE (928) 755-4541 \n",
"70116 WEST MEMPHIS AR 72301 CRITTENDEN (870) 394-4113 \n",
"87991 MIAMI FL 33137 MIAMI-DADE (305) 751-8626 \n",
"22662 BRADENTON FL 34210 MANATEE (941) 792-2222 \n",
"63165 BOISE ID 83704 ADA (208) 373-5000 \n",
"\n",
" Hospital Type Hospital Ownership \n",
"Account_Num \n",
"71730 Critical Access Hospitals Voluntary non-profit - Private \n",
"70116 Psychiatric Proprietary \n",
"87991 Acute Care Hospitals Voluntary non-profit - Private \n",
"22662 Psychiatric Proprietary \n",
"63165 Acute Care Hospitals Proprietary "
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hospital_dupes.head()"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"# Deduping follows the same process, you just use 1 single dataframe\n",
"dupe_indexer = recordlinkage.Index()\n",
"dupe_indexer.sortedneighbourhood(left_on='State')\n",
"dupe_candidate_links = dupe_indexer.index(hospital_dupes)\n"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"# Comparison step\n",
"compare_dupes = recordlinkage.Compare()\n",
"compare_dupes.string('City', 'City', threshold=0.85, label='City')\n",
"compare_dupes.string('Phone Number',\n",
" 'Phone Number',\n",
" threshold=0.85,\n",
" label='Phone_Num')\n",
"compare_dupes.string('Facility Name',\n",
" 'Facility Name',\n",
" threshold=0.80,\n",
" label='Hosp_Name')\n",
"compare_dupes.string('Address',\n",
" 'Address',\n",
" threshold=0.85,\n",
" label='Hosp_Address')\n",
"dupe_features = compare_dupes.compute(dupe_candidate_links, hospital_dupes)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" City | \n",
" Phone_Num | \n",
" Hosp_Name | \n",
" Hosp_Address | \n",
"
\n",
" \n",
" Account_Num_1 | \n",
" Account_Num_2 | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 26270 | \n",
" 28485 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 30430 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 43602 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 59585 | \n",
" 28485 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 30430 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 64029 | \n",
" 38600 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 35413 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 81525 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 82916 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 18907 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
981277 rows × 4 columns
\n",
"
"
],
"text/plain": [
" City Phone_Num Hosp_Name Hosp_Address\n",
"Account_Num_1 Account_Num_2 \n",
"26270 28485 0.0 0.0 0.0 0.0\n",
" 30430 0.0 0.0 0.0 0.0\n",
" 43602 0.0 0.0 0.0 0.0\n",
"59585 28485 0.0 0.0 0.0 0.0\n",
" 30430 0.0 0.0 0.0 0.0\n",
"... ... ... ... ...\n",
"64029 38600 0.0 0.0 0.0 0.0\n",
" 35413 0.0 0.0 0.0 0.0\n",
" 81525 0.0 0.0 0.0 0.0\n",
" 82916 0.0 0.0 0.0 0.0\n",
" 18907 0.0 0.0 0.0 0.0\n",
"\n",
"[981277 rows x 4 columns]"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dupe_features"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3.0 7\n",
"2.0 206\n",
"1.0 7859\n",
"0.0 973205\n",
"dtype: int64"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dupe_features.sum(axis=1).value_counts().sort_index(ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"potential_dupes = dupe_features[dupe_features.sum(axis=1) > 2].reset_index()\n",
"potential_dupes['Score'] = potential_dupes.loc[:, 'City':'Hosp_Address'].sum(axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Account_Num_1 | \n",
" Account_Num_2 | \n",
" City | \n",
" Phone_Num | \n",
" Hosp_Name | \n",
" Hosp_Address | \n",
" Score | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 28494 | \n",
" 37949 | \n",
" 1.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 3.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 74835 | \n",
" 77000 | \n",
" 1.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 3.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 24549 | \n",
" 28485 | \n",
" 1.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 3.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 70366 | \n",
" 52654 | \n",
" 1.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 3.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 61685 | \n",
" 24849 | \n",
" 1.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 3.0 | \n",
"
\n",
" \n",
" 5 | \n",
" 51567 | \n",
" 41166 | \n",
" 1.0 | \n",
" 1.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 3.0 | \n",
"
\n",
" \n",
" 6 | \n",
" 26495 | \n",
" 41079 | \n",
" 1.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 3.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Account_Num_1 Account_Num_2 City Phone_Num Hosp_Name Hosp_Address \\\n",
"0 28494 37949 1.0 1.0 0.0 1.0 \n",
"1 74835 77000 1.0 1.0 0.0 1.0 \n",
"2 24549 28485 1.0 1.0 0.0 1.0 \n",
"3 70366 52654 1.0 1.0 0.0 1.0 \n",
"4 61685 24849 1.0 1.0 0.0 1.0 \n",
"5 51567 41166 1.0 1.0 1.0 0.0 \n",
"6 26495 41079 1.0 1.0 0.0 1.0 \n",
"\n",
" Score \n",
"0 3.0 \n",
"1 3.0 \n",
"2 3.0 \n",
"3 3.0 \n",
"4 3.0 \n",
"5 3.0 \n",
"6 3.0 "
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"potential_dupes.sort_values(by=['Score'], ascending=True)"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Facility Name | \n",
" Address | \n",
" City | \n",
" State | \n",
" ZIP Code | \n",
" County Name | \n",
" Phone Number | \n",
" Hospital Type | \n",
" Hospital Ownership | \n",
"
\n",
" \n",
" Account_Num | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 41166 | \n",
" ST VINCENT HOSPITAL | \n",
" 835 S VAN BUREN ST | \n",
" GREEN BAY | \n",
" WI | \n",
" 54301 | \n",
" BROWN | \n",
" (920) 433-0111 | \n",
" Acute Care Hospitals | \n",
" Voluntary non-profit - Church | \n",
"
\n",
" \n",
" 51567 | \n",
" SAINT VINCENT HOSPITAL | \n",
" 835 SOUTH VAN BUREN ST | \n",
" GREEN BAY | \n",
" WI | \n",
" 54301 | \n",
" BROWN | \n",
" (920) 433-0112 | \n",
" Acute Care Hospitals | \n",
" Voluntary non-profit - Church | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Facility Name Address City State \\\n",
"Account_Num \n",
"41166 ST VINCENT HOSPITAL 835 S VAN BUREN ST GREEN BAY WI \n",
"51567 SAINT VINCENT HOSPITAL 835 SOUTH VAN BUREN ST GREEN BAY WI \n",
"\n",
" ZIP Code County Name Phone Number Hospital Type \\\n",
"Account_Num \n",
"41166 54301 BROWN (920) 433-0111 Acute Care Hospitals \n",
"51567 54301 BROWN (920) 433-0112 Acute Care Hospitals \n",
"\n",
" Hospital Ownership \n",
"Account_Num \n",
"41166 Voluntary non-profit - Church \n",
"51567 Voluntary non-profit - Church "
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Take a look at one of the potential duplicates\n",
"hospital_dupes[hospital_dupes.index.isin([51567, 41166])]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.7.2 64-bit",
"language": "python",
"name": "python37264bit24e6adcd4e9b4d8bb6ff9239d4d45105"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}