{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# COVID-19 Webscraper" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Vaccination clinic data: https://vaccinefinder.nyc.gov/" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# import packages\n", "from bs4 import BeautifulSoup\n", "import pandas as pd\n", "import codecs\n", "from urllib.request import urlopen" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "scrolled": false }, "outputs": [], "source": [ "# file address\n", "f = codecs.open(\"NYC COVID-19 and Flu Vaccine Finder.html\", 'r','utf-8')\n", "\n", "# read in the html file\n", "soup = BeautifulSoup(f, 'html.parser')\n", "## soup" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total 1136 vaccine stations as of Jan 11\n" ] } ], "source": [ "articles = soup.find_all('article', class_ = 'sc-kfzAmx jhOYBm')\n", "print('Total ' + str(len(articles)) + ' vaccine stations as of Jan 11')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# collect data from the website\n", "df = pd.DataFrame()\n", "for i in range(len(articles)):\n", " \n", " article = articles[i]\n", " \n", " name = article.find(class_ =\"baseH-sc-1pkt4xd-4 H2-sc-1pkt4xd-6 dtATFP cQDSrG sc-gsTCUz bhdLno\")\n", " if ( name is not None):\n", " name = name.text\n", " else:\n", " name = 'N/A'\n", " \n", " address = article.find(class_ =\"CalciteP-sc-1pkt4xd-2 sc-bdfBwQ UHspd cIKpxU\").text\n", " \n", " tp = article.find(class_ =\"StyledLabel-sc-19bfcv8-0 eXXBrL\").text\n", " \n", " tel = article.find(class_ =\"CalciteA-sc-1pkt4xd-3 jGotip\")\n", " if (tel is not None):\n", " tel = tel.text\n", " else:\n", " tel = 'N/A'\n", " \n", "\n", " if (article.find(class_ =\"sc-jrAGrp fHSJfL\") is not None):\n", " vacType = article.find(class_ =\"sc-jrAGrp fHSJfL\").text\n", " else: \n", " vacType = 'N/A'\n", " \n", " notes = ''\n", " for i in range(len(article.find_all(class_ =\"sc-jrAGrp gwlDDV\"))):\n", " notes = notes + article.find_all(class_ =\"sc-jrAGrp gwlDDV\")[i].text + ' , '\n", " \n", " \n", " dfa = {'Name': name, 'Address':address, 'Station_Type':tp, 'Phone_Number':tel,'Vaccine_Type': vacType, 'Notes': notes}\n", " \n", " df = df.append(dfa, ignore_index = True)\n", " \n", " #print(dfa)\n", " #ls = ls.append(name)\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameAddressNotesPhone_NumberStation_TypeVaccine_TypeNotes
0Abyssinian Baptist Church- Pop Up132 West 138th Street, Manhattan$100 incentive available , Walk-up vaccination...(877) 829-4692Pop Up - VanVaccines offered:Pfizer (12+)Johnson & Johnson...$100 incentive available , Walk-up vaccination...
1AMC Magic Johnson Harlem- Pop Up2309 Frederick Douglass Blvd, Manhattan$100 incentive available , Walk-up vaccination...(877) 829-4692Pop Up - BusVaccine offered:Pfizer (5-11)$100 incentive available , Walk-up vaccination...
\n", "
" ], "text/plain": [ " Name \\\n", "0 Abyssinian Baptist Church- Pop Up \n", "1 AMC Magic Johnson Harlem- Pop Up \n", "\n", " Address \\\n", "0 132 West 138th Street, Manhattan \n", "1 2309 Frederick Douglass Blvd, Manhattan \n", "\n", " Notes Phone_Number \\\n", "0 $100 incentive available , Walk-up vaccination... (877) 829-4692 \n", "1 $100 incentive available , Walk-up vaccination... (877) 829-4692 \n", "\n", " Station_Type Vaccine_Type \\\n", "0 Pop Up - Van Vaccines offered:Pfizer (12+)Johnson & Johnson... \n", "1 Pop Up - Bus Vaccine offered:Pfizer (5-11) \n", "\n", " Notes \n", "0 $100 incentive available , Walk-up vaccination... \n", "1 $100 incentive available , Walk-up vaccination... " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = df[['Name','Address','Notes', 'Phone_Number', 'Station_Type', 'Vaccine_Type','Notes']]\n", "df.head(2)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "df.to_csv('Vac_stations.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }