{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# COVID-19 Webscraper" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Vaccination clinic data: https://vaccinefinder.nyc.gov/" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# import packages\n", "from bs4 import BeautifulSoup\n", "import pandas as pd\n", "import codecs\n", "from urllib.request import urlopen" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "scrolled": false }, "outputs": [], "source": [ "# file address\n", "f = codecs.open(\"NYC COVID-19 and Flu Vaccine Finder.html\", 'r','utf-8')\n", "\n", "# read in the html file\n", "soup = BeautifulSoup(f, 'html.parser')\n", "## soup" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total 1136 vaccine stations as of Jan 11\n" ] } ], "source": [ "articles = soup.find_all('article', class_ = 'sc-kfzAmx jhOYBm')\n", "print('Total ' + str(len(articles)) + ' vaccine stations as of Jan 11')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# collect data from the website\n", "df = pd.DataFrame()\n", "for i in range(len(articles)):\n", " \n", " article = articles[i]\n", " \n", " name = article.find(class_ =\"baseH-sc-1pkt4xd-4 H2-sc-1pkt4xd-6 dtATFP cQDSrG sc-gsTCUz bhdLno\")\n", " if ( name is not None):\n", " name = name.text\n", " else:\n", " name = 'N/A'\n", " \n", " address = article.find(class_ =\"CalciteP-sc-1pkt4xd-2 sc-bdfBwQ UHspd cIKpxU\").text\n", " \n", " tp = article.find(class_ =\"StyledLabel-sc-19bfcv8-0 eXXBrL\").text\n", " \n", " tel = article.find(class_ =\"CalciteA-sc-1pkt4xd-3 jGotip\")\n", " if (tel is not None):\n", " tel = tel.text\n", " else:\n", " tel = 'N/A'\n", " \n", "\n", " if (article.find(class_ =\"sc-jrAGrp fHSJfL\") is not None):\n", " vacType = article.find(class_ =\"sc-jrAGrp fHSJfL\").text\n", " else: \n", " vacType = 'N/A'\n", " \n", " notes = ''\n", " for i in range(len(article.find_all(class_ =\"sc-jrAGrp gwlDDV\"))):\n", " notes = notes + article.find_all(class_ =\"sc-jrAGrp gwlDDV\")[i].text + ' , '\n", " \n", " \n", " dfa = {'Name': name, 'Address':address, 'Station_Type':tp, 'Phone_Number':tel,'Vaccine_Type': vacType, 'Notes': notes}\n", " \n", " df = df.append(dfa, ignore_index = True)\n", " \n", " #print(dfa)\n", " #ls = ls.append(name)\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Name | \n", "Address | \n", "Notes | \n", "Phone_Number | \n", "Station_Type | \n", "Vaccine_Type | \n", "Notes | \n", "
---|---|---|---|---|---|---|---|
0 | \n", "Abyssinian Baptist Church- Pop Up | \n", "132 West 138th Street, Manhattan | \n", "$100 incentive available , Walk-up vaccination... | \n", "(877) 829-4692 | \n", "Pop Up - Van | \n", "Vaccines offered:Pfizer (12+)Johnson & Johnson... | \n", "$100 incentive available , Walk-up vaccination... | \n", "
1 | \n", "AMC Magic Johnson Harlem- Pop Up | \n", "2309 Frederick Douglass Blvd, Manhattan | \n", "$100 incentive available , Walk-up vaccination... | \n", "(877) 829-4692 | \n", "Pop Up - Bus | \n", "Vaccine offered:Pfizer (5-11) | \n", "$100 incentive available , Walk-up vaccination... | \n", "