{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# First Github Scraper\n", "\n", "This notebook is a simplified version of the IRE tutorial “Web scraping with Python.” Consult [the original material](https://github.com/ireapps/teaching-guide-python-scraping) to learn more about how to write a web scraper." ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "import requests\n", "import bs4" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "URL = 'http://www.dllr.state.md.us/employment/warn.shtml'" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "warn_page = requests.get(URL)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "soup = bs4.BeautifulSoup(warn_page.text, 'html.parser')" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "table = soup.find('table')" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "rows = table.find_all('tr')" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "import csv" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "HEADERS = [\n", " 'warn_date',\n", " 'naics_code',\n", " 'biz',\n", " 'address',\n", " 'wia_code',\n", " 'total_employees',\n", " 'effective_date',\n", " 'type_code'\n", "]" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "with open('warn-data.csv', 'w', newline='') as outfile:\n", " writer = csv.writer(outfile)\n", " writer.writerow(HEADERS)\n", " for row in rows[1:]:\n", " cells = row.find_all('td')\n", " values = [c.text.strip() for c in cells]\n", " writer.writerow(values)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.9" } }, "nbformat": 4, "nbformat_minor": 4 }