{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# **FBA.gov 조달서비스 크롤링**\n", "- https://www.fbo.gov/index.php?s=opportunity&=&pageID=3\n", "- https://www.fbo.gov/index.php?s=opportunity&mode=list&tab=list&tabmode=list&pp=100\n", "- https://www.fbo.gov/index.php?s=opportunity&mode=list&tab=list&tabmode=list&pp=100&pageID=2\n", "- https://www.fbo.gov/index.php?s=opportunity&mode=list&tab=list&tabmode=list" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## **1 전체 페이지에서 개별 목록 수집하기**\n", "https://github.com/SmugZombie/FBO_Parser/blob/master/fbo_parse.py\n", "\n", "```html\n", "
Janitorial Service
,\n", "
70FBR919Q00000072
,\n", "
S -- Utilities and housekeeping services
\n", "```" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# import requests\n", "# def html_download(url, method=\"get\", params=None, data=None):\n", "# userAgent = {\"user-agent\":\"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0\"}\n", "# return requests.request(method, url, params=params, data=data, headers= userAgent)\n", "\n", "# from bs4 import BeautifulSoup\n", "# url = \"https://www.fbo.gov/index.php?s=opportunity&mode=list&tab=list&tabmode=list&pp=100\"\n", "# html = html_download(url)\n", "# dom = BeautifulSoup(html.text, \"html.parser\") # 오류시 \n", "# type(html.text), type(dom)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(str, bs4.BeautifulSoup)" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# with open('fbo.html', \"w\") as f:\n", "# f.write(html.text)\n", "with open('fbo.html', \"r\") as f:\n", " html = f.read()\n", "\n", "from bs4 import BeautifulSoup\n", "dom = BeautifulSoup(html, \"html.parser\") # 오류시 \n", "type(html), type(dom)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Janitorial Service ',\n", " '40--SWEEP WIRE,FAIRED STBD ',\n", " '40--WIRE ROPE ASSEMBLY, ',\n", " '51--JACK,SCREW,MECHANIC ',\n", " 'Notice of intent to sole source: Azure Biosystems Inc. ']" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 링크가 포함된 페이지만 추출하기\n", "link_pages = [_ for _ in dom.find(\"\", {\"class\":\"list\"}).find_all(\"a\") \n", " if str(_['href']).find(\"s=opportunity&mode=form&id\") != -1]\n", "\n", "# 추출 내용 중 제목만 추출\n", "[_.find(\"\", {\"class\":'solt'}).text for _ in link_pages][:5]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['70FBR919Q00000072',\n", " 'SPE4A619T38X9',\n", " 'SPE4A619T828X',\n", " 'SPE4A619T13F9',\n", " '12905B19R8027']" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 추출 내용 중 ID만 추출\n", "[_.find(\"\", {\"class\":'soln'}).text.strip() for _ in link_pages][:5]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['S -- Utilities and housekeeping services',\n", " '40 -- Rope, cable, chain & fittings',\n", " '40 -- Rope, cable, chain & fittings',\n", " '51 -- Hand tools',\n", " '99 -- Miscellaneous']" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 추출 내용 중 품목명 추출\n", "[_.find(\"\", {\"class\":'solcc'}).text.strip() for _ in link_pages][:5]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'?s=opportunity&mode=form&id=8901b7da304242cf60579b4a94870a6d&tab=core&_cview=0'" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 개별 페이지 링크 주소만 추출\n", "href_links = [_['href'] for _ in link_pages][:5]\n", "href_links[0]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idsolnsolcchref
070FBR919Q00000072Janitorial ServiceS -- Utilities and housekeeping serviceshttps://www.fbo.gov/index.php?s=opportunity&mo...
1SPE4A619T38X940--SWEEP WIRE,FAIRED STBD40 -- Rope, cable, chain & fittingshttps://www.fbo.gov/index.php?s=opportunity&mo...
2SPE4A619T828X40--WIRE ROPE ASSEMBLY,40 -- Rope, cable, chain & fittingshttps://www.fbo.gov/index.php?s=opportunity&mo...
3SPE4A619T13F951--JACK,SCREW,MECHANIC51 -- Hand toolshttps://www.fbo.gov/index.php?s=opportunity&mo...
412905B19R8027Notice of intent to sole source: Azure Biosyst...99 -- Miscellaneoushttps://www.fbo.gov/index.php?s=opportunity&mo...
\n", "
" ], "text/plain": [ " id soln \\\n", "0 70FBR919Q00000072 Janitorial Service \n", "1 SPE4A619T38X9 40--SWEEP WIRE,FAIRED STBD \n", "2 SPE4A619T828X 40--WIRE ROPE ASSEMBLY, \n", "3 SPE4A619T13F9 51--JACK,SCREW,MECHANIC \n", "4 12905B19R8027 Notice of intent to sole source: Azure Biosyst... \n", "\n", " solcc \\\n", "0 S -- Utilities and housekeeping services \n", "1 40 -- Rope, cable, chain & fittings \n", "2 40 -- Rope, cable, chain & fittings \n", "3 51 -- Hand tools \n", "4 99 -- Miscellaneous \n", "\n", " href \n", "0 https://www.fbo.gov/index.php?s=opportunity&mo... \n", "1 https://www.fbo.gov/index.php?s=opportunity&mo... \n", "2 https://www.fbo.gov/index.php?s=opportunity&mo... \n", "3 https://www.fbo.gov/index.php?s=opportunity&mo... \n", "4 https://www.fbo.gov/index.php?s=opportunity&mo... " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "url = \"https://www.fbo.gov/index.php?s=opportunity&mode=list&tab=list&tabmode=list&pp=100\"\n", "\n", "import pandas as pd\n", "import requests\n", "fbo_links = pd.DataFrame()\n", "fbo_links['id'] = [_.find(\"\", {\"class\":'soln'}).text.strip() for _ in link_pages]\n", "fbo_links['soln'] = [_.find(\"\", {\"class\":'solt'}).text.strip() for _ in link_pages]\n", "fbo_links['solcc'] = [_.find(\"\", {\"class\":'solcc'}).text.strip() for _ in link_pages]\n", "# fbo_links['href'] = [_['href'] for _ in link_pages]\n", "fbo_links['href'] = [requests.compat.urljoin(url, _['href']) for _ in link_pages]\n", "fbo_links.head()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 4 }