{ "cells": [ { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "\n", "\n", "***\n", "***\n", "# 数据抓取:\n", " > # 抓取47年政府工作报告\n", "***\n", "***\n", "\n", "王成军 \n", "\n", "wangchengjun@nju.edu.cn\n", "\n", "计算传播网 http://computational-communication.com" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true, "slideshow": { "slide_type": "slide" } }, "outputs": [], "source": [ "import urllib2\n", "from bs4 import BeautifulSoup" ] }, { "cell_type": "code", "execution_count": 111, "metadata": { "collapsed": false, "scrolled": true, "slideshow": { "slide_type": "slide" } }, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "execution_count": 111, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from IPython.display import display_html, HTML\n", "HTML('')\n", "# the webpage we would like to crawl" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Inspect\n", "\n", "# · 2016年政府工作报告\n", "\n", " · 2016年政府工作报告\n", "\n" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false, "slideshow": { "slide_type": "slide" } }, "outputs": [ { "data": { "text/plain": [ "u'./d12qgrdzfbg/201603/t20160318_369509.html'" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get the link for each year\n", "url = \"http://www.hprc.org.cn/wxzl/wxysl/lczf/\" \n", "content = urllib2.urlopen(url).read().decode('gb18030') \n", "soup = BeautifulSoup(content, 'html.parser') \n", "# links = soup.find_all('td', {'class', 'bl'}) \n", "links = soup.select('.bl a')\n", "links[0]['href']" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# decode\n", "urllib2.urlopen(url).read().decode('gb18030') \n", "\n", "# html.parser\n", "BeautifulSoup(content, 'html.parser')" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false, "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "47\n" ] } ], "source": [ "print len(links)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false, "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "./d12qgrdzfbg/201603/t20160318_369509.html\n" ] } ], "source": [ "print links[0]['href']" ] }, { "cell_type": "code", "execution_count": 106, "metadata": { "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2016年政府工作报告\n" ] } ], "source": [ "print links[0].a" ] }, { "cell_type": "code", "execution_count": 107, "metadata": { "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "./d12qgrdzfbg/201603/t20160318_369509.html\n" ] } ], "source": [ "print links[0].a['href']" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "d12qgrdzfbg/201603/t20160318_369509.html\n" ] } ], "source": [ "print links[0]['href'].split('./')[1]" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "d12qgrdzfbg/201603/t20160318_369509.html\n" ] } ], "source": [ "print links[0]['href'].split('./')[1]" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201603/t20160318_369509.html\n" ] } ], "source": [ "print url + links[0]['href'].split('./')[1]" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "data": { "text/plain": [ "[u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201603/t20160318_369509.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201503/t20150318_319434.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201403/t20140315_270863.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201402/t20140214_266528.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie/201402/t20140214_266527.html']" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hyperlinks = [url + i['href'].split('./')[1] for i in links]\n", "hyperlinks[:5]" ] }, { "cell_type": "code", "execution_count": 95, "metadata": { "collapsed": false, "slideshow": { "slide_type": "slide" } }, "outputs": [ { "data": { "text/plain": [ "[u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201603/t20160318_369509.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201503/t20150318_319434.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201403/t20140315_270863.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201402/t20140214_266528.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie/201402/t20140214_266527.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie/201103/t20110315_153641.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie/201003/t20100315_44772.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie/200908/t20090817_27504.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie/200908/t20090817_27495.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27775.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27765.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27757.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27756.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27753.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_2/200908/t20090818_27744.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_2/200908/t20090818_27741.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_2/200908/t20090818_27738.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_2/200908/t20090818_27737.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_2/200908/t20090818_27736.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_3/200908/t20090818_27709.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_3/200908/t20090818_27708.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_3/200908/t20090818_27707.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_3/200908/t20090818_27706.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_3/200908/t20090818_27705.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_4/200908/t20090818_27702.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_4/200908/t20090818_27700.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_4/200908/t20090818_27699.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_4/200908/t20090818_27678.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_4/200908/t20090818_27644.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_5/200908/t20090818_27642.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_5/200908/t20090818_27640.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_5/200908/t20090818_27616.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_5/200908/t20090818_27615.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_5/200908/t20090818_27614.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_6/200908/t20090818_27613.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_6/200908/t20090818_27612.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_6/200908/t20090818_27611.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_6/200908/t20090818_27567.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_6/200908/t20090818_27566.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_7/200908/t20090818_27565.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_8/200908/t20090818_27564.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_9/200908/t20090818_27562.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_9/200908/t20090818_27563.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_10/200908/t20090818_27561.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_10/200908/t20090818_27560.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_10/200908/t20090818_27559.html',\n", " u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_10/200908/t20090818_27558.html']" ] }, "execution_count": 95, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hyperlinks" ] }, { "cell_type": "code", "execution_count": 116, "metadata": { "collapsed": false, "slideshow": { "slide_type": "slide" } }, "outputs": [ { "data": { "text/plain": [ "u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27775.html'" ] }, "execution_count": 116, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hyperlinks[9] # 2007年有分页" ] }, { "cell_type": "code", "execution_count": 113, "metadata": { "collapsed": false, "slideshow": { "slide_type": "slide" } }, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "execution_count": 113, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from IPython.display import display_html, HTML\n", "HTML('')\n", "# 2007年有分页" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "subslide" } }, "source": [ "# Inspect 下一页\n", "\n", "下一页\n", "\n", " 下一页\n", " \n", "- a\n", " - script\n", " - td" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "url_i = 'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27775.html'\n", "content = urllib2.urlopen(url_i).read().decode('gb18030') \n", "soup = BeautifulSoup(content, 'html.parser') \n", "#scripts = soup.find_all('script')\n", "#scripts[0]\n", "\n", "scripts = soup.select('td script')[0]" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "scripts" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\tvar currentPage = 0;//所在页从0开始\n", "\tvar prevPage = currentPage-1//上一页\n", "\tvar 下一页Page = currentPage+1//下一页\n", "\tvar countPage = 4//共多少页\n", "\t//document.write(\"共\"+countPage+\"页  \");\n", "\t\n", "\t//循环\n", "\tvar num = 17;\n", "\tfor(var i=0+(currentPage-1-(currentPage-1)%num) ; i<=(num+(currentPage-1-(currentPage-1)%num))&&(i1){\n", "\t\t\tif(currentPage==i)\n", "\t\t\t\tdocument.write(\"【\"+(i+1)+\"】 \");\n", "\t\t\telse if(i==0)\n", "\t\t\t\tdocument.write(\"【\"+(i+1)+\"】 \");\n", "\t\t\telse\n", "\t\t\t\tdocument.write(\"【\"+(i+1)+\"】 \");\n", "\t\t}\t\n", "\t}\n", "\t\n", "\tdocument.write(\"

\");\n", "\t//设置上一页代码\n", "\tif(countPage>1&¤tPage!=0&¤tPage!=1)\n", "\t\tdocument.write(\"上一页 \");\n", "\telse if(countPage>1&¤tPage!=0&¤tPage==1)\n", "\t\tdocument.write(\"上一页 \");\n", "\t//else\n", "\t//\tdocument.write(\"上一页  \");\n", "\t\n", "\t\n", "\t//设置下一页代码 \n", "\tif(countPage>1&¤tPage!=(countPage-1))\n", "\t\tdocument.write(\"下一页  \");\n", "\t//else\n", "\t//\tdocument.write(\"下一页  \");\n", "\t\t\t\t\t \n", "\t\n" ] } ], "source": [ "print scripts.text" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": false, "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "data": { "text/plain": [ "4" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "countPage = int(''.join(scripts).split('countPage = ')[1].split('//')[0])\n", "countPage" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "collapsed": true, "slideshow": { "slide_type": "slide" } }, "outputs": [], "source": [ "def crawler(url_i):\n", " content = urllib2.urlopen(url_i).read().decode('gb18030') \n", " soup = BeautifulSoup(content, 'html.parser') \n", " year = soup.find('span', {'class', 'huang16c'}).text[:4]\n", " year = int(year)\n", " report = ''.join(s.text for s in soup('p'))\n", " # 找到分页信息\n", " scripts = soup.find_all('script')\n", " countPage = int(''.join(scripts[1]).split('countPage = ')[1].split('//')[0])\n", " if countPage == 1:\n", " pass\n", " else:\n", " for i in range(1, countPage):\n", " url_child = url_i.split('.html')[0] +'_'+str(i)+'.html'\n", " content = urllib2.urlopen(url_child).read().decode('gb18030') \n", " soup = BeautifulSoup(content) \n", " report_child = ''.join(s.text for s in soup('p'))\n", " report = report + report_child\n", " return year, report\n" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "collapsed": false, "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2016\n", "2015\n", "2014\n", "2013\n", "2012\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/Users/chengjun/anaconda/lib/python2.7/site-packages/bs4/__init__.py:166: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"lxml\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n", "\n", "To get rid of this warning, change this:\n", "\n", " BeautifulSoup([your markup])\n", "\n", "to this:\n", "\n", " BeautifulSoup([your markup], \"lxml\")\n", "\n", " markup_type=markup_type))\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2011\n", "2010\n", "2009\n", "2008\n", "2007\n", "2006\n", "2005\n", "2004\n", "2003\n", "2002\n", "2001\n", "2000\n", "1999\n", "1998\n", "1997\n", "1996\n", "1995\n", "1994\n", "1993\n", "1992\n", "1991\n", "1990\n", "1989\n", "1988\n", "1987\n", "1986\n", "1985\n", "1984\n", "1983\n", "1982\n", "1981\n", "1980\n", "1979\n", "1978\n", "1975\n", "1964\n", "1959\n", "1960\n", "1957\n", "1956\n", "1955\n", "1954\n" ] } ], "source": [ "# 抓取47年政府工作报告内容\n", "reports = {}\n", "for link in hyperlinks:\n", " year, report = crawler(link)\n", " print year\n", " reports[year] = report " ] }, { "cell_type": "code", "execution_count": 99, "metadata": { "collapsed": false, "slideshow": { "slide_type": "skip" } }, "outputs": [], "source": [ "url2016 = 'http://news.xinhuanet.com/fortune/2016-03/05/c_128775704.htm'\n", "content = urllib2.urlopen(url2016).read()\n", "soup = BeautifulSoup(content, 'html.parser') \n", "report2016 = ''.join(s.text for s in soup('p'))" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false, "slideshow": { "slide_type": "slide" } }, "outputs": [], "source": [ "with open('/Users/chengjun/github/cjc2016/data/gov_reports1954-2016.txt', 'wb') as f:\n", " for r in reports:\n", " line = str(r)+'\\t'+reports[r].replace('\\n', '\\t') +'\\n'\n", " f.write(line.encode('utf-8'))" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true, "slideshow": { "slide_type": "slide" } }, "source": [ "# This is the end.\n", "> ## Thank you for your attention." ] } ], "metadata": { "celltoolbar": "Slideshow", "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.11" } }, "nbformat": 4, "nbformat_minor": 0 }