{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"\n",
"\n",
"***\n",
"***\n",
"# 数据抓取:\n",
" > # 抓取47年政府工作报告\n",
"***\n",
"***\n",
"\n",
"王成军 \n",
"\n",
"wangchengjun@nju.edu.cn\n",
"\n",
"计算传播网 http://computational-communication.com"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true,
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [],
"source": [
"import urllib2\n",
"from bs4 import BeautifulSoup"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {
"collapsed": false,
"scrolled": true,
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"execution_count": 111,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from IPython.display import display_html, HTML\n",
"HTML('')\n",
"# the webpage we would like to crawl"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Inspect\n",
"\n",
"# · 2016年政府工作报告 | \n",
"\n",
" · 2016年政府工作报告 | \n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"data": {
"text/plain": [
"u'./d12qgrdzfbg/201603/t20160318_369509.html'"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# get the link for each year\n",
"url = \"http://www.hprc.org.cn/wxzl/wxysl/lczf/\" \n",
"content = urllib2.urlopen(url).read().decode('gb18030') \n",
"soup = BeautifulSoup(content, 'html.parser') \n",
"# links = soup.find_all('td', {'class', 'bl'}) \n",
"links = soup.select('.bl a')\n",
"links[0]['href']"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"# decode\n",
"urllib2.urlopen(url).read().decode('gb18030') \n",
"\n",
"# html.parser\n",
"BeautifulSoup(content, 'html.parser')"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"47\n"
]
}
],
"source": [
"print len(links)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"./d12qgrdzfbg/201603/t20160318_369509.html\n"
]
}
],
"source": [
"print links[0]['href']"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "subslide"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2016年政府工作报告\n"
]
}
],
"source": [
"print links[0].a"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "subslide"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"./d12qgrdzfbg/201603/t20160318_369509.html\n"
]
}
],
"source": [
"print links[0].a['href']"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "subslide"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"d12qgrdzfbg/201603/t20160318_369509.html\n"
]
}
],
"source": [
"print links[0]['href'].split('./')[1]"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "subslide"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"d12qgrdzfbg/201603/t20160318_369509.html\n"
]
}
],
"source": [
"print links[0]['href'].split('./')[1]"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "subslide"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201603/t20160318_369509.html\n"
]
}
],
"source": [
"print url + links[0]['href'].split('./')[1]"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "subslide"
}
},
"outputs": [
{
"data": {
"text/plain": [
"[u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201603/t20160318_369509.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201503/t20150318_319434.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201403/t20140315_270863.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201402/t20140214_266528.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie/201402/t20140214_266527.html']"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hyperlinks = [url + i['href'].split('./')[1] for i in links]\n",
"hyperlinks[:5]"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"data": {
"text/plain": [
"[u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201603/t20160318_369509.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201503/t20150318_319434.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201403/t20140315_270863.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/d12qgrdzfbg/201402/t20140214_266528.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie/201402/t20140214_266527.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie/201103/t20110315_153641.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie/201003/t20100315_44772.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie/200908/t20090817_27504.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie/200908/t20090817_27495.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27775.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27765.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27757.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27756.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27753.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_2/200908/t20090818_27744.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_2/200908/t20090818_27741.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_2/200908/t20090818_27738.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_2/200908/t20090818_27737.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_2/200908/t20090818_27736.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_3/200908/t20090818_27709.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_3/200908/t20090818_27708.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_3/200908/t20090818_27707.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_3/200908/t20090818_27706.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_3/200908/t20090818_27705.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_4/200908/t20090818_27702.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_4/200908/t20090818_27700.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_4/200908/t20090818_27699.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_4/200908/t20090818_27678.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_4/200908/t20090818_27644.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_5/200908/t20090818_27642.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_5/200908/t20090818_27640.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_5/200908/t20090818_27616.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_5/200908/t20090818_27615.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_5/200908/t20090818_27614.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_6/200908/t20090818_27613.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_6/200908/t20090818_27612.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_6/200908/t20090818_27611.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_6/200908/t20090818_27567.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_6/200908/t20090818_27566.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_7/200908/t20090818_27565.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_8/200908/t20090818_27564.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_9/200908/t20090818_27562.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_9/200908/t20090818_27563.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_10/200908/t20090818_27561.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_10/200908/t20090818_27560.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_10/200908/t20090818_27559.html',\n",
" u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_10/200908/t20090818_27558.html']"
]
},
"execution_count": 95,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hyperlinks"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"data": {
"text/plain": [
"u'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27775.html'"
]
},
"execution_count": 116,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hyperlinks[9] # 2007年有分页"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"execution_count": 113,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from IPython.display import display_html, HTML\n",
"HTML('')\n",
"# 2007年有分页"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "subslide"
}
},
"source": [
"# Inspect 下一页\n",
"\n",
"下一页\n",
"\n",
" 下一页\n",
" \n",
"- a\n",
" - script\n",
" - td"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "subslide"
}
},
"outputs": [],
"source": [
"url_i = 'http://www.hprc.org.cn/wxzl/wxysl/lczf/dishiyijie_1/200908/t20090818_27775.html'\n",
"content = urllib2.urlopen(url_i).read().decode('gb18030') \n",
"soup = BeautifulSoup(content, 'html.parser') \n",
"#scripts = soup.find_all('script')\n",
"#scripts[0]\n",
"\n",
"scripts = soup.select('td script')[0]"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"scripts"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "subslide"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\tvar currentPage = 0;//所在页从0开始\n",
"\tvar prevPage = currentPage-1//上一页\n",
"\tvar 下一页Page = currentPage+1//下一页\n",
"\tvar countPage = 4//共多少页\n",
"\t//document.write(\"共\"+countPage+\"页 \");\n",
"\t\n",
"\t//循环\n",
"\tvar num = 17;\n",
"\tfor(var i=0+(currentPage-1-(currentPage-1)%num) ; i<=(num+(currentPage-1-(currentPage-1)%num))&&(i1){\n",
"\t\t\tif(currentPage==i)\n",
"\t\t\t\tdocument.write(\"【\"+(i+1)+\"】 \");\n",
"\t\t\telse if(i==0)\n",
"\t\t\t\tdocument.write(\"【\"+(i+1)+\"】 \");\n",
"\t\t\telse\n",
"\t\t\t\tdocument.write(\"【\"+(i+1)+\"】 \");\n",
"\t\t}\t\n",
"\t}\n",
"\t\n",
"\tdocument.write(\"
\");\n",
"\t//设置上一页代码\n",
"\tif(countPage>1&¤tPage!=0&¤tPage!=1)\n",
"\t\tdocument.write(\"上一页 \");\n",
"\telse if(countPage>1&¤tPage!=0&¤tPage==1)\n",
"\t\tdocument.write(\"上一页 \");\n",
"\t//else\n",
"\t//\tdocument.write(\"上一页 \");\n",
"\t\n",
"\t\n",
"\t//设置下一页代码 \n",
"\tif(countPage>1&¤tPage!=(countPage-1))\n",
"\t\tdocument.write(\"下一页 \");\n",
"\t//else\n",
"\t//\tdocument.write(\"下一页 \");\n",
"\t\t\t\t\t \n",
"\t\n"
]
}
],
"source": [
"print scripts.text"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "subslide"
}
},
"outputs": [
{
"data": {
"text/plain": [
"4"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"countPage = int(''.join(scripts).split('countPage = ')[1].split('//')[0])\n",
"countPage"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"collapsed": true,
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [],
"source": [
"def crawler(url_i):\n",
" content = urllib2.urlopen(url_i).read().decode('gb18030') \n",
" soup = BeautifulSoup(content, 'html.parser') \n",
" year = soup.find('span', {'class', 'huang16c'}).text[:4]\n",
" year = int(year)\n",
" report = ''.join(s.text for s in soup('p'))\n",
" # 找到分页信息\n",
" scripts = soup.find_all('script')\n",
" countPage = int(''.join(scripts[1]).split('countPage = ')[1].split('//')[0])\n",
" if countPage == 1:\n",
" pass\n",
" else:\n",
" for i in range(1, countPage):\n",
" url_child = url_i.split('.html')[0] +'_'+str(i)+'.html'\n",
" content = urllib2.urlopen(url_child).read().decode('gb18030') \n",
" soup = BeautifulSoup(content) \n",
" report_child = ''.join(s.text for s in soup('p'))\n",
" report = report + report_child\n",
" return year, report\n"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2016\n",
"2015\n",
"2014\n",
"2013\n",
"2012\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/chengjun/anaconda/lib/python2.7/site-packages/bs4/__init__.py:166: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"lxml\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n",
"\n",
"To get rid of this warning, change this:\n",
"\n",
" BeautifulSoup([your markup])\n",
"\n",
"to this:\n",
"\n",
" BeautifulSoup([your markup], \"lxml\")\n",
"\n",
" markup_type=markup_type))\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"2011\n",
"2010\n",
"2009\n",
"2008\n",
"2007\n",
"2006\n",
"2005\n",
"2004\n",
"2003\n",
"2002\n",
"2001\n",
"2000\n",
"1999\n",
"1998\n",
"1997\n",
"1996\n",
"1995\n",
"1994\n",
"1993\n",
"1992\n",
"1991\n",
"1990\n",
"1989\n",
"1988\n",
"1987\n",
"1986\n",
"1985\n",
"1984\n",
"1983\n",
"1982\n",
"1981\n",
"1980\n",
"1979\n",
"1978\n",
"1975\n",
"1964\n",
"1959\n",
"1960\n",
"1957\n",
"1956\n",
"1955\n",
"1954\n"
]
}
],
"source": [
"# 抓取47年政府工作报告内容\n",
"reports = {}\n",
"for link in hyperlinks:\n",
" year, report = crawler(link)\n",
" print year\n",
" reports[year] = report "
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "skip"
}
},
"outputs": [],
"source": [
"url2016 = 'http://news.xinhuanet.com/fortune/2016-03/05/c_128775704.htm'\n",
"content = urllib2.urlopen(url2016).read()\n",
"soup = BeautifulSoup(content, 'html.parser') \n",
"report2016 = ''.join(s.text for s in soup('p'))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [],
"source": [
"with open('/Users/chengjun/github/cjc2016/data/gov_reports1954-2016.txt', 'wb') as f:\n",
" for r in reports:\n",
" line = str(r)+'\\t'+reports[r].replace('\\n', '\\t') +'\\n'\n",
" f.write(line.encode('utf-8'))"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true,
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"# This is the end.\n",
"> ## Thank you for your attention."
]
}
],
"metadata": {
"celltoolbar": "Slideshow",
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}