"
]
},
"execution_count": 251,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a.form"
]
},
{
"cell_type": "code",
"execution_count": 254,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"false\n"
]
}
],
"source": [
"if b.form:\n",
" print 'true'\n",
"else:\n",
" print 'false'"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [],
"source": [
"import random\n",
"import time\n",
"\n",
"def crawler(url, file_name):\n",
" try:\n",
" # open the browser\n",
" url_1 = 'http://bbs.tianya.cn' + url\n",
" content = urllib2.urlopen(url_1).read() #获取网页的html文本\n",
" post_soup = BeautifulSoup(content, \"lxml\") \n",
" # how many pages in a post\n",
" post_form = post_soup.find('div', {'class', 'atl-pages'})\n",
" if post_form.form:\n",
" post_pages = post_form.form['onsubmit'].split(',')[-1].split(')')[0]\n",
" post_pages = int(post_pages)\n",
" url_base = '-'.join(url_1.split('-')[:-1]) + '-%d.shtml'\n",
" else:\n",
" post_pages = 1\n",
" # for the first page\n",
" pa = post_soup.find_all('div', {'class', 'atl-item'})\n",
" records = parsePage(pa)\n",
" with open(file_name,'a') as p: # '''Note''':Append mode, run only once!\n",
" for record in records: \n",
" p.write('1'+ '\\t' + url + '\\t' + record.encode('utf-8')+\"\\n\") \n",
" # for the 2nd+ pages\n",
" if post_pages > 1:\n",
" for page_num in range(2, post_pages+1):\n",
" time.sleep(random.random())\n",
" flushPrint(page_num)\n",
" url2 =url_base % page_num\n",
" content = urllib2.urlopen(url2).read() #获取网页的html文本\n",
" post_soup = BeautifulSoup(content, \"lxml\") \n",
" pa = post_soup.find_all('div', {'class', 'atl-item'})\n",
" records = parsePage(pa)\n",
" with open(file_name,'a') as p: # '''Note''':Append mode, run only once!\n",
" for record in records: \n",
" p.write(str(page_num) + '\\t' +url + '\\t' + record.encode('utf-8')+\"\\n\") \n",
" else:\n",
" pass\n",
" except Exception, e:\n",
" print e\n",
" pass"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"# 测试"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"7"
]
}
],
"source": [
"url = df.link[2]\n",
"file_name = '/Users/chengjun/github/cjc2016/data/tianya_bbs_threads_2test.txt'\n",
"crawler(url, file_name)"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"# 正式抓取!"
]
},
{
"cell_type": "code",
"execution_count": 417,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/post-free-2849477-1.shtmlThis it the post of : 0\n",
"/post-free-2842180-1.shtmlThis it the post of : 10\n",
"/post-free-3316698-1.shtmlThis it the post of : 20\n",
"/post-free-923387-1.shtmlThis it the post of : 30\n",
"/post-free-4236026-1.shtmlThis it the post of : 40\n",
"/post-free-2850721-1.shtmlThis it the post of : 50\n",
"/post-free-5054821-1.shtmlThis it the post of : 60\n",
"/post-free-3326274-1.shtmlThis it the post of : 70\n",
"/post-free-4236793-1.shtmlThis it the post of : 80\n",
"/post-free-4239792-1.shtmlThis it the post of : 90\n",
"/post-free-5042110-1.shtmlThis it the post of : 100\n",
"/post-free-2241144-1.shtmlThis it the post of : 110\n",
"/post-free-3324561-1.shtmlThis it the post of : 120\n",
"/post-free-3835452-1.shtmlThis it the post of : 130\n",
"/post-free-5045950-1.shtmlThis it the post of : 140\n",
"/post-free-2848818-1.shtmlThis it the post of : 150\n",
"/post-free-3281916-1.shtmlThis it the post of : 160\n",
"/post-free-949151-1.shtmlThis it the post of : 170\n",
"/post-free-2848839-1.shtmlThis it the post of : 180\n",
"/post-free-3228423-1.shtmlThis it the post of : 190\n",
"/post-free-2852970-1.shtmlThis it the post of : 200\n",
"/post-free-3325388-1.shtmlThis it the post of : 210\n",
"/post-free-3835748-1.shtmlThis it the post of : 220\n",
"/post-free-3833431-1.shtmlThis it the post of : 230\n",
"/post-free-3378998-1.shtmlThis it the post of : 240\n",
"/post-free-3359022-1.shtmlThis it the post of : 250\n",
"/post-free-3365791-1.shtmlThis it the post of : 260\n",
"/post-free-3396378-1.shtmlThis it the post of : 270\n",
"/post-free-3835212-1.shtmlThis it the post of : 280\n",
"/post-free-4248593-1.shtmlThis it the post of : 290\n",
"/post-free-3833373-1.shtmlThis it the post of : 300\n",
"/post-free-3847600-1.shtmlThis it the post of : 310\n",
"/post-free-3832970-1.shtmlThis it the post of : 320\n",
"/post-free-4076130-1.shtmlThis it the post of : 330\n",
"/post-free-3835673-1.shtmlThis it the post of : 340\n",
"/post-free-3835434-1.shtmlThis it the post of : 350\n",
"/post-free-3368554-1.shtmlThis it the post of : 360\n",
"/post-free-3832938-1.shtmlThis it the post of : 370\n",
"/post-free-3835075-1.shtmlThis it the post of : 380\n",
"/post-free-3832963-1.shtmlThis it the post of : 390\n",
"/post-free-4250604-1.shtmlThis it the post of : 400\n",
"/post-free-3834828-1.shtmlThis it the post of : 410\n",
"/post-free-3835007-1.shtmlThis it the post of : 420\n",
"/post-free-3838253-1.shtmlThis it the post of : 430\n",
"/post-free-3835167-1.shtmlThis it the post of : 440\n",
"/post-free-3835898-1.shtmlThis it the post of : 450\n",
"/post-free-3835123-1.shtmlThis it the post of : 460\n",
"/post-free-3835031-1.shtml"
]
}
],
"source": [
"for k, link in enumerate(df.link):\n",
" flushPrint(link)\n",
" if k % 10== 0:\n",
" print 'This it the post of : ' + str(k)\n",
" file_name = '/Users/chengjun/github/cjc2016/data/tianya_bbs_threads_network.txt'\n",
" crawler(link, file_name)"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"# 读取数据"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [
{
"data": {
"text/plain": [
"8079"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dtt = []\n",
"with open('/Users/chengjun/github/cjc2016/data/tianya_bbs_threads_network.txt', 'r') as f:\n",
" for line in f:\n",
" pnum, link, time, author_id, author, content = line.replace('\\n', '').split('\\t')\n",
" dtt.append([pnum, link, time, author_id, author, content])\n",
"len(dtt)"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"data": {
"text/html": [
"