华东理工大学《金融计算》

网络爬虫实战

蒋志强

In [1]:
import numpy as np
import pandas as pd
import time
import requests
import re

1. 实验报告第二题

In [2]:
p1 = r'<tr.+?ter">.+?(\d{4}-\d{2}-\d{2}).+?</div.+?ter">(.+?)</div.+?ter">(.+?)</div.+?ter">(.+?)</div>.+?ter">(.+?)</div.+?ter">(.+?)</div.+?ter">(.+?)</div>'

fw = open('data.csv', 'w')
data_path = '../6Python网络爬虫实战/Data_600618/'
for nian in range(1999, 2019):
    for jidu in range(1, 5):
        with open(data_path + 'DataHTML_600618_Year_' + str(nian) + '_Jidu_' + str(jidu) + '.txt', 'r', encoding='utf-8') as file:
            html = file.read()
            match = re.findall(p1, html, re.S)
            if match:
                for line in match:
                    fw.write('{:s}, {:s}, {:s}, {:s}, {:s}, {:s}, {:s}\n'.format(line[0], line[1], line[2], line[3],
                                                                                 line[4], line[5], line[6]))
fw.close()

2. 爬取新浪新闻标题、链接、时间

In [3]:
def myfun_crawl_sina_news():
    url = 'https://finance.sina.com.cn/roll/index.d.html?cid=56592&page='
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0'}

    pcheck = r'<div\sclass="hs01">.+?<div\sclass="hs01">'
    p = r'<li><a\shref="(.+?shtml).+?"\starget="_blank">(.+?)</a><span>\((.+?)\)</span></li>'
    objp = re.compile(p, re.S)

    fw = open('Data_SinaNews.txt', 'w')
    for i in range(1, 6):
        while True:
            try:
                res = requests.get(url+str(i), headers=headers, timeout=10)
                res.encoding = 'utf-8'
                html = res.text
                mcheck = re.search(pcheck, html, re.S)
                if len(mcheck.group()) > 0:
                    html = mcheck.group()
                    break
            except:
                print('failing to crawl the data because of timeout')
            time.sleep(np.random.randint(10, 30))
        match = objp.findall(html)
        for line in match:
            fw.write('{:s}\t{:s}\t{:s}\n'.format(line[0], line[1], line[2]))
    fw.close()
myfun_crawl_sina_news()

3. 读取新浪接口数据

In [5]:
res = requests.get('http://money.finance.sina.com.cn/quotes_service/api/json_v2.php/CN_MarketData.getKLineData?symbol=sh601633&scale=240&ma=no&datalen=10000')
# scale单位是分钟。这个地址数据很全,开盘、收盘、最高、最低、成交量。
# ma移动平均参数
# datalen数据量参数
data_json = res.json()
fw = open('data_sina_api.txt', 'w')
fw.write('day, open, high, low, close, volume\n')
for i in range(len(data_json)):
    dj = data_json[i]
    fw.write('{:s},{:s},{:s},{:s},{:s},{:s}\n'.format(dj['day'], dj['open'], dj['high'],
                                                    dj['low'], dj['close'], dj['volume']))
fw.close()

stock_data = pd.DataFrame(data_json)
print(stock_data)
             day    open    high     low   close     volume
0     2011-09-28  12.800  12.800  11.810  11.850   68260880
1     2011-09-29  11.600  12.700  11.510  12.380   55975244
2     2011-09-30  12.280  12.680  12.020  12.460   38487768
3     2011-10-10  12.460  12.470  11.210  11.320   34180024
4     2011-10-11  11.600  12.450  11.530  12.330   55751612
...          ...     ...     ...     ...     ...        ...
2547  2022-05-06  23.950  24.060  23.660  23.810   22797849
2548  2022-05-09  23.500  23.870  23.070  23.330   16721101
2549  2022-05-10  22.500  23.220  22.200  23.050   25281828
2550  2022-05-11  23.050  25.360  23.050  25.360   31291235
2551  2022-05-12  25.800  27.580  25.450  26.880  104939534

[2552 rows x 6 columns]