import numpy as np
import pandas as pd
import time
import requests
import re
p1 = r'<tr.+?ter">.+?(\d{4}-\d{2}-\d{2}).+?</div.+?ter">(.+?)</div.+?ter">(.+?)</div.+?ter">(.+?)</div>.+?ter">(.+?)</div.+?ter">(.+?)</div.+?ter">(.+?)</div>'
fw = open('data.csv', 'w')
data_path = '../6Python网络爬虫实战/Data_600618/'
for nian in range(1999, 2019):
for jidu in range(1, 5):
with open(data_path + 'DataHTML_600618_Year_' + str(nian) + '_Jidu_' + str(jidu) + '.txt', 'r', encoding='utf-8') as file:
html = file.read()
match = re.findall(p1, html, re.S)
if match:
for line in match:
fw.write('{:s}, {:s}, {:s}, {:s}, {:s}, {:s}, {:s}\n'.format(line[0], line[1], line[2], line[3],
line[4], line[5], line[6]))
fw.close()
def myfun_crawl_sina_news():
url = 'https://finance.sina.com.cn/roll/index.d.html?cid=56592&page='
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0'}
pcheck = r'<div\sclass="hs01">.+?<div\sclass="hs01">'
p = r'<li><a\shref="(.+?shtml).+?"\starget="_blank">(.+?)</a><span>\((.+?)\)</span></li>'
objp = re.compile(p, re.S)
fw = open('Data_SinaNews.txt', 'w')
for i in range(1, 6):
while True:
try:
res = requests.get(url+str(i), headers=headers, timeout=10)
res.encoding = 'utf-8'
html = res.text
mcheck = re.search(pcheck, html, re.S)
if len(mcheck.group()) > 0:
html = mcheck.group()
break
except:
print('failing to crawl the data because of timeout')
time.sleep(np.random.randint(10, 30))
match = objp.findall(html)
for line in match:
fw.write('{:s}\t{:s}\t{:s}\n'.format(line[0], line[1], line[2]))
fw.close()
myfun_crawl_sina_news()
res = requests.get('http://money.finance.sina.com.cn/quotes_service/api/json_v2.php/CN_MarketData.getKLineData?symbol=sh601633&scale=240&ma=no&datalen=10000')
# scale单位是分钟。这个地址数据很全,开盘、收盘、最高、最低、成交量。
# ma移动平均参数
# datalen数据量参数
data_json = res.json()
fw = open('data_sina_api.txt', 'w')
fw.write('day, open, high, low, close, volume\n')
for i in range(len(data_json)):
dj = data_json[i]
fw.write('{:s},{:s},{:s},{:s},{:s},{:s}\n'.format(dj['day'], dj['open'], dj['high'],
dj['low'], dj['close'], dj['volume']))
fw.close()
stock_data = pd.DataFrame(data_json)
print(stock_data)