{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import requests #这个导入库需要自己下载\n", "import re\n", "import pandas as pd\n", "import time" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "14238\n", "14245\n", "14252\n", "14259\n", "14266\n", "14273\n", "14280\n", "14287\n", "14294\n", "14301\n", "14308\n", "14315\n", "14322\n", "14329\n", "14336\n", "14343\n", "14350\n", "14357\n", "14364\n", "14371\n", "14378\n", "14385\n", "14392\n", "14399\n", "14406\n", "14413\n", "14420\n", "14427\n", "14434\n", "14441\n" ] } ], "source": [ "#生成链接\n", "header={\n", " 'User-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'\n", "}\n", "base_url='https://www.lottery.gov.cn/kjpls/{}.html'\n", "data=[]\n", "#改变range值控制url数量\n", "for i in range(14233,14445):\n", " new_url=base_url.format(i)\n", "\n", "#获取数据并且整合\n", "\n", " r=requests.get(new_url,headers=header)\n", " r.encoding=r.apparent_encoding\n", " date=re.findall(r'期于(.+?)开奖',r.text)[0]\n", " if re.findall(r'投注总额(.+?) 元。',r.text) == []: #判断缺失值\n", " amount=-1\n", " else :\n", " amount=re.findall(r'投注总额(.+?) 元。',r.text)[0]\n", " \n", " \n", " result=re.findall(r'开奖结果: (.+?)<.+?>',r.text)[0]\n", " data.append(\n", " #date帮你改成了时间格式\n", " [date,amount,result]\n", " )\n", "#爬取多个网站的时候可以设置延迟,或者加个if判断每爬几个网站暂停一段时间\n", " if i%7==0:\n", " print(i)\n", " time.sleep(3)\n", " elif i%2==0:\n", " time.sleep(2)\n", " \n", " \n", "\n", "#date[:4]+date[6:8]+date[10:12]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "14444" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "i" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "#保存数据\n", "title=['date','amount','result']\n", "df=pd.DataFrame(data,columns=title).set_index('date')\n", "df.to_csv('data14233--14445-1--na.csv')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "df=pd.read_csv('data14233--14445-1--na.csv',parse_dates=['date'])" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dateamountresult
02014 年01 月14 日-15 3 5
12014 年01 月15 日17,874,1181 7 5
22014 年01 月16 日-19 8 2
32014 年01 月17 日-10 7 1
42014 年01 月18 日-10 2 1
............
1972014 年08 月07 日14,826,2022 0 5
1982014 年08 月08 日15,116,9140 5 6
1992014 年08 月09 日14,274,6667 7 5
2002014 年08 月10 日14,198,1402 1 4
2012014 年08 月11 日15,260,9766 0 9
\n", "

202 rows × 3 columns

\n", "
" ], "text/plain": [ " date amount result\n", "0 2014 年01 月14 日 -1 5 3 5\n", "1 2014 年01 月15 日 17,874,118 1 7 5\n", "2 2014 年01 月16 日 -1 9 8 2\n", "3 2014 年01 月17 日 -1 0 7 1\n", "4 2014 年01 月18 日 -1 0 2 1\n", ".. ... ... ...\n", "197 2014 年08 月07 日 14,826,202 2 0 5\n", "198 2014 年08 月08 日 15,116,914 0 5 6\n", "199 2014 年08 月09 日 14,274,666 7 7 5\n", "200 2014 年08 月10 日 14,198,140 2 1 4\n", "201 2014 年08 月11 日 15,260,976 6 0 9\n", "\n", "[202 rows x 3 columns]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head(-10)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "#data" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "#title=['date','amount','result']\n", "#df1=pd.DataFrame(data,columns=title)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "scrolled": false }, "outputs": [], "source": [ "#df1.info()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 212 entries, 0 to 211\n", "Data columns (total 3 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 date 212 non-null object\n", " 1 amount 212 non-null object\n", " 2 result 212 non-null object\n", "dtypes: object(3)\n", "memory usage: 5.1+ KB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 4 }