{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2018-12-16T00:17:00.686796Z", "start_time": "2018-12-16T00:17:00.682793Z" } }, "outputs": [], "source": [ "import requests, pandas as pd, numpy as np\n", "from requests import session\n", "from bs4 import BeautifulSoup" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2018-12-16T00:10:48.191666Z", "start_time": "2018-12-16T00:10:47.036665Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "__cfduid dc1e6566f055b30ff537c92b686dc9db61577353250\n", "PHPSESSID i1sgviskhsoe31m2vdhp3r1e66\n" ] } ], "source": [ "url='http://www.omnibus.ro/index.php/hu/szekelyfoldi-top-listak/arbevetel/also-haromszek-2015-2018#oldal'\n", "r = requests.get(url)\n", "\n", "for c in r.cookies:\n", " print(c.name, c.value)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'http://www.omnibus.ro/index.php/hu/szekelyfoldi-top-listak/arbevetel/also-haromszek-2015-2018#oldal'" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "url" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "ExecuteTime": { "end_time": "2018-12-16T08:49:53.082342Z", "start_time": "2018-12-16T08:39:39.697363Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0\n", "1\n", "2\n", "3\n", "4\n", "5\n", "6\n", "7\n", "8\n", "9\n", "10\n", "11\n", "12\n", "13\n", "14\n", "15\n", "16\n", "17\n", "18\n", "19\n", "20\n", "21\n", "22\n", "23\n", "24\n", "25\n", "26\n", "27\n", "28\n", "29\n", "30\n", "31\n", "32\n", "33\n", "34\n", "35\n", "36\n", "37\n", "38\n", "39\n", "40\n", "41\n", "42\n", "43\n", "44\n", "45\n", "46\n", "47\n", "48\n", "49\n", "also-haromszek\n", "0\n", "1\n", "2\n", "3\n", "4\n", "5\n", "6\n", "7\n", "8\n", "9\n", "10\n", "11\n", "12\n", "13\n", "14\n", "15\n", "16\n", "17\n", "18\n", "19\n", "20\n", "21\n", "22\n", "23\n", "24\n", "25\n", "26\n", "27\n", "28\n", "29\n", "30\n", "31\n", "32\n", "33\n", "34\n", "35\n", "36\n", "37\n", "38\n", "39\n", "40\n", "41\n", "42\n", "43\n", "44\n", "45\n", "46\n", "47\n", "48\n", "49\n", "felso-haromszek\n", "0\n", "1\n", "2\n", "3\n", "4\n", "5\n", "6\n", "7\n", "8\n", "9\n", "10\n", "11\n", "12\n", "13\n", "14\n", "15\n", "16\n", "17\n", "18\n", "19\n", "20\n", "21\n", "22\n", "23\n", "24\n", "25\n", "26\n", "27\n", "28\n", "29\n", "30\n", "31\n", "32\n", "33\n", "34\n", "35\n", "36\n", "37\n", "38\n", "39\n", "40\n", "41\n", "42\n", "43\n", "44\n", "45\n", "46\n", "47\n", "48\n", "49\n", "csikszek\n", "0\n", "1\n", "2\n", "3\n", "4\n", "5\n", "6\n", "7\n", "8\n", "9\n", "10\n", "11\n", "12\n", "13\n", "14\n", "15\n", "16\n", "17\n", "18\n", "19\n", "20\n", "21\n", "22\n", "23\n", "24\n", "25\n", "26\n", "27\n", "28\n", "29\n", "30\n", "31\n", "32\n", "33\n", "34\n", "35\n", "36\n", "37\n", "38\n", "39\n", "40\n", "41\n", "42\n", "43\n", "44\n", "45\n", "46\n", "47\n", "48\n", "49\n", "udvarhelyszek\n", "0\n", "1\n", "2\n", "3\n", "4\n", "5\n", "6\n", "7\n", "8\n", "9\n", "10\n", "11\n", "12\n", "13\n", "14\n", "15\n", "16\n", "17\n", "18\n", "19\n", "20\n", "21\n", "22\n", "23\n", "24\n", "25\n", "26\n", "27\n", "28\n", "29\n", "30\n", "31\n", "32\n", "33\n", "34\n", "35\n", "36\n", "37\n", "38\n", "39\n", "40\n", "41\n", "42\n", "43\n", "44\n", "45\n", "46\n", "47\n", "48\n", "49\n", "marosszek\n", "0\n", "1\n", "2\n", "3\n", "4\n", "5\n", "6\n", "7\n", "8\n", "9\n", "10\n", "11\n", "12\n", "13\n", "14\n", "15\n", "16\n", "17\n", "18\n", "19\n", "20\n", "21\n", "22\n", "23\n", "24\n", "25\n", "26\n", "27\n", "28\n", "29\n", "30\n", "31\n", "32\n", "33\n", "34\n", "35\n", "36\n", "37\n", "38\n", "39\n", "40\n", "41\n", "42\n", "43\n", "44\n", "45\n", "46\n", "47\n", "48\n", "49\n", "gyergyoszek\n" ] } ], "source": [ "dfs=[]\n", "regions=['also-haromszek','felso-haromszek','csikszek','udvarhelyszek','marosszek','gyergyoszek']\n", "for region in regions:\n", " url='http://www.omnibus.ro/index.php/hu/szekelyfoldi-top-listak/alkalmazott/'+\\\n", " region+'-2015-2018#oldal'\n", " with session() as c:\n", " response = c.get(url)\n", " #print(response.headers)\n", " #print(response.text)\n", " df=pd.read_html(response.text)[1]\n", " df.columns=[0]+list(df.loc[0])[:-1]\n", " df=df.loc[2:].set_index(0)\n", " df=df[df.columns[1:-1]]\n", " df=df.loc[list(df.index)[:-1]]\n", " df['region']=region\n", " df['nr']=df.index\n", " \n", " soup = BeautifulSoup(response.content)\n", " links=soup.findAll('table')[3].findAll('a')\n", " coords=[]\n", " kws=[]\n", " cms=[]\n", " for i in range(len(links)):\n", " print(i,)\n", " r=requests.get(links[i]['href'])\n", " g=repr(r.content)\n", " coord_start=g.find('GLatLng')\n", " coord_end=coord_start+g[coord_start:].find(')')\n", " coord=g[coord_start+len('GLatLng')+1:coord_end].split(',')\n", " kw_start=g.find('')\n", " kw=g[kw_start+len('Cím:')\n", " cm_end=cm_start+g[cm_start:].find('
')\n", " cm=g[cm_start+len('Cím:'):cm_end].strip()\n", " \n", " coords.append(coord)\n", " kws.append(kw)\n", " cms.append(cm)\n", " \n", " df=df[:len(links)]\n", " df['coords']=coords\n", " df['kws']=kws\n", " df['cms']=cms\n", " dfs.append(df)\n", " print(region)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "ExecuteTime": { "end_time": "2018-12-16T08:39:39.694362Z", "start_time": "2018-12-16T08:29:29.816472Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0\n", "1\n", "2\n", "3\n", "4\n", "5\n", "6\n", "7\n", "8\n", "9\n", "10\n", "11\n", "12\n", "13\n", "14\n", "15\n", "16\n", "17\n", "18\n", "19\n", "20\n", "21\n", "22\n", "23\n", "24\n", "25\n", "26\n", "27\n", "28\n", "29\n", "30\n", "31\n", "32\n", "33\n", "34\n", "35\n", "36\n", "37\n", "38\n", "39\n", "40\n", "41\n", "42\n", "43\n", "44\n", "45\n", "46\n", "47\n", "48\n", "49\n", "also-haromszek\n", "0\n", "1\n", "2\n", "3\n", "4\n", "5\n", "6\n", "7\n", "8\n", "9\n", "10\n", "11\n", "12\n", "13\n", "14\n", "15\n", "16\n", "17\n", "18\n", "19\n", "20\n", "21\n", "22\n", "23\n", "24\n", "25\n", "26\n", "27\n", "28\n", "29\n", "30\n", "31\n", "32\n", "33\n", "34\n", "35\n", "36\n", "37\n", "38\n", "39\n", "40\n", "41\n", "42\n", "43\n", "44\n", "45\n", "46\n", "47\n", "48\n", "49\n", "felso-haromszek\n", "0\n", "1\n", "2\n", "3\n", "4\n", "5\n", "6\n", "7\n", "8\n", "9\n", "10\n", "11\n", "12\n", "13\n", "14\n", "15\n", "16\n", "17\n", "18\n", "19\n", "20\n", "21\n", "22\n", "23\n", "24\n", "25\n", "26\n", "27\n", "28\n", "29\n", "30\n", "31\n", "32\n", "33\n", "34\n", "35\n", "36\n", "37\n", "38\n", "39\n", "40\n", "41\n", "42\n", "43\n", "44\n", "45\n", "46\n", "47\n", "48\n", "49\n", "csikszek\n", "0\n", "1\n", "2\n", "3\n", "4\n", "5\n", "6\n", "7\n", "8\n", "9\n", "10\n", "11\n", "12\n", "13\n", "14\n", "15\n", "16\n", "17\n", "18\n", "19\n", "20\n", "21\n", "22\n", "23\n", "24\n", "25\n", "26\n", "27\n", "28\n", "29\n", "30\n", "31\n", "32\n", "33\n", "34\n", "35\n", "36\n", "37\n", "38\n", "39\n", "40\n", "41\n", "42\n", "43\n", "44\n", "45\n", "46\n", "47\n", "48\n", "49\n", "udvarhelyszek\n", "0\n", "1\n", "2\n", "3\n", "4\n", "5\n", "6\n", "7\n", "8\n", "9\n", "10\n", "11\n", "12\n", "13\n", "14\n", "15\n", "16\n", "17\n", "18\n", "19\n", "20\n", "21\n", "22\n", "23\n", "24\n", "25\n", "26\n", "27\n", "28\n", "29\n", "30\n", "31\n", "32\n", "33\n", "34\n", "35\n", "36\n", "37\n", "38\n", "39\n", "40\n", "41\n", "42\n", "43\n", "44\n", "45\n", "46\n", "47\n", "48\n", "49\n", "marosszek\n", "0\n", "1\n", "2\n", "3\n", "4\n", "5\n", "6\n", "7\n", "8\n", "9\n", "10\n", "11\n", "12\n", "13\n", "14\n", "15\n", "16\n", "17\n", "18\n", "19\n", "20\n", "21\n", "22\n", "23\n", "24\n", "25\n", "26\n", "27\n", "28\n", "29\n", "30\n", "31\n", "32\n", "33\n", "34\n", "35\n", "36\n", "37\n", "38\n", "39\n", "40\n", "41\n", "42\n", "43\n", "44\n", "45\n", "46\n", "47\n", "48\n", "49\n", "gyergyoszek\n" ] } ], "source": [ "dfs2=[]\n", "regions=['also-haromszek','felso-haromszek','csikszek','udvarhelyszek','marosszek','gyergyoszek']\n", "for region in regions:\n", " url='http://www.omnibus.ro/index.php/hu/szekelyfoldi-top-listak/arbevetel/'+\\\n", " region+'-2015-2018#oldal'\n", " with session() as c:\n", " response = c.get(url)\n", " #print(response.headers)\n", " #print(response.text)\n", " df=pd.read_html(response.text)[1]\n", " df.columns=[0]+list(df.loc[0])[:-1]\n", " df=df.loc[2:].set_index(0)\n", " df=df[df.columns[1:-1]]\n", " df=df.loc[list(df.index)[:-1]]\n", " df['region']=region\n", " df['nr']=df.index\n", " \n", " soup = BeautifulSoup(response.content)\n", " links=soup.findAll('table')[3].findAll('a')\n", " coords=[]\n", " kws=[]\n", " cms=[]\n", " for i in range(len(links)):\n", " print(i,)\n", " r=requests.get(links[i]['href'])\n", " g=repr(r.content)\n", " coord_start=g.find('GLatLng')\n", " coord_end=coord_start+g[coord_start:].find(')')\n", " coord=g[coord_start+len('GLatLng')+1:coord_end].split(',')\n", " kw_start=g.find('')\n", " kw=g[kw_start+len('Cím:')\n", " cm_end=cm_start+g[cm_start:].find('
')\n", " cm=g[cm_start+len('Cím:'):cm_end].strip()\n", " \n", " coords.append(coord)\n", " kws.append(kw)\n", " cms.append(cm)\n", " \n", " df=df[:len(links)]\n", " df['coords']=coords\n", " df['kws']=kws\n", " df['cms']=cms\n", " dfs2.append(df)\n", " print(region)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "ExecuteTime": { "end_time": "2018-12-16T09:03:49.815627Z", "start_time": "2018-12-16T09:03:49.810628Z" } }, "outputs": [], "source": [ "dfs[5].columns=dfs[0].columns" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "ExecuteTime": { "end_time": "2018-12-16T09:04:06.364690Z", "start_time": "2018-12-16T09:04:06.356689Z" } }, "outputs": [], "source": [ "dfsi=pd.concat(dfs).reset_index()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "ExecuteTime": { "end_time": "2018-12-16T09:04:07.103689Z", "start_time": "2018-12-16T09:04:07.093685Z" } }, "outputs": [], "source": [ "dfsi2=pd.concat(dfs2).reset_index()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "ExecuteTime": { "end_time": "2018-12-16T09:04:08.905210Z", "start_time": "2018-12-16T09:04:08.766218Z" } }, "outputs": [], "source": [ "dfsi.to_csv('dfsi.csv',sep=';')" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "ExecuteTime": { "end_time": "2018-12-16T09:04:09.266213Z", "start_time": "2018-12-16T09:04:09.103216Z" } }, "outputs": [], "source": [ "dfsi2.to_csv('dfsi2.csv',sep=';')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }