{ "cells": [ { "cell_type": "code", "execution_count": 474, "metadata": {}, "outputs": [], "source": [ "import requests, pandas as pd, numpy as np, json\n", "from requests import session\n", "from bs4 import BeautifulSoup" ] }, { "cell_type": "code", "execution_count": 475, "metadata": {}, "outputs": [], "source": [ "url='https://www.metrolinemap.com/'" ] }, { "cell_type": "code", "execution_count": 479, "metadata": {}, "outputs": [], "source": [ "response = requests.get(url)\n", "soup = BeautifulSoup(response.content)\n", "links=soup.find_all('a')[5:-4]\n", "links=[i['href'] for i in links]" ] }, { "cell_type": "code", "execution_count": 587, "metadata": {}, "outputs": [], "source": [ "metros={}\n", "good=[]" ] }, { "cell_type": "code", "execution_count": 588, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Algiers Metro\n", "Cairo Metro\n", "Yerevan Metro\n", "Baku Metro\n", "Beijing Subway\n", "Changchun Subway\n", "Changsha Metro\n", "Chengdu Metro\n", "Chongqing Rail Transit\n", "Dalian Metro\n", "Dongguan Rail Transit\n", "Fuzhou Metro\n", "Guangzhou Metro\n", "Guiyang Metro\n", "Hangzhou Metro\n", "Harbin Metro\n", "Hefei Metro\n", "Hong Kong MTR\n", "Jinan Metro\n", "Kunming Rail Transit\n", "Nanchang Metro\n", "Nanjing Metro\n", "Nanning Rail Transit\n", "Ningbo Rail Transit\n", "Qingdao Metro\n", "Shanghai Metro\n", "Shenyang Metro\n", "Shenzhen Metro\n", "Shijiazhuang Metro\n", "Suzhou Rail Transit\n", "Tianjin Metro\n", "Ürümqi Metro\n", "Wenzhou Metro\n", "Wuhan Metro\n", "Wuxi Metro\n", "Xiamen Metro\n", "Xi'an Metro\n", "Zhengzhou Metro\n", "Tbilisi Metro\n", "Ahmedabad Metro\n", "Bangalore Metro\n", "Chennai Metro\n", "Delhi Metro\n", "Hyderabad Metro\n", "Jaipur Metro\n", "Kochi Metro\n", "Kolkata Metro\n", "Lucknow Metro\n", "Mumbai Metro\n", "Nagpur Metro\n", "Noida Metro\n", "Jakarta MRT\n", "Isfahan Metro\n", "Mashhad Urban Railway\n", "Shiraz Metro\n", "Tabriz Metro\n", "Tehran Metro\n", "Fukuoka City Subway\n", "Hiroshima Rapid Transit Line\n", "Kobe Municipal Subway\n", "Kyoto Municipal Subway\n", "Nagoya Municipal Subway\n", "Osaka Municipal Subway\n", "Sapporo Municipal Subway\n", "Sendai Subway\n", "Tokyo Metro\n", "Yokohama Municipal Subway\n", "Almaty Metro\n", "Pyongyang Metro\n", "Busan Metro\n", "Daegu Metro\n", "Daejeon Metro\n", "Gwangju Metro\n", "Seoul Metropolitan Subway\n", "Kuala Lumpur Rapid Rail\n", "Manila Metro Rail Transit\n", "Doha Metro\n", "Mecca Metro\n", "Singapore Mass Rapid Transit\n", "Kaohsiung Mass Rapid Transit\n", "Taipei Metro\n", "Taoyuan Metro\n", "Bangkok Metro\n", "Dubai Metro\n", "Tashkent Metro\n", "Sydney Metro\n", "Vienna U-Bahn\n", "Minsk Metro\n", "Brussels Metro\n", "Sofia Metro\n", "Prague Metro\n", "Copenhagen Metro\n", "Helsinki Metro\n", "Lille Metro\n", "Lyon Metro\n", "Marseille Metro\n", "Paris Metro\n", "Rennes Metro\n", "Toulouse Metro\n", "Berlin U-Bahn\n", "Hamburg U-Bahn\n", "Munich U-Bahn\n", "Nuremberg U-Bahn\n", "Athens Metro\n", "Budapest Metro\n", "Brescia Metro\n", "Catania Metro\n", "Genoa Metro\n", "Milan Metro\n", "Naples Metro\n", "Rome Metro\n", "Turin Metro\n", "Amsterdam Metro\n", "Rotterdam Metro\n", "Oslo Metro\n", "Warsaw Metro\n", "Lisbon Metro\n", "Bucharest Metro\n", "Kazan Metro\n", "Moscow Metro\n", "Nizhny Novgorod Metro\n", "Novosibirsk Metro\n", "Saint Petersburg Metro\n", "Samara Metro\n", "Yekaterinburg Metro\n", "Barcelona Metro\n", "Metro Bilbao\n", "Madrid Metro\n", "Stockholm Metro\n", "Lausanne Metro\n", "Adana Metro\n", "Ankara Metro\n", "Bursaray\n", "Istanbul Metro\n", "İzmir Metro\n", "Dnipro Metro\n", "Kharkiv Metro\n", "Kiev Metro\n", "Glasgow Subway\n", "London Underground and DLR\n", "Tyne and Wear Metro\n", "Montreal Metro\n", "Toronto subway\n", "Vancouver SkyTrain\n", "Santo Domingo Metro\n", "Mexico City Metro\n", "Monterrey Metro\n", "Panama Metro\n", "Puerto Rico Tren Urbano\n", "Atlanta Metro (MARTA)\n", "Baltimore Metro Subway\n", "Boston MBTA Subway\n", "Chicago \"L\"\n", "Cleveland RTA Rapid Transit\n", "Los Angeles Metro\n", "Miami Metrorail\n", "New York City Subway\n", "Philadelphia SEPTA and PATCO\n", "San Francisco BART\n", "Washington Metro\n", "Buenos Aires Underground\n", "Belo Horizonte Metro\n", "Brasília Metro\n", "Porto Alegre Metro\n", "Recife Metro\n", "Rio de Janeiro Metro\n", "Salvador Metro\n", "São Paulo Metro\n", "Santiago Metro\n", "Medellín Metro\n", "Lima Metro\n", "Caracas Metro\n" ] } ], "source": [ "# for link in links[168:169]:\n", "for link in links[:]:\n", " if link not in good:\n", " response = requests.get(link)\n", " soup = BeautifulSoup(response.content)\n", " name=soup.find('h1').text\n", " print(name)\n", " metros[link]={'name':name,'url':link}\n", " metros[link]['desc']=soup.find('div',{'class':'callout-card-content'}).text.replace('\\n','')\n", " path_IDs=[i[:i.find('=')].strip() for i in soup.text.split('pathCoordinates')[1:] if '=' in i]\n", " colors={i[:i.find(',')]:i[i.find('#'):i.find('#')+7] for i in path_IDs if '#' in i}\n", " lines={k:[i[i.find('(')+1:i.find(')')].replace('\"','').strip() \\\n", " for i in soup.text.split('highlightedPoly'+k+' = poly'+k+';')][1] for k in colors}\n", " linelist=list(lines.keys())\n", " linelist2=list({k: v for k, v in sorted(lines.items(), key=lambda item: item[1])}.keys()) #sorted ABC by values\n", " ends=[i.text[i.text.find('(')+1:-1] for i in soup.find_all('button')]\n", " branches={linelist[i]:e for (i,e) in enumerate(ends)} \n", " spaths={i[:i.find('=')].strip():(i[i.find('['):i.find(']')].strip()+']').replace(',]',']') \\\n", " for i in soup.text.split('pathCoordinates')[1:] if '=' in i and '#' not in i}\n", " metros[link]['lines']=[{'path':json.loads(spaths[i].replace('lat','\"lat\"').replace('lng','\"lng\"')),\n", " 'color':colors[i],\n", " 'name':lines[i],\n", " 'branch':branches[i],\n", " 'id':i} for i in spaths]\n", " stations=[i for i in json.loads(soup.text[soup.text.find('var stations =')+15:\\\n", " soup.text.find(']\\r\\n]')+4].replace('\\r','').replace('\\n','').replace(\"\\'\",''))]\n", " buttonstations={}\n", " buttonlines=[i['id'].replace('chkMetroLine','') for i in soup.findAll('input',{'name':'chkMetroLine'})]\n", " for i in range(len(buttonlines)):\n", " line=buttonlines[i]\n", "# for i in range(len(linelist)): #good for London, branches\n", "# line=linelist[i]\n", "# for i in range(len(linelist2)): #good for Santiago, non-ABC\n", "# line=linelist2[i]\n", " for s in [k['href'] for k in soup.findAll('div',{'class':'panel'})[i].findAll('a')]:\n", " if s not in buttonstations: buttonstations[s]=set()\n", " buttonstations[s].add(line)\n", " metros[link]['stations']=[{'name':s[0],'lat':s[1],'lon':s[2],'url':s[3],\n", " 'lines':list(set([i[1:i.find('class=color')-1].strip() for i in s[4].split('title')[1:]])),\n", " 'branches':list(buttonstations[s[3]])} for s in stations]\n", " good.append(link)" ] }, { "cell_type": "code", "execution_count": 589, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "6421999" ] }, "execution_count": 589, "metadata": {}, "output_type": "execute_result" } ], "source": [ "open('metros.json','w').write(json.dumps(metros))" ] }, { "cell_type": "code", "execution_count": 590, "metadata": {}, "outputs": [], "source": [ "response = requests.get(url)\n", "soup = BeautifulSoup(response.content)" ] }, { "cell_type": "code", "execution_count": 591, "metadata": {}, "outputs": [], "source": [ "country=''\n", "city='' \n", "geo={}\n", "s=soup.text.split('\\n\\n\\n')[35:226]\n", "for k in range(len(s)):\n", " i=s[k]\n", " if i:\n", " #print(repr(i))\n", " if i[0]=='\\n':\n", " continent=i.split('\\n')[1].strip()\n", " country=i.split('\\n')[3].strip()\n", " city=i.split('\\n')[-1].replace('Map','').strip()\n", " elif not s[k-1]:\n", " continent=i.split('\\n')[0]\n", " country=i.split('\\n')[2].strip()\n", " city=i.split('\\n')[-1].replace('Map','').strip()\n", " elif i[0]==' ':\n", " country=i.split('\\n')[0].strip()\n", " city=i.split('\\n')[-1].replace('Map','').strip()\n", " else:\n", " city=i.replace('Map','').strip()\n", " #print(continent,country,city)\n", " geo[city]={'continent':continent,'country':country}" ] }, { "cell_type": "code", "execution_count": 592, "metadata": {}, "outputs": [], "source": [ "geo['Cleveland RTA Rapid Transit']=geo['Cleveland RTA Rapid Transit']\n", "geo['London Underground and DLR']=geo['London Underground']" ] }, { "cell_type": "code", "execution_count": 593, "metadata": {}, "outputs": [], "source": [ "for metro in metros:\n", " name=metros[metro]['name']\n", " if name not in geo:\n", " print(name)\n", " else:\n", " metros[metro]['geo']=geo[name]" ] }, { "cell_type": "code", "execution_count": 594, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "6431348" ] }, "execution_count": 594, "metadata": {}, "output_type": "execute_result" } ], "source": [ "open('metrosg.json','w').write(json.dumps(metros))" ] }, { "cell_type": "code", "execution_count": 595, "metadata": {}, "outputs": [], "source": [ "import zipfile" ] }, { "cell_type": "code", "execution_count": 596, "metadata": {}, "outputs": [], "source": [ "zipfile.ZipFile('metrosg.zip', \"w\", zipfile.ZIP_DEFLATED).write('metrosg.json')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 4 }