{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import os\n", "import time\n", "import urllib.error\n", "import urllib.request\n", "\n", "from bs4 import BeautifulSoup" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "url = 'https://news.yahoo.co.jp/photo/'\n", "ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) '\\\n", " 'AppleWebKit/537.36 (KHTML, like Gecko) '\\\n", " 'Chrome/55.0.2883.95 Safari/537.36 '" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "req = urllib.request.Request(url, headers={'User-Agent': ua})\n", "html = urllib.request.urlopen(req)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "soup = BeautifulSoup(html, \"html.parser\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "img_list = soup.find(class_='headline').find_all('img')\n", "url_list = []\n", "for img in img_list:\n", " url_list.append(img.get('src'))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def download_image(url, dst_path):\n", " try:\n", " data = urllib.request.urlopen(url).read()\n", " with open(dst_path, mode='wb') as f:\n", " f.write(data)\n", " except urllib.error.URLError as e:\n", " print(e)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "https://giwiz-newstop.c.yimg.jp/im_siggecJNKV0BleHsiAnBMXsewQ---x223-y122-n0-pril-exp30d/q/amd/20171121-00010002-esquire-000-1-view.jpg data/temp/20171121-00010002-esquire-000-1-view.jpg\n", "https://giwiz-newstop.c.yimg.jp/im_siggric32PEeS_iH0vtb0LCUZQ---x192-y122-n0-pril-exp30d/q/amd/20171121-00010006-mensplus-000-1-view.jpg data/temp/20171121-00010006-mensplus-000-1-view.jpg\n", "https://giwiz-newstop.c.yimg.jp/im_siggdu75GDeqZ7B_Eh0pxdKKeg---x161-y122-n0-pril-exp30d/q/amd/20171121-00000023-mnet-000-0-thumb.jpg data/temp/20171121-00000023-mnet-000-0-thumb.jpg\n", "https://giwiz-newstop.c.yimg.jp/im_siggWLXlL2dWHrPr.mrrLLkL9w---x192-y108-n0-pril-exp30d/q/amd/20171121-00000006-ignjapan-000-1-view.jpg data/temp/20171121-00000006-ignjapan-000-1-view.jpg\n", "https://giwiz-newstop.c.yimg.jp/im_siggeyzG3wHHU0NxUvrp0xPvPg---x192-y108-n0-pril-exp30d/q/amd/20171121-00010006-giz-000-1-view.jpg data/temp/20171121-00010006-giz-000-1-view.jpg\n", "https://giwiz-newstop.c.yimg.jp/im_siggXn_6JufQa4Orbdr3IEWONA---x192-y108-n0-pril-exp30d/q/amd/20171121-00010003-biz_lifeh-000-1-view.jpg data/temp/20171121-00010003-biz_lifeh-000-1-view.jpg\n", "https://giwiz-newstop.c.yimg.jp/im_siggSGvaG83y7pmaqZg2Bsxl4Q---x213-y120-n0-pril-exp30d/q/amd/20171121-00010021-abema-000-1-view.jpg data/temp/20171121-00010021-abema-000-1-view.jpg\n", "https://giwiz-newstop.c.yimg.jp/im_sigg7rXGpJA7JHwJHGSJzWPBRA---x180-y120-n0-pril-exp30d/q/amd/20171121-00010003-binsider-000-1-view.jpg data/temp/20171121-00010003-binsider-000-1-view.jpg\n", "https://giwiz-newstop.c.yimg.jp/im_siggbZPC5gZWE944mVVJ7w2xEw---x179-y119-n0-pril-exp30d/q/amd/20171121-00085892-esse-000-1-view.jpg data/temp/20171121-00085892-esse-000-1-view.jpg\n" ] } ], "source": [ "download_dir = 'data/temp'\n", "sleep_time_sec = 1\n", "\n", "for url in url_list:\n", " filename = os.path.basename(url)\n", " dst_path = os.path.join(download_dir, filename)\n", " time.sleep(sleep_time_sec)\n", " print(url, dst_path)\n", " download_image(url, dst_path)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 2 }