{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "070ce222", "metadata": { "collapsed": false }, "outputs": [], "source": [ "from scrapy.spider import BaseSpider" ] }, { "cell_type": "code", "execution_count": 14, "id": "4803dd7e", "metadata": { "collapsed": false }, "outputs": [], "source": [ "class MySpider(BaseSpider):\n", " name = \"uni-kl.de\"\n", " allowed_domains = [\"uni-kl.de\"]\n", " start_urls = [\n", " \"http://www.uni-kl.de/\",\n", " ]\n", "\n", " def parse(self, response):\n", " filename = response.url.split(\"/\")[-2]\n", " open(filename, 'wb').write(response.body)" ] }, { "cell_type": "code", "execution_count": 17, "id": "6a9b5c8b", "metadata": { "collapsed": false }, "outputs": [], "source": [ "# see: http://stackoverflow.com/questions/7993680/running-scrapy-tasks-in-python\n", "\n", "from scrapy import project, signals\n", "from scrapy.conf import settings\n", "from scrapy.crawler import CrawlerProcess\n", "from scrapy.xlib.pydispatch import dispatcher\n", "from multiprocessing.queues import Queue\n", "from multiprocessing import Process\n", "\n", "class CrawlerWorker(Process):\n", " def __init__(self, spider, results):\n", " Process.__init__(self)\n", " self.results = results\n", "\n", " self.crawler = CrawlerProcess(settings)\n", " if not hasattr(project, 'crawler'):\n", " self.crawler.install()\n", " self.crawler.configure()\n", "\n", " self.items = []\n", " self.spider = spider\n", " dispatcher.connect(self._item_passed, signals.item_passed)\n", "\n", " def _item_passed(self, item):\n", " self.items.append(item)\n", "\n", " def run(self):\n", " self.crawler.crawl(self.spider)\n", " self.crawler.start()\n", " self.crawler.stop()\n", " self.results.put(self.items)\n" ] }, { "cell_type": "code", "execution_count": 18, "id": "7d601d0c", "metadata": { "collapsed": false }, "outputs": [], "source": [ "# The part below can be called as often as you want\n", "results = Queue()\n", "crawler = CrawlerWorker(MySpider(), results)\n", "crawler.start()\n", "for item in results.get():\n", " print item" ] }, { "cell_type": "code", "execution_count": null, "id": "cd6b067d", "metadata": { "collapsed": false }, "outputs": [], "source": [] } ], "metadata": {}, "nbformat": 4, "nbformat_minor": 5 }