{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Make a new `get_XXX_url`" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```python\n", "@staticmethod\n", "def get_soup_url(url):\n", " return url\n", "\n", "@staticmethod\n", "def get_pdf_url(url):\n", " return url\n", "\n", "@staticmethod\n", "def get_tex_url(url):\n", " return url\n", "```" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m[success]\u001b[0m local driver can be built.\n", "\u001b[31m[failure]\u001b[0m remote driver can't be built.\n", "DRIVER_TYPE: \u001b[32mlocal\u001b[0m\n" ] } ], "source": [ "import sys\n", "sys.path.append(\"../tests/\")\n", "import inspect\n", "from data import JournalData\n", "from gummy import journals\n", "from gummy.journals import *" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Genetics\n" ] } ], "source": [ "journal = input().lower()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Current Method" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "crawler = journals.get(journal)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " @staticmethod\n", " def get_soup_url(url):\n", " return url\n", "\n", " @staticmethod\n", " def get_pdf_url(url):\n", " return url\n", "\n", " @staticmethod\n", " def get_tex_url(url):\n", " return url\n", "\n" ] } ], "source": [ "print(inspect.getsource(crawler.get_soup_url))\n", "print(inspect.getsource(crawler.get_pdf_url))\n", "print(inspect.getsource(crawler.get_tex_url))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Visit web page and Collect various url" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "https://www.genetics.org/content/176/4/2177\n" ] } ], "source": [ "url = JournalData.get(journal)\n", "print(url)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "urls = [\n", " \"https://www.genetics.org/content/176/4/2177\",\n", " \"https://www.genetics.org/content/genetics/176/4/2177.full.pdf\",\n", " \"https://www.genetics.org/content/176/4/2177.full.pdf\",\n", "]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create URL conversion function (staticmethod)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### `get_soup_url`" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "https://www.genetics.org/content/176/4/2177\n", "https://www.genetics.org/content/176/4/2177\n", "https://www.genetics.org/content/176/4/2177\n" ] } ], "source": [ "for url in urls:\n", " print(url.rstrip(\".full.pdf\").replace(\"/content/genetics/\", \"/content/\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### `get_pdf_url`" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "https://www.genetics.org/content/genetics/176/4/2177.full.pdf\n", "https://www.genetics.org/content/genetics/176/4/2177.full.pdf\n", "https://www.genetics.org/content/genetics/176/4/2177.full.pdf\n" ] } ], "source": [ "for url in urls:\n", " print(url.rstrip(\".full.pdf\").replace(\"/content/genetics/\", \"/content/\").replace(\"/content/\", \"/content/genetics/\")+\".full.pdf\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### `get_tex_url`" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Created Conversion Method" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " @staticmethod\n", " def get_soup_url(url):\n", " return url.rstrip(\".full.pdf\").replace(\"/content/genetics/\", \"/content/\")\n", "\n", " @staticmethod\n", " def get_pdf_url(url):\n", " return GeneticsCrawler.get_soup_url(url).replace(\"/content/\", \"/content/genetics/\")+\".full.pdf\" \n", "\n", " @staticmethod\n", " def get_tex_url(url):\n", " return url\n", "\n" ] } ], "source": [ "print(inspect.getsource(crawler.get_soup_url))\n", "print(inspect.getsource(crawler.get_pdf_url))\n", "print(inspect.getsource(crawler.get_tex_url))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "hide_input": false, "kernelspec": { "display_name": "Python 3 (/usr/local/bin/)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }