{ "cells": [ { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "wx0Sg_fRM5Kh" }, "source": [ "# Notebook [2]: Using the PDF converter" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "Y5PrFhdQMeBF" }, "source": [ "\n", "\n", "This notebook shows how to use the PDF converter to create an input dataframe for the cdQA pipeline from a directory of PDF files.\n" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "f58-FXmbMfjz" }, "source": [ "***Note:*** *To run this notebook you will need to have access to GPU. If you are using colab, you will need to install `cdQA` by executing `!pip install cdqa` in a cell.* " ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2019-07-20T13:41:40.814076Z", "start_time": "2019-07-20T13:41:39.440654Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 76 }, "colab_type": "code", "collapsed": true, "id": "7UMrjUJ2EGmu", "outputId": "97fb0bd8-8a73-4cd0-cd43-eb326067a03d" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/andre.farias/python3.7.0/lib/python3.7/site-packages/tqdm/autonotebook/__init__.py:18: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", " \" (e.g. in jupyter console)\", TqdmExperimentalWarning)\n" ] } ], "source": [ "import os\n", "import pandas as pd\n", "from ast import literal_eval\n", "\n", "from cdqa.utils.converters import pdf_converter\n", "from cdqa.utils.filters import filter_paragraphs\n", "from cdqa.pipeline import QAPipeline\n", "from cdqa.utils.download import download_model" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "V1fV_dquOrx0" }, "source": [ "### Download pre-trained reader model and PDF files" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2019-07-20T13:42:54.139892Z", "start_time": "2019-07-20T13:41:41.869993Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Downloading trained model...\n" ] } ], "source": [ "# Download model\n", "download_model(model='bert-squad_1.1', dir='./models')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2019-07-20T13:43:21.153039Z", "start_time": "2019-07-20T13:43:20.228398Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 94 }, "colab_type": "code", "id": "yhg8jFjbERzv", "outputId": "3c5414b9-979b-4342-c76d-ab3a05520d3e" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Downloading PDF files...\n" ] } ], "source": [ "# Download pdf files from BNP Paribas public news\n", "def download_pdf():\n", " import os\n", " import wget\n", " directory = './data/pdf/'\n", " models_url = [\n", " 'https://invest.bnpparibas.com/documents/1q19-pr-12648',\n", " 'https://invest.bnpparibas.com/documents/4q18-pr-18000',\n", " 'https://invest.bnpparibas.com/documents/4q17-pr'\n", " ]\n", "\n", " print('\\nDownloading PDF files...')\n", "\n", " if not os.path.exists(directory):\n", " os.makedirs(directory)\n", " for url in models_url:\n", " wget.download(url=url, out=directory)\n", "\n", "download_pdf()" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "QqPK6BV2O-RO" }, "source": [ "### Convert the PDF files into a DataFrame for cdQA pipeline" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2019-07-20T13:44:01.821890Z", "start_time": "2019-07-20T13:43:22.685954Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 143 }, "colab_type": "code", "id": "czafu4-aEXXm", "outputId": "d1c13305-b4a3-4dff-f0ec-6bf277ca3b2a" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2019-07-20 15:43:22,713 [MainThread ] [INFO ] Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.19/tika-server-1.19.jar to /var/folders/fy/3wb1p_ms5r3g97jm4y93pqd40000gn/T/tika-server.jar.\n", "2019-07-20 15:43:34,191 [MainThread ] [INFO ] Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.19/tika-server-1.19.jar.md5 to /var/folders/fy/3wb1p_ms5r3g97jm4y93pqd40000gn/T/tika-server.jar.md5.\n", "2019-07-20 15:43:34,617 [MainThread ] [WARNI] Failed to see startup log message; retrying...\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titleparagraphs
04q17-pr.pdf[GOOD START OF THE 2020 PLAN * COST OF RISK...
14q18-pr2.pdf[SIGNIFICANT PROGRESS IN THE DIGITAL TRANSFORM...
21q19-pr-12648.pdf[The business of BNP Paribas was up this quart...
\n", "
" ], "text/plain": [ " title paragraphs\n", "0 4q17-pr.pdf [GOOD START OF THE 2020 PLAN * COST OF RISK...\n", "1 4q18-pr2.pdf [SIGNIFICANT PROGRESS IN THE DIGITAL TRANSFORM...\n", "2 1q19-pr-12648.pdf [The business of BNP Paribas was up this quart..." ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pdf_converter(directory_path='./data/pdf/')\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "FLZd4H_vPJuU" }, "source": [ "### Instantiate the cdQA pipeline from a pre-trained reader model" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2019-07-20T13:44:46.283172Z", "start_time": "2019-07-20T13:44:45.317024Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 211 }, "colab_type": "code", "id": "8OOqnkNyEaFe", "outputId": "1182284f-abf7-46dc-829c-09b2758e7bcd" }, "outputs": [ { "data": { "text/plain": [ "QAPipeline(reader=BertQA(bert_model='bert-base-uncased', do_lower_case=True,\n", " fp16=False, gradient_accumulation_steps=1,\n", " learning_rate=3e-05, local_rank=-1, loss_scale=0,\n", " max_answer_length=30, n_best_size=20, no_cuda=False,\n", " null_score_diff_threshold=0.0, num_train_epochs=2,\n", " output_dir=None, predict_batch_size=8, seed=42,\n", " server_ip='', server_port='', train_batch_size=12,\n", " verbose_logging=False, version_2_with_negative=False,\n", " warmup_proportion=0.1))" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0)\n", "\n", "# Fit Retriever to documents\n", "cdqa_pipeline.fit_retriever(df=df)" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "40nBTa4UPrO2" }, "source": [ " ### Execute a query" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2019-07-20T13:54:57.200016Z", "start_time": "2019-07-20T13:44:49.005187Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 56 }, "colab_type": "code", "id": "POH2gjywEcNb", "outputId": "d454f0af-d64e-4d32-8da1-d1694a3e787d" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "3it [00:00, 170.06it/s]\n", "The pre-trained model you are loading is an uncased model but you have set `do_lower_case` to False. We are setting `do_lower_case=True` for you but you may want to check this behavior.\n" ] } ], "source": [ "query = 'How many contracts did BNP Paribas Cardif sell in 2019?'\n", "prediction = cdqa_pipeline.predict(query)" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "PgdnYmW3P3d8" }, "source": [ "### Explore predictions" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2019-07-20T13:54:57.336337Z", "start_time": "2019-07-20T13:54:57.318676Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 114 }, "colab_type": "code", "id": "ThCffJekEdiC", "outputId": "0ca8870f-aabc-4ef9-8fad-751974e70284" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "query: How many contracts did BNP Paribas Cardif sell in 2019?\n", "answer: 140,000\n", "title: 1q19-pr-12648.pdf\n", "paragraph: Insurance recorded a good level of activity with in particular the good performance of the international Savings and Protection Insurance businesses and the good development of the new property and casualty insurance offering in the FRB network via Cardif IARD4 (close to 140,000 contracts sold at the end of March 2019). The business committed to energy transition with a target of 3.5 billion euros in green investments by the end of 2020. \n" ] } ], "source": [ "print('query: {}'.format(query))\n", "print('answer: {}'.format(prediction[0]))\n", "print('title: {}'.format(prediction[1]))\n", "print('paragraph: {}'.format(prediction[2]))" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "name": "Untitled0.ipynb", "provenance": [], "version": "0.3.2" }, "hide_input": false, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 1 }