{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Demo for SciPy 2021, part 0: Setup\n",
    "\n",
    "Based on `Market_Intelligence_Part1.ipynb`.\n",
    "\n",
    "This notebook precomputes the model results used in parts 1 and 2 so that we don't need network access to run those parts."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Import Python libraries\n",
    "from typing import *\n",
    "import json\n",
    "import os\n",
    "import ibm_watson\n",
    "import ibm_watson.natural_language_understanding_v1 as nlu\n",
    "import ibm_cloud_sdk_core\n",
    "import pandas as pd\n",
    "import text_extensions_for_pandas as tp\n",
    "import transformers\n",
    "\n",
    "if \"IBM_API_KEY\" not in os.environ:\n",
    "    raise ValueError(\"IBM_API_KEY environment variable not set. Please create \"\n",
    "                     \"a free instance of IBM Watson Natural Language Understanding \"\n",
    "                     \"(see https://www.ibm.com/cloud/watson-natural-language-understanding) \"\n",
    "                     \"and set the IBM_API_KEY environment variable to your instance's \"\n",
    "                     \"API key value.\")\n",
    "api_key = os.environ.get(\"IBM_API_KEY\")\n",
    "service_url = os.environ.get(\"IBM_SERVICE_URL\")  \n",
    "\n",
    "\n",
    "# Initialize the Watson NLU Python API\n",
    "natural_language_understanding = ibm_watson.NaturalLanguageUnderstandingV1(\n",
    "    version=\"2021-01-01\",\n",
    "    authenticator=ibm_cloud_sdk_core.authenticators.IAMAuthenticator(api_key)\n",
    ")\n",
    "natural_language_understanding.set_service_url(service_url)\n",
    "\n",
    "# Github notebook gists will be this wide: ------------------>\n",
    "# Screenshots of this notebook should be this wide: ----------------------------->"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'https://newsroom.ibm.com/2020-02-19-IBM-Power-Systems-Certified-for-SAP-HANA-R-Enterprise-Cloud-as-a-provider-for-large-SAP-HANA-systems'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Show the document URL used\n",
    "doc_url = \"https://newsroom.ibm.com/2020-02-19-IBM-Power-Systems-Certified-for-SAP-HANA-R-Enterprise-Cloud-as-a-provider-for-large-SAP-HANA-systems\"\n",
    "doc_url"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Rerun NLU, capture all results, and convert to DataFrames\n",
    "nlu_results = natural_language_understanding.analyze(\n",
    "    url=doc_url,\n",
    "    return_analyzed_text=True,\n",
    "    features=nlu.Features(\n",
    "        entities=nlu.EntitiesOptions(mentions=True),\n",
    "        semantic_roles=nlu.SemanticRolesOptions())).get_result()\n",
    "\n",
    "dataframes = tp.io.watson.nlu.parse_response(nlu_results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['usage', 'semantic_roles', 'retrieved_url', 'language', 'entities', 'analyzed_text'])"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nlu_results.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'type': 'Person',\n",
       "  'text': 'Christoph Herman',\n",
       "  'relevance': 0.217154,\n",
       "  'mentions': [{'text': 'Christoph Herman',\n",
       "    'location': [1213, 1229],\n",
       "    'confidence': 0.94435}],\n",
       "  'count': 1,\n",
       "  'confidence': 0.94435},\n",
       " {'type': 'Person',\n",
       "  'text': 'Stephen Leonard',\n",
       "  'relevance': 0.136166,\n",
       "  'mentions': [{'text': 'Stephen Leonard',\n",
       "    'location': [2227, 2242],\n",
       "    'confidence': 0.989177}],\n",
       "  'disambiguation': {'name': 'Steve_Leonard',\n",
       "   'dbpedia_resource': 'http://dbpedia.org/resource/Steve_Leonard'},\n",
       "  'count': 1,\n",
       "  'confidence': 0.989177},\n",
       " {'type': 'Person',\n",
       "  'text': 'Sam Ponedal',\n",
       "  'relevance': 0.020711,\n",
       "  'mentions': [{'text': 'Sam Ponedal',\n",
       "    'location': [3574, 3585],\n",
       "    'confidence': 0.894298}],\n",
       "  'count': 1,\n",
       "  'confidence': 0.894298}]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Generate the Watson version of person mentions as raw JSON\n",
    "person_mentions_watson_json = [\n",
    "    e for e in nlu_results[\"entities\"] if e[\"type\"] == \"Person\"]\n",
    "\n",
    "person_mentions_watson_json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>person</th>\n",
       "      <th>confidence</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>[1213, 1229): 'Christoph Herman'</td>\n",
       "      <td>0.944350</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>[2227, 2242): 'Stephen Leonard'</td>\n",
       "      <td>0.989177</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>[3574, 3585): 'Sam Ponedal'</td>\n",
       "      <td>0.894298</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                              person  confidence\n",
       "38  [1213, 1229): 'Christoph Herman'    0.944350\n",
       "41   [2227, 2242): 'Stephen Leonard'    0.989177\n",
       "48       [3574, 3585): 'Sam Ponedal'    0.894298"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Generate the Watson Person output as a DataFrame\n",
    "entity_mentions = dataframes[\"entity_mentions\"]\n",
    "\n",
    "person_mentions_watson = (\n",
    "     entity_mentions[entity_mentions[\"type\"] == \"Person\"]\n",
    "     [[\"span\", \"confidence\"]].rename(columns={\"span\": \"person\"}))\n",
    "person_mentions_watson"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>subject</th>\n",
       "      <th>verb</th>\n",
       "      <th>object</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[1213, 1281): 'Christoph Herman, SVP and Head ...</td>\n",
       "      <td>said</td>\n",
       "      <td>[937, 1205): 'SAP HANA Enterprise Cloud on IBM...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[2227, 2519): 'Stephen Leonard, General Manage...</td>\n",
       "      <td>said</td>\n",
       "      <td>[2028, 2219): 'In June, IBM announced the avai...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                             subject  verb  \\\n",
       "0  [1213, 1281): 'Christoph Herman, SVP and Head ...  said   \n",
       "1  [2227, 2519): 'Stephen Leonard, General Manage...  said   \n",
       "\n",
       "                                              object  \n",
       "0  [937, 1205): 'SAP HANA Enterprise Cloud on IBM...  \n",
       "1  [2028, 2219): 'In June, IBM announced the avai...  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Generate the filtered SRL output as a DataFrame and add offset info\n",
    "semantic_roles = dataframes[\"semantic_roles\"]\n",
    "doc_text = entity_mentions[\"span\"].array.document_text\n",
    "\n",
    "quotes = semantic_roles[semantic_roles[\"action.normalized\"] == \"say\"].reset_index(drop=True)\n",
    "\n",
    "# Add location info that isn't present in the output of Watson NLU\n",
    "for colname in (\"subject\", \"object\"):\n",
    "    begins = pd.Series([doc_text.index(s) for s in quotes[f\"{colname}.text\"]], dtype=int)\n",
    "    ends = begins + quotes[f\"{colname}.text\"].str.len()\n",
    "    quotes[colname] = tp.SpanArray(doc_text, begins, ends)\n",
    "\n",
    "quotes = (quotes[[\"subject\", \"action.text\", \"object\"]]\n",
    "          .rename(columns={\"action.text\": \"verb\"}))\n",
    "someone_said_something_df = quotes\n",
    "someone_said_something_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "scrolled": true,
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'subject': {'text': 'Christoph Herman, SVP and Head of SAP HANA Enterprise Cloud Delivery',\n",
       "   'begin': 1213,\n",
       "   'end': 1281},\n",
       "  'sentence': ' \"SAP HANA Enterprise Cloud on IBM Power Systems will help clients unlock the full value of SAP HANA in the cloud, with the possibility of enhancing the scalability and availability of mission critical SAP applications while moving workloads to SAP HANA and lowering TCO,\" said Christoph Herman, SVP and Head of SAP HANA Enterprise Cloud Delivery.',\n",
       "  'object': {'text': 'SAP HANA Enterprise Cloud on IBM Power Systems will help clients unlock the full value of SAP HANA in the cloud, with the possibility of enhancing the scalability and availability of mission critical SAP applications while moving workloads to SAP HANA and lowering TCO'},\n",
       "  'action': {'verb': {'text': 'say', 'tense': 'past'},\n",
       "   'text': 'said',\n",
       "   'normalized': 'say'}},\n",
       " {'subject': {'text': 'Stephen Leonard, General Manager, IBM Cognitive Systems, \"With the addition of IBM Power Systems in SAP HANA Enterprise Cloud, we\\'re giving our clients more choices and greater flexibility to run their workloads where they want to across the hybrid cloud and accelerate digital transformation',\n",
       "   'begin': 2227,\n",
       "   'end': 2519},\n",
       "  'sentence': ' \"In June, IBM announced the availability of POWER9 in the IBM Cloud, taking the first step toward our goal of bringing IBM Cognitive Systems technology to our clients, no matter where they are,\" said Stephen Leonard, General Manager, IBM Cognitive Systems, \"With the addition of IBM Power Systems in SAP HANA Enterprise Cloud, we\\'re giving our clients more choices and greater flexibility to run their workloads where they want to across the hybrid cloud and accelerate digital transformation.\"',\n",
       "  'object': {'text': 'In June, IBM announced the availability of POWER9 in the IBM Cloud, taking the first step toward our goal of bringing IBM Cognitive Systems technology to our clients, no matter where they are'},\n",
       "  'action': {'verb': {'text': 'say', 'tense': 'past'},\n",
       "   'text': 'said',\n",
       "   'normalized': 'say'}}]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Code for slides: Run the Watson NLU semantic_roles model\n",
    "\n",
    "semantic_roles_results = (\n",
    "    natural_language_understanding\n",
    "        .analyze(url=doc_url, features=nlu.Features(\n",
    "            semantic_roles=nlu.SemanticRolesOptions()))\n",
    "    .get_result()[\"semantic_roles\"])\n",
    "someone_said_something = [r for r in semantic_roles_results \n",
    "                          if r[\"action\"][\"normalized\"] == \"say\"]\n",
    "for s in someone_said_something:\n",
    "    s[\"subject\"][\"begin\"] = doc_text.find(s[\"subject\"][\"text\"])\n",
    "    s[\"subject\"][\"end\"] = s[\"subject\"][\"begin\"] + len(s[\"subject\"][\"text\"])\n",
    "    \n",
    "\n",
    "someone_said_something_json = someone_said_something\n",
    "someone_said_something_json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'entity_group': 'PER',\n",
       "  'score': 0.9996308088302612,\n",
       "  'word': 'Christoph Herman',\n",
       "  'start': 1213,\n",
       "  'end': 1229}]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Code for slides: Run the document text through the Hugging Face NER model\n",
    "\n",
    "ner = transformers.pipeline(\"ner\")\n",
    "tagged_tokens = ner(doc_text)\n",
    "model_results = ner.group_entities(tagged_tokens)\n",
    "person_mentions = [d for d in model_results if d[\"entity_group\"] == \"PER\"]\n",
    "person_mentions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>person</th>\n",
       "      <th>score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[1213, 1229): 'Christoph Herman'</td>\n",
       "      <td>0.999631</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                             person     score\n",
       "0  [1213, 1229): 'Christoph Herman'  0.999631"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Convert the Hugging Face model outputs to a DataFrame\n",
    "person_mentions_df = pd.DataFrame({\n",
    "    \"person\": tp.SpanArray(doc_text, [m[\"start\"] for m in person_mentions],\n",
    "                           [m[\"end\"] for m in person_mentions]),\n",
    "    \"score\": [m[\"score\"] for m in person_mentions]\n",
    "})\n",
    "person_mentions_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Write out all the data we've generated\n",
    "output_dir = \"./scipy_demo_data\"\n",
    "\n",
    "###################\n",
    "# Inputs to Part 1\n",
    "\n",
    "# Tweak the output of the Hugging Face model to be JSON-serializable\n",
    "for m in person_mentions:\n",
    "    m[\"start\"] = int(m[\"start\"])\n",
    "    m[\"end\"] = int(m[\"end\"])\n",
    "with open(f\"{output_dir}/person_mentions.json\", \"w\") as f:\n",
    "    json.dump(person_mentions, f)\n",
    "    \n",
    "with open(f\"{output_dir}/person_mentions_watson.json\", \"w\") as f:\n",
    "    json.dump(person_mentions_watson_json, f)\n",
    "    \n",
    "with open(f\"{output_dir}/someone_said_something.json\", \"w\") as f:\n",
    "    json.dump(someone_said_something_json, f)\n",
    "    \n",
    "###################\n",
    "# Inputs to Part 2\n",
    "\n",
    "person_mentions_df.to_parquet(f\"{output_dir}/person_mentions.parquet\")\n",
    "person_mentions_watson.to_parquet(f\"{output_dir}/person_mentions_watson.parquet\")\n",
    "someone_said_something_df.to_parquet(f\"{output_dir}/someone_said_something.parquet\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}