# Demo for PyData Montreal 2021, part 1

Based on `Market_Intelligence_Part1.ipynb`.

In [1]:
# Import Python libraries
from typing import *
import os

if "IBM_API_KEY" not in os.environ:
    raise ValueError("IBM_API_KEY environment variable not set. Please create "
                     "a free instance of IBM Watson Natural Language Understanding "
                     "(see https://www.ibm.com/cloud/watson-natural-language-understanding) "
                     "and set the IBM_API_KEY environment variable to your instance's "
                     "API key value.")
api_key = os.environ.get("IBM_API_KEY")
service_url = os.environ.get("IBM_SERVICE_URL")  

# Github notebook gists will be this wide: ------------------>
# Screenshots of this notebook should be this wide: ----------------------------->

In [2]:
# Show the document
doc_url = "https://newsroom.ibm.com/2020-02-19-IBM-Power-Systems-Certified-for-SAP-HANA-R-Enterprise-Cloud-as-a-provider-for-large-SAP-HANA-systems"
doc_url

'https://newsroom.ibm.com/2020-02-19-IBM-Power-Systems-Certified-for-SAP-HANA-R-Enterprise-Cloud-as-a-provider-for-large-SAP-HANA-systems'

In [3]:
import ibm_watson
import ibm_watson.natural_language_understanding_v1 as nlu
import ibm_cloud_sdk_core

natural_language_understanding = ibm_watson.NaturalLanguageUnderstandingV1(
    version="2021-01-01",
    authenticator=ibm_cloud_sdk_core.authenticators.IAMAuthenticator(api_key)
)
natural_language_understanding.set_service_url(service_url)
nlu_results = natural_language_understanding.analyze(
    url=doc_url,
    return_analyzed_text=True,
    features=nlu.Features(
        entities=nlu.EntitiesOptions(mentions=True),
        semantic_roles=nlu.SemanticRolesOptions())).get_result()
nlu_results

{'usage': {'text_units': 1, 'text_characters': 3810, 'features': 2},
 'semantic_roles': [{'subject': {'text': 'IBM'},
   'sentence': 'ARMONK, N.Y., Feb. 19, 2020 /PRNewswire/ --\xa0IBM (NYSE: IBM) today announced that IBM Power Systems has been certified for the SAP HANA® Enterprise Cloud as a critical infrastructure platform provider for large SAP HANA systems, aiming to simplify the IT infrastructure for the managed, private cloud environment.',
   'object': {'text': 'that IBM Power Systems has been certified for the SAP HANA® Enterprise Cloud as a critical infrastructure platform provider for large SAP HANA systems, aiming to simplify the IT infrastructure for the managed, private cloud environment'},
   'action': {'verb': {'text': 'announce', 'tense': 'past'},
    'text': 'announced',
    'normalized': 'announce'}},
  {'subject': {'text': 'IBM Power Systems'},
   'sentence': 'ARMONK, N.Y., Feb. 19, 2020 /PRNewswire/ --\xa0IBM (NYSE: IBM) today announced that IBM Power Systems has b

In [17]:
import pandas as pd
import text_extensions_for_pandas as tp

# Convert the output of Watson Natural Language Understanding to DataFrames.
dataframes = tp.io.watson.nlu.parse_response(nlu_results)
dataframes.keys()

dict_keys(['syntax', 'entities', 'entity_mentions', 'keywords', 'relations', 'semantic_roles'])

In [18]:
entity_mentions = dataframes["entity_mentions"]
entity_mentions

Unnamed: 0,type,text,span,confidence
0,Organization,SAP,"[126, 129): 'SAP'",0.706408
1,Organization,SAP,"[210, 213): 'SAP'",0.625238
2,Organization,SAP,"[565, 568): 'SAP'",0.70457
3,Organization,SAP,"[937, 940): 'SAP'",0.738066
4,Organization,SAP,"[1137, 1140): 'SAP'",0.644183
5,Organization,SAP,"[1294, 1297): 'SAP'",0.224388
6,Organization,SAP,"[1509, 1512): 'SAP'",0.746624
7,Organization,SAP,"[1709, 1712): 'SAP'",0.616656
8,Organization,SAP,"[1968, 1971): 'SAP'",0.876088
9,Organization,SAP,"[2720, 2723): 'SAP'",0.807853


In [19]:
# Extract mentions of person names
person_mentions = entity_mentions[entity_mentions["type"] == "Person"]
person_mentions

Unnamed: 0,type,text,span,confidence
38,Person,Christoph Herman,"[1213, 1229): 'Christoph Herman'",0.94435
41,Person,Stephen Leonard,"[2227, 2242): 'Stephen Leonard'",0.989177
48,Person,Sam Ponedal,"[3574, 3585): 'Sam Ponedal'",0.894298


In [20]:
# Show person mentions in context
person_mentions["span"].array

Unnamed: 0,begin,end,covered_text
0,1213,1229,Christoph Herman
1,2227,2242,Stephen Leonard
2,3574,3585,Sam Ponedal


In [21]:
# Go back to the Watson output and retrieve the results from the semantic_roles
# model. This mode finds subject-verb-object triples.
semantic_roles = dataframes["semantic_roles"]
semantic_roles

Unnamed: 0,subject.text,sentence,object.text,action.verb.text,action.verb.tense,action.text,action.normalized
0,IBM,"ARMONK, N.Y., Feb. 19, 2020 /PRNewswire/ -- IB...",that IBM Power Systems has been certified for ...,announce,past,announced,announce
1,IBM Power Systems,"ARMONK, N.Y., Feb. 19, 2020 /PRNewswire/ -- IB...",certified for the SAP HANA® Enterprise Cloud a...,be,past,been,be
2,to simplify the IT infrastructure,"ARMONK, N.Y., Feb. 19, 2020 /PRNewswire/ -- IB...","for the managed, private cloud environment",aim,present,aiming,aim
3,the IT infrastructure,"ARMONK, N.Y., Feb. 19, 2020 /PRNewswire/ -- IB...","for the managed, private cloud environment",simplify,future,aiming to simplify,aim to simplify
4,IBM POWER9-based IBM Power Systems E980 servers,The service will run on IBM POWER9-based IBM ...,the industry's largest virtualized server scal...,have,present,have,have
5,the IT infrastructure,Providing the IT infrastructure for a managed...,for a managed,Providing,present,Providing,Providing
6,the SAP HANA Enterprise Cloud,Providing the IT infrastructure for a managed...,a scalable,be,present,is,be
7,a scalable and secured service,Providing the IT infrastructure for a managed...,a user's evolution,accelerate,future,is designed to accelerate,be design to accelerate
8,It,It provides capabilities that span the softwa...,capabilities that span the software and hardwa...,provide,present,provides,provide
9,capabilities,It provides capabilities that span the softwa...,the software and hardware stack,span,present,span,span


In [22]:
# Extract instances of subjects that made statements
quotes = semantic_roles[semantic_roles["action.normalized"] == "say"]
quotes

Unnamed: 0,subject.text,sentence,object.text,action.verb.text,action.verb.tense,action.text,action.normalized
11,"Christoph Herman, SVP and Head of SAP HANA Ent...","""SAP HANA Enterprise Cloud on IBM Power Syste...",SAP HANA Enterprise Cloud on IBM Power Systems...,say,past,said,say
23,"Stephen Leonard, General Manager, IBM Cognitiv...","""In June, IBM announced the availability of P...","In June, IBM announced the availability of POW...",say,past,said,say


In [23]:
# Filter down to just the columns we're interested in
subjects = quotes[["subject.text"]].reset_index(drop=True)
subjects

Unnamed: 0,subject.text
0,"Christoph Herman, SVP and Head of SAP HANA Ent..."
1,"Stephen Leonard, General Manager, IBM Cognitiv..."


In [24]:
# subjects["subject.text"] contains strings. We need spans. Create spans.
    
# Original document had HTML tags. Get the detagged document text.
doc_text = entity_mentions["span"].array.document_text

# Use String.index() to find where the strings in "subject.text" begin
subjects["begin"] = pd.Series(
    [doc_text.index(s) for s in subjects["subject.text"]], dtype=int)
subjects

Unnamed: 0,subject.text,begin
0,"Christoph Herman, SVP and Head of SAP HANA Ent...",1213
1,"Stephen Leonard, General Manager, IBM Cognitiv...",2227


In [25]:
# Compute end offsets
subjects["end"] = subjects["begin"] + subjects["subject.text"].str.len()
subjects

Unnamed: 0,subject.text,begin,end
0,"Christoph Herman, SVP and Head of SAP HANA Ent...",1213,1281
1,"Stephen Leonard, General Manager, IBM Cognitiv...",2227,2519


In [26]:
# Wrap the <begin, end, text> triples in a SpanArray column
subjects["span"] = tp.SpanArray(doc_text, subjects["begin"], subjects["end"])
subjects

Unnamed: 0,subject.text,begin,end,span
0,"Christoph Herman, SVP and Head of SAP HANA Ent...",1213,1281,"[1213, 1281): 'Christoph Herman, SVP and Head ..."
1,"Stephen Leonard, General Manager, IBM Cognitiv...",2227,2519,"[2227, 2519): 'Stephen Leonard, General Manage..."


In [27]:
# Align subjects with person names
executives = tp.spanner.contain_join(subjects["span"], 
                                     person_mentions["span"],
                                     "subject", "person")
executives

Unnamed: 0,subject,person
0,"[1213, 1281): 'Christoph Herman, SVP and Head ...","[1213, 1229): 'Christoph Herman'"
1,"[2227, 2519): 'Stephen Leonard, General Manage...","[2227, 2242): 'Stephen Leonard'"


In [28]:
final_result = executives.drop(columns="subject")
final_result["url"] = doc_url
final_result

Unnamed: 0,person,url
0,"[1213, 1229): 'Christoph Herman'",https://newsroom.ibm.com/2020-02-19-IBM-Power-...
1,"[2227, 2242): 'Stephen Leonard'",https://newsroom.ibm.com/2020-02-19-IBM-Power-...
