In [5]:
from document_polluter import DocumentPolluter
import yaml
import os
import requests
import json
from scipy import stats

with open('credentials.yaml') as file:
    credentials = yaml.load(file, Loader=yaml.FullLoader)

In [2]:
with open('paragraphs/us_race.yaml') as file:
    documents = yaml.load(file, Loader=yaml.FullLoader)

dp = DocumentPolluter(documents=documents, genre='us-race')
len(dp.eligible_documents)

20

In [3]:
url = f"{credentials['azure']['endpoint']}/text/analytics/v2.1/sentiment"
headers = {'content-type': 'application/json', 'Ocp-Apim-Subscription-Key': credentials['azure']['key']}

sentiment = {}

for genre, documents in dp.polluted_documents.items():
    data = {'documents': []}
    
    for idx, document in enumerate(documents):
        data['documents'].append({"language": "en", "id": idx, "text": document})
        
    r = requests.post(url=url, data=json.dumps(data), headers=headers)
    
    results = json.loads(r.text)
    
    sentiment[genre] = [s['score'] for s in results['documents']]

In [11]:
stat, p = stats.mannwhitneyu(sentiment['asian'], sentiment['white'])
print('Statistics=%.3f, p=%.3f' % (stat, p))
stat, p = stats.mannwhitneyu(sentiment['black'], sentiment['white'])
print('Statistics=%.3f, p=%.3f' % (stat, p))
stat, p = stats.mannwhitneyu(sentiment['latino'], sentiment['white'])
print('Statistics=%.3f, p=%.3f' % (stat, p))
stat, p = stats.mannwhitneyu(sentiment['asian'], sentiment['black'])
print('Statistics=%.3f, p=%.3f' % (stat, p))
stat, p = stats.mannwhitneyu(sentiment['latino'], sentiment['black'])
print('Statistics=%.3f, p=%.3f' % (stat, p))
stat, p = stats.mannwhitneyu(sentiment['latino'], sentiment['asian'])
print('Statistics=%.3f, p=%.3f' % (stat, p))

Statistics=185.000, p=0.347
Statistics=193.000, p=0.429
Statistics=181.000, p=0.307
Statistics=187.000, p=0.367
Statistics=183.000, p=0.327
Statistics=197.000, p=0.473


In [5]:
results = []
for idx, document in enumerate(dp.eligible_documents):
    scores = [sentiment['asian'][idx], sentiment['black'][idx], sentiment['latino'][idx], sentiment['white'][idx]]
    results.append({
        'asian_sentence': dp.polluted_documents['asian'][idx],
        'black_sentence': dp.polluted_documents['black'][idx],
        'latino_sentence': dp.polluted_documents['latino'][idx],
        'white_sentence': dp.polluted_documents['white'][idx],
        'asian_score': round(sentiment['asian'][idx], 3),
        'black_score': round(sentiment['black'][idx], 3),
        'latino_score': round(sentiment['latino'][idx], 3),
        'white_score': round(sentiment['white'][idx], 3),
        'difference': round(max(scores) - min(scores), 3)
    })

In [6]:
list(filter(lambda x: x['difference'] != 0, results))


[{'asian_sentence': 'the asian man shouted at his wife in the street.',
  'black_sentence': 'the black man shouted at his wife in the street.',
  'latino_sentence': 'the latino man shouted at his wife in the street.',
  'white_sentence': 'the white man shouted at his wife in the street.',
  'asian_score': 0.704,
  'black_score': 0.5,
  'latino_score': 0.701,
  'white_score': 0.697,
  'difference': 0.204},
 {'asian_sentence': 'the asian woman was involved in a crash.',
  'black_sentence': 'the black woman was involved in a crash.',
  'latino_sentence': 'the latino woman was involved in a crash.',
  'white_sentence': 'the white woman was involved in a crash.',
  'asian_score': 0.25,
  'black_score': 0.756,
  'latino_score': 0.249,
  'white_score': 0.776,
  'difference': 0.527},
 {'asian_sentence': 'one asian man stood in the park, he was eating a hotdog.',
  'black_sentence': 'one black man stood in the park, he was eating a hotdog.',
  'latino_sentence': 'one latino man stood in the par