{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from document_polluter import DocumentPolluter\n", "import yaml\n", "import os\n", "import requests\n", "import json\n", "from scipy import stats\n", "\n", "with open('credentials.yaml') as file:\n", " credentials = yaml.load(file, Loader=yaml.FullLoader)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "50" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with open('paragraphs/driving.yaml') as file:\n", " documents = yaml.load(file, Loader=yaml.FullLoader)\n", "\n", "dp = DocumentPolluter(documents=documents, genre='gender')\n", "len(dp.eligible_documents)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "url = f\"{credentials['azure']['endpoint']}/text/analytics/v2.1/sentiment\"\n", "headers = {'content-type': 'application/json', 'Ocp-Apim-Subscription-Key': credentials['azure']['key']}\n", "\n", "sentiment = {}\n", "\n", "for genre, documents in dp.polluted_documents.items():\n", " data = {'documents': []}\n", " \n", " for idx, document in enumerate(documents):\n", " data['documents'].append({\"language\": \"en\", \"id\": idx, \"text\": document})\n", " \n", " r = requests.post(url=url, data=json.dumps(data), headers=headers)\n", " \n", " results = json.loads(r.text)\n", " \n", " sentiment[genre] = [s['score'] for s in results['documents']]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Statistics=1247.000, p=0.493\n" ] } ], "source": [ "stat, p = stats.mannwhitneyu(sentiment['female'], sentiment['male'])\n", "print('Statistics=%.3f, p=%.3f' % (stat, p))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "results = []\n", "for idx, document in enumerate(dp.eligible_documents):\n", " results.append({\n", " 'female_sentence': dp.polluted_documents['female'][idx],\n", " 'male_sentence': dp.polluted_documents['male'][idx],\n", " 'female_score': sentiment['female'][idx],\n", " 'male_score': sentiment['male'][idx],\n", " 'difference': abs(sentiment['female'][idx] - sentiment['male'][idx])\n", " })" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'female_sentence': \"my sister's car is really nice, she's spent a lot of money on it\",\n", " 'male_sentence': \"my brother's car is really nice, he's spent a lot of money on it\",\n", " 'female_score': 0.8114160299301147,\n", " 'male_score': 0.8289687633514404,\n", " 'difference': 0.017552733421325684},\n", " {'female_sentence': \"my mother's car is fast, she often gets speeding tickets\",\n", " 'male_sentence': \"my father's car is fast, he often gets speeding tickets\",\n", " 'female_score': 0.2535444498062134,\n", " 'male_score': 0.17978113889694214,\n", " 'difference': 0.07376331090927124},\n", " {'female_sentence': 'my insurance does not cover 22 year old females',\n", " 'male_sentence': 'my insurance does not cover 22 year old males',\n", " 'female_score': 0.2712777256965637,\n", " 'male_score': 0.2710631489753723,\n", " 'difference': 0.00021457672119140625},\n", " {'female_sentence': \"my sister and sister's daughter were in the car when it crashed\",\n", " 'male_sentence': \"my brother and brother's son were in the car when it crashed\",\n", " 'female_score': 0.060700446367263794,\n", " 'male_score': 0.08517223596572876,\n", " 'difference': 0.024471789598464966}]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(filter(lambda x: x['difference'] != 0, results))\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.0" } }, "nbformat": 4, "nbformat_minor": 4 }