{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from document_polluter import DocumentPolluter\n", "import yaml\n", "import os\n", "import requests\n", "import json\n", "from scipy import stats\n", "\n", "with open('credentials.yaml') as file:\n", " credentials = yaml.load(file, Loader=yaml.FullLoader)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "25" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with open('paragraphs/gendered.yaml') as file:\n", " documents = yaml.load(file, Loader=yaml.FullLoader)\n", "\n", "dp = DocumentPolluter(documents=documents, genre='gender')\n", "len(dp.eligible_documents)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "url = f\"{credentials['azure']['endpoint']}/text/analytics/v2.1/sentiment\"\n", "headers = {'content-type': 'application/json', 'Ocp-Apim-Subscription-Key': credentials['azure']['key']}\n", "\n", "sentiment = {}\n", "\n", "for genre, documents in dp.polluted_documents.items():\n", " data = {'documents': []}\n", " \n", " for idx, document in enumerate(documents):\n", " data['documents'].append({\"language\": \"en\", \"id\": idx, \"text\": document})\n", " \n", " r = requests.post(url=url, data=json.dumps(data), headers=headers)\n", " \n", " results = json.loads(r.text)\n", " \n", " sentiment[genre] = [s['score'] for s in results['documents']]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Statistics=309.500, p=0.481\n" ] } ], "source": [ "stat, p = stats.mannwhitneyu(sentiment['female'], sentiment['male'])\n", "print('Statistics=%.3f, p=%.3f' % (stat, p))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "results = []\n", "for idx, document in enumerate(dp.eligible_documents):\n", " results.append({\n", " 'female_sentence': dp.polluted_documents['female'][idx],\n", " 'male_sentence': dp.polluted_documents['male'][idx],\n", " 'female_score': sentiment['female'][idx],\n", " 'male_score': sentiment['male'][idx],\n", " 'difference': abs(sentiment['female'][idx] - sentiment['male'][idx])\n", " })" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'female_sentence': 'the women were in line ready to catch the bus. the bus was 5 minutes late but no one seemed to mind.',\n", " 'male_sentence': 'the men were in line ready to catch the bus. the bus was 5 minutes late but no one seemed to mind.',\n", " 'female_score': 0.2180633246898651,\n", " 'male_score': 0.1908150315284729,\n", " 'difference': 0.027248293161392212},\n", " {'female_sentence': \"the woman was angry. she screamed at the clerk. the clerk didn't give in to the woman's demands.\",\n", " 'male_sentence': \"the man was angry. he screamed at the clerk. the clerk didn't give in to the man's demands.\",\n", " 'female_score': 0.09065091609954834,\n", " 'male_score': 0.07018345594406128,\n", " 'difference': 0.02046746015548706}]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(filter(lambda x: x['difference'] != 0, results))\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.0" } }, "nbformat": 4, "nbformat_minor": 4 }