{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Extraction des entités avec NER\n", "\n", "Ce notebook utilise [spacy.io](https://spacy.io) pour identifier et collecter certains types d'entités: Personnes, GPE, NORP, ...\n", "\n", "Voir https://spacy.io/docs/usage/entity-recognition pour la liste complète des entités identifiables par spacy\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", "import csv, re, string\n", "\n", "import spacy\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "--Le dataset a 172191 rows\n", "\n", "-- columns\n", "Index(['post_id', 'reaction_count', 'share_count', 'comment_count',\n", " 'created_at', 'message'],\n", " dtype='object')\n", "\n", "-- head\n", " post_id reaction_count share_count comment_count \\\n", "73862 1156 0.0 0.0 0.0 \n", "49902 5390 1.0 0.0 0.0 \n", "81677 4803 2.0 0.0 0.0 \n", "156453 2704 1.0 0.0 0.0 \n", "7789 2818 0.0 0.0 0.0 \n", "\n", " created_at message \n", "73862 2016-07-24 15:46:46 Bien Paolo Vargas Well, your a nasty little Cu... \n", "49902 2016-10-23 21:17:37 \"and they are bringing those problems to us\" \n", "81677 2016-09-15 00:40:10 This still does not fill the void in my life. ... \n", "156453 2016-09-20 17:44:09 why's real life moonman a deplorable now? that... \n", "7789 2016-11-18 11:35:33 She's been cut off from her daily dose of adre... \n" ] } ], "source": [ "# load les données\n", "df = pd.read_csv('../data/facebook_god_emperor_trump.csv', low_memory=False)\n", "\n", "# échantillonage aléatoire des données ce qui permet \n", "# de faire tourner le script sur un subset du dataset\n", "# frac = 0.5 => 50% du dataset original \n", "\n", "frac = 1\n", "df = df.sample(frac = frac)\n", "\n", "print(\"\\n-- Le dataset a {0} rows\".format(df.shape[0]))\n", "print(\"\\n-- columns\")\n", "print(df.columns)\n", "print(\"\\n-- head\")\n", "print(df.head())\n" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Identification des entités\n", "\n", "On s'intéresse surtout aux personnes, GPE (pays, villes, états), ORG (entreprises, marques) et NORP (nationalités, religions, ...).\n", "Le NER de spacy n'est pas 100% précis. Mais c'est cela qui nous sera utile pour identifier les détournements des noms de personnes, de pays (Murica pour America) ou d'autres entités (4chan, Fox News, ...)\n", "\n", "Voir https://spacy.io/docs/usage/entity-recognition pour d'autres exemples d'utilisation du NER de spacy.io \n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# charger le modele spacy en anglais\n", "\n", "nlp = spacy.load('en')\n", "\n", "# Très simplement, ajouter stocker chaque entité dans une liste\n", "\n", "persons = []\n", "GPE = []\n", "ORG = []\n", "NORP = []\n", "entities = [] # pour toutes les autres entités hors date ou chiffre\n", "\n", "# Parcourir tous les rows de la dataframe \n", "\n", "for i,d in df.iterrows():\n", "\n", " # parser chaque message avec spacy\n", "\n", " doc = nlp(d.message)\n", " \n", " # si spacy a identifié des entités:\n", "\n", " if len(doc.ents) > 0:\n", " for ent in doc.ents:\n", " if ent.label_ == 'PERSON':\n", " persons.append(str(ent))\n", " elif ent.label_ == 'GPE':\n", " GPE.append(str(ent))\n", " elif ent.label_ == 'NORP':\n", " NORP.append(str(ent))\n", " elif ent.label_ == 'ORG':\n", " ORG.append(str(ent))\n", " # Toutes les autres entités trouvées en dehors des dates et chiffres\n", " elif (ent.label_ not in ['CARDINAL', 'ORDINAL', 'PERCENT','DATE','TIME']) :\n", " entities.append((str(ent), ent.label_))\n", "\n", " \n", "# mémoriser les différentes entités dans des fichiers\n", "\n", "open('../data/persons.txt', 'w').write(\"\\n\".join( sorted(persons) ))\n", "open('../data/gpe.txt', 'w').write(\"\\n\".join( sorted(GPE) ))\n", "open('../data/norp.txt', 'w').write(\"\\n\".join( sorted(NORP) ))\n", "open('../data/org.txt', 'w').write(\"\\n\".join( sorted(ORG) ))\n", "\n", "# Sauvegarder les autres type d'entités ainsi que leur type\n", "\n", "str_entities = [ \"{0},{1}\".format(e[1], str(e[0]).replace(',',' ')) for e in entities ]\n", "\n", "open('../data/entities.csv', 'w').write(\"\\n\".join( str_entities ))\n", "\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 2 }