{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Import des modules et outils de lecture / écriture" ] }, { "cell_type": "code", "execution_count": 153, "metadata": {}, "outputs": [], "source": [ "import csv \n", "\n", "\n", "def lecture_csv(fichier, delimiteur, encodage):\n", " \"\"\"\n", " Paramètre : un chemin vers un fichier CSV\n", " Valeur renvoyée : un tableau de dictionnaires, extraction de la table contenue dans le fichier\n", " \"\"\"\n", " f = open(fichier, mode = 'r', encoding = encodage, newline='')\n", " reader = csv.DictReader(f, delimiter = delimiteur) #création de l'objet reader\n", " table = [dict(enregistrement) for enregistrement in reader]\n", " f.close()\n", " return table\n", "\n", "def ecriture_csv(table, fichier, delimiteur, encodage):\n", " \"\"\"\n", " Paramètre : \n", " un chemin vers un fichier CSV\n", " une table comme tableau de dictionnaires partageant les mêmes clefs, de valeurs str\n", " Valeur renvoyée :\n", " None, écrit table dans fichier avec Dictriter du module CSV \n", " \"\"\"\n", " g = open(fichier, mode = 'w', encoding = encodage, newline='')\n", " attributs = list(table[0].keys())\n", " writer = csv.DictWriter(g, delimiter = delimiteur, fieldnames = attributs) #création de l'objet writer\n", " writer.writeheader() #écriture des attibuts\n", " for enregistrement in table:\n", " writer.writerow(enregistrement) #écriture des enregistrements\n", " g.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Fusion de tables par réunion" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "prenom_11 = lecture_csv('Prénoms2011GF_Rennes.csv', ';', 'ISO8859')\n", "\n", "assert prenom_11[:2] == [{'ANNAISS': '2011',\n", " 'MNAISS': '',\n", " 'CODCOM': '35238',\n", " 'LBCOM': 'RENNES',\n", " 'SEXE': 'FEMME',\n", " 'PRN': 'Manon',\n", " 'NBR': '57'},\n", " {'ANNAISS': '2011',\n", " 'MNAISS': '',\n", " 'CODCOM': '35238',\n", " 'LBCOM': 'RENNES',\n", " 'SEXE': 'FEMME',\n", " 'PRN': 'Louise',\n", " 'NBR': '55'}] and len(prenom_11) == 248" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "prenom_10 = lecture_csv('Prénoms2010GF_Rennes.csv', ';', 'ISO8859')\n", "assert prenom_10[:2] == [{'ANNAISS': '2010',\n", " 'MNAISS': '',\n", " 'CODCOM': '35238',\n", " 'LBCOM': 'RENNES',\n", " 'SEXE': 'FEMME',\n", " 'PRN': 'Louise',\n", " 'NBR': '48'},\n", " {'ANNAISS': '2010',\n", " 'MNAISS': '',\n", " 'CODCOM': '35238',\n", " 'LBCOM': 'RENNES',\n", " 'SEXE': 'FEMME',\n", " 'PRN': 'Chloé',\n", " 'NBR': '44'}] and len(prenom_10) == 221" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "def reunion_table(table1, table2):\n", " \"\"\"Paramètres : deux tables table1 et table2\n", " Valeur renvoyée : nouvelle table obtenue par concaténation de table1 et table2\n", " \"\"\"\n", " return table1 + table2" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "prenom_10_11 = reunion_table(prenom_10, prenom_11)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'ANNAISS': '2010',\n", " 'MNAISS': '',\n", " 'CODCOM': '35238',\n", " 'LBCOM': 'RENNES',\n", " 'SEXE': 'FEMME',\n", " 'PRN': 'Louise',\n", " 'NBR': '48'}" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "prenom_10_11[0]" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'ANNAISS': '2011',\n", " 'MNAISS': '',\n", " 'CODCOM': '35238',\n", " 'LBCOM': 'RENNES',\n", " 'SEXE': 'FEMME',\n", " 'PRN': 'Manon',\n", " 'NBR': '57'}" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "prenom_10_11[len(prenom_10)]" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(prenom_10) + len(prenom_11) == len(prenom_10_11)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "prenom_f_10_11 = [enreg for enreg in prenom_10_11 if enreg['SEXE'] == 'FEMME']" ] }, { "cell_type": "code", "execution_count": 167, "metadata": {}, "outputs": [], "source": [ "def prenom_max(table):\n", " \"\"\"Paramètres : table de prénoms \n", " Valeur renvoyée : \n", " tuple constitué de l'effectif maximum\n", " et du tableau des prénoms les plus fréquents\n", " \"\"\"\n", " effectif_max = int(table[0]['NBR'])\n", " histo = {table[0]['PRN'] : effectif_max} \n", " tab_max = [table[0]['PRN']]\n", " for enreg in table[1:]:\n", " prenom = enreg['PRN']\n", " effectif = int(enreg['NBR'])\n", " if prenom not in histo:\n", " histo[prenom] = effectif\n", " else:\n", " histo[prenom] += effectif\n", " if histo[prenom] > effectif_max:\n", " tab_max = [prenom]\n", " effectif_max = histo[prenom]\n", " elif histo[prenom] == effectif_max:\n", " tab_max.append(prenom)\n", " return (effectif_max, tab_max) " ] }, { "cell_type": "code", "execution_count": 170, "metadata": {}, "outputs": [], "source": [ "assert prenom_max(prenom_f_10_11) == (103, ['Louise'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Premier problème : même nombre d'attibuts des tables fusionnées mais les noms des attibuts ont changé\n", "\n", "Le fichier 'Prénoms2008GF_Rennes.csv' est le seul où l'attribut d'année de naissance est nommé ANNEE_NAISSANCE au lieu de ANNAISS" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "prenom_08 = lecture_csv('Prénoms2008GF_Rennes.csv', ';', 'ISO8859')" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "prenom_09 = lecture_csv('Prénoms2009GF_Rennes.csv', ';', 'ISO8859')" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "prenom_08_09 = reunion_table(prenom_08, prenom_09)" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "def clef_prenom(enreg):\n", " return enreg['PRN']\n", "\n", "def clef_prenom_annee(enreg):\n", " return (enreg['PRN'], enreg['ANNAISS'])" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sorted(prenom_09, key = clef_prenom_annee) == sorted(prenom_09, key = clef_prenom)" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "ename": "KeyError", "evalue": "'ANNAISS'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0msorted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprenom_08_09\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mclef_prenom_annee\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m\u001b[0m in \u001b[0;36mclef_prenom_annee\u001b[0;34m(enreg)\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mclef_prenom_annee\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0menreg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0menreg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'PRN'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0menreg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'ANNAISS'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mKeyError\u001b[0m: 'ANNAISS'" ] } ], "source": [ "sorted(prenom_08_09, key = clef_prenom_annee)" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "ename": "KeyError", "evalue": "'ANNAISS'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0msorted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprenom_08\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mclef_prenom_annee\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0msorted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprenom_08\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mclef_prenom\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m\u001b[0m in \u001b[0;36mclef_prenom_annee\u001b[0;34m(enreg)\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mclef_prenom_annee\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0menreg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0menreg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'ANNAISS'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0menreg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'PRN'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mKeyError\u001b[0m: 'ANNAISS'" ] } ], "source": [ "sorted(prenom_08, key = clef_prenom_annee) == sorted(prenom_08, key = clef_prenom)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Second problème : le nombre d'attributs a changé\n", "\n", "Pour Prénoms2015GF_Rennes.csv l'attribut MNAISS a été supprimé, il n'était pas utilisé dans les autres fichiers. Comme les enregistrements sont des dictionnaires, cela ne pose pas de problème d'accès aux autres attributs (pas de décalage d'index)." ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "prenom_14 = lecture_csv('Prénoms2014GF_ Rennes.csv', ';', 'ISO8859')\n", "# noter aussi l'espace malencontreux dans le nom de fichier => encore une erreur humaine" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "mois_14 = [enreg['MNAISS'] for enreg in prenom_14]" ] }, { "cell_type": "code", "execution_count": 171, "metadata": {}, "outputs": [], "source": [ "assert mois_14[:4] == ['', '', '', '']" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], "source": [ "prenom_15 = lecture_csv('Prénoms2015GF_Rennes.csv', ';', 'ISO8859')" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "ename": "KeyError", "evalue": "'MNAISS'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmois_15\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0menreg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'MNAISS'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0menreg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mprenom_15\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmois_15\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0menreg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'MNAISS'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0menreg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mprenom_15\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mKeyError\u001b[0m: 'MNAISS'" ] } ], "source": [ "mois_15 = [enreg['MNAISS'] for enreg in prenom_15]" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "prenom_14_15 = reunion_table(prenom_14, prenom_15)" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'ANNAISS': '2015',\n", " 'CODCOM': '35238',\n", " 'LBCOM': 'RENNES',\n", " 'SEXE': 'FEMME',\n", " 'PRN': 'Aala',\n", " 'NRB': '1'},\n", " {'ANNAISS': '2014',\n", " 'MNAISS': '',\n", " 'CODCOM': '35238',\n", " 'LBCOM': 'RENNES',\n", " 'SEXE': 'FEMME',\n", " 'PRN': 'Aaliyah',\n", " 'NRB': '2'},\n", " {'ANNAISS': '2015',\n", " 'CODCOM': '35238',\n", " 'LBCOM': 'RENNES',\n", " 'SEXE': 'FEMME',\n", " 'PRN': 'Aaliyah',\n", " 'NRB': '3'}]" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sorted(prenom_14_15, key = clef_prenom_annee)[:3]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Fusion de tables par jointure" ] }, { "cell_type": "code", "execution_count": 115, "metadata": {}, "outputs": [], "source": [ "films = lecture_csv('films.csv', ',', 'utf8')\n", "realisateurs = lecture_csv('realisateurs.csv', ',', 'utf8')" ] }, { "cell_type": "code", "execution_count": 117, "metadata": {}, "outputs": [], "source": [ "assert films[:2] == [{'titre': 'La vie est belle', 'réalisateur': 'Capra', 'année': '1946'},\n", " {'titre': 'La vie est belle', 'réalisateur': 'Benigni', 'année': '1997'}] and len(films) == 36" ] }, { "cell_type": "code", "execution_count": 118, "metadata": {}, "outputs": [], "source": [ "assert len(realisateurs) == 21 and realisateurs[:2] == [{'nom': 'Coppola', 'prénom': 'Sofia', 'pays': 'USA'},\n", " {'nom': 'Coppola', 'prénom': 'Francis Ford', 'pays': 'USA'}]" ] }, { "cell_type": "code", "execution_count": 119, "metadata": {}, "outputs": [], "source": [ "def fusion_enregistrements(f, r):\n", " return {'titre' : f['titre'], 'année' : f['année'], 'nom réalisateur' : r['nom'],\n", " 'prénom réalisateur' : r['prénom'], 'pays réalisateur' : r['pays']}" ] }, { "cell_type": "code", "execution_count": 123, "metadata": {}, "outputs": [], "source": [ "def fusion_tables(films, realisateurs):\n", " fusion = []\n", " for f in films:\n", " for r in realisateurs:\n", " if f['réalisateur'] == r['nom']:\n", " fusion.append(fusion_enregistrements(f, r))\n", " return fusion\n", "\n", "def fusion_tables2(films, realisateurs): \n", " return [fusion_enregistrements(f, r) for f in films for r in realisateurs if f['réalisateur'] == r['nom']]\n" ] }, { "cell_type": "code", "execution_count": 124, "metadata": {}, "outputs": [], "source": [ "jointure_f_r = fusion_tables(films, realisateurs)" ] }, { "cell_type": "code", "execution_count": 125, "metadata": {}, "outputs": [], "source": [ "jointure_f_r2 = fusion_tables2(films, realisateurs)" ] }, { "cell_type": "code", "execution_count": 126, "metadata": {}, "outputs": [], "source": [ "assert jointure_f_r == jointure_f_r2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Premier problème d'incohérence : trois réalisateurs pour le même film ! Le nom du réalisateur ne permet pas de l'identitifier de façon unique, même si on rajoutait le prénom on pourrait peut être tomber sur des cas d'homonymie " ] }, { "cell_type": "code", "execution_count": 127, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'titre': 'Lost in translation',\n", " 'année': '2003',\n", " 'nom réalisateur': 'Coppola',\n", " 'prénom réalisateur': 'Sofia',\n", " 'pays réalisateur': 'USA'},\n", " {'titre': 'Lost in translation',\n", " 'année': '2003',\n", " 'nom réalisateur': 'Coppola',\n", " 'prénom réalisateur': 'Francis Ford',\n", " 'pays réalisateur': 'USA'},\n", " {'titre': 'Lost in translation',\n", " 'année': '2003',\n", " 'nom réalisateur': 'Coppola',\n", " 'prénom réalisateur': 'Roman',\n", " 'pays réalisateur': 'USA'}]" ] }, "execution_count": 127, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[enreg for enreg in jointure_f_r if enreg['titre'] == 'Lost in translation']" ] }, { "cell_type": "code", "execution_count": 128, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(43, 36)" ] }, "execution_count": 128, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(jointure_f_r), len(films)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Second problème d'incohérence : 'Autant en emporte le vent' n'a pas retrouvé son réalisateur à cause d'une erreur de saisie dans la table 'films.csv' où 'Flemming' a été écrit avec deux m au lieu de 1 !" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[]" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[enreg for enreg in jointure_f_r if enreg['titre'] == 'Autant en emporte le vent']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Une solution aux problèmes d'incohérences : utiliser un identifiant unique, on reprend les jointures " ] }, { "cell_type": "code", "execution_count": 149, "metadata": {}, "outputs": [], "source": [ "films_idr = lecture_csv('films_idr.csv', ',', 'utf8')\n", "realisateurs_id = lecture_csv('realisateurs_id.csv', ',', 'utf8')" ] }, { "cell_type": "code", "execution_count": 150, "metadata": {}, "outputs": [], "source": [ "def fusion_tables3(films, realisateurs): \n", " return [fusion_enregistrements(f, r) for f in films for r in realisateurs if f['idr'] == r['id']]\n", "\n", "\n", "jointure_f_r3 = fusion_tables3(films_idr, realisateurs_id)" ] }, { "cell_type": "code", "execution_count": 151, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'titre': 'Apocalypse Now',\n", " 'année': '1979',\n", " 'nom réalisateur': 'Coppola',\n", " 'prénom réalisateur': 'Francis Ford',\n", " 'pays réalisateur': 'USA'}]" ] }, "execution_count": 151, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[enreg for enreg in jointure_f_r3 if enreg['titre'] == 'Apocalypse Now']" ] }, { "cell_type": "code", "execution_count": 152, "metadata": {}, "outputs": [], "source": [ "assert len(jointure_f_r3) == len(films)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }