{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Riksdagen motioner \n", "\n", "* Denna [Jupyter Notebook](https://github.com/salgo60/open-data-examples/blob/master/Riksdagens%20dokument%20Motioner-python.ipynb) \n", " * [GITHUB Riksdagen](https://github.com/salgo60/Riksdagen)\n", "* [Skapa sökfråga](http://data.riksdagen.se/dokumentlista/) --> 245204 Motioner \n", " \n", "* Wikipedia [WikiProject_Sweden/Swedish_Riksdag_documents](https://www.wikidata.org/wiki/Wikidata:WikiProject_Sweden/Swedish_Riksdag_documents) \n", "\n", "* [Sida med prefix](https://data.riksdagen.se/dokumentation/sa-funkar-dokument-id/#:~:text=kod%20for%20riksmote%20eller%20ar)\n", "\n", "* [Buggar Riksdagens öppna api](https://github.com/salgo60/Wikidata_riksdagen-corpus/issues/50)\n", " * verkar vara en bug att man måste ange rm för att äldre dokument skall hämtas\n", " \n", "Tanken skapa Python som kollar alla Riksdagsdokument i WIkidata om det finns lagstiftandeförsamling hos Riksdagen\n", "* [video vad som gjordes](https://youtu.be/L3_VCtSMrfc)\n", "* [yta för att samla problem med Riksdagens data](https://github.com/salgo60/Wikidata_riksdagen-corpus/issues/50#issuecomment-1264662050)\n", "\n", "### Test H7023008 Motion 2019/20:3008 av Jonas Sjöstedt m.fl. (V)\n", "* [text](http://data.riksdagen.se/dokument/H7023008.text) / [html](http://data.riksdagen.se/dokument/H7023008.html) / [json](http://data.riksdagen.se/dokumentlista/?sok=H7023008&utformat=json) \n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Last run: 2022-10-02 19:36:35.977986\n" ] } ], "source": [ "from datetime import datetime\n", "start_time = datetime.now()\n", "print(\"Last run: \", start_time)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import sys\n", "from SPARQLWrapper import SPARQLWrapper, JSON\n", "from tqdm.auto import tqdm\n", "tqdm.pandas()\n", "import urllib3, json\n", "import pandas as pd\n", "\n", "endpoint_url = \"https://query.wikidata.org/sparql\"\n", "\n", "queryMotioner = \"\"\"#title: Missing lagstiftande\n", "SELECT (Replace(str(?doc),\".*Q\", \"Q\") AS ?WD) ?api\n", "WHERE\n", "{\n", " ?doc wdt:P8433 ?rid.\n", " ?doc wdt:P31 ?instance.\n", " ?doc wdt:P31/wdt:P279* wd:Q452237.\n", " BIND(URI(CONCAT(\"https://data.riksdagen.se/dokument/\",?rid,\".json\")) AS ?api)\n", " minus{?doc wdt:P7727 ?miss}\n", " SERVICE wikibase:label { bd:serviceParam wikibase:language \"sv,en\". }\n", "} \n", "\"\"\"\n", "\n", "def get_sparql_dataframe(endpoint_url, query):\n", " \"\"\"\n", " Helper function to convert SPARQL results into a Pandas data frame.\n", " \"\"\"\n", " user_agent = \"salgo60/%s.%s\" % (sys.version_info[0], sys.version_info[1])\n", " \n", " sparql = SPARQLWrapper(endpoint_url, agent=user_agent)\n", " sparql.setQuery(query)\n", " sparql.setReturnFormat(JSON)\n", " result = sparql.query()\n", "\n", " processed_results = json.load(result.response)\n", " cols = processed_results['head']['vars']\n", "\n", " out = []\n", " for row in processed_results['results']['bindings']:\n", " item = []\n", " for c in cols:\n", " item.append(row.get(c, {}).get('value'))\n", " out.append(item)\n", "\n", " return pd.DataFrame(out, columns=cols)\n", "\n", "WDMotioner = get_sparql_dataframe(endpoint_url, queryMotioner)\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 47802 entries, 0 to 47801\n", "Data columns (total 2 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 WD 47802 non-null object\n", " 1 api 47802 non-null object\n", "dtypes: object(2)\n", "memory usage: 747.0+ KB\n" ] } ], "source": [ "WDMotioner.info()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\tError in https://data.riksdagen.se/dokument/FY02957.json Expecting value: line 1 column 1 (char 0)\n", "\tError in https://data.riksdagen.se/dokument/H9024373-.json Expecting value: line 1 column 1 (char 0)\n", "\tError in https://data.riksdagen.se/dokument/H9024355-.json Expecting value: line 1 column 1 (char 0)\n", "\tError in https://data.riksdagen.se/dokument/H9023779-.json Expecting value: line 1 column 1 (char 0)\n", "\tError in https://data.riksdagen.se/dokument/GS02xv282-.json Expecting value: line 1 column 1 (char 0)\n" ] } ], "source": [ "http = urllib3.PoolManager() \n", "listP7277 = []\n", "dftot = pd.DataFrame()\n", "for index, row in WDMotioner.iterrows():\n", " url = row['api']\n", " #print(row['WD'],url)\n", " try:\n", " r = http.request('GET', url)\n", " #print(r.data)\n", " data = json.loads(r.data)\n", " organ = data[\"dokumentstatus\"][\"dokument\"][\"organ\"]\n", " listP7277.append([row[\"WD\"],organ])\n", " except Exception as e:\n", " print(\"\\tError in \",url, e) \n", "dftot = pd.DataFrame(listP7277)\n", "dftot.columns =['WD', 'lagstiftande']\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
WDlagstiftande
0Q98001273LU
1Q98001277
2Q98001287
3Q98001289JoU
4Q98001295BoU
.........
47792Q111682080LU
47793Q111682081LU
47794Q111682086LU
47795Q111682087LU
47796Q111682084LU
\n", "

47797 rows × 2 columns

\n", "
" ], "text/plain": [ " WD lagstiftande\n", "0 Q98001273 LU\n", "1 Q98001277 \n", "2 Q98001287 \n", "3 Q98001289 JoU\n", "4 Q98001295 BoU\n", "... ... ...\n", "47792 Q111682080 LU\n", "47793 Q111682081 LU\n", "47794 Q111682086 LU\n", "47795 Q111682087 LU\n", "47796 Q111682084 LU\n", "\n", "[47797 rows x 2 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dftot" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " 32985\n", "JoU 4116\n", "BoU 2355\n", "LU 2066\n", "SoU 538\n", "TU 537\n", "MJU 464\n", "JuU 401\n", "sku 377\n", "SkU 370\n", "UbU 359\n", "CU 305\n", "NU 299\n", "ubu 285\n", "KU 274\n", "SfU 228\n", "sfu 214\n", "tu 192\n", "UU 178\n", "juu 174\n", "nu 172\n", "KrU 166\n", "AU 136\n", "FiU 133\n", "kru 131\n", "uu 118\n", "lu 114\n", "FöU 95\n", "au 9\n", "U 2\n", "kamm 1\n", "MU 1\n", "MjU 1\n", "SKU 1\n", "Name: lagstiftande, dtype: int64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dftot[\"lagstiftande\"].value_counts()\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "dfWD = dftot[dftot[\"lagstiftande\"] != \"\"]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 14812 entries, 0 to 47796\n", "Data columns (total 2 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 WD 14812 non-null object\n", " 1 lagstiftande 14812 non-null object\n", "dtypes: object(2)\n", "memory usage: 347.2+ KB\n" ] } ], "source": [ "dfWD.info()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "dfWD.to_csv(\"Lagstiftande.csv\")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Ended: 2022-10-02 20:20:21.565499\n", "Time elapsed (hh:mm:ss.ms) 0:43:45.588685\n" ] } ], "source": [ "end = datetime.now()\n", "print(\"Ended: \", end) \n", "print('Time elapsed (hh:mm:ss.ms) {}'.format(datetime.now() - start_time))" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# GE02Bo518\n", "#dftot[dftot[\"id\"] == \"GE02Bo518\"]\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" } }, "nbformat": 4, "nbformat_minor": 4 }