{ "cells": [ { "cell_type": "markdown", "id": "c2321978", "metadata": {}, "source": [ "Check Open Data portals for Swedish municipalities" ] }, { "cell_type": "code", "execution_count": 1, "id": "db9092dd", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Last runa: 2021-06-29 11:33:35.369172\n" ] } ], "source": [ "from datetime import datetime\n", "start_time = datetime.now()\n", "print(\"Last runa: \", start_time)" ] }, { "cell_type": "code", "execution_count": 2, "id": "0d8b5634", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(148, 4)" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# pip install sparqlwrapper\n", "# https://rdflib.github.io/sparqlwrapper/\n", "\n", "import sys,json\n", "import pandas as pd\n", "from SPARQLWrapper import SPARQLWrapper, JSON\n", "\n", "endpoint_url = \"https://query.wikidata.org/sparql\"\n", "\n", "queryOpenDataMiss = \"\"\"SELECT (REPLACE(STR(?org), \".*Q\", \"Q\") AS ?wikidata) ?org ?orgLabel ?www WHERE {\n", " ?org wdt:P31 wd:Q127448.\n", " ?org wdt:P856 ?www.\n", " ?org wdt:P361 ?lan.\n", " ?lan wdt:P31 wd:Q193556.\n", " ?org wdt:P276?/wdt:P625 ?coord.\n", " minus {?org wdt:P8402 ?portalValue}\n", " SERVICE wikibase:label { bd:serviceParam wikibase:language \"sv\". }\n", "} \n", "GROUP BY ?org ?orgLabel ?www\n", "ORDER BY DESC (?orgLabel) \"\"\"\n", "\n", "\n", "def get_sparql_dataframe(endpoint_url, query):\n", " \"\"\"\n", " Helper function to convert SPARQL results into a Pandas data frame.\n", " \"\"\"\n", " user_agent = \"salgo60/%s.%s\" % (sys.version_info[0], sys.version_info[1])\n", " \n", " sparql = SPARQLWrapper(endpoint_url, agent=user_agent)\n", " sparql.setQuery(query)\n", " sparql.setReturnFormat(JSON)\n", " result = sparql.query()\n", "\n", " processed_results = json.load(result.response)\n", " cols = processed_results['head']['vars']\n", "\n", " out = []\n", " for row in processed_results['results']['bindings']:\n", " item = []\n", " for c in cols:\n", " item.append(row.get(c, {}).get('value'))\n", " out.append(item)\n", "\n", " return pd.DataFrame(out, columns=cols)\n", "\n", "WDOpenDataCheck = get_sparql_dataframe(endpoint_url, queryOpenDataMiss)\n", "WDOpenDataCheck.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "0c6b7591", "metadata": {}, "outputs": [], "source": [ "pd.set_option('max_colwidth', 400)\n", "WDOpenDataCheck.tail(60)" ] }, { "cell_type": "code", "execution_count": 4, "id": "f8dbf6c2", "metadata": { "scrolled": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " 22%|██▏ | 32/148 [00:10<00:45, 2.53it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error 404 http://www.torsas.se/psidata Q515551\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 35%|███▌ | 52/148 [00:17<00:34, 2.82it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error 404 http://www.skurup.se/psidata Q515266\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 50%|█████ | 74/148 [00:35<00:35, 2.07it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error 404 http://www.monsteras.se/psidata Q515250\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 66%|██████▌ | 97/148 [30:15<6:30:51, 459.84s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error 404 http://www.koping.se/psidata Q42009\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 148/148 [30:33<00:00, 12.39s/it] " ] }, { "name": "stdout", "output_type": "stream", "text": [ "148\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "import urllib3, json\n", "from tqdm import tqdm\n", "timeout = urllib3.util.Timeout(connect=2.0, read=7.0)\n", "http = urllib3.PoolManager()\n", "\n", "listOpenDataCheck = []\n", "for WD, row in tqdm(WDOpenDataCheck.iterrows(), total=WDOpenDataCheck.shape[0]):\n", " url = row[\"www\"] +\"/psidata\"\n", " \n", " new_item = dict()\n", " new_item['wikidata'] = row[\"wikidata\"] \n", " try:\n", " r = http.request('GET', url) \n", " new_item['status'] = r.status\n", "\n", " except:\n", " print (\"Error \", r.status, url, row[\"wikidata\"] )\n", " new_item['status'] = r.status\n", "# if r.status == 200:\n", "# print (row[\"wikidata\"] , r.status, url)\n", " new_item['url'] = url \n", "# new_item['country'] = row[\"country\"] \n", " \n", " listOpenDataCheck.append(new_item)\n", "print (len(listOpenDataCheck))" ] }, { "cell_type": "code", "execution_count": 5, "id": "18e2f8dc", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(148, 3)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "OpenDataNewtot = pd.DataFrame(listOpenDataCheck,\n", " columns=['wikidata','status','url'])\n", "OpenDataNewtot.shape\n" ] }, { "cell_type": "code", "execution_count": 8, "id": "697db5b8", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>wikidata</th>\n", " <th>status</th>\n", " <th>url</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>11</th>\n", " <td>Q271153</td>\n", " <td>200</td>\n", " <td>http://www.are.se/psidata</td>\n", " </tr>\n", " <tr>\n", " <th>18</th>\n", " <td>Q515477</td>\n", " <td>200</td>\n", " <td>http://www.vastervik.se/psidata</td>\n", " </tr>\n", " <tr>\n", " <th>36</th>\n", " <td>Q501432</td>\n", " <td>200</td>\n", " <td>http://www.tibro.se/psidata</td>\n", " </tr>\n", " <tr>\n", " <th>89</th>\n", " <td>Q504235</td>\n", " <td>200</td>\n", " <td>http://www.ljungby.se/psidata</td>\n", " </tr>\n", " <tr>\n", " <th>108</th>\n", " <td>Q510223</td>\n", " <td>200</td>\n", " <td>http://www.karlshamn.se/psidata</td>\n", " </tr>\n", " <tr>\n", " <th>109</th>\n", " <td>Q499435</td>\n", " <td>200</td>\n", " <td>https://www.karlsborg.se/psidata</td>\n", " </tr>\n", " <tr>\n", " <th>116</th>\n", " <td>Q428749</td>\n", " <td>200</td>\n", " <td>http://www.hjo.se/psidata</td>\n", " </tr>\n", " <tr>\n", " <th>121</th>\n", " <td>Q499359</td>\n", " <td>200</td>\n", " <td>http://www.hammaro.se/psidata</td>\n", " </tr>\n", " <tr>\n", " <th>142</th>\n", " <td>Q509476</td>\n", " <td>200</td>\n", " <td>http://www.boden.se/psidata</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " wikidata status url\n", "11 Q271153 200 http://www.are.se/psidata\n", "18 Q515477 200 http://www.vastervik.se/psidata\n", "36 Q501432 200 http://www.tibro.se/psidata\n", "89 Q504235 200 http://www.ljungby.se/psidata\n", "108 Q510223 200 http://www.karlshamn.se/psidata\n", "109 Q499435 200 https://www.karlsborg.se/psidata\n", "116 Q428749 200 http://www.hjo.se/psidata\n", "121 Q499359 200 http://www.hammaro.se/psidata\n", "142 Q509476 200 http://www.boden.se/psidata" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "OpenDataNewtot[OpenDataNewtot.status == 200]" ] }, { "cell_type": "code", "execution_count": null, "id": "a6e5d739", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.4" } }, "nbformat": 4, "nbformat_minor": 5 }