{ "cells": [ { "cell_type": "markdown", "id": "c2321978", "metadata": {}, "source": [ "Check Open Data portals for Swedish municipalities" ] }, { "cell_type": "code", "execution_count": 1, "id": "db9092dd", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Last runa: 2021-06-29 11:33:35.369172\n" ] } ], "source": [ "from datetime import datetime\n", "start_time = datetime.now()\n", "print(\"Last runa: \", start_time)" ] }, { "cell_type": "code", "execution_count": 2, "id": "0d8b5634", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(148, 4)" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# pip install sparqlwrapper\n", "# https://rdflib.github.io/sparqlwrapper/\n", "\n", "import sys,json\n", "import pandas as pd\n", "from SPARQLWrapper import SPARQLWrapper, JSON\n", "\n", "endpoint_url = \"https://query.wikidata.org/sparql\"\n", "\n", "queryOpenDataMiss = \"\"\"SELECT (REPLACE(STR(?org), \".*Q\", \"Q\") AS ?wikidata) ?org ?orgLabel ?www WHERE {\n", " ?org wdt:P31 wd:Q127448.\n", " ?org wdt:P856 ?www.\n", " ?org wdt:P361 ?lan.\n", " ?lan wdt:P31 wd:Q193556.\n", " ?org wdt:P276?/wdt:P625 ?coord.\n", " minus {?org wdt:P8402 ?portalValue}\n", " SERVICE wikibase:label { bd:serviceParam wikibase:language \"sv\". }\n", "} \n", "GROUP BY ?org ?orgLabel ?www\n", "ORDER BY DESC (?orgLabel) \"\"\"\n", "\n", "\n", "def get_sparql_dataframe(endpoint_url, query):\n", " \"\"\"\n", " Helper function to convert SPARQL results into a Pandas data frame.\n", " \"\"\"\n", " user_agent = \"salgo60/%s.%s\" % (sys.version_info[0], sys.version_info[1])\n", " \n", " sparql = SPARQLWrapper(endpoint_url, agent=user_agent)\n", " sparql.setQuery(query)\n", " sparql.setReturnFormat(JSON)\n", " result = sparql.query()\n", "\n", " processed_results = json.load(result.response)\n", " cols = processed_results['head']['vars']\n", "\n", " out = []\n", " for row in processed_results['results']['bindings']:\n", " item = []\n", " for c in cols:\n", " item.append(row.get(c, {}).get('value'))\n", " out.append(item)\n", "\n", " return pd.DataFrame(out, columns=cols)\n", "\n", "WDOpenDataCheck = get_sparql_dataframe(endpoint_url, queryOpenDataMiss)\n", "WDOpenDataCheck.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "0c6b7591", "metadata": {}, "outputs": [], "source": [ "pd.set_option('max_colwidth', 400)\n", "WDOpenDataCheck.tail(60)" ] }, { "cell_type": "code", "execution_count": 4, "id": "f8dbf6c2", "metadata": { "scrolled": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " 22%|██▏ | 32/148 [00:10<00:45, 2.53it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error 404 http://www.torsas.se/psidata Q515551\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 35%|███▌ | 52/148 [00:17<00:34, 2.82it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error 404 http://www.skurup.se/psidata Q515266\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 50%|█████ | 74/148 [00:35<00:35, 2.07it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error 404 http://www.monsteras.se/psidata Q515250\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 66%|██████▌ | 97/148 [30:15<6:30:51, 459.84s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error 404 http://www.koping.se/psidata Q42009\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 148/148 [30:33<00:00, 12.39s/it] " ] }, { "name": "stdout", "output_type": "stream", "text": [ "148\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "import urllib3, json\n", "from tqdm import tqdm\n", "timeout = urllib3.util.Timeout(connect=2.0, read=7.0)\n", "http = urllib3.PoolManager()\n", "\n", "listOpenDataCheck = []\n", "for WD, row in tqdm(WDOpenDataCheck.iterrows(), total=WDOpenDataCheck.shape[0]):\n", " url = row[\"www\"] +\"/psidata\"\n", " \n", " new_item = dict()\n", " new_item['wikidata'] = row[\"wikidata\"] \n", " try:\n", " r = http.request('GET', url) \n", " new_item['status'] = r.status\n", "\n", " except:\n", " print (\"Error \", r.status, url, row[\"wikidata\"] )\n", " new_item['status'] = r.status\n", "# if r.status == 200:\n", "# print (row[\"wikidata\"] , r.status, url)\n", " new_item['url'] = url \n", "# new_item['country'] = row[\"country\"] \n", " \n", " listOpenDataCheck.append(new_item)\n", "print (len(listOpenDataCheck))" ] }, { "cell_type": "code", "execution_count": 5, "id": "18e2f8dc", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(148, 3)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "OpenDataNewtot = pd.DataFrame(listOpenDataCheck,\n", " columns=['wikidata','status','url'])\n", "OpenDataNewtot.shape\n" ] }, { "cell_type": "code", "execution_count": 8, "id": "697db5b8", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | wikidata | \n", "status | \n", "url | \n", "
|---|---|---|---|
| 11 | \n", "Q271153 | \n", "200 | \n", "http://www.are.se/psidata | \n", "
| 18 | \n", "Q515477 | \n", "200 | \n", "http://www.vastervik.se/psidata | \n", "
| 36 | \n", "Q501432 | \n", "200 | \n", "http://www.tibro.se/psidata | \n", "
| 89 | \n", "Q504235 | \n", "200 | \n", "http://www.ljungby.se/psidata | \n", "
| 108 | \n", "Q510223 | \n", "200 | \n", "http://www.karlshamn.se/psidata | \n", "
| 109 | \n", "Q499435 | \n", "200 | \n", "https://www.karlsborg.se/psidata | \n", "
| 116 | \n", "Q428749 | \n", "200 | \n", "http://www.hjo.se/psidata | \n", "
| 121 | \n", "Q499359 | \n", "200 | \n", "http://www.hammaro.se/psidata | \n", "
| 142 | \n", "Q509476 | \n", "200 | \n", "http://www.boden.se/psidata | \n", "