{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "c2321978",
   "metadata": {},
   "source": [
    "Check Open Data portals for Swedish municipalities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "db9092dd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Last runa:  2021-06-29 11:33:35.369172\n"
     ]
    }
   ],
   "source": [
    "from datetime import datetime\n",
    "start_time  = datetime.now()\n",
    "print(\"Last runa: \", start_time)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "0d8b5634",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(148, 4)"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# pip install sparqlwrapper\n",
    "# https://rdflib.github.io/sparqlwrapper/\n",
    "\n",
    "import sys,json\n",
    "import pandas as pd\n",
    "from SPARQLWrapper import SPARQLWrapper, JSON\n",
    "\n",
    "endpoint_url = \"https://query.wikidata.org/sparql\"\n",
    "\n",
    "queryOpenDataMiss = \"\"\"SELECT (REPLACE(STR(?org), \".*Q\", \"Q\") AS ?wikidata) ?org ?orgLabel ?www   WHERE {\n",
    "  ?org wdt:P31 wd:Q127448.\n",
    "  ?org wdt:P856 ?www.\n",
    "  ?org wdt:P361 ?lan.\n",
    "  ?lan wdt:P31 wd:Q193556.\n",
    "  ?org wdt:P276?/wdt:P625 ?coord.\n",
    "  minus {?org wdt:P8402 ?portalValue}\n",
    "  SERVICE wikibase:label { bd:serviceParam wikibase:language \"sv\". }\n",
    "} \n",
    "GROUP BY ?org ?orgLabel ?www\n",
    "ORDER BY DESC (?orgLabel) \"\"\"\n",
    "\n",
    "\n",
    "def get_sparql_dataframe(endpoint_url, query):\n",
    "    \"\"\"\n",
    "    Helper function to convert SPARQL results into a Pandas data frame.\n",
    "    \"\"\"\n",
    "    user_agent = \"salgo60/%s.%s\" % (sys.version_info[0], sys.version_info[1])\n",
    " \n",
    "    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)\n",
    "    sparql.setQuery(query)\n",
    "    sparql.setReturnFormat(JSON)\n",
    "    result = sparql.query()\n",
    "\n",
    "    processed_results = json.load(result.response)\n",
    "    cols = processed_results['head']['vars']\n",
    "\n",
    "    out = []\n",
    "    for row in processed_results['results']['bindings']:\n",
    "        item = []\n",
    "        for c in cols:\n",
    "            item.append(row.get(c, {}).get('value'))\n",
    "        out.append(item)\n",
    "\n",
    "    return pd.DataFrame(out, columns=cols)\n",
    "\n",
    "WDOpenDataCheck = get_sparql_dataframe(endpoint_url, queryOpenDataMiss)\n",
    "WDOpenDataCheck.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0c6b7591",
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.set_option('max_colwidth', 400)\n",
    "WDOpenDataCheck.tail(60)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "f8dbf6c2",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 22%|██▏       | 32/148 [00:10<00:45,  2.53it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Error  404 http://www.torsas.se/psidata Q515551\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 35%|███▌      | 52/148 [00:17<00:34,  2.82it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Error  404 http://www.skurup.se/psidata Q515266\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 50%|█████     | 74/148 [00:35<00:35,  2.07it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Error  404 http://www.monsteras.se/psidata Q515250\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 66%|██████▌   | 97/148 [30:15<6:30:51, 459.84s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Error  404 http://www.koping.se/psidata Q42009\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 148/148 [30:33<00:00, 12.39s/it]   "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "148\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "import urllib3, json\n",
    "from tqdm import tqdm\n",
    "timeout = urllib3.util.Timeout(connect=2.0, read=7.0)\n",
    "http = urllib3.PoolManager()\n",
    "\n",
    "listOpenDataCheck = []\n",
    "for WD, row in tqdm(WDOpenDataCheck.iterrows(), total=WDOpenDataCheck.shape[0]):\n",
    "    url = row[\"www\"] +\"/psidata\"\n",
    "    \n",
    "    new_item = dict()\n",
    "    new_item['wikidata'] = row[\"wikidata\"] \n",
    "    try:\n",
    "        r = http.request('GET', url)        \n",
    "        new_item['status'] = r.status\n",
    "\n",
    "    except:\n",
    "        print (\"Error \", r.status, url, row[\"wikidata\"] )\n",
    "        new_item['status'] = r.status\n",
    "#    if r.status == 200:\n",
    "#        print (row[\"wikidata\"] , r.status, url)\n",
    "    new_item['url'] = url \n",
    "#    new_item['country'] = row[\"country\"] \n",
    "    \n",
    "    listOpenDataCheck.append(new_item)\n",
    "print (len(listOpenDataCheck))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "18e2f8dc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(148, 3)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "OpenDataNewtot = pd.DataFrame(listOpenDataCheck,\n",
    "                  columns=['wikidata','status','url'])\n",
    "OpenDataNewtot.shape\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "697db5b8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>wikidata</th>\n",
       "      <th>status</th>\n",
       "      <th>url</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>Q271153</td>\n",
       "      <td>200</td>\n",
       "      <td>http://www.are.se/psidata</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>Q515477</td>\n",
       "      <td>200</td>\n",
       "      <td>http://www.vastervik.se/psidata</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>Q501432</td>\n",
       "      <td>200</td>\n",
       "      <td>http://www.tibro.se/psidata</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>89</th>\n",
       "      <td>Q504235</td>\n",
       "      <td>200</td>\n",
       "      <td>http://www.ljungby.se/psidata</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>108</th>\n",
       "      <td>Q510223</td>\n",
       "      <td>200</td>\n",
       "      <td>http://www.karlshamn.se/psidata</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>109</th>\n",
       "      <td>Q499435</td>\n",
       "      <td>200</td>\n",
       "      <td>https://www.karlsborg.se/psidata</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>116</th>\n",
       "      <td>Q428749</td>\n",
       "      <td>200</td>\n",
       "      <td>http://www.hjo.se/psidata</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>121</th>\n",
       "      <td>Q499359</td>\n",
       "      <td>200</td>\n",
       "      <td>http://www.hammaro.se/psidata</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>142</th>\n",
       "      <td>Q509476</td>\n",
       "      <td>200</td>\n",
       "      <td>http://www.boden.se/psidata</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    wikidata  status                               url\n",
       "11   Q271153     200         http://www.are.se/psidata\n",
       "18   Q515477     200   http://www.vastervik.se/psidata\n",
       "36   Q501432     200       http://www.tibro.se/psidata\n",
       "89   Q504235     200     http://www.ljungby.se/psidata\n",
       "108  Q510223     200   http://www.karlshamn.se/psidata\n",
       "109  Q499435     200  https://www.karlsborg.se/psidata\n",
       "116  Q428749     200         http://www.hjo.se/psidata\n",
       "121  Q499359     200     http://www.hammaro.se/psidata\n",
       "142  Q509476     200       http://www.boden.se/psidata"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "OpenDataNewtot[OpenDataNewtot.status == 200]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a6e5d739",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}