{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If you run into any issues or have questions/concerns about the data catalog API, usage patterns, or anything else, please do not hesistate to email them to danf@usc.edu.\n",
    "\n",
    "Thanks,\n",
    "Dan Feldman"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### This notebook contains commands to browse available [CHIRPS](http://blog.chg.ucsb.edu/?m=201810) data using MINT Data Catalog API"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Prerequisites: python 3.6 or later\n",
    "import requests\n",
    "import json\n",
    "import uuid\n",
    "import pprint\n",
    "import datetime\n",
    "pp = pprint.PrettyPrinter(indent=2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Boilerplate setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Data Catalog api endpoint url \n",
    "url = \"https://api.mint-data-catalog.org\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Step 1: Get session token to use the API\n",
    "resp = requests.get(f\"{url}/get_session_token\").json()\n",
    "print(resp)\n",
    "api_key = resp['X-Api-Key']\n",
    "\n",
    "request_headers = {\n",
    "    'Content-Type': \"application/json\",\n",
    "    'X-Api-Key': api_key\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# This is a convenience method to handle api responses. The main portion of the notebook starts in the the next cell\n",
    "def handle_api_response(response, print_response=True):\n",
    "    parsed_response = response.json()\n",
    "\n",
    "    if print_response:\n",
    "        pp.pprint(parsed_response)\n",
    "    \n",
    "    if response.status_code == 200:\n",
    "        return parsed_response\n",
    "    elif response.status_code == 400:\n",
    "        raise Exception(\"Bad request ^\")\n",
    "    elif response.status_code == 403:\n",
    "        msg = \"Please make sure your request headers include X-Api-Key and that you are using correct url\"\n",
    "        raise Exception(msg)\n",
    "    else:\n",
    "        msg = \"\"\"It seems our server encountered an error which it doesn't know how to handle yet. \n",
    "        This sometimes happens with unexpected input(s). In order to help us diagnose and resolve the issue, \n",
    "        please notify Dan Feldman (danf@usc.edu) of this error.\"\"\"\n",
    "    \n",
    "    return parsed_response"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Search queries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1. Search for datasets whose names that start with \"CHIRPS\" (or \"chirps\"; it's case-insensitive).\n",
    "# Also, note the wildcard character '*' in the query. Without it, the query searches for exact matches.\n",
    "\n",
    "search_query = {\n",
    "    \"dataset_names__in\": [\"CHIRPS*\"]\n",
    "}\n",
    "\n",
    "resp = requests.post(f\"{url}/find_datasets\", \n",
    "                                        headers=request_headers,\n",
    "                                        json=search_query)\n",
    "\n",
    "# This will return a list of datasets that match the query, along with dataset's record_id, description, name, and metadata\n",
    "response = handle_api_response(resp, print_response=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Suppose we are interested in \"CHIRPS_accumulated_precipitation_one_month\". \n",
    "# We can get it from the response object above in the following way (but you can use any other way of iterating through\n",
    "# and filtering response object as it's a standard dict/list combo):\n",
    "\n",
    "desired_dataset = next(dataset for dataset in response[\"datasets\"] if dataset[\"dataset_name\"] == \"CHIRPS_accumulated_precipitation_one_month\")\n",
    "pp.pprint(desired_dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2. Once we've chosen our dataset, we can use data catalog API to find more information about it.\n",
    "# First, we need to note dataset's record_id since we'll need it to interact with other data catalog API \n",
    "# endpoints\n",
    "\n",
    "dataset_id = desired_dataset[\"dataset_id\"]\n",
    "print(dataset_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2.1 View dataset's variables\n",
    "search_query = {\n",
    "    \"dataset_id\": dataset_id\n",
    "}\n",
    "\n",
    "resp = requests.post(f\"{url}/datasets/dataset_variables\", \n",
    "                                        headers=request_headers,\n",
    "                                        json=search_query)\n",
    "\n",
    "# This will return a list of variables (along with variables' metadata) that our dataset contains\n",
    "dataset_variables = handle_api_response(resp, print_response=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2.2 Search dataset's resources based on temporal and spatial coverage\n",
    "\n",
    "# ----- WARNING -----------------------------------------------------------------------------\n",
    "# Currently, there is a limit to the number of records that the API will return at once. \n",
    "# By default it's 20, but it's possible to return up to 2000 records by specifying the \n",
    "# appropriate limit. However, some datasets (like CHIRPS) contain ~100k resources, \n",
    "# which is why it's important to provide additional filtering criteria like spatial \n",
    "# and temporal coverage\n",
    "# -------------------------------------------------------------------------------------------\n",
    "\n",
    "\n",
    "\n",
    "# 2.2.1 Specifying spatial coverage as a lat/lon bounding box:\n",
    "# Bounding box search parameter is a 4-element numeric array (in WGS84 coordinate system) [xmin, ymin, xmax, ymax]\n",
    "# As a reminder, x is longitude, y is latitude\n",
    "\n",
    "# For example, bounding box for Ethiopia+SNNPR+KAT (adm level 2) is\n",
    "# {\"xmax\": 38.062137603759766, \"xmin\": 37.3511962890625, \"ymax\": 7.4791812896728525, \"ymin\": 7.147633552551269}\n",
    "# We don't have to match those coordinates exactly as data catalog supports \"within\" and \"intersects\" geospatial queries\n",
    "bounding_box = [37.0, 7.0, 39.0, 8.0]\n",
    "\n",
    "\n",
    "\n",
    "# 2.2.2 Specifying temporal coverage as start/end times in ISO8601 format. Supported operators are: \n",
    "# gt (greater than), \n",
    "# gte (greater than or equal)\n",
    "# lt (less than),\n",
    "# lte (less than or equal)\n",
    "#\n",
    "# For example, to specify temporal coverage for the entire 2018, we will write \n",
    "start_time = \"2018-01-01T00:00:00\"\n",
    "end_time = \"2018-12-31T23:59:59\"\n",
    "\n",
    "\n",
    "# Together, the complete search query becomes\n",
    "search_query = {\n",
    "    \"spatial_coverage__within\": bounding_box,    \n",
    "    \"start_time__gte\": start_time,\n",
    "    \"end_time__lte\": end_time,\n",
    "    \"limit\": 2000\n",
    "}\n",
    "\n",
    "resp = requests.post(f\"{url}/datasets/find\", \n",
    "                                        headers=request_headers,\n",
    "                                        json=search_query)\n",
    "\n",
    "response = handle_api_response(resp, print_response=True)\n",
    "\n",
    "resources = response[\"resources\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# This is an example of a returned resource record. It contains dataset's record_id and metadata,\n",
    "# resource's record_id, name, metadata, and data_url\n",
    "resource_record = resources[0]\n",
    "pp.pprint(resource_record)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# In order to download actual data, we can get the relevant url from the \"resource_data_url\" field:\n",
    "\n",
    "data_url = resource_record[\"resource_data_url\"]\n",
    "print(data_url)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}