{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "\n# Species distribution modeling\n\nModeling species' geographic distributions is an important\nproblem in conservation biology. In this example, we\nmodel the geographic distribution of two South American\nmammals given past observations and 14 environmental\nvariables. Since we have only positive examples (there are\nno unsuccessful observations), we cast this problem as a\ndensity estimation problem and use the :class:`~sklearn.svm.OneClassSVM`\nas our modeling tool. The dataset is provided by Phillips et. al. (2006).\nIf available, the example uses\n[basemap](https://matplotlib.org/basemap/)\nto plot the coast lines and national boundaries of South America.\n\nThe two species are:\n\n- [Bradypus variegatus](http://www.iucnredlist.org/details/3038/0),\n the brown-throated sloth.\n\n- [Microryzomys minutus](http://www.iucnredlist.org/details/13408/0),\n also known as the forest small rice rat, a rodent that lives in Peru,\n Colombia, Ecuador, Peru, and Venezuela.\n\n## References\n\n- [\"Maximum entropy modeling of species geographic distributions\"](http://rob.schapire.net/papers/ecolmod.pdf)\n S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling,\n 190:231-259, 2006.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Authors: The scikit-learn developers\n# SPDX-License-Identifier: BSD-3-Clause\n\nfrom time import time\n\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nfrom sklearn import metrics, svm\nfrom sklearn.datasets import fetch_species_distributions\nfrom sklearn.utils import Bunch\n\n# if basemap is available, we'll use it.\n# otherwise, we'll improvise later...\ntry:\n from mpl_toolkits.basemap import Basemap\n\n basemap = True\nexcept ImportError:\n basemap = False\n\n\ndef construct_grids(batch):\n \"\"\"Construct the map grid from the batch object\n\n Parameters\n ----------\n batch : Batch object\n The object returned by :func:`fetch_species_distributions`\n\n Returns\n -------\n (xgrid, ygrid) : 1-D arrays\n The grid corresponding to the values in batch.coverages\n \"\"\"\n # x,y coordinates for corner cells\n xmin = batch.x_left_lower_corner + batch.grid_size\n xmax = xmin + (batch.Nx * batch.grid_size)\n ymin = batch.y_left_lower_corner + batch.grid_size\n ymax = ymin + (batch.Ny * batch.grid_size)\n\n # x coordinates of the grid cells\n xgrid = np.arange(xmin, xmax, batch.grid_size)\n # y coordinates of the grid cells\n ygrid = np.arange(ymin, ymax, batch.grid_size)\n\n return (xgrid, ygrid)\n\n\ndef create_species_bunch(species_name, train, test, coverages, xgrid, ygrid):\n \"\"\"Create a bunch with information about a particular organism\n\n This will use the test/train record arrays to extract the\n data specific to the given species name.\n \"\"\"\n bunch = Bunch(name=\" \".join(species_name.split(\"_\")[:2]))\n species_name = species_name.encode(\"ascii\")\n points = dict(test=test, train=train)\n\n for label, pts in points.items():\n # choose points associated with the desired species\n pts = pts[pts[\"species\"] == species_name]\n bunch[\"pts_%s\" % label] = pts\n\n # determine coverage values for each of the training & testing points\n ix = np.searchsorted(xgrid, pts[\"dd long\"])\n iy = np.searchsorted(ygrid, pts[\"dd lat\"])\n bunch[\"cov_%s\" % label] = coverages[:, -iy, ix].T\n\n return bunch\n\n\ndef plot_species_distribution(\n species=(\"bradypus_variegatus_0\", \"microryzomys_minutus_0\")\n):\n \"\"\"\n Plot the species distribution.\n \"\"\"\n if len(species) > 2:\n print(\n \"Note: when more than two species are provided,\"\n \" only the first two will be used\"\n )\n\n t0 = time()\n\n # Load the compressed data\n data = fetch_species_distributions()\n\n # Set up the data grid\n xgrid, ygrid = construct_grids(data)\n\n # The grid in x,y coordinates\n X, Y = np.meshgrid(xgrid, ygrid[::-1])\n\n # create a bunch for each species\n BV_bunch = create_species_bunch(\n species[0], data.train, data.test, data.coverages, xgrid, ygrid\n )\n MM_bunch = create_species_bunch(\n species[1], data.train, data.test, data.coverages, xgrid, ygrid\n )\n\n # background points (grid coordinates) for evaluation\n np.random.seed(13)\n background_points = np.c_[\n np.random.randint(low=0, high=data.Ny, size=10000),\n np.random.randint(low=0, high=data.Nx, size=10000),\n ].T\n\n # We'll make use of the fact that coverages[6] has measurements at all\n # land points. This will help us decide between land and water.\n land_reference = data.coverages[6]\n\n # Fit, predict, and plot for each species.\n for i, species in enumerate([BV_bunch, MM_bunch]):\n print(\"_\" * 80)\n print(\"Modeling distribution of species '%s'\" % species.name)\n\n # Standardize features\n mean = species.cov_train.mean(axis=0)\n std = species.cov_train.std(axis=0)\n train_cover_std = (species.cov_train - mean) / std\n\n # Fit OneClassSVM\n print(\" - fit OneClassSVM ... \", end=\"\")\n clf = svm.OneClassSVM(nu=0.1, kernel=\"rbf\", gamma=0.5)\n clf.fit(train_cover_std)\n print(\"done.\")\n\n # Plot map of South America\n plt.subplot(1, 2, i + 1)\n if basemap:\n print(\" - plot coastlines using basemap\")\n m = Basemap(\n projection=\"cyl\",\n llcrnrlat=Y.min(),\n urcrnrlat=Y.max(),\n llcrnrlon=X.min(),\n urcrnrlon=X.max(),\n resolution=\"c\",\n )\n m.drawcoastlines()\n m.drawcountries()\n else:\n print(\" - plot coastlines from coverage\")\n plt.contour(\n X, Y, land_reference, levels=[-9998], colors=\"k\", linestyles=\"solid\"\n )\n plt.xticks([])\n plt.yticks([])\n\n print(\" - predict species distribution\")\n\n # Predict species distribution using the training data\n Z = np.ones((data.Ny, data.Nx), dtype=np.float64)\n\n # We'll predict only for the land points.\n idx = np.where(land_reference > -9999)\n coverages_land = data.coverages[:, idx[0], idx[1]].T\n\n pred = clf.decision_function((coverages_land - mean) / std)\n Z *= pred.min()\n Z[idx[0], idx[1]] = pred\n\n levels = np.linspace(Z.min(), Z.max(), 25)\n Z[land_reference == -9999] = -9999\n\n # plot contours of the prediction\n plt.contourf(X, Y, Z, levels=levels, cmap=plt.cm.Reds)\n plt.colorbar(format=\"%.2f\")\n\n # scatter training/testing points\n plt.scatter(\n species.pts_train[\"dd long\"],\n species.pts_train[\"dd lat\"],\n s=2**2,\n c=\"black\",\n marker=\"^\",\n label=\"train\",\n )\n plt.scatter(\n species.pts_test[\"dd long\"],\n species.pts_test[\"dd lat\"],\n s=2**2,\n c=\"black\",\n marker=\"x\",\n label=\"test\",\n )\n plt.legend()\n plt.title(species.name)\n plt.axis(\"equal\")\n\n # Compute AUC with regards to background points\n pred_background = Z[background_points[0], background_points[1]]\n pred_test = clf.decision_function((species.cov_test - mean) / std)\n scores = np.r_[pred_test, pred_background]\n y = np.r_[np.ones(pred_test.shape), np.zeros(pred_background.shape)]\n fpr, tpr, thresholds = metrics.roc_curve(y, scores)\n roc_auc = metrics.auc(fpr, tpr)\n plt.text(-35, -70, \"AUC: %.3f\" % roc_auc, ha=\"right\")\n print(\"\\n Area under the ROC curve : %f\" % roc_auc)\n\n print(\"\\ntime elapsed: %.2fs\" % (time() - t0))\n\n\nplot_species_distribution()\nplt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.21" } }, "nbformat": 4, "nbformat_minor": 0 }