{ "cells": [ { "cell_type": "markdown", "id": "eed73add-2b1c-4182-8aa1-4aeb86ab2392", "metadata": {}, "source": [ "# Demo Collect Rook Usage" ] }, { "cell_type": "code", "execution_count": null, "id": "102176b7-8dbd-41a8-b28f-b230cde45373", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import hvplot.pandas # noqa\n", "from rooki.client import Rooki" ] }, { "cell_type": "code", "execution_count": null, "id": "d88b5a2f-13d8-4673-b99b-a734af570c17", "metadata": {}, "outputs": [], "source": [ "# Available hosts\n", "hosts = {\n", " 'demo': 'rook.dkrz.de',\n", " 'dkrz': 'rook3.cloud.dkrz.de',\n", " 'ceda': 'rook-wps1.ceda.ac.uk',\n", "}" ] }, { "cell_type": "code", "execution_count": null, "id": "6d914230-b413-4d54-8e20-776c97b801c6", "metadata": {}, "outputs": [], "source": [ "# Use cache\n", "cache_id = {\n", " 'ceda': '1f8181bc-d351-11eb-9402-005056aba41c',\n", " 'dkrz': '34369610-d351-11eb-8f86-fa163e466023',\n", "}" ] }, { "cell_type": "code", "execution_count": null, "id": "48ed7c74-14ac-4313-962e-ed6b38ccbb1a", "metadata": {}, "outputs": [], "source": [ "# Collect usage from several nodes \n", "def collect_usage(sites, time=None, use_cache=True):\n", " df_wps_list = []\n", " df_downloads_list = []\n", " for site in sites:\n", " if use_cache:\n", " ref_wps = f\"http://{hosts[site]}/outputs/rook/{cache_id[site]}/wps_requests.csv\"\n", " ref_downloads = f\"http://{hosts[site]}/outputs/rook/{cache_id[site]}/downloads.csv\"\n", " else:\n", " url = f\"http://{hosts[site]}/wps\"\n", " rooki = Rooki(url, mode='sync')\n", " resp = rooki.usage(time=time)\n", " ref_wps = resp.response.processOutputs[0].reference\n", " print(ref_wps)\n", " ref_downloads = resp.response.processOutputs[1].reference\n", " print(ref_downloads)\n", " # load wps\n", " df_wps = pd.read_csv(ref_wps, parse_dates=[4, 5])\n", " df_wps['node'] = site\n", " df_wps_list.append(df_wps)\n", " # load downloads\n", " df_downloads = pd.read_csv(ref_downloads, parse_dates=[2])\n", " df_downloads['node'] = site\n", " df_downloads_list.append(df_downloads)\n", " df_wps_combined = pd.concat(df_wps_list, ignore_index=True)\n", " df_downloads_combined = pd.concat(df_downloads_list, ignore_index=True)\n", " return df_wps_combined, df_downloads_combined" ] }, { "cell_type": "markdown", "id": "340d4697-3d87-46b8-bc59-f9eb8593fd46", "metadata": {}, "source": [ "## collect usage " ] }, { "cell_type": "code", "execution_count": null, "id": "fc6e99b4-4ea4-4d8f-9d3a-ccacb9030d2a", "metadata": {}, "outputs": [], "source": [ "df, df_downloads = collect_usage(['ceda', 'dkrz'], time='2021-03-23/', use_cache=False)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "64aa3b54-6644-42e6-8f25-fbdb75d99eaf", "metadata": {}, "outputs": [], "source": [ "df.nunique()" ] }, { "cell_type": "markdown", "id": "4a18da34-8e19-46c0-89ac-8d9653834dac", "metadata": {}, "source": [ "## evaluate pywps stats" ] }, { "cell_type": "code", "execution_count": null, "id": "ce9795f0-c0a0-4c79-ad46-a33e1de1a802", "metadata": {}, "outputs": [], "source": [ "df.operation.value_counts()" ] }, { "cell_type": "code", "execution_count": null, "id": "961936ad-8b12-4d1a-b3cf-911abe44bb09", "metadata": {}, "outputs": [], "source": [ "df.loc[df['operation']=='execute'].loc[df['status']==4].identifier.value_counts()" ] }, { "cell_type": "code", "execution_count": null, "id": "86c2f49b-45f8-476f-846d-1b0e2c35cee1", "metadata": {}, "outputs": [], "source": [ "df.loc[df['operation']=='execute'].loc[df['status']==5].identifier.value_counts()" ] }, { "cell_type": "code", "execution_count": null, "id": "76722414-36f9-4744-9ca6-4447eeb775a7", "metadata": {}, "outputs": [], "source": [ "df.loc[df['operation']=='execute'].loc[df['identifier']=='orchestrate'].status.value_counts()" ] }, { "cell_type": "markdown", "id": "a0f7fcab-5676-4494-a9cb-700b5236e313", "metadata": {}, "source": [ "## duration" ] }, { "cell_type": "code", "execution_count": null, "id": "88b024f3-cd90-4044-af8d-23808bf05028", "metadata": {}, "outputs": [], "source": [ "df['duration'] = df['time_end'] - df['time_start']\n", "df.duration = df.duration.dt.seconds" ] }, { "cell_type": "code", "execution_count": null, "id": "a3491938-b60e-47ea-b8e1-b4edbdf57530", "metadata": {}, "outputs": [], "source": [ "df_skip_outlier = df.loc[df['operation']=='execute'].loc[df['identifier']=='orchestrate'].loc[df.duration<900]" ] }, { "cell_type": "code", "execution_count": null, "id": "53dcd6bf-0d46-4ce6-a947-b705762818b1", "metadata": {}, "outputs": [], "source": [ "df_skip_outlier.duration.mean()" ] }, { "cell_type": "code", "execution_count": null, "id": "676d5bb9-68e2-49f7-b454-467866bc0d00", "metadata": {}, "outputs": [], "source": [ "df_skip_outlier.hvplot.hist(y='duration', logx=False, bins=100)" ] }, { "cell_type": "markdown", "id": "c5a3fafe-b4c1-45e1-b67a-822df2d48a26", "metadata": {}, "source": [ "## jobs over days" ] }, { "cell_type": "code", "execution_count": null, "id": "5917883c-bf6e-42d5-a9f3-cb8cd9dc31c5", "metadata": {}, "outputs": [], "source": [ "days = (df.time_start.max() - df.time_start.min()).days\n", "days" ] }, { "cell_type": "code", "execution_count": null, "id": "689a5acd-18ae-47eb-a756-e4df9e374225", "metadata": {}, "outputs": [], "source": [ "len(df)/days" ] }, { "cell_type": "code", "execution_count": null, "id": "ac539d6c-c2b0-45b3-9e79-6a2b874e0350", "metadata": {}, "outputs": [], "source": [ "df.loc[df['operation']=='execute'].loc[df['identifier']=='orchestrate'].hvplot.hist(y='time_start', bins=days)" ] }, { "cell_type": "markdown", "id": "20d87fc7-cebc-4b9b-8534-406fb52c9821", "metadata": {}, "source": [ "## jobs over week days" ] }, { "cell_type": "code", "execution_count": null, "id": "1f38780d-8ddf-4864-83c0-584fb7688b97", "metadata": {}, "outputs": [], "source": [ "df['dayofweek'] = df['time_start'].dt.dayofweek\n", "df" ] }, { "cell_type": "code", "execution_count": null, "id": "adbe5cc0-ebea-4c00-8d82-e4962dab9501", "metadata": {}, "outputs": [], "source": [ "df.loc[df['operation']=='execute'].loc[df['identifier']=='orchestrate'].hvplot.hist(y='dayofweek', bins=7)" ] }, { "cell_type": "markdown", "id": "da99225b-6a52-48fd-9af3-a3d5e3718a92", "metadata": {}, "source": [ "## jobs over day time" ] }, { "cell_type": "code", "execution_count": null, "id": "eadced2e-7c34-4cac-8542-412b00f86841", "metadata": {}, "outputs": [], "source": [ "df['hour'] = df['time_start'].dt.hour\n", "df" ] }, { "cell_type": "code", "execution_count": null, "id": "9acad703-62e6-49de-9a88-f4ef3b48e4ea", "metadata": {}, "outputs": [], "source": [ "df.loc[df['operation']=='execute'].loc[df['identifier']=='orchestrate'].hvplot.hist(y='hour', bins=24)" ] }, { "cell_type": "markdown", "id": "3bca146a-3d30-47a7-a64f-c77452618642", "metadata": {}, "source": [ "## concurrent jobs" ] }, { "cell_type": "code", "execution_count": null, "id": "ec94e4f4-1295-4f03-ab87-9f2d1db74ce8", "metadata": {}, "outputs": [], "source": [ "# https://stackoverflow.com/questions/57804145/combining-rows-with-overlapping-time-periods-in-a-pandas-dataframe\n", "edf = df.loc[df['operation']=='execute'].loc[df['identifier']=='orchestrate'].loc[df['status'].isin([4, 5])]\n", "startdf = pd.DataFrame({'time':edf['time_start'], 'what':1})\n", "enddf = pd.DataFrame({'time':edf['time_end'], 'what':-1})\n", "mergdf = pd.concat([startdf, enddf]).sort_values('time')\n", "mergdf['running'] = mergdf['what'].cumsum()\n", "mergdf" ] }, { "cell_type": "code", "execution_count": null, "id": "e9840fcf-d5bd-48c4-9416-91382094e940", "metadata": {}, "outputs": [], "source": [ "mergdf.running.mean()" ] }, { "cell_type": "code", "execution_count": null, "id": "e68a8955-e0e2-4110-b651-3ec5fefa2172", "metadata": {}, "outputs": [], "source": [ "max_running = mergdf.running.max()\n", "max_running" ] }, { "cell_type": "code", "execution_count": null, "id": "590e581b-b869-4c75-af19-6950d42385bc", "metadata": {}, "outputs": [], "source": [ "mergdf.loc[mergdf['running']>0].hvplot.hist(y='running', bins=max_running)" ] }, { "cell_type": "markdown", "id": "958f24a1-f7fb-49b1-bd88-0cd5b824b8f9", "metadata": {}, "source": [ "## concurrent jobs over days" ] }, { "cell_type": "code", "execution_count": null, "id": "5312a09b-2a6f-4d6b-9b6c-e483cc1173cb", "metadata": {}, "outputs": [], "source": [ "mergdf.loc[mergdf['running']>0].hvplot.scatter(y='running', x='time')" ] }, { "cell_type": "code", "execution_count": null, "id": "462b30da-ff49-4a31-a70e-800d7703a043", "metadata": {}, "outputs": [], "source": [ "tmpdf = mergdf.groupby(pd.Grouper(key=\"time\", freq=\"1D\")).max()\n", "tmpdf" ] }, { "cell_type": "code", "execution_count": null, "id": "eafa6138-9202-4ece-bc13-f878ac5f8878", "metadata": {}, "outputs": [], "source": [ "tmp2df = pd.DataFrame()\n", "tmp2df['time'] = tmpdf.index.values\n", "tmp2df['running'] = tmpdf.running.values\n", "tmp2df" ] }, { "cell_type": "code", "execution_count": null, "id": "9f9402bc-8b58-44bb-aca5-75398e9d2e6d", "metadata": {}, "outputs": [], "source": [ "tmpdf.running.mean()" ] }, { "cell_type": "code", "execution_count": null, "id": "a9763705-681c-4f90-9601-dff22ac22a22", "metadata": {}, "outputs": [], "source": [ "tmp2df.hvplot.bar(x='time', y='running')" ] }, { "cell_type": "markdown", "id": "359b3545-d5c6-47cc-9431-2ff991551bbb", "metadata": {}, "source": [ "## Errors per day" ] }, { "cell_type": "code", "execution_count": null, "id": "8f25d4e7-d8b7-452d-919c-7ccc858ed4ac", "metadata": {}, "outputs": [], "source": [ "df_errors = df.loc[df['operation']=='execute'].loc[df['identifier']=='orchestrate'].loc[df['status']==5]\n", "df_errors" ] }, { "cell_type": "code", "execution_count": null, "id": "a68c71a0-d5d1-4493-a252-41e9de821a10", "metadata": {}, "outputs": [], "source": [ "df_errors.hvplot.hist(y='time_start')" ] }, { "cell_type": "markdown", "id": "b132a455-7ab1-40a0-a517-c019bfabaede", "metadata": {}, "source": [ "## Error messages" ] }, { "cell_type": "code", "execution_count": null, "id": "66ee1b4d-464b-4456-a3ac-bc6cf4b96820", "metadata": {}, "outputs": [], "source": [ "df_errors.message.value_counts()" ] }, { "cell_type": "markdown", "id": "6803c1c5-091e-44f7-99ec-86d4a44ff0f5", "metadata": {}, "source": [ "## Downloads" ] }, { "cell_type": "code", "execution_count": null, "id": "e523910e-742c-4374-85d4-3ed4f75ad226", "metadata": {}, "outputs": [], "source": [ "df_downloads.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "4a79ccc4-364b-4f6c-b5a1-66e314e6e58e", "metadata": {}, "outputs": [], "source": [ "df_downloads.nunique()" ] }, { "cell_type": "markdown", "id": "ae5ee8d7-e1ec-490b-9b1f-93d839da88d5", "metadata": {}, "source": [ "### Downloads size" ] }, { "cell_type": "code", "execution_count": null, "id": "dec46445-88c0-4663-8107-5bd352834348", "metadata": {}, "outputs": [], "source": [ "df_downloads['size'].sum() / 1024 ** 3" ] }, { "cell_type": "code", "execution_count": null, "id": "60631108-addb-4a51-a920-d690d1591b7a", "metadata": {}, "outputs": [], "source": [ "def size_mb(size):\n", " return size / 1024 ** 2" ] }, { "cell_type": "code", "execution_count": null, "id": "6b5ec0b1-0d5d-4f04-a3a6-102ed9f0bb45", "metadata": {}, "outputs": [], "source": [ "df_downloads['size_mb'] = df_downloads['size'].apply(size_mb)\n", "df_downloads" ] }, { "cell_type": "code", "execution_count": null, "id": "50baabf8-4b02-487f-9230-79c2d4091496", "metadata": {}, "outputs": [], "source": [ "df_downloads.hvplot.hist(y='size_mb')" ] }, { "cell_type": "markdown", "id": "c5a88f29-150a-401a-8352-2c98a5fcd926", "metadata": {}, "source": [ "### Download size per day" ] }, { "cell_type": "code", "execution_count": null, "id": "3d8be35b-0958-45dd-8d62-e813b6df2de7", "metadata": {}, "outputs": [], "source": [ "downloads_per_day = df_downloads.groupby(df_downloads.datetime.dt.date)[\"size_mb\"].sum()\n", "downloads_per_day" ] }, { "cell_type": "code", "execution_count": null, "id": "cab1c088-9813-464d-b23d-8edb1b98aacf", "metadata": {}, "outputs": [], "source": [ "downloads_per_day.mean()" ] }, { "cell_type": "code", "execution_count": null, "id": "d41436a9-2e4f-44cd-a24e-3b4a9c588f05", "metadata": {}, "outputs": [], "source": [ "downloads_per_day.hvplot.bar()" ] }, { "cell_type": "markdown", "id": "ae3a8490-9c6f-404d-be0d-3ce1b3fca792", "metadata": {}, "source": [ "### Download requests per day" ] }, { "cell_type": "code", "execution_count": null, "id": "8d55ccef-f51b-4ca2-b7b4-6c1011bfbc9b", "metadata": {}, "outputs": [], "source": [ "days = (df_downloads.datetime.max() - df_downloads.datetime.min()).days\n", "days" ] }, { "cell_type": "code", "execution_count": null, "id": "2d0768d3-1a9d-43c3-a289-fd9fa81f112f", "metadata": {}, "outputs": [], "source": [ "len(df_downloads)/days" ] }, { "cell_type": "code", "execution_count": null, "id": "c8df63e3-1dc0-4406-af11-33376bf5be6f", "metadata": { "tags": [] }, "outputs": [], "source": [ "df_downloads.hvplot.hist(y='datetime', bins=days)" ] }, { "cell_type": "markdown", "id": "05c88c8b-05f5-41e0-8110-9a04810d1c72", "metadata": {}, "source": [ "### Downloads by IP address" ] }, { "cell_type": "code", "execution_count": null, "id": "46513730-f6c6-4aa4-937c-ab78e8e2757e", "metadata": {}, "outputs": [], "source": [ "df_downloads.remote_host_ip.value_counts()" ] }, { "cell_type": "markdown", "id": "d1837426-c3ec-4ddc-9132-be7da03deb34", "metadata": {}, "source": [ "### Downloads GeoIP\n", "https://pypi.org/project/geoip2nation/" ] }, { "cell_type": "code", "execution_count": null, "id": "0a710753-9445-481f-bcd5-456ca9a2bb70", "metadata": {}, "outputs": [], "source": [ "from geoip import xgeoip\n", "r = xgeoip.GeoIp()\n", "r.load_memory()\n", "\n", "def lookup_ip(ip):\n", " return r.resolve(ip).country" ] }, { "cell_type": "code", "execution_count": null, "id": "b016a320-344e-41cf-b79b-7eb947054161", "metadata": {}, "outputs": [], "source": [ "df_downloads['geoip'] = df_downloads.remote_host_ip.apply(lookup_ip)\n", "df_downloads" ] }, { "cell_type": "code", "execution_count": null, "id": "37cf67f8-ce17-4ade-9b51-f074794f1c9c", "metadata": {}, "outputs": [], "source": [ "df_downloads.geoip.value_counts().hvplot.bar()" ] }, { "cell_type": "markdown", "id": "a8228e3b-60e9-4e94-aee9-1669cd5551f9", "metadata": {}, "source": [ "## GeoHealthCheck\n", "https://geohealthcheck.cloud.dkrz.de" ] }, { "cell_type": "code", "execution_count": null, "id": "6b925133-33a6-465c-bb13-116eb395a541", "metadata": {}, "outputs": [], "source": [ "import requests\n", "from io import StringIO\n", "ghc_url = \"https://geohealthcheck.cloud.dkrz.de/resource/45/history/csv\"\n", "req = requests.get(ghc_url, verify=False)\n", "df_ghc = pd.read_csv(StringIO(req.text), parse_dates=['checked_datetime'])\n", "df_ghc" ] }, { "cell_type": "code", "execution_count": null, "id": "6ce9a10c-4a59-47d1-9572-3bad785400d9", "metadata": {}, "outputs": [], "source": [ "df_ghc.status.value_counts()" ] }, { "cell_type": "code", "execution_count": null, "id": "0793b73b-f320-4dfe-9ed4-44276db2dc63", "metadata": {}, "outputs": [], "source": [ "def up(status):\n", " if status == True:\n", " return 1\n", " return 0\n", "df_ghc['up'] = df_ghc.status.apply(up)" ] }, { "cell_type": "code", "execution_count": null, "id": "50c9dc88-a94f-4237-b378-411f72b1a40a", "metadata": {}, "outputs": [], "source": [ "df_ghc.hvplot.line(x='checked_datetime', y='up')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.5" } }, "nbformat": 4, "nbformat_minor": 5 }