{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "gDocVcAVkuWQ"
      },
      "source": [
        "# Calculating the CD index using Dimensions on Google BigQuery\n",
        "\n",
        "This is a colab notebook that demonstrated how to calculate the CD index using Dimesnions on Google BigQuery.\n",
        "\n",
        "For more details, see the accompanying paper TBC."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "gJo_HRo7n___"
      },
      "source": [
        "## Set up libraries and GBQ"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "mncivLDYknuN",
        "outputId": "a379c159-f45b-42cf-e91d-23e69eaaec5f"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Collecting cdindex\n",
            "  Downloading cdindex-1.0.19.tar.gz (9.6 kB)\n",
            "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "Requirement already satisfied: future in /usr/local/lib/python3.10/dist-packages (from cdindex) (0.18.3)\n",
            "Building wheels for collected packages: cdindex\n",
            "  Building wheel for cdindex (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Created wheel for cdindex: filename=cdindex-1.0.19-cp310-cp310-linux_x86_64.whl size=26233 sha256=959604541392dfcaadb8090b62b95fc4d5d5a49c40bd256262166176ee286052\n",
            "  Stored in directory: /root/.cache/pip/wheels/43/75/3c/d1af2e9aaab2040d9907be8ebc640b05912fee43cc98471689\n",
            "Successfully built cdindex\n",
            "Installing collected packages: cdindex\n",
            "Successfully installed cdindex-1.0.19\n"
          ]
        }
      ],
      "source": [
        "!pip install cdindex\n",
        "import cdindex\n",
        "\n",
        "import datetime\n",
        "from string import Template\n",
        "import pandas as pd\n",
        "import numpy as np\n",
        "import re\n",
        "import os\n",
        "import shutil\n",
        "import pandas as pd\n",
        "import itertools\n",
        "import tqdm\n",
        "import plotly.express as px\n",
        "from sklearn.linear_model import LinearRegression\n",
        "from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, ConfusionMatrixDisplay, precision_score, recall_score, f1_score, accuracy_score\n",
        "from sklearn.metrics import classification_report\n",
        "import matplotlib.pyplot as plt\n",
        "\n",
        "from google.colab import auth\n",
        "from google.auth import default\n",
        "from google.cloud import bigquery\n",
        "\n",
        "# Authenticate Google\n",
        "auth.authenticate_user()\n",
        "creds, _ = default()\n",
        "\n",
        "# Get the tables with all fields and nesting for the GBQ Dimensions tables\n",
        "def gbq(sql, verbose=False):\n",
        "  if verbose:\n",
        "    print(\"==========\\nQuerying:\")\n",
        "    print(sql)\n",
        "    print(\"...\")\n",
        "  df = pd.io.gbq.read_gbq(sql, project_id = gbq_project_id)\n",
        "  if verbose:\n",
        "    print(\"\\t...loaded \", len(df), \" records\")\n",
        "    display(df)\n",
        "  return df\n",
        "\n",
        "\n",
        "# You need to set these variables before running the script\n",
        "gbq_project_id = \"[your GBQ billing project ID]\"\n",
        "cdindex_gbq_table_name = \"[your GBQ project].[GBQ dataset].[prefix for tables with CD indices]\""
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "1YSk9g_TuXnL"
      },
      "source": [
        "## Sample calculation for a paper\n",
        "This is the query shown in our paper\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ieKZFqR8udNs"
      },
      "outputs": [],
      "source": [
        "sql = \"\"\"\n",
        "-- This is the focal publication f\n",
        "DECLARE focal_publication_id STRING DEFAULT \"pub.1019844293\";\n",
        "-- This is the impact span t\n",
        "DECLARE time_diff INT64 DEFAULT 5;\n",
        "\n",
        "\n",
        "WITH cd_raw_data AS\n",
        "(\n",
        "\t-- Calculating s’ for each citation to the focal publication\n",
        "\t-- All are assigned a score s’=-1. Any other publications appearing in\n",
        "\t-- the second SELECT and aren’t included here\n",
        "\t-- implicitly get a score s’= 0\n",
        "(\n",
        "SELECT\n",
        "DISTINCT -- make sure we list unique citations otherwise we may double count\n",
        "publications.id AS focal_id, -- focal publication\n",
        "citation.id AS citation_id, -- citing publication to focal publication\n",
        "-1 AS score -- s’\n",
        "-- the Dimensions GBQ table for publications\n",
        "FROM `dimensions-ai.data_analytics.publications` AS publications\n",
        "-- fetch all its citing publications: id and year\n",
        "LEFT JOIN UNNEST(publications.citations) AS citation\n",
        "-- for this experiment we only look at one publication\n",
        "WHERE publications.id = focal_publication_id\n",
        "-- we only consider citations that appear at most time_diff years after\n",
        "-- the focal publication has been published\n",
        "AND citation.year - publications.year BETWEEN 1 AND time_diff\n",
        ")\n",
        "UNION ALL\n",
        "-- Calculating s’’ for each citation to the references of\n",
        "-- the focal publication\n",
        "\t-- All are assigned a score s’’=-2. Any other publications appearing in\n",
        "\t-- the first SELECT and aren’t included here\n",
        "\t-- implicitly get a score s’’= 0\n",
        "(\n",
        "SELECT DISTINCT\n",
        "publications.id AS focal_id, -- focal publication\n",
        "reference_citation.id AS citation_id,-- citing publication to references\n",
        "-2 AS score -- s’’\n",
        "FROM `dimensions-ai.data_analytics.publications` AS publications\n",
        "-- get all the reference publication IDs of the focal publication\n",
        "LEFT JOIN UNNEST(publications.reference_ids) AS reference_id\n",
        "-- get the references’ meta data - mainly citations to it\n",
        "INNER JOIN `dimensions-ai.data_analytics.publications` AS references\n",
        "ON references.id = reference_id\n",
        "-- get the citations to the references\n",
        "LEFT JOIN UNNEST(references.citations) AS reference_citation\n",
        "WHERE publications.id = focal_publication_id\n",
        "AND reference_citation.year - publications.year BETWEEN 1 AND time_diff\n",
        ")\n",
        ")\n",
        "-- Now add up all scores, count the distinct ids of the citations in both SELECTs\n",
        "-- above and use that information to calculate the CD index\n",
        "SELECT focal_id,\n",
        "((SUM(score)/COUNT(DISTINCT citation_id))+2) AS sql_cd_5\n",
        "FROM cd_raw_data\n",
        "GROUP BY focal_id\n",
        "\"\"\"\n",
        "results = gbq(sql)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "BISyWqYVoLOd"
      },
      "source": [
        "## Sample calculation for 3 papers\n",
        "Here we calculate the CD index for 3 selected publications in the most basic fashion and compare it against calculations in *Park, Michael, Erin Leahey, and Russell J. Funk. 2023. “Papers and Patents Are Becoming Less Disruptive over Time.” Nature 613 (7942): 138–44.*\n",
        "\n",
        "Our result for the CD index will be `sql_cd_5` and the values of the Nature paper are in column `nature_cd_5`\n",
        "\n",
        "In order to make the SQL AS simple AS possible (and AS easy to understand AS possible) we do not introduce any limitations on the publication network (i.e. the citations and references).\n",
        "\n",
        "This is a bit more elaborate example where we also supply some metadata. We also explicitely exclude publications and citations with no valid publication year (which is very rare ca. 0.05% of all publictions so we are a bit pedantic here).\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 143
        },
        "id": "k5M7xPUrpGI6",
        "outputId": "a1a8a3e4-5090-4766-8c0f-a781e56d0c89"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "\n",
              "  <div id=\"df-a954d37b-0bb1-45cd-a061-a1dca2b492d3\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>focal_publication_id</th>\n",
              "      <th>doi</th>\n",
              "      <th>first_author</th>\n",
              "      <th>title</th>\n",
              "      <th>journal</th>\n",
              "      <th>publication_year</th>\n",
              "      <th>sql_cd_5</th>\n",
              "      <th>nature_cd_5</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>pub.1019844293</td>\n",
              "      <td>10.1038/2261209a0</td>\n",
              "      <td>BALTIMORE</td>\n",
              "      <td>Viral RNA-dependent DNA Polymerase: RNA-depend...</td>\n",
              "      <td>Nature</td>\n",
              "      <td>1970</td>\n",
              "      <td>-0.443149</td>\n",
              "      <td>-0.55</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>pub.1040343773</td>\n",
              "      <td>10.1038/171737a0</td>\n",
              "      <td>WATSON</td>\n",
              "      <td>Molecular Structure of Nucleic Acids: A Struct...</td>\n",
              "      <td>Nature</td>\n",
              "      <td>1953</td>\n",
              "      <td>0.600000</td>\n",
              "      <td>0.52</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>pub.1060431417</td>\n",
              "      <td>10.1103/physrev.140.a1133</td>\n",
              "      <td>Kohn</td>\n",
              "      <td>Self-Consistent Equations Including Exchange a...</td>\n",
              "      <td>Physical Review</td>\n",
              "      <td>1965</td>\n",
              "      <td>-0.258621</td>\n",
              "      <td>-0.22</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-a954d37b-0bb1-45cd-a061-a1dca2b492d3')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "\n",
              "\n",
              "\n",
              "    <div id=\"df-3167a235-2409-4a3c-8651-41cfb4421fd2\">\n",
              "      <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-3167a235-2409-4a3c-8651-41cfb4421fd2')\"\n",
              "              title=\"Suggest charts.\"\n",
              "              style=\"display:none;\">\n",
              "\n",
              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "     width=\"24px\">\n",
              "    <g>\n",
              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
              "    </g>\n",
              "</svg>\n",
              "      </button>\n",
              "    </div>\n",
              "\n",
              "<style>\n",
              "  .colab-df-quickchart {\n",
              "    background-color: #E8F0FE;\n",
              "    border: none;\n",
              "    border-radius: 50%;\n",
              "    cursor: pointer;\n",
              "    display: none;\n",
              "    fill: #1967D2;\n",
              "    height: 32px;\n",
              "    padding: 0 0 0 0;\n",
              "    width: 32px;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart:hover {\n",
              "    background-color: #E2EBFA;\n",
              "    box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "    fill: #174EA6;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart {\n",
              "    background-color: #3B4455;\n",
              "    fill: #D2E3FC;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart:hover {\n",
              "    background-color: #434B5C;\n",
              "    box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "    filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "    fill: #FFFFFF;\n",
              "  }\n",
              "</style>\n",
              "\n",
              "    <script>\n",
              "      async function quickchart(key) {\n",
              "        const containerElement = document.querySelector('#' + key);\n",
              "        const charts = await google.colab.kernel.invokeFunction(\n",
              "            'suggestCharts', [key], {});\n",
              "      }\n",
              "    </script>\n",
              "\n",
              "      <script>\n",
              "\n",
              "function displayQuickchartButton(domScope) {\n",
              "  let quickchartButtonEl =\n",
              "    domScope.querySelector('#df-3167a235-2409-4a3c-8651-41cfb4421fd2 button.colab-df-quickchart');\n",
              "  quickchartButtonEl.style.display =\n",
              "    google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "}\n",
              "\n",
              "        displayQuickchartButton(document);\n",
              "      </script>\n",
              "      <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-a954d37b-0bb1-45cd-a061-a1dca2b492d3 button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-a954d37b-0bb1-45cd-a061-a1dca2b492d3');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "text/plain": [
              "  focal_publication_id                        doi first_author  \\\n",
              "0       pub.1019844293          10.1038/2261209a0    BALTIMORE   \n",
              "1       pub.1040343773           10.1038/171737a0       WATSON   \n",
              "2       pub.1060431417  10.1103/physrev.140.a1133         Kohn   \n",
              "\n",
              "                                               title          journal  \\\n",
              "0  Viral RNA-dependent DNA Polymerase: RNA-depend...           Nature   \n",
              "1  Molecular Structure of Nucleic Acids: A Struct...           Nature   \n",
              "2  Self-Consistent Equations Including Exchange a...  Physical Review   \n",
              "\n",
              "   publication_year  sql_cd_5  nature_cd_5  \n",
              "0              1970 -0.443149        -0.55  \n",
              "1              1953  0.600000         0.52  \n",
              "2              1965 -0.258621        -0.22  "
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# The results of the journal\n",
        "nature_results = pd.DataFrame([\n",
        "    {\"focal_publication_id\" : \"pub.1040343773\", \"nature_cd_5\" : 0.52},\n",
        "    {\"focal_publication_id\" : \"pub.1060431417\", \"nature_cd_5\" : -0.22},\n",
        "    {\"focal_publication_id\" : \"pub.1019844293\", \"nature_cd_5\" : -0.55}\n",
        "])\n",
        "\n",
        "focal_publication_ids_text = \"'\" + \"','\".join(nature_results.focal_publication_id.unique())+\"'\"\n",
        "\n",
        "sql = f\"\"\"\n",
        "-- This is the focal publications f\n",
        "DECLARE focal_publication_ids ARRAY<STRING> DEFAULT [{focal_publication_ids_text}];\n",
        "-- This is the impact span t\n",
        "DECLARE time_diff INT64 DEFAULT 5;\n",
        "\n",
        "\n",
        "WITH cd_raw_data AS\n",
        "(\n",
        "\t-- Calculating s’ for each citation to the focal publication\n",
        "\t-- All are assigned a score s’=-1. Any other publications appearing in\n",
        "\t-- the second SELECT and aren’t included here\n",
        "\t-- implicitly get a score s’= 0\n",
        "  (\n",
        "    SELECT\n",
        "    DISTINCT  -- make sure we list unique citations otherwise we may double count\n",
        "    publications.id AS focal_id, -- focal publication\n",
        "    citation.id AS citation_id, -- citing publication to focal publication\n",
        "    -1 AS score -- s’\n",
        "    -- the Dimensions GBQ table for publications\n",
        "    FROM `dimensions-ai.data_analytics.publications` AS publications\n",
        "    -- fetch all its citing publications: id and year\n",
        "    LEFT JOIN UNNEST(publications.citations) AS citation\n",
        "    -- for this experiment we only look at one publication\n",
        "    WHERE publications.id IN UNNEST(focal_publication_ids)\n",
        "    -- we only consider citations that appear at most time_diff years after\n",
        "    -- the focal publication has been published\n",
        "    AND citation.year - publications.year BETWEEN 1 AND time_diff\n",
        "    -- exclude cases where there is no publication year...very rare\n",
        "    AND citation.year IS NOT NULL\n",
        "    AND publications.year IS NOT NULL\n",
        "  )\n",
        "  UNION ALL\n",
        "  -- Calculating s’’ for each citation to the references of\n",
        "  -- the focal publication\n",
        "  -- All are assigned a score s’’=-2. Any other publications appearing in\n",
        "  -- the first SELECT and aren’t included here\n",
        "  -- implicitly get a score s’’= 0\n",
        "  (\n",
        "    SELECT DISTINCT\n",
        "    publications.id AS focal_id, -- focal publication\n",
        "    reference_citation.id AS citation_id,-- citing publication to references\n",
        "    -2 AS score -- s’’\n",
        "    FROM `dimensions-ai.data_analytics.publications` AS publications\n",
        "    -- get all the reference publication IDs of the focal publication\n",
        "    LEFT JOIN UNNEST(publications.reference_ids) AS reference_id\n",
        "    -- get the references’ meta data - mainly citations to it\n",
        "    INNER JOIN `dimensions-ai.data_analytics.publications` AS references\n",
        "    ON references.id = reference_id\n",
        "    -- get the citations to the references\n",
        "    LEFT JOIN UNNEST(references.citations) AS reference_citation\n",
        "    WHERE publications.id IN UNNEST(focal_publication_ids)\n",
        "    AND reference_citation.year - publications.year BETWEEN 1 AND time_diff\n",
        "    -- exclude cases where there is no publication year...very rare\n",
        "    AND reference_citation.year IS NOT NULL\n",
        "    AND publications.year IS NOT NULL\n",
        "  )\n",
        ")\n",
        "-- Now add up all scores, count the distinct ids of the citations in both SELECTs\n",
        "-- above and use that information to calculate the CD index\n",
        "SELECT focal_id AS focal_publication_id,\n",
        "publications.doi AS doi,\n",
        "CONCAT(publications.authors[SAFE_OFFSET(0)].last_name) AS first_author,\n",
        "publications.title.preferred AS title,\n",
        "publications.journal.title AS journal,\n",
        "publications.year AS publication_year,\n",
        "((SUM(score)/COUNT(DISTINCT citation_id))+2) AS sql_cd_5\n",
        "FROM cd_raw_data\n",
        "LEFT JOIN `dimensions-ai.data_analytics.publications` AS publications ON publications.id = cd_raw_data.focal_id\n",
        "GROUP BY 1,2,3,4,5,6\n",
        "\"\"\"\n",
        "\n",
        "results = gbq(sql)\n",
        "\n",
        "# Add the results of the Nature paper for comparison\n",
        "results = results.merge(nature_results, on=\"focal_publication_id\")\n",
        "display(results)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "2V5xTPpQtYCO"
      },
      "source": [
        "## Comparing our calculation against cdindex\n",
        "We are now comparing the SQL-based calculations against the standard python implementation [cdindex](http://russellfunk.org/cdindex/code.html) by Russel Funk.\n",
        "\n",
        "The python package requires a graph to calculate the CD index. Therefore we will build a subgraph of the larger network with only those citations and references and citations to references needed to calculate the CD index.\n",
        "\n",
        "We will use the same publications as above. Our result for the CD index will be `sql_cd_5` and the ones from the Python package `py_cd_5`.\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "MTEAhaKiuM85"
      },
      "outputs": [],
      "source": [
        "sql = f\"\"\"\n",
        "DECLARE focal_publication_ids ARRAY<STRING> DEFAULT [{focal_publication_ids_text}];\n",
        "\n",
        "# We now get all the vertices and edges of the directed graph of publications with a vertex attribute \"time\"\n",
        "# The relationship source->target means source cites target\n",
        "# We also don't impose a time limit i.e. we get potentially more publications that we really need\n",
        "WITH raw_data AS\n",
        "(\n",
        "  (\n",
        "    # Citations of the focal publications\n",
        "    SELECT DISTINCT\n",
        "    citation.id AS source,\n",
        "    citation.year AS source_year,\n",
        "    publications.id AS target,\n",
        "    publications.year AS target_year,\n",
        "    \"focal paper citations\" AS kind # not needed but maybe useful for debugging\n",
        "    FROM `dimensions-ai.data_analytics.publications` AS publications\n",
        "    LEFT JOIN UNNEST(publications.citations) AS citation\n",
        "    WHERE publications.id IN UNNEST(focal_publication_ids)\n",
        "    AND citation.year IS NOT NULL\n",
        "    AND publications.year IS NOT NULL\n",
        "  )\n",
        "  UNION ALL\n",
        "  (\n",
        "    # Citations of the references of the focal publications\n",
        "    SELECT DISTINCT\n",
        "    reference_citation.id AS source,\n",
        "    reference_citation.year AS source_year,\n",
        "    references.id AS target,\n",
        "    references.year AS target_year,\n",
        "    \"reference citations\" AS kind # not needed but maybe useful for debugging\n",
        "    FROM `dimensions-ai.data_analytics.publications` AS publications\n",
        "    LEFT JOIN UNNEST(publications.reference_ids) AS reference_id\n",
        "    INNER JOIN `dimensions-ai.data_analytics.publications` AS references\n",
        "    ON references.id = reference_id\n",
        "    LEFT JOIN UNNEST(references.citations) AS reference_citation\n",
        "    WHERE publications.id IN UNNEST(focal_publication_ids)\n",
        "    AND reference_citation.year IS NOT NULL\n",
        "    AND publications.year IS NOT NULL\n",
        "  )\n",
        ")\n",
        "# It is possible that we get a citation several times e.g. if focal publications\n",
        "# share citations hence we make sure each citation only happens once:\n",
        "SELECT DISTINCT *\n",
        "FROM raw_data\n",
        "\"\"\"\n",
        "\n",
        "raw_edge_list = gbq(sql)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 143
        },
        "id": "lHsyBjr-Ahx0",
        "outputId": "e08b0999-b093-4d72-f70c-56a2a3ff9231"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "\n",
              "  <div id=\"df-93c40af0-0a94-4747-9adf-a2d8dc466275\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>focal_publication_id</th>\n",
              "      <th>doi</th>\n",
              "      <th>first_author</th>\n",
              "      <th>title</th>\n",
              "      <th>journal</th>\n",
              "      <th>publication_year</th>\n",
              "      <th>sql_cd_5</th>\n",
              "      <th>nature_cd_5</th>\n",
              "      <th>py_cd_5</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>pub.1019844293</td>\n",
              "      <td>10.1038/2261209a0</td>\n",
              "      <td>BALTIMORE</td>\n",
              "      <td>Viral RNA-dependent DNA Polymerase: RNA-depend...</td>\n",
              "      <td>Nature</td>\n",
              "      <td>1970</td>\n",
              "      <td>-0.443149</td>\n",
              "      <td>-0.55</td>\n",
              "      <td>-0.443149</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>pub.1040343773</td>\n",
              "      <td>10.1038/171737a0</td>\n",
              "      <td>WATSON</td>\n",
              "      <td>Molecular Structure of Nucleic Acids: A Struct...</td>\n",
              "      <td>Nature</td>\n",
              "      <td>1953</td>\n",
              "      <td>0.600000</td>\n",
              "      <td>0.52</td>\n",
              "      <td>0.600000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>pub.1060431417</td>\n",
              "      <td>10.1103/physrev.140.a1133</td>\n",
              "      <td>Kohn</td>\n",
              "      <td>Self-Consistent Equations Including Exchange a...</td>\n",
              "      <td>Physical Review</td>\n",
              "      <td>1965</td>\n",
              "      <td>-0.258621</td>\n",
              "      <td>-0.22</td>\n",
              "      <td>-0.258621</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-93c40af0-0a94-4747-9adf-a2d8dc466275')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "\n",
              "\n",
              "\n",
              "    <div id=\"df-53a90486-0a86-44fc-a1e8-35982fdee682\">\n",
              "      <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-53a90486-0a86-44fc-a1e8-35982fdee682')\"\n",
              "              title=\"Suggest charts.\"\n",
              "              style=\"display:none;\">\n",
              "\n",
              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "     width=\"24px\">\n",
              "    <g>\n",
              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
              "    </g>\n",
              "</svg>\n",
              "      </button>\n",
              "    </div>\n",
              "\n",
              "<style>\n",
              "  .colab-df-quickchart {\n",
              "    background-color: #E8F0FE;\n",
              "    border: none;\n",
              "    border-radius: 50%;\n",
              "    cursor: pointer;\n",
              "    display: none;\n",
              "    fill: #1967D2;\n",
              "    height: 32px;\n",
              "    padding: 0 0 0 0;\n",
              "    width: 32px;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart:hover {\n",
              "    background-color: #E2EBFA;\n",
              "    box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "    fill: #174EA6;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart {\n",
              "    background-color: #3B4455;\n",
              "    fill: #D2E3FC;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart:hover {\n",
              "    background-color: #434B5C;\n",
              "    box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "    filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "    fill: #FFFFFF;\n",
              "  }\n",
              "</style>\n",
              "\n",
              "    <script>\n",
              "      async function quickchart(key) {\n",
              "        const containerElement = document.querySelector('#' + key);\n",
              "        const charts = await google.colab.kernel.invokeFunction(\n",
              "            'suggestCharts', [key], {});\n",
              "      }\n",
              "    </script>\n",
              "\n",
              "      <script>\n",
              "\n",
              "function displayQuickchartButton(domScope) {\n",
              "  let quickchartButtonEl =\n",
              "    domScope.querySelector('#df-53a90486-0a86-44fc-a1e8-35982fdee682 button.colab-df-quickchart');\n",
              "  quickchartButtonEl.style.display =\n",
              "    google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "}\n",
              "\n",
              "        displayQuickchartButton(document);\n",
              "      </script>\n",
              "      <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-93c40af0-0a94-4747-9adf-a2d8dc466275 button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-93c40af0-0a94-4747-9adf-a2d8dc466275');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "text/plain": [
              "  focal_publication_id                        doi first_author  \\\n",
              "0       pub.1019844293          10.1038/2261209a0    BALTIMORE   \n",
              "1       pub.1040343773           10.1038/171737a0       WATSON   \n",
              "2       pub.1060431417  10.1103/physrev.140.a1133         Kohn   \n",
              "\n",
              "                                               title          journal  \\\n",
              "0  Viral RNA-dependent DNA Polymerase: RNA-depend...           Nature   \n",
              "1  Molecular Structure of Nucleic Acids: A Struct...           Nature   \n",
              "2  Self-Consistent Equations Including Exchange a...  Physical Review   \n",
              "\n",
              "   publication_year  sql_cd_5  nature_cd_5   py_cd_5  \n",
              "0              1970 -0.443149        -0.55 -0.443149  \n",
              "1              1953  0.600000         0.52  0.600000  \n",
              "2              1965 -0.258621        -0.22 -0.258621  "
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# Paranoia check\n",
        "assert len(raw_edge_list[(raw_edge_list.source_year.isna()) | (raw_edge_list.target_year.isna())])==0\n",
        "\n",
        "# Now create a list of all vertices for our mini publication network\n",
        "# for this we need to concatenate the target and source vertices of all edges and delete duplicates\n",
        "# We also need the \"time\" i.e. publication year for each of the publications aka nodes\n",
        "vertex_list = pd.concat([\n",
        "  raw_edge_list[[\"source\", \"source_year\"]].rename(columns={\"source\" : \"name\", \"source_year\" : \"time\"}),\n",
        "  raw_edge_list[[\"target\", \"target_year\"]].rename(columns={\"target\" : \"name\", \"target_year\" : \"time\"})\n",
        "], axis =0).drop_duplicates()\n",
        "\n",
        "vertex_list.time = vertex_list.time.astype(int)\n",
        "\n",
        "# Paranoia check: each vertex appears only once\n",
        "assert vertex_list.name.nunique() == len(vertex_list)\n",
        "\n",
        "# The final edge list only needs source and target ids\n",
        "edge_list = raw_edge_list[[\"source\", \"target\"]].drop_duplicates()\n",
        "\n",
        "# Now we feed this graph into cdindex\n",
        "graph = cdindex.Graph(vertices = vertex_list.to_dict(\"records\"), edges = edge_list.to_dict(\"records\"))\n",
        "\n",
        "results[\"py_cd_5\"] = results.focal_publication_id.apply(lambda id : graph.cdindex(id, 5))\n",
        "display(results)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "32TGhOHgtee6"
      },
      "source": [
        "## Calculating CD index for all journal articles\n",
        "\n",
        "This is the query that allows you to calculate the CD index for all or some publications on Dimensions GBQ. Be advised that this query can take up to 5h. **In order to track progress we strongly recommend to run this query in the GBQ console rather than in a notebook**. You will also need to provide a table name in variable `cdindex_gbq_table_name` where the script will write the results to. The amount is too large to make writing it to a pandas dataframe reasonable.\n",
        "\n",
        "This python script has 3 modes which depends on the value of `run_query`:\n",
        "\n",
        "- **all**: Calculates the CD index for all publications\n",
        "\n",
        "- **journals** : Only calculate the CD index for all journal articles with at least 10 references. Most of the literature calculates CD indices for journal articles only. The lower limit for references rules out publications with metadata issues and this can lead too many publications having a CD = 1 as any publication with at least one citation and no reference has automatically CD=1\n",
        "\n",
        "- **pubmed**: Only calculates CD index for pubmed"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "R0PZSh2gGUB7",
        "outputId": "710688c3-84e6-4fa6-d5ae-67176ba7fc96"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Running query...\n",
            "=========== The SQL ============\n",
            "\n",
            "CREATE OR REPLACE TABLE `reports-dimensions-ai.precomputed.publications_cd_index_pubmed`\n",
            "CLUSTER BY id\n",
            "AS\n",
            "(\n",
            "  WITH publications AS\n",
            "  (\n",
            "    SELECT id, year, citations, reference_ids\n",
            "    FROM `dimensions-ai.data_analytics.publications`\n",
            "    WHERE year IS NOT NULL AND pmid IS NOT NULL\n",
            "  )\n",
            "\n",
            "  SELECT focal_id AS id,\n",
            "  (SUM(score)/COUNT(DISTINCT citation_id))+2 AS cd_5,\n",
            "  COUNTIF(score = -1)*((SUM(score)/COUNT(DISTINCT citation_id))+2) AS mcd_5\n",
            "  FROM\n",
            "  (\n",
            "    (\n",
            "      SELECT DISTINCT publications.id AS focal_id,\n",
            "      citation.id AS citation_id,\n",
            "      -1 AS score\n",
            "      FROM publications\n",
            "      LEFT JOIN UNNEST(publications.citations) AS citation\n",
            "      WHERE citation.year - publications.year BETWEEN 1 AND 5\n",
            "    )\n",
            "    UNION ALL\n",
            "    (\n",
            "      SELECT DISTINCT publications.id AS focal_id,\n",
            "      reference_citation.id as citation_id,\n",
            "      -2 as score\n",
            "      FROM publications\n",
            "      LEFT JOIN UNNEST(publications.reference_ids) AS reference_id\n",
            "      INNER JOIN publications AS references\n",
            "      ON references.id = reference_id\n",
            "      LEFT JOIN UNNEST(references.citations) AS reference_citation\n",
            "      WHERE reference_citation.year - publications.year BETWEEN 1 AND 5\n",
            "    )\n",
            "  )\n",
            "  GROUP BY 1\n",
            ")\n",
            "\n",
            "================================\n",
            "Sure you want to run it in colab? (type 'yes' to do it)no\n",
            "...not running SQL\n"
          ]
        }
      ],
      "source": [
        "# As in most of the literature we calculate CD_5 but you can also compute for other time spans\n",
        "time_diff = 5\n",
        "\n",
        "# Mode\n",
        "run_query = \"pubmed\"\n",
        "\n",
        "if run_query == \"journals\":\n",
        "  sql_where = 'AND type =\"article\" AND ARRAY_LENGTH(reference_ids)>=10 AND journal.id IS NOT NULL'\n",
        "elif run_query == \"pubmed\":\n",
        "  sql_where = 'AND pmid IS NOT NULL'\n",
        "elif run_query == \"all\":\n",
        "  sql_where = \"\"\n",
        "else:\n",
        "  print(\"WRONG VALUE for run_query\")\n",
        "  1/0\n",
        "\n",
        "print(\"Running query...\")\n",
        "sql = f\"\"\"\n",
        "CREATE OR REPLACE TABLE `{cdindex_gbq_table_name}_{run_query}`\n",
        "CLUSTER BY id\n",
        "AS\n",
        "(\n",
        "  WITH publications AS\n",
        "  (\n",
        "    SELECT id, year, citations, reference_ids\n",
        "    FROM `dimensions-ai.data_analytics.publications`\n",
        "    WHERE year IS NOT NULL {sql_where}\n",
        "  )\n",
        "\n",
        "  SELECT focal_id AS id,\n",
        "  (SUM(score)/COUNT(DISTINCT citation_id))+2 AS cd_{time_diff},\n",
        "  COUNTIF(score = -1)*((SUM(score)/COUNT(DISTINCT citation_id))+2) AS mcd_{time_diff}\n",
        "  FROM\n",
        "  (\n",
        "    (\n",
        "      SELECT DISTINCT publications.id AS focal_id,\n",
        "      citation.id AS citation_id,\n",
        "      -1 AS score\n",
        "      FROM publications\n",
        "      LEFT JOIN UNNEST(publications.citations) AS citation\n",
        "      WHERE citation.year - publications.year BETWEEN 1 AND {time_diff}\n",
        "    )\n",
        "    UNION ALL\n",
        "    (\n",
        "      SELECT DISTINCT publications.id AS focal_id,\n",
        "      reference_citation.id as citation_id,\n",
        "      -2 as score\n",
        "      FROM publications\n",
        "      LEFT JOIN UNNEST(publications.reference_ids) AS reference_id\n",
        "      INNER JOIN publications AS references\n",
        "      ON references.id = reference_id\n",
        "      LEFT JOIN UNNEST(references.citations) AS reference_citation\n",
        "      WHERE reference_citation.year - publications.year BETWEEN 1 AND {time_diff}\n",
        "    )\n",
        "  )\n",
        "  GROUP BY 1\n",
        ")\n",
        "\"\"\"\n",
        "\n",
        "print(\"=========== The SQL ============\")\n",
        "print(sql)\n",
        "print(\"================================\")\n",
        "\n",
        "answer =input(\"Sure you want to run it in colab? (type 'yes' to do it)\")\n",
        "if answer == \"yes\":\n",
        "  start = datetime.datetime.now()\n",
        "  print(f\"Started {start}\")\n",
        "  tmp = gbq(sql)\n",
        "  finish = datetime.datetime.now()\n",
        "  print(f\"Finished {finish}\")\n",
        "  print(f\"Duration: {finish-start}\")\n",
        "else:\n",
        "  print(\"...not running SQL\")\n"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}