{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/programminghistorian/jekyll/blob/gh-pages/assets/corpus-analysis-with-spacy/corpus-analysis-with-spacy.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "wVeD4Ik7D43F"
      },
      "source": [
        "## Imports, Uploads, and Preprocessing\n"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Import Packages"
      ],
      "metadata": {
        "id": "n_BVwGof2vi9"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Tyh52tgEA_HA"
      },
      "outputs": [],
      "source": [
        "# Import spacy\n",
        "import spacy\n",
        "\n",
        "# Load spaCy visualizer\n",
        "from spacy import displacy\n",
        "\n",
        "# Import pandas DataFrame packages\n",
        "import pandas as pd\n",
        "\n",
        "# Import graphing package\n",
        "import plotly.graph_objects as go\n",
        "import plotly.express as px"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "IbNN7-PIBcW_"
      },
      "source": [
        "### Upload Text Files"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "-r9zTBIAlLsI"
      },
      "outputs": [],
      "source": [
        "# Import drive and files to facilitate file uploads\n",
        "from google.colab import files"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "XaVUPnFIE_kS"
      },
      "outputs": [],
      "source": [
        "# Selet multiple text files to upload from local folder\n",
        "uploaded_files = files.upload()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "N3f8cxLrgzUq"
      },
      "outputs": [],
      "source": [
        "type(uploaded_files)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "s2w09XuhKqOq"
      },
      "outputs": [],
      "source": [
        "# Add files into DataFrame\n",
        "paper_df = pd.DataFrame.from_dict(uploaded_files, orient='index')\n",
        "paper_df.head()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "BJJPgl5FL9qY"
      },
      "outputs": [],
      "source": [
        "# Reset index and add column names to make wrangling easier\n",
        "paper_df = paper_df.reset_index()\n",
        "paper_df.columns = [\"Filename\", \"Text\"]\n",
        "paper_df.head()"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Pre-process Text Files"
      ],
      "metadata": {
        "id": "uXI3nyVQ2-sf"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "2QTKf4DV00Aa"
      },
      "outputs": [],
      "source": [
        "# Convert papers from bytes to strings\n",
        "paper_df['Text'] = paper_df['Text'].str.decode('utf-8')\n",
        "paper_df.head()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "k633V0JbBInq"
      },
      "outputs": [],
      "source": [
        "# Remove extra spaces from papers\n",
        "paper_df['Text'] = paper_df['Text'].str.replace('\\s+', ' ', regex=True).str.strip()\n",
        "paper_df.head()"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Upload and Merge Metadata Files"
      ],
      "metadata": {
        "id": "oBCoNFow2W4U"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ZCASvLyJcq7C"
      },
      "outputs": [],
      "source": [
        "# Upload csv with essay metadata\n",
        "metadata = files.upload()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "gby6n4lzcq-c"
      },
      "outputs": [],
      "source": [
        "metadata_df = pd.read_csv('metadata.csv')\n",
        "metadata_df = metadata_df.dropna(axis=1, how='all')\n",
        "metadata_df.head()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "T5s_ZEt-BcXP"
      },
      "outputs": [],
      "source": [
        "# Remove .txt from title of each paper\n",
        "paper_df['Filename'] = paper_df['Filename'].str.replace('.txt', '')\n",
        "\n",
        "# Rename column from paper ID to Title\n",
        "metadata_df.rename(columns={\"PAPER ID\": \"Filename\"}, inplace=True)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "s_Ta2d77BcXP"
      },
      "outputs": [],
      "source": [
        "# Merge metadata and papers into new DataFrame\n",
        "# Will only keep rows where both essay and metadata are present\n",
        "final_paper_df = metadata_df.merge(paper_df,on='Filename')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "zOMIXCyOMOKl"
      },
      "outputs": [],
      "source": [
        "# Print DataFrame\n",
        "final_paper_df.head()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "zQ8ve667EvxG"
      },
      "source": [
        "### Alternate Code: Installs, Imports and Preprocessing in Jupyter Notebook"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# # Install and import spacy\n",
        "# !pip install spaCy\n",
        "\n",
        "# # Import spacy\n",
        "# import spacy\n",
        "\n",
        "# # Install English language model\n",
        "# !spacy download en_core_web_sm\n",
        "\n",
        "# # Import os to upload documents and metadata\n",
        "# import os\n",
        "\n",
        "# # Load spaCy visualizer\n",
        "# from spacy import displacy\n",
        "\n",
        "# # Import pandas DataFrame packages\n",
        "# import pandas as pd\n",
        "\n",
        "# # Import graphing package\n",
        "# import plotly.graph_objects as go\n",
        "# import plotly.express as px"
      ],
      "metadata": {
        "id": "KlvUa2oX1645"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ub8MflGfA_HB"
      },
      "outputs": [],
      "source": [
        "# # Create empty lists for file names and contents\n",
        "# texts = []\n",
        "# file_names = []\n",
        "# # Iterate through each file in the path\n",
        "# for _file_name in os.listdir('path_to_directory'):\n",
        "# # Look for only text files\n",
        "#     if _file_name.endswith('.txt'):\n",
        "#     # Append contents of each text file to text list\n",
        "#         texts.append(open('path_to_directory' + '/' + _file_name, 'r').read())\n",
        "#         # Append name of each file to file name list\n",
        "#         file_names.append(_file_name)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Z7BmHGFBA_HB",
        "scrolled": true
      },
      "outputs": [],
      "source": [
        "# # Create dictionary object associating each file name with its text\n",
        "# d = {'Filename':file_names,'Text':texts}"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "yC5-sbOPA_HB"
      },
      "outputs": [],
      "source": [
        "# # Turn dictionary into a dataframe\n",
        "# paper_df = pd.DataFrame(d)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "cK3PvJkcA_HC"
      },
      "outputs": [],
      "source": [
        "# paper_df.head()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "kax8Ecu7A_HC"
      },
      "outputs": [],
      "source": [
        "# # Remove extra spaces from papers\n",
        "# paper_df['Text'] = paper_df['Text'].str.replace('\\s+', ' ', regex=True).str.strip()\n",
        "# paper_df.head()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "zGbZVIrFA_HC"
      },
      "outputs": [],
      "source": [
        "# metadata_df = pd.read_csv('path_to_directory/metadata.csv')\n",
        "# metadata_df.head()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "RO4lwuwJcrID"
      },
      "outputs": [],
      "source": [
        "# # Remove .txt from title of each paper\n",
        "# paper_df['Filename'] = paper_df['Filename'].str.replace('.txt', '', regex=True)\n",
        "\n",
        "# # Rename column from paper ID to Title\n",
        "# metadata_df.rename(columns={\"PAPER ID\": \"Filename\"}, inplace=True)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "2eCYbDExeuqM",
        "scrolled": true
      },
      "outputs": [],
      "source": [
        "# # Merge metadata and papers into new DataFrame\n",
        "# # Will only keep rows where both essay and metadata are present\n",
        "# final_paper_df = metadata_df.merge(paper_df,on='Filename')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "C-8mOPIZA_HD"
      },
      "outputs": [],
      "source": [
        "# # Print DataFrame\n",
        "# final_paper_df.head()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "xdlHoqlBA_HD"
      },
      "source": [
        "## Text Enrichment with spaCy"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Creating Doc Objects"
      ],
      "metadata": {
        "id": "ejyC6xvA3w9q"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "2P1TUR3oA_HD"
      },
      "outputs": [],
      "source": [
        "# Load nlp pipeline\n",
        "nlp = spacy.load('en_core_web_sm')\n",
        "\n",
        "# Check what functions it performs\n",
        "print(nlp.pipe_names)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ceknDmJZA_HE"
      },
      "outputs": [],
      "source": [
        "#Define example sentence\n",
        "sentence = \"This is 'an' example? sentence\"\n",
        "\n",
        "# Call the nlp model on the sentence\n",
        "doc = nlp(sentence)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "KWX1hxoBA_HE"
      },
      "outputs": [],
      "source": [
        "# Loop through each token in doc object\n",
        "for token in doc:\n",
        "    # Print text and part of speech for each\n",
        "    print(token.text, token.pos_)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "5pE_8YJ_A_HE"
      },
      "outputs": [],
      "source": [
        "# Define a function that runs the nlp pipeline on any given input text\n",
        "def process_text(text):\n",
        "    return nlp(text)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "N3O8qhn1A_HE",
        "scrolled": true
      },
      "outputs": [],
      "source": [
        "# Apply the function to the \"Text\" column, so that the nlp pipeline is called on each student essay\n",
        "final_paper_df['Doc'] = final_paper_df['Text'].apply(process_text)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "BSNTL5msA_HE"
      },
      "source": [
        "### Text Reduction"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "#### Tokenization"
      ],
      "metadata": {
        "id": "5tgNt7NO35I0"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "SBEb3CErA_HE"
      },
      "outputs": [],
      "source": [
        "# Define a function to retrieve tokens from a doc object\n",
        "def get_token(doc):\n",
        "    # Loop through each token in the doc object\n",
        "    for token in doc:\n",
        "        # Retrieve the text of each token\n",
        "        return token.text"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "GOQY6IYAA_HE"
      },
      "outputs": [],
      "source": [
        "# Define a function to retrieve tokens from a doc object\n",
        "def get_token(doc):\n",
        "    return [(token.text) for token in doc]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "7-qpJ-peA_HF"
      },
      "outputs": [],
      "source": [
        "# Run the token retrieval function on the doc objects in the dataframe\n",
        "final_paper_df['Tokens'] = final_paper_df['Doc'].apply(get_token)\n",
        "final_paper_df.head()"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "tokens = final_paper_df[['Text', 'Tokens']].copy()\n",
        "tokens.head()"
      ],
      "metadata": {
        "id": "xSU8Rn57FbSK"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "YLzPDKXqA_HF"
      },
      "source": [
        "#### Lemmatization"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "50hlrH_KA_HF"
      },
      "outputs": [],
      "source": [
        "# Define a function to retrieve lemmas from a doc object\n",
        "def get_lemma(doc):\n",
        "    return [(token.lemma_) for token in doc]\n",
        "\n",
        "# Run the lemma retrieval function on the doc objects in the dataframe\n",
        "final_paper_df['Lemmas'] = final_paper_df['Doc'].apply(get_lemma)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "WbrgpAIjA_HF",
        "scrolled": true
      },
      "outputs": [],
      "source": [
        "print(f'\"Write\" appears in the text tokens column ' + str(final_paper_df['Tokens'].apply(lambda x: x.count('write')).sum()) + ' times.')\n",
        "print(f'\"Write\" appears in the lemmas column ' + str(final_paper_df['Lemmas'].apply(lambda x: x.count('write')).sum()) + ' times.')"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Text Annotation"
      ],
      "metadata": {
        "id": "GvH1xTvZ3-MW"
      }
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "FLeLfyvVA_HF"
      },
      "source": [
        "#### Part of Speech Tagging"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "T24RkcuTA_HF"
      },
      "outputs": [],
      "source": [
        "# Define a function to retrieve lemmas from a doc object\n",
        "def get_pos(doc):\n",
        "    #Return the coarse- and fine-grained part of speech text for each token in the doc\n",
        "    return [(token.pos_, token.tag_) for token in doc]\n",
        "\n",
        "# Define a function to retrieve parts of speech from a doc object\n",
        "final_paper_df['POS'] = final_paper_df['Doc'].apply(get_pos)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "_1fZXUynA_HF",
        "scrolled": true
      },
      "outputs": [],
      "source": [
        "# Create a list of part of speech tags\n",
        "list(final_paper_df['POS'])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "2lhKj9xGA_HG"
      },
      "outputs": [],
      "source": [
        "spacy.explain(\"IN\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "L-k8yoClA_HG"
      },
      "outputs": [],
      "source": [
        "# Define function to extract proper nouns from Doc object\n",
        "def extract_proper_nouns(doc):\n",
        "    return [token.text for token in doc if token.pos_ == 'PROPN']\n",
        "\n",
        "# Apply function to Doc column and store resulting proper nouns in new column\n",
        "final_paper_df['Proper_Nouns'] = final_paper_df['Doc'].apply(extract_proper_nouns)"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "list(final_paper_df.loc[[3, 163], 'Proper_Nouns'])"
      ],
      "metadata": {
        "id": "m98pVVJX1ZlK"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "e-8878WiA_HG"
      },
      "source": [
        "#### Dependency Parsing"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Kka-2u7eA_HG",
        "scrolled": true
      },
      "outputs": [],
      "source": [
        "# Extract the first sentence from the fifth Doc object\n",
        "doc = final_paper_df['Doc'][5]\n",
        "\n",
        "# Create a list of sentence from the doc object\n",
        "sentences = list(doc.sents)\n",
        "\n",
        "# Retrieve the first sentence\n",
        "sentence = sentences[0]\n",
        "\n",
        "# Create dependency visualization for the first sentence of the 5th essay\n",
        "displacy.render(sentence, style=\"dep\", jupyter=True)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "TqwjCcjDA_HG"
      },
      "outputs": [],
      "source": [
        "#Define function to extract parts of speech of all non-stopwords\n",
        "def extract_stopwords(doc):\n",
        "    return [token.text for token in doc if token.text not in nlp.Defaults.stop_words]\n",
        "\n",
        "#Create list of tokens without stopwords\n",
        "final_paper_df['Tokens_NoStops'] = final_paper_df['Doc'].apply(extract_stopwords)\n",
        "\n",
        "#Turn list of stopwords into a string\n",
        "final_paper_df['Text_NoStops'] = [' '.join(map(str, l)) for l in final_paper_df['Tokens_NoStops']]\n",
        "\n",
        "#Create new doc object from texts without stopwords\n",
        "final_paper_df['Doc_NoStops'] = final_paper_df['Text_NoStops'].apply(process_text)\n",
        "\n",
        "# extract the first sentence from the first Doc object\n",
        "doc = final_paper_df['Doc_NoStops'][5]\n",
        "sentences = list(doc.sents)\n",
        "sentence = sentences[0]\n",
        "\n",
        "# visualize the dependency parse tree for the sentence\n",
        "displacy.render(sentence, style='dep', jupyter=True)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "yhnSUEC3A_HH"
      },
      "outputs": [],
      "source": [
        "# Define function to extract noun phrases from Doc object\n",
        "def extract_noun_phrases(doc):\n",
        "    return [chunk.text for chunk in doc.noun_chunks]\n",
        "\n",
        "# Apply function to Doc column and store resulting proper nouns in new column\n",
        "final_paper_df['Noun_Phrases'] = final_paper_df['Doc'].apply(extract_noun_phrases)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "wod3AHAXA_HH",
        "scrolled": true
      },
      "outputs": [],
      "source": [
        "final_paper_df['Noun_Phrases'][0]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "39utqaRyh_M1"
      },
      "source": [
        "#### Named Entity Recognition\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "wRffhMlUA_HI",
        "scrolled": false
      },
      "outputs": [],
      "source": [
        "# Get all NE labels and assign to variable\n",
        "labels = nlp.get_pipe(\"ner\").labels\n",
        "\n",
        "# Print each label and its description\n",
        "for label in labels:\n",
        "    print(label + ' : ' + spacy.explain(label))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "mow27zbdA_HI"
      },
      "outputs": [],
      "source": [
        "# Define function to extract named entities from doc objects\n",
        "def extract_named_entities(doc):\n",
        "    return [ent.label_ for ent in doc.ents]\n",
        "\n",
        "# Apply function to Doc column and store resulting named entities in new column\n",
        "final_paper_df['Named_Entities'] = final_paper_df['Doc'].apply(extract_named_entities)\n",
        "final_paper_df['Named_Entities']"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "XQYCg6fkA_HJ"
      },
      "outputs": [],
      "source": [
        "# Define function to extract text tagged with named entities from doc objects\n",
        "def extract_named_entities(doc):\n",
        "    return [ent for ent in doc.ents]\n",
        "\n",
        "# Apply function to Doc column and store resulting text in new column\n",
        "final_paper_df['NE_Words'] = final_paper_df['Doc'].apply(extract_named_entities)\n",
        "final_paper_df['NE_Words']"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "7ovPPNn4A_HJ"
      },
      "outputs": [],
      "source": [
        "# Extract the first Doc object\n",
        "doc = final_paper_df['Doc'][1]\n",
        "\n",
        "# Visualize named entity tagging in a single paper\n",
        "displacy.render(doc, style='ent', jupyter=True)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "rKiNOMoDB2ta"
      },
      "source": [
        "### Download Enriched Dataset"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "S7USU7ipB9YK"
      },
      "outputs": [],
      "source": [
        "# Save DataFrame as csv (in Google Drive)\n",
        "# Use this step only to save  csv to your computer's working directory\n",
        "final_paper_df.to_csv('MICUSP_papers_with_spaCy_tags.csv')\n",
        "\n",
        "# Download csv to your computer from Google Drive\n",
        "files.download('MICUSP_papers_with_spaCy_tags.csv')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "XUGnN4BX4C4c"
      },
      "source": [
        "## Analysis of Linguistic Annotations"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "9QTJrt6byTlf"
      },
      "source": [
        "### Part of Speech Analysis"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "T5RAD_t_SJt2"
      },
      "outputs": [],
      "source": [
        "# Create doc object from single sentence\n",
        "doc = nlp(\"This is 'an' example? sentence\")\n",
        "\n",
        "# Print counts of each part of speech in sentence\n",
        "print(doc.count_by(spacy.attrs.POS))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "QGPdo6FtS4bq"
      },
      "outputs": [],
      "source": [
        "# Store dictionary with indexes and POS counts in a variable\n",
        "num_pos = doc.count_by(spacy.attrs.POS)\n",
        "\n",
        "dictionary = {}\n",
        "\n",
        "# Create a new dictionary which replaces the index of each part of speech for its label (NOUN, VERB, ADJECTIVE)\n",
        "for k,v in sorted(num_pos.items()):\n",
        "  dictionary[doc.vocab[k].text] = v\n",
        "\n",
        "dictionary"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "TnDn9MpTA_HK"
      },
      "outputs": [],
      "source": [
        "# Create new DataFrame for analysis purposes\n",
        "pos_analysis_df = final_paper_df[['Filename','DISCIPLINE', 'Doc']]\n",
        "\n",
        "# Create list to store each dictionary\n",
        "num_list = []\n",
        "\n",
        "# Define a function to get part of speech tags and counts and append them to a new dictionary\n",
        "def get_pos_tags(doc):\n",
        "    dictionary = {}\n",
        "    num_pos = doc.count_by(spacy.attrs.POS)\n",
        "    for k,v in sorted(num_pos.items()):\n",
        "        dictionary[doc.vocab[k].text] = v\n",
        "    num_list.append(dictionary)\n",
        "\n",
        "# Apply function to each doc object in DataFrame\n",
        "pos_analysis_df['C_POS'] = pos_analysis_df['Doc'].apply(get_pos_tags)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "DQ0UCcqMA_HK"
      },
      "outputs": [],
      "source": [
        "# Create new dataframe with part of speech counts\n",
        "pos_counts = pd.DataFrame(num_list)\n",
        "columns = list(pos_counts.columns)\n",
        "\n",
        "# Add discipline of each paper as new column to dataframe\n",
        "idx = 0\n",
        "new_col = pos_analysis_df['DISCIPLINE']\n",
        "pos_counts.insert(loc=idx, column='DISCIPLINE', value=new_col)\n",
        "\n",
        "pos_counts"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "rbK1VH5lZ7T6"
      },
      "outputs": [],
      "source": [
        "# Get average part of speech counts used in papers of each discipline\n",
        "average_pos_df = pos_counts.groupby(['DISCIPLINE']).mean()\n",
        "\n",
        "# Round calculations to the nearest whole number\n",
        "average_pos_df = average_pos_df.round(0)\n",
        "\n",
        "# Reset index to improve DataFrame readability\n",
        "average_pos_df = average_pos_df.reset_index()\n",
        "\n",
        "# Show dataframe\n",
        "average_pos_df"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "F0NVs2tojM9A"
      },
      "outputs": [],
      "source": [
        "# Use plotly to plot proper noun use per genre\n",
        "fig = px.bar(average_pos_df, x=\"DISCIPLINE\", y=[\"ADJ\", 'VERB', \"NUM\"], title=\"Average Part-of-Speech Use in Papers Written by Biology and English Students\", barmode='group')\n",
        "fig.show()"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Fine-Grained Part of Speech Analysis"
      ],
      "metadata": {
        "id": "ZzJrF2Pv4YQO"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "KO4hSnXxZdqR",
        "scrolled": true
      },
      "outputs": [],
      "source": [
        "# Create list to store each dictionary\n",
        "tag_num_list = []\n",
        "\n",
        "# Define a function to get part of speech tags and counts and append them to a new dictionary\n",
        "def get_fine_pos_tags(doc):\n",
        "    dictionary = {}\n",
        "    num_tag = doc.count_by(spacy.attrs.TAG)\n",
        "    for k,v in sorted(num_tag.items()):\n",
        "        dictionary[doc.vocab[k].text] = v\n",
        "    tag_num_list.append(dictionary)\n",
        "\n",
        "# Apply function to each doc object in DataFrame\n",
        "pos_analysis_df['F_POS'] = pos_analysis_df['Doc'].apply(get_fine_pos_tags)\n",
        "\n",
        "# Create new dataframe with part of speech counts\n",
        "tag_counts = pd.DataFrame(tag_num_list)\n",
        "columns = list(tag_counts.columns)\n",
        "\n",
        "# Add discipline of each paper as new column to dataframe\n",
        "idx = 0\n",
        "new_col = pos_analysis_df['DISCIPLINE']\n",
        "tag_counts.insert(loc=idx, column='DISCIPLINE', value=new_col)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "W7NknlyqA_HL"
      },
      "outputs": [],
      "source": [
        "# Get average fine-grain part of speech counts used in papers of each discipline\n",
        "average_tag_df = tag_counts.groupby(['DISCIPLINE']).mean()\n",
        "\n",
        "# Round calculations to the nearest whole number\n",
        "average_tag_df = average_tag_df.round(0)\n",
        "\n",
        "# Reset index to improve DataFrame readability\n",
        "average_tag_df = average_tag_df.reset_index()\n",
        "\n",
        "# Show dataframe\n",
        "average_tag_df"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "IIt9zHCmA_HL"
      },
      "outputs": [],
      "source": [
        "# Use plotly to plot proper noun use per genre\n",
        "fig = px.bar(average_tag_df, x=\"DISCIPLINE\", y=[\"VBD\", 'VBP', 'VBZ'], title=\"Average Verb Tense Usage Differences in Biology and English Student Writing\", barmode='group')\n",
        "fig.show()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "xC6eZIpIyb8k"
      },
      "source": [
        "### Named Entity Analysis"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "U_WJ6sFTifpA",
        "scrolled": true
      },
      "outputs": [],
      "source": [
        "# Create new DataFrame for analysis purposes\n",
        "ner_analysis_df = final_paper_df[['Filename','PAPER TYPE', 'Named_Entities', 'NE_Words']]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "2MgWXgOL4K2E"
      },
      "outputs": [],
      "source": [
        "# Convert named entity lists to strings so we can count specific entities\n",
        "ner_analysis_df['Named_Entities'] = ner_analysis_df['Named_Entities'].apply(lambda x: ' '.join(x))\n",
        "\n",
        "# Get the number of each type of entity in each paper\n",
        "person_counts = ner_analysis_df['Named_Entities'].str.count('PERSON')\n",
        "loc_counts = ner_analysis_df['Named_Entities'].str.count('LOC')\n",
        "date_counts = ner_analysis_df['Named_Entities'].str.count('DATE')\n",
        "woa_counts = ner_analysis_df['Named_Entities'].str.count('WORK_OF_ART')\n",
        "\n",
        "# Append named entity counts to new DataFrame\n",
        "ner_counts_df = pd.DataFrame()\n",
        "ner_counts_df['Genre'] = ner_analysis_df[\"PAPER TYPE\"]\n",
        "ner_counts_df['PERSON_Counts'] = person_counts\n",
        "ner_counts_df['LOC_Counts'] = loc_counts\n",
        "ner_counts_df['DATE_Counts'] = date_counts\n",
        "ner_counts_df['WORK_OF_ART_Counts'] = woa_counts\n",
        "\n",
        "ner_counts_df.head()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "OXUlrCXhh_0f"
      },
      "outputs": [],
      "source": [
        "# Calculate average usage of each named entity type\n",
        "average_ner_df = ner_counts_df.groupby(['Genre']).mean()\n",
        "average_ner_df = average_ner_df.round(0)\n",
        "average_ner_df = average_ner_df.reset_index()\n",
        "average_ner_df\n",
        "\n",
        "# Use plotly to plot proper noun use per genre\n",
        "fig = px.bar(average_ner_df, x=\"Genre\", y=[\"PERSON_Counts\", 'LOC_Counts', \"DATE_Counts\", 'WORK_OF_ART_Counts'], title=\"Average Named Entity Usage Across Student Paper Genres\", barmode='group')\n",
        "fig.show()"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Analysis of ```DATE``` Named Entities"
      ],
      "metadata": {
        "id": "n8nl8w084fo4"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "6RvvclvkA_HM",
        "scrolled": true
      },
      "outputs": [],
      "source": [
        "# Define function to extract words tagged as \"date\" named entities from doc objects\n",
        "def extract_date_named_entities(doc):\n",
        "    return [ent for ent in doc.ents if ent.label_ == 'DATE']\n",
        "\n",
        "# Get all date entity words and apply to new column of DataFrame\n",
        "ner_analysis_df['Date_Named_Entities'] = final_paper_df['Doc'].apply(extract_date_named_entities)\n",
        "\n",
        "\n",
        "# Make list of date entities a string so we can count their frequencies\n",
        "ner_analysis_df['Date_Named_Entities'] = [', '.join(map(str, l)) for l in ner_analysis_df['Date_Named_Entities']]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ELHvnBtNvLSH",
        "scrolled": true
      },
      "outputs": [],
      "source": [
        "# Search for only date words in proposal papers\n",
        "date_word_counts_df = ner_analysis_df[(ner_analysis_df == 'Proposal').any(axis=1)]\n",
        "\n",
        "# Count the frequency of each word in these essays and append to list\n",
        "date_word_frequencies = date_word_counts_df.Date_Named_Entities.str.split(expand=True).stack().value_counts()\n",
        "\n",
        "# Get top 10 most common words and their frequencies\n",
        "date_word_frequencies[:10]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "2iKKPP-swqe2"
      },
      "outputs": [],
      "source": [
        "# Search for only date words in critique/evaluation papers\n",
        "date_word_counts_df = ner_analysis_df[(ner_analysis_df == 'Critique/Evaluation').any(axis=1)]\n",
        "\n",
        "# Count the frequency of each word in these essays and append to list\n",
        "date_word_frequencies = date_word_counts_df.Date_Named_Entities.str.split(expand=True).stack().value_counts()\n",
        "\n",
        "# Get top 10 most common words and their frequencies\n",
        "date_word_frequencies[:10]"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "provenance": [],
      "include_colab_link": true
    },
    "kernelspec": {
      "display_name": "Python 3 (ipykernel)",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.8.5"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}