{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "wVeD4Ik7D43F" }, "source": [ "## Imports, Uploads, and Preprocessing\n" ] }, { "cell_type": "markdown", "source": [ "### Import Packages" ], "metadata": { "id": "n_BVwGof2vi9" } }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Tyh52tgEA_HA" }, "outputs": [], "source": [ "# Import spacy\n", "import spacy\n", "\n", "# Load spaCy visualizer\n", "from spacy import displacy\n", "\n", "# Import pandas DataFrame packages\n", "import pandas as pd\n", "\n", "# Import graphing package\n", "import plotly.graph_objects as go\n", "import plotly.express as px" ] }, { "cell_type": "markdown", "metadata": { "id": "IbNN7-PIBcW_" }, "source": [ "### Upload Text Files" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "-r9zTBIAlLsI" }, "outputs": [], "source": [ "# Import drive and files to facilitate file uploads\n", "from google.colab import files" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "XaVUPnFIE_kS" }, "outputs": [], "source": [ "# Selet multiple text files to upload from local folder\n", "uploaded_files = files.upload()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "N3f8cxLrgzUq" }, "outputs": [], "source": [ "type(uploaded_files)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "s2w09XuhKqOq" }, "outputs": [], "source": [ "# Add files into DataFrame\n", "paper_df = pd.DataFrame.from_dict(uploaded_files, orient='index')\n", "paper_df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "BJJPgl5FL9qY" }, "outputs": [], "source": [ "# Reset index and add column names to make wrangling easier\n", "paper_df = paper_df.reset_index()\n", "paper_df.columns = [\"Filename\", \"Text\"]\n", "paper_df.head()" ] }, { "cell_type": "markdown", "source": [ "### Pre-process Text Files" ], "metadata": { "id": "uXI3nyVQ2-sf" } }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "2QTKf4DV00Aa" }, "outputs": [], "source": [ "# Convert papers from bytes to strings\n", "paper_df['Text'] = paper_df['Text'].str.decode('utf-8')\n", "paper_df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "k633V0JbBInq" }, "outputs": [], "source": [ "# Remove extra spaces from papers\n", "paper_df['Text'] = paper_df['Text'].str.replace('\\s+', ' ', regex=True).str.strip()\n", "paper_df.head()" ] }, { "cell_type": "markdown", "source": [ "### Upload and Merge Metadata Files" ], "metadata": { "id": "oBCoNFow2W4U" } }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ZCASvLyJcq7C" }, "outputs": [], "source": [ "# Upload csv with essay metadata\n", "metadata = files.upload()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "gby6n4lzcq-c" }, "outputs": [], "source": [ "metadata_df = pd.read_csv('metadata.csv')\n", "metadata_df = metadata_df.dropna(axis=1, how='all')\n", "metadata_df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "T5s_ZEt-BcXP" }, "outputs": [], "source": [ "# Remove .txt from title of each paper\n", "paper_df['Filename'] = paper_df['Filename'].str.replace('.txt', '')\n", "\n", "# Rename column from paper ID to Title\n", "metadata_df.rename(columns={\"PAPER ID\": \"Filename\"}, inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "s_Ta2d77BcXP" }, "outputs": [], "source": [ "# Merge metadata and papers into new DataFrame\n", "# Will only keep rows where both essay and metadata are present\n", "final_paper_df = metadata_df.merge(paper_df,on='Filename')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "zOMIXCyOMOKl" }, "outputs": [], "source": [ "# Print DataFrame\n", "final_paper_df.head()" ] }, { "cell_type": "markdown", "metadata": { "id": "zQ8ve667EvxG" }, "source": [ "### Alternate Code: Installs, Imports and Preprocessing in Jupyter Notebook" ] }, { "cell_type": "code", "source": [ "# # Install and import spacy\n", "# !pip install spaCy\n", "\n", "# # Import spacy\n", "# import spacy\n", "\n", "# # Install English language model\n", "# !spacy download en_core_web_sm\n", "\n", "# # Import os to upload documents and metadata\n", "# import os\n", "\n", "# # Load spaCy visualizer\n", "# from spacy import displacy\n", "\n", "# # Import pandas DataFrame packages\n", "# import pandas as pd\n", "\n", "# # Import graphing package\n", "# import plotly.graph_objects as go\n", "# import plotly.express as px" ], "metadata": { "id": "KlvUa2oX1645" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ub8MflGfA_HB" }, "outputs": [], "source": [ "# # Create empty lists for file names and contents\n", "# texts = []\n", "# file_names = []\n", "# # Iterate through each file in the path\n", "# for _file_name in os.listdir('path_to_directory'):\n", "# # Look for only text files\n", "# if _file_name.endswith('.txt'):\n", "# # Append contents of each text file to text list\n", "# texts.append(open('path_to_directory' + '/' + _file_name, 'r').read())\n", "# # Append name of each file to file name list\n", "# file_names.append(_file_name)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Z7BmHGFBA_HB", "scrolled": true }, "outputs": [], "source": [ "# # Create dictionary object associating each file name with its text\n", "# d = {'Filename':file_names,'Text':texts}" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "yC5-sbOPA_HB" }, "outputs": [], "source": [ "# # Turn dictionary into a dataframe\n", "# paper_df = pd.DataFrame(d)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "cK3PvJkcA_HC" }, "outputs": [], "source": [ "# paper_df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "kax8Ecu7A_HC" }, "outputs": [], "source": [ "# # Remove extra spaces from papers\n", "# paper_df['Text'] = paper_df['Text'].str.replace('\\s+', ' ', regex=True).str.strip()\n", "# paper_df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "zGbZVIrFA_HC" }, "outputs": [], "source": [ "# metadata_df = pd.read_csv('path_to_directory/metadata.csv')\n", "# metadata_df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "RO4lwuwJcrID" }, "outputs": [], "source": [ "# # Remove .txt from title of each paper\n", "# paper_df['Filename'] = paper_df['Filename'].str.replace('.txt', '', regex=True)\n", "\n", "# # Rename column from paper ID to Title\n", "# metadata_df.rename(columns={\"PAPER ID\": \"Filename\"}, inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "2eCYbDExeuqM", "scrolled": true }, "outputs": [], "source": [ "# # Merge metadata and papers into new DataFrame\n", "# # Will only keep rows where both essay and metadata are present\n", "# final_paper_df = metadata_df.merge(paper_df,on='Filename')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "C-8mOPIZA_HD" }, "outputs": [], "source": [ "# # Print DataFrame\n", "# final_paper_df.head()" ] }, { "cell_type": "markdown", "metadata": { "id": "xdlHoqlBA_HD" }, "source": [ "## Text Enrichment with spaCy" ] }, { "cell_type": "markdown", "source": [ "### Creating Doc Objects" ], "metadata": { "id": "ejyC6xvA3w9q" } }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "2P1TUR3oA_HD" }, "outputs": [], "source": [ "# Load nlp pipeline\n", "nlp = spacy.load('en_core_web_sm')\n", "\n", "# Check what functions it performs\n", "print(nlp.pipe_names)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ceknDmJZA_HE" }, "outputs": [], "source": [ "#Define example sentence\n", "sentence = \"This is 'an' example? sentence\"\n", "\n", "# Call the nlp model on the sentence\n", "doc = nlp(sentence)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "KWX1hxoBA_HE" }, "outputs": [], "source": [ "# Loop through each token in doc object\n", "for token in doc:\n", " # Print text and part of speech for each\n", " print(token.text, token.pos_)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "5pE_8YJ_A_HE" }, "outputs": [], "source": [ "# Define a function that runs the nlp pipeline on any given input text\n", "def process_text(text):\n", " return nlp(text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "N3O8qhn1A_HE", "scrolled": true }, "outputs": [], "source": [ "# Apply the function to the \"Text\" column, so that the nlp pipeline is called on each student essay\n", "final_paper_df['Doc'] = final_paper_df['Text'].apply(process_text)" ] }, { "cell_type": "markdown", "metadata": { "id": "BSNTL5msA_HE" }, "source": [ "### Text Reduction" ] }, { "cell_type": "markdown", "source": [ "#### Tokenization" ], "metadata": { "id": "5tgNt7NO35I0" } }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "SBEb3CErA_HE" }, "outputs": [], "source": [ "# Define a function to retrieve tokens from a doc object\n", "def get_token(doc):\n", " # Loop through each token in the doc object\n", " for token in doc:\n", " # Retrieve the text of each token\n", " return token.text" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "GOQY6IYAA_HE" }, "outputs": [], "source": [ "# Define a function to retrieve tokens from a doc object\n", "def get_token(doc):\n", " return [(token.text) for token in doc]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "7-qpJ-peA_HF" }, "outputs": [], "source": [ "# Run the token retrieval function on the doc objects in the dataframe\n", "final_paper_df['Tokens'] = final_paper_df['Doc'].apply(get_token)\n", "final_paper_df.head()" ] }, { "cell_type": "code", "source": [ "tokens = final_paper_df[['Text', 'Tokens']].copy()\n", "tokens.head()" ], "metadata": { "id": "xSU8Rn57FbSK" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "YLzPDKXqA_HF" }, "source": [ "#### Lemmatization" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "50hlrH_KA_HF" }, "outputs": [], "source": [ "# Define a function to retrieve lemmas from a doc object\n", "def get_lemma(doc):\n", " return [(token.lemma_) for token in doc]\n", "\n", "# Run the lemma retrieval function on the doc objects in the dataframe\n", "final_paper_df['Lemmas'] = final_paper_df['Doc'].apply(get_lemma)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "WbrgpAIjA_HF", "scrolled": true }, "outputs": [], "source": [ "print(f'\"Write\" appears in the text tokens column ' + str(final_paper_df['Tokens'].apply(lambda x: x.count('write')).sum()) + ' times.')\n", "print(f'\"Write\" appears in the lemmas column ' + str(final_paper_df['Lemmas'].apply(lambda x: x.count('write')).sum()) + ' times.')" ] }, { "cell_type": "markdown", "source": [ "### Text Annotation" ], "metadata": { "id": "GvH1xTvZ3-MW" } }, { "cell_type": "markdown", "metadata": { "id": "FLeLfyvVA_HF" }, "source": [ "#### Part of Speech Tagging" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "T24RkcuTA_HF" }, "outputs": [], "source": [ "# Define a function to retrieve lemmas from a doc object\n", "def get_pos(doc):\n", " #Return the coarse- and fine-grained part of speech text for each token in the doc\n", " return [(token.pos_, token.tag_) for token in doc]\n", "\n", "# Define a function to retrieve parts of speech from a doc object\n", "final_paper_df['POS'] = final_paper_df['Doc'].apply(get_pos)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "_1fZXUynA_HF", "scrolled": true }, "outputs": [], "source": [ "# Create a list of part of speech tags\n", "list(final_paper_df['POS'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "2lhKj9xGA_HG" }, "outputs": [], "source": [ "spacy.explain(\"IN\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "L-k8yoClA_HG" }, "outputs": [], "source": [ "# Define function to extract proper nouns from Doc object\n", "def extract_proper_nouns(doc):\n", " return [token.text for token in doc if token.pos_ == 'PROPN']\n", "\n", "# Apply function to Doc column and store resulting proper nouns in new column\n", "final_paper_df['Proper_Nouns'] = final_paper_df['Doc'].apply(extract_proper_nouns)" ] }, { "cell_type": "code", "source": [ "list(final_paper_df.loc[[3, 163], 'Proper_Nouns'])" ], "metadata": { "id": "m98pVVJX1ZlK" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "e-8878WiA_HG" }, "source": [ "#### Dependency Parsing" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Kka-2u7eA_HG", "scrolled": true }, "outputs": [], "source": [ "# Extract the first sentence from the fifth Doc object\n", "doc = final_paper_df['Doc'][5]\n", "\n", "# Create a list of sentence from the doc object\n", "sentences = list(doc.sents)\n", "\n", "# Retrieve the first sentence\n", "sentence = sentences[0]\n", "\n", "# Create dependency visualization for the first sentence of the 5th essay\n", "displacy.render(sentence, style=\"dep\", jupyter=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "TqwjCcjDA_HG" }, "outputs": [], "source": [ "#Define function to extract parts of speech of all non-stopwords\n", "def extract_stopwords(doc):\n", " return [token.text for token in doc if token.text not in nlp.Defaults.stop_words]\n", "\n", "#Create list of tokens without stopwords\n", "final_paper_df['Tokens_NoStops'] = final_paper_df['Doc'].apply(extract_stopwords)\n", "\n", "#Turn list of stopwords into a string\n", "final_paper_df['Text_NoStops'] = [' '.join(map(str, l)) for l in final_paper_df['Tokens_NoStops']]\n", "\n", "#Create new doc object from texts without stopwords\n", "final_paper_df['Doc_NoStops'] = final_paper_df['Text_NoStops'].apply(process_text)\n", "\n", "# extract the first sentence from the first Doc object\n", "doc = final_paper_df['Doc_NoStops'][5]\n", "sentences = list(doc.sents)\n", "sentence = sentences[0]\n", "\n", "# visualize the dependency parse tree for the sentence\n", "displacy.render(sentence, style='dep', jupyter=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "yhnSUEC3A_HH" }, "outputs": [], "source": [ "# Define function to extract noun phrases from Doc object\n", "def extract_noun_phrases(doc):\n", " return [chunk.text for chunk in doc.noun_chunks]\n", "\n", "# Apply function to Doc column and store resulting proper nouns in new column\n", "final_paper_df['Noun_Phrases'] = final_paper_df['Doc'].apply(extract_noun_phrases)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "wod3AHAXA_HH", "scrolled": true }, "outputs": [], "source": [ "final_paper_df['Noun_Phrases'][0]" ] }, { "cell_type": "markdown", "metadata": { "id": "39utqaRyh_M1" }, "source": [ "#### Named Entity Recognition\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "wRffhMlUA_HI", "scrolled": false }, "outputs": [], "source": [ "# Get all NE labels and assign to variable\n", "labels = nlp.get_pipe(\"ner\").labels\n", "\n", "# Print each label and its description\n", "for label in labels:\n", " print(label + ' : ' + spacy.explain(label))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "mow27zbdA_HI" }, "outputs": [], "source": [ "# Define function to extract named entities from doc objects\n", "def extract_named_entities(doc):\n", " return [ent.label_ for ent in doc.ents]\n", "\n", "# Apply function to Doc column and store resulting named entities in new column\n", "final_paper_df['Named_Entities'] = final_paper_df['Doc'].apply(extract_named_entities)\n", "final_paper_df['Named_Entities']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "XQYCg6fkA_HJ" }, "outputs": [], "source": [ "# Define function to extract text tagged with named entities from doc objects\n", "def extract_named_entities(doc):\n", " return [ent for ent in doc.ents]\n", "\n", "# Apply function to Doc column and store resulting text in new column\n", "final_paper_df['NE_Words'] = final_paper_df['Doc'].apply(extract_named_entities)\n", "final_paper_df['NE_Words']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "7ovPPNn4A_HJ" }, "outputs": [], "source": [ "# Extract the first Doc object\n", "doc = final_paper_df['Doc'][1]\n", "\n", "# Visualize named entity tagging in a single paper\n", "displacy.render(doc, style='ent', jupyter=True)" ] }, { "cell_type": "markdown", "metadata": { "id": "rKiNOMoDB2ta" }, "source": [ "### Download Enriched Dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "S7USU7ipB9YK" }, "outputs": [], "source": [ "# Save DataFrame as csv (in Google Drive)\n", "# Use this step only to save csv to your computer's working directory\n", "final_paper_df.to_csv('MICUSP_papers_with_spaCy_tags.csv')\n", "\n", "# Download csv to your computer from Google Drive\n", "files.download('MICUSP_papers_with_spaCy_tags.csv')" ] }, { "cell_type": "markdown", "metadata": { "id": "XUGnN4BX4C4c" }, "source": [ "## Analysis of Linguistic Annotations" ] }, { "cell_type": "markdown", "metadata": { "id": "9QTJrt6byTlf" }, "source": [ "### Part of Speech Analysis" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "T5RAD_t_SJt2" }, "outputs": [], "source": [ "# Create doc object from single sentence\n", "doc = nlp(\"This is 'an' example? sentence\")\n", "\n", "# Print counts of each part of speech in sentence\n", "print(doc.count_by(spacy.attrs.POS))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "QGPdo6FtS4bq" }, "outputs": [], "source": [ "# Store dictionary with indexes and POS counts in a variable\n", "num_pos = doc.count_by(spacy.attrs.POS)\n", "\n", "dictionary = {}\n", "\n", "# Create a new dictionary which replaces the index of each part of speech for its label (NOUN, VERB, ADJECTIVE)\n", "for k,v in sorted(num_pos.items()):\n", " dictionary[doc.vocab[k].text] = v\n", "\n", "dictionary" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "TnDn9MpTA_HK" }, "outputs": [], "source": [ "# Create new DataFrame for analysis purposes\n", "pos_analysis_df = final_paper_df[['Filename','DISCIPLINE', 'Doc']]\n", "\n", "# Create list to store each dictionary\n", "num_list = []\n", "\n", "# Define a function to get part of speech tags and counts and append them to a new dictionary\n", "def get_pos_tags(doc):\n", " dictionary = {}\n", " num_pos = doc.count_by(spacy.attrs.POS)\n", " for k,v in sorted(num_pos.items()):\n", " dictionary[doc.vocab[k].text] = v\n", " num_list.append(dictionary)\n", "\n", "# Apply function to each doc object in DataFrame\n", "pos_analysis_df['C_POS'] = pos_analysis_df['Doc'].apply(get_pos_tags)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "DQ0UCcqMA_HK" }, "outputs": [], "source": [ "# Create new dataframe with part of speech counts\n", "pos_counts = pd.DataFrame(num_list)\n", "columns = list(pos_counts.columns)\n", "\n", "# Add discipline of each paper as new column to dataframe\n", "idx = 0\n", "new_col = pos_analysis_df['DISCIPLINE']\n", "pos_counts.insert(loc=idx, column='DISCIPLINE', value=new_col)\n", "\n", "pos_counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "rbK1VH5lZ7T6" }, "outputs": [], "source": [ "# Get average part of speech counts used in papers of each discipline\n", "average_pos_df = pos_counts.groupby(['DISCIPLINE']).mean()\n", "\n", "# Round calculations to the nearest whole number\n", "average_pos_df = average_pos_df.round(0)\n", "\n", "# Reset index to improve DataFrame readability\n", "average_pos_df = average_pos_df.reset_index()\n", "\n", "# Show dataframe\n", "average_pos_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "F0NVs2tojM9A" }, "outputs": [], "source": [ "# Use plotly to plot proper noun use per genre\n", "fig = px.bar(average_pos_df, x=\"DISCIPLINE\", y=[\"ADJ\", 'VERB', \"NUM\"], title=\"Average Part-of-Speech Use in Papers Written by Biology and English Students\", barmode='group')\n", "fig.show()" ] }, { "cell_type": "markdown", "source": [ "### Fine-Grained Part of Speech Analysis" ], "metadata": { "id": "ZzJrF2Pv4YQO" } }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "KO4hSnXxZdqR", "scrolled": true }, "outputs": [], "source": [ "# Create list to store each dictionary\n", "tag_num_list = []\n", "\n", "# Define a function to get part of speech tags and counts and append them to a new dictionary\n", "def get_fine_pos_tags(doc):\n", " dictionary = {}\n", " num_tag = doc.count_by(spacy.attrs.TAG)\n", " for k,v in sorted(num_tag.items()):\n", " dictionary[doc.vocab[k].text] = v\n", " tag_num_list.append(dictionary)\n", "\n", "# Apply function to each doc object in DataFrame\n", "pos_analysis_df['F_POS'] = pos_analysis_df['Doc'].apply(get_fine_pos_tags)\n", "\n", "# Create new dataframe with part of speech counts\n", "tag_counts = pd.DataFrame(tag_num_list)\n", "columns = list(tag_counts.columns)\n", "\n", "# Add discipline of each paper as new column to dataframe\n", "idx = 0\n", "new_col = pos_analysis_df['DISCIPLINE']\n", "tag_counts.insert(loc=idx, column='DISCIPLINE', value=new_col)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "W7NknlyqA_HL" }, "outputs": [], "source": [ "# Get average fine-grain part of speech counts used in papers of each discipline\n", "average_tag_df = tag_counts.groupby(['DISCIPLINE']).mean()\n", "\n", "# Round calculations to the nearest whole number\n", "average_tag_df = average_tag_df.round(0)\n", "\n", "# Reset index to improve DataFrame readability\n", "average_tag_df = average_tag_df.reset_index()\n", "\n", "# Show dataframe\n", "average_tag_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "IIt9zHCmA_HL" }, "outputs": [], "source": [ "# Use plotly to plot proper noun use per genre\n", "fig = px.bar(average_tag_df, x=\"DISCIPLINE\", y=[\"VBD\", 'VBP', 'VBZ'], title=\"Average Verb Tense Usage Differences in Biology and English Student Writing\", barmode='group')\n", "fig.show()" ] }, { "cell_type": "markdown", "metadata": { "id": "xC6eZIpIyb8k" }, "source": [ "### Named Entity Analysis" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "U_WJ6sFTifpA", "scrolled": true }, "outputs": [], "source": [ "# Create new DataFrame for analysis purposes\n", "ner_analysis_df = final_paper_df[['Filename','PAPER TYPE', 'Named_Entities', 'NE_Words']]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "2MgWXgOL4K2E" }, "outputs": [], "source": [ "# Convert named entity lists to strings so we can count specific entities\n", "ner_analysis_df['Named_Entities'] = ner_analysis_df['Named_Entities'].apply(lambda x: ' '.join(x))\n", "\n", "# Get the number of each type of entity in each paper\n", "person_counts = ner_analysis_df['Named_Entities'].str.count('PERSON')\n", "loc_counts = ner_analysis_df['Named_Entities'].str.count('LOC')\n", "date_counts = ner_analysis_df['Named_Entities'].str.count('DATE')\n", "woa_counts = ner_analysis_df['Named_Entities'].str.count('WORK_OF_ART')\n", "\n", "# Append named entity counts to new DataFrame\n", "ner_counts_df = pd.DataFrame()\n", "ner_counts_df['Genre'] = ner_analysis_df[\"PAPER TYPE\"]\n", "ner_counts_df['PERSON_Counts'] = person_counts\n", "ner_counts_df['LOC_Counts'] = loc_counts\n", "ner_counts_df['DATE_Counts'] = date_counts\n", "ner_counts_df['WORK_OF_ART_Counts'] = woa_counts\n", "\n", "ner_counts_df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "OXUlrCXhh_0f" }, "outputs": [], "source": [ "# Calculate average usage of each named entity type\n", "average_ner_df = ner_counts_df.groupby(['Genre']).mean()\n", "average_ner_df = average_ner_df.round(0)\n", "average_ner_df = average_ner_df.reset_index()\n", "average_ner_df\n", "\n", "# Use plotly to plot proper noun use per genre\n", "fig = px.bar(average_ner_df, x=\"Genre\", y=[\"PERSON_Counts\", 'LOC_Counts', \"DATE_Counts\", 'WORK_OF_ART_Counts'], title=\"Average Named Entity Usage Across Student Paper Genres\", barmode='group')\n", "fig.show()" ] }, { "cell_type": "markdown", "source": [ "### Analysis of ```DATE``` Named Entities" ], "metadata": { "id": "n8nl8w084fo4" } }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "6RvvclvkA_HM", "scrolled": true }, "outputs": [], "source": [ "# Define function to extract words tagged as \"date\" named entities from doc objects\n", "def extract_date_named_entities(doc):\n", " return [ent for ent in doc.ents if ent.label_ == 'DATE']\n", "\n", "# Get all date entity words and apply to new column of DataFrame\n", "ner_analysis_df['Date_Named_Entities'] = final_paper_df['Doc'].apply(extract_date_named_entities)\n", "\n", "\n", "# Make list of date entities a string so we can count their frequencies\n", "ner_analysis_df['Date_Named_Entities'] = [', '.join(map(str, l)) for l in ner_analysis_df['Date_Named_Entities']]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ELHvnBtNvLSH", "scrolled": true }, "outputs": [], "source": [ "# Search for only date words in proposal papers\n", "date_word_counts_df = ner_analysis_df[(ner_analysis_df == 'Proposal').any(axis=1)]\n", "\n", "# Count the frequency of each word in these essays and append to list\n", "date_word_frequencies = date_word_counts_df.Date_Named_Entities.str.split(expand=True).stack().value_counts()\n", "\n", "# Get top 10 most common words and their frequencies\n", "date_word_frequencies[:10]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "2iKKPP-swqe2" }, "outputs": [], "source": [ "# Search for only date words in critique/evaluation papers\n", "date_word_counts_df = ner_analysis_df[(ner_analysis_df == 'Critique/Evaluation').any(axis=1)]\n", "\n", "# Count the frequency of each word in these essays and append to list\n", "date_word_frequencies = date_word_counts_df.Date_Named_Entities.str.split(expand=True).stack().value_counts()\n", "\n", "# Get top 10 most common words and their frequencies\n", "date_word_frequencies[:10]" ] } ], "metadata": { "colab": { "provenance": [], "include_colab_link": true }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 0 }