"Open

## Imports, Uploads, and Preprocessing


### Import Packages

In [None]:
# Import spacy
import spacy

# Load spaCy visualizer
from spacy import displacy

# Import pandas DataFrame packages
import pandas as pd

# Import graphing package
import plotly.graph_objects as go
import plotly.express as px

### Upload Text Files

In [None]:
# Import drive and files to facilitate file uploads
from google.colab import files

In [None]:
# Selet multiple text files to upload from local folder
uploaded_files = files.upload()

In [None]:
type(uploaded_files)

In [None]:
# Add files into DataFrame
paper_df = pd.DataFrame.from_dict(uploaded_files, orient='index')
paper_df.head()

In [None]:
# Reset index and add column names to make wrangling easier
paper_df = paper_df.reset_index()
paper_df.columns = ["Filename", "Text"]
paper_df.head()

### Pre-process Text Files

In [None]:
# Convert papers from bytes to strings
paper_df['Text'] = paper_df['Text'].str.decode('utf-8')
paper_df.head()

In [None]:
# Remove extra spaces from papers
paper_df['Text'] = paper_df['Text'].str.replace('\s+', ' ', regex=True).str.strip()
paper_df.head()

### Upload and Merge Metadata Files

In [None]:
# Upload csv with essay metadata
metadata = files.upload()

In [None]:
metadata_df = pd.read_csv('metadata.csv')
metadata_df = metadata_df.dropna(axis=1, how='all')
metadata_df.head()

In [None]:
# Remove .txt from title of each paper
paper_df['Filename'] = paper_df['Filename'].str.replace('.txt', '')

# Rename column from paper ID to Title
metadata_df.rename(columns={"PAPER ID": "Filename"}, inplace=True)

In [None]:
# Merge metadata and papers into new DataFrame
# Will only keep rows where both essay and metadata are present
final_paper_df = metadata_df.merge(paper_df,on='Filename')

In [None]:
# Print DataFrame
final_paper_df.head()

### Alternate Code: Installs, Imports and Preprocessing in Jupyter Notebook

In [None]:
# # Install and import spacy
# !pip install spaCy

# # Import spacy
# import spacy

# # Install English language model
# !spacy download en_core_web_sm

# # Import os to upload documents and metadata
# import os

# # Load spaCy visualizer
# from spacy import displacy

# # Import pandas DataFrame packages
# import pandas as pd

# # Import graphing package
# import plotly.graph_objects as go
# import plotly.express as px

In [None]:
# # Create empty lists for file names and contents
# texts = []
# file_names = []
# # Iterate through each file in the path
# for _file_name in os.listdir('path_to_directory'):
# # Look for only text files
# if _file_name.endswith('.txt'):
# # Append contents of each text file to text list
# texts.append(open('path_to_directory' + '/' + _file_name, 'r').read())
# # Append name of each file to file name list
# file_names.append(_file_name)

In [None]:
# # Create dictionary object associating each file name with its text
# d = {'Filename':file_names,'Text':texts}

In [None]:
# # Turn dictionary into a dataframe
# paper_df = pd.DataFrame(d)

In [None]:
# paper_df.head()

In [None]:
# # Remove extra spaces from papers
# paper_df['Text'] = paper_df['Text'].str.replace('\s+', ' ', regex=True).str.strip()
# paper_df.head()

In [None]:
# metadata_df = pd.read_csv('path_to_directory/metadata.csv')
# metadata_df.head()

In [None]:
# # Remove .txt from title of each paper
# paper_df['Filename'] = paper_df['Filename'].str.replace('.txt', '', regex=True)

# # Rename column from paper ID to Title
# metadata_df.rename(columns={"PAPER ID": "Filename"}, inplace=True)

In [None]:
# # Merge metadata and papers into new DataFrame
# # Will only keep rows where both essay and metadata are present
# final_paper_df = metadata_df.merge(paper_df,on='Filename')

In [None]:
# # Print DataFrame
# final_paper_df.head()

## Text Enrichment with spaCy

### Creating Doc Objects

In [None]:
# Load nlp pipeline
nlp = spacy.load('en_core_web_sm')

# Check what functions it performs
print(nlp.pipe_names)

In [None]:
#Define example sentence
sentence = "This is 'an' example? sentence"

# Call the nlp model on the sentence
doc = nlp(sentence)

In [None]:
# Loop through each token in doc object
for token in doc:
 # Print text and part of speech for each
 print(token.text, token.pos_)

In [None]:
# Define a function that runs the nlp pipeline on any given input text
def process_text(text):
 return nlp(text)

In [None]:
# Apply the function to the "Text" column, so that the nlp pipeline is called on each student essay
final_paper_df['Doc'] = final_paper_df['Text'].apply(process_text)

### Text Reduction

#### Tokenization

In [None]:
# Define a function to retrieve tokens from a doc object
def get_token(doc):
 # Loop through each token in the doc object
 for token in doc:
 # Retrieve the text of each token
 return token.text

In [None]:
# Define a function to retrieve tokens from a doc object
def get_token(doc):
 return [(token.text) for token in doc]

In [None]:
# Run the token retrieval function on the doc objects in the dataframe
final_paper_df['Tokens'] = final_paper_df['Doc'].apply(get_token)
final_paper_df.head()

In [None]:
tokens = final_paper_df[['Text', 'Tokens']].copy()
tokens.head()

#### Lemmatization

In [None]:
# Define a function to retrieve lemmas from a doc object
def get_lemma(doc):
 return [(token.lemma_) for token in doc]

# Run the lemma retrieval function on the doc objects in the dataframe
final_paper_df['Lemmas'] = final_paper_df['Doc'].apply(get_lemma)

In [None]:
print(f'"Write" appears in the text tokens column ' + str(final_paper_df['Tokens'].apply(lambda x: x.count('write')).sum()) + ' times.')
print(f'"Write" appears in the lemmas column ' + str(final_paper_df['Lemmas'].apply(lambda x: x.count('write')).sum()) + ' times.')

### Text Annotation

#### Part of Speech Tagging

In [None]:
# Define a function to retrieve lemmas from a doc object
def get_pos(doc):
 #Return the coarse- and fine-grained part of speech text for each token in the doc
 return [(token.pos_, token.tag_) for token in doc]

# Define a function to retrieve parts of speech from a doc object
final_paper_df['POS'] = final_paper_df['Doc'].apply(get_pos)

In [None]:
# Create a list of part of speech tags
list(final_paper_df['POS'])

In [None]:
spacy.explain("IN")

In [None]:
# Define function to extract proper nouns from Doc object
def extract_proper_nouns(doc):
 return [token.text for token in doc if token.pos_ == 'PROPN']

# Apply function to Doc column and store resulting proper nouns in new column
final_paper_df['Proper_Nouns'] = final_paper_df['Doc'].apply(extract_proper_nouns)

In [None]:
list(final_paper_df.loc[[3, 163], 'Proper_Nouns'])

#### Dependency Parsing

In [None]:
# Extract the first sentence from the fifth Doc object
doc = final_paper_df['Doc'][5]

# Create a list of sentence from the doc object
sentences = list(doc.sents)

# Retrieve the first sentence
sentence = sentences[0]

# Create dependency visualization for the first sentence of the 5th essay
displacy.render(sentence, style="dep", jupyter=True)

In [None]:
#Define function to extract parts of speech of all non-stopwords
def extract_stopwords(doc):
 return [token.text for token in doc if token.text not in nlp.Defaults.stop_words]

#Create list of tokens without stopwords
final_paper_df['Tokens_NoStops'] = final_paper_df['Doc'].apply(extract_stopwords)

#Turn list of stopwords into a string
final_paper_df['Text_NoStops'] = [' '.join(map(str, l)) for l in final_paper_df['Tokens_NoStops']]

#Create new doc object from texts without stopwords
final_paper_df['Doc_NoStops'] = final_paper_df['Text_NoStops'].apply(process_text)

# extract the first sentence from the first Doc object
doc = final_paper_df['Doc_NoStops'][5]
sentences = list(doc.sents)
sentence = sentences[0]

# visualize the dependency parse tree for the sentence
displacy.render(sentence, style='dep', jupyter=True)

In [None]:
# Define function to extract noun phrases from Doc object
def extract_noun_phrases(doc):
 return [chunk.text for chunk in doc.noun_chunks]

# Apply function to Doc column and store resulting proper nouns in new column
final_paper_df['Noun_Phrases'] = final_paper_df['Doc'].apply(extract_noun_phrases)

In [None]:
final_paper_df['Noun_Phrases'][0]

#### Named Entity Recognition


In [None]:
# Get all NE labels and assign to variable
labels = nlp.get_pipe("ner").labels

# Print each label and its description
for label in labels:
 print(label + ' : ' + spacy.explain(label))

In [None]:
# Define function to extract named entities from doc objects
def extract_named_entities(doc):
 return [ent.label_ for ent in doc.ents]

# Apply function to Doc column and store resulting named entities in new column
final_paper_df['Named_Entities'] = final_paper_df['Doc'].apply(extract_named_entities)
final_paper_df['Named_Entities']

In [None]:
# Define function to extract text tagged with named entities from doc objects
def extract_named_entities(doc):
 return [ent for ent in doc.ents]

# Apply function to Doc column and store resulting text in new column
final_paper_df['NE_Words'] = final_paper_df['Doc'].apply(extract_named_entities)
final_paper_df['NE_Words']

In [None]:
# Extract the first Doc object
doc = final_paper_df['Doc'][1]

# Visualize named entity tagging in a single paper
displacy.render(doc, style='ent', jupyter=True)

### Download Enriched Dataset

In [None]:
# Save DataFrame as csv (in Google Drive)
# Use this step only to save csv to your computer's working directory
final_paper_df.to_csv('MICUSP_papers_with_spaCy_tags.csv')

# Download csv to your computer from Google Drive
files.download('MICUSP_papers_with_spaCy_tags.csv')

## Analysis of Linguistic Annotations

### Part of Speech Analysis

In [None]:
# Create doc object from single sentence
doc = nlp("This is 'an' example? sentence")

# Print counts of each part of speech in sentence
print(doc.count_by(spacy.attrs.POS))

In [None]:
# Store dictionary with indexes and POS counts in a variable
num_pos = doc.count_by(spacy.attrs.POS)

dictionary = {}

# Create a new dictionary which replaces the index of each part of speech for its label (NOUN, VERB, ADJECTIVE)
for k,v in sorted(num_pos.items()):
 dictionary[doc.vocab[k].text] = v

dictionary

In [None]:
# Create new DataFrame for analysis purposes
pos_analysis_df = final_paper_df[['Filename','DISCIPLINE', 'Doc']]

# Create list to store each dictionary
num_list = []

# Define a function to get part of speech tags and counts and append them to a new dictionary
def get_pos_tags(doc):
 dictionary = {}
 num_pos = doc.count_by(spacy.attrs.POS)
 for k,v in sorted(num_pos.items()):
 dictionary[doc.vocab[k].text] = v
 num_list.append(dictionary)

# Apply function to each doc object in DataFrame
pos_analysis_df['C_POS'] = pos_analysis_df['Doc'].apply(get_pos_tags)

In [None]:
# Create new dataframe with part of speech counts
pos_counts = pd.DataFrame(num_list)
columns = list(pos_counts.columns)

# Add discipline of each paper as new column to dataframe
idx = 0
new_col = pos_analysis_df['DISCIPLINE']
pos_counts.insert(loc=idx, column='DISCIPLINE', value=new_col)

pos_counts

In [None]:
# Get average part of speech counts used in papers of each discipline
average_pos_df = pos_counts.groupby(['DISCIPLINE']).mean()

# Round calculations to the nearest whole number
average_pos_df = average_pos_df.round(0)

# Reset index to improve DataFrame readability
average_pos_df = average_pos_df.reset_index()

# Show dataframe
average_pos_df

In [None]:
# Use plotly to plot proper noun use per genre
fig = px.bar(average_pos_df, x="DISCIPLINE", y=["ADJ", 'VERB', "NUM"], title="Average Part-of-Speech Use in Papers Written by Biology and English Students", barmode='group')
fig.show()

### Fine-Grained Part of Speech Analysis

In [None]:
# Create list to store each dictionary
tag_num_list = []

# Define a function to get part of speech tags and counts and append them to a new dictionary
def get_fine_pos_tags(doc):
 dictionary = {}
 num_tag = doc.count_by(spacy.attrs.TAG)
 for k,v in sorted(num_tag.items()):
 dictionary[doc.vocab[k].text] = v
 tag_num_list.append(dictionary)

# Apply function to each doc object in DataFrame
pos_analysis_df['F_POS'] = pos_analysis_df['Doc'].apply(get_fine_pos_tags)

# Create new dataframe with part of speech counts
tag_counts = pd.DataFrame(tag_num_list)
columns = list(tag_counts.columns)

# Add discipline of each paper as new column to dataframe
idx = 0
new_col = pos_analysis_df['DISCIPLINE']
tag_counts.insert(loc=idx, column='DISCIPLINE', value=new_col)

In [None]:
# Get average fine-grain part of speech counts used in papers of each discipline
average_tag_df = tag_counts.groupby(['DISCIPLINE']).mean()

# Round calculations to the nearest whole number
average_tag_df = average_tag_df.round(0)

# Reset index to improve DataFrame readability
average_tag_df = average_tag_df.reset_index()

# Show dataframe
average_tag_df

In [None]:
# Use plotly to plot proper noun use per genre
fig = px.bar(average_tag_df, x="DISCIPLINE", y=["VBD", 'VBP', 'VBZ'], title="Average Verb Tense Usage Differences in Biology and English Student Writing", barmode='group')
fig.show()

### Named Entity Analysis

In [None]:
# Create new DataFrame for analysis purposes
ner_analysis_df = final_paper_df[['Filename','PAPER TYPE', 'Named_Entities', 'NE_Words']]

In [None]:
# Convert named entity lists to strings so we can count specific entities
ner_analysis_df['Named_Entities'] = ner_analysis_df['Named_Entities'].apply(lambda x: ' '.join(x))

# Get the number of each type of entity in each paper
person_counts = ner_analysis_df['Named_Entities'].str.count('PERSON')
loc_counts = ner_analysis_df['Named_Entities'].str.count('LOC')
date_counts = ner_analysis_df['Named_Entities'].str.count('DATE')
woa_counts = ner_analysis_df['Named_Entities'].str.count('WORK_OF_ART')

# Append named entity counts to new DataFrame
ner_counts_df = pd.DataFrame()
ner_counts_df['Genre'] = ner_analysis_df["PAPER TYPE"]
ner_counts_df['PERSON_Counts'] = person_counts
ner_counts_df['LOC_Counts'] = loc_counts
ner_counts_df['DATE_Counts'] = date_counts
ner_counts_df['WORK_OF_ART_Counts'] = woa_counts

ner_counts_df.head()

In [None]:
# Calculate average usage of each named entity type
average_ner_df = ner_counts_df.groupby(['Genre']).mean()
average_ner_df = average_ner_df.round(0)
average_ner_df = average_ner_df.reset_index()
average_ner_df

# Use plotly to plot proper noun use per genre
fig = px.bar(average_ner_df, x="Genre", y=["PERSON_Counts", 'LOC_Counts', "DATE_Counts", 'WORK_OF_ART_Counts'], title="Average Named Entity Usage Across Student Paper Genres", barmode='group')
fig.show()

### Analysis of ```DATE``` Named Entities

In [None]:
# Define function to extract words tagged as "date" named entities from doc objects
def extract_date_named_entities(doc):
 return [ent for ent in doc.ents if ent.label_ == 'DATE']

# Get all date entity words and apply to new column of DataFrame
ner_analysis_df['Date_Named_Entities'] = final_paper_df['Doc'].apply(extract_date_named_entities)


# Make list of date entities a string so we can count their frequencies
ner_analysis_df['Date_Named_Entities'] = [', '.join(map(str, l)) for l in ner_analysis_df['Date_Named_Entities']]

In [None]:
# Search for only date words in proposal papers
date_word_counts_df = ner_analysis_df[(ner_analysis_df == 'Proposal').any(axis=1)]

# Count the frequency of each word in these essays and append to list
date_word_frequencies = date_word_counts_df.Date_Named_Entities.str.split(expand=True).stack().value_counts()

# Get top 10 most common words and their frequencies
date_word_frequencies[:10]

In [None]:
# Search for only date words in critique/evaluation papers
date_word_counts_df = ner_analysis_df[(ner_analysis_df == 'Critique/Evaluation').any(axis=1)]

# Count the frequency of each word in these essays and append to list
date_word_frequencies = date_word_counts_df.Date_Named_Entities.str.split(expand=True).stack().value_counts()

# Get top 10 most common words and their frequencies
date_word_frequencies[:10]