{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# CoNLL_4.ipynb\n",
"\n",
"This notebook contains the fourth part of the model training and analysis code from our CoNLL-2020 paper, [\"Identifying Incorrect Labels in the CoNLL-2003 Corpus\"](https://www.aclweb.org/anthology/2020.conll-1.16/).\n",
"\n",
"If you're new to the Text Extensions for Pandas library, we recommend that you start\n",
"by reading through the notebook [`Analyze_Model_Outputs.ipynb`](https://github.com/CODAIT/text-extensions-for-pandas/blob/master/notebooks/Analyze_Model_Outputs.ipynb), which explains the \n",
"portions of the library that we use in the notebooks in this directory.\n",
"\n",
"### Summary\n",
"\n",
"This notebook repeats the model training process from `CoNLL_3.ipynb`, but performs a 10-fold cross-validation. This process involves training a total of 170 models -- 10 groups of 17. Next, this notebook evaluates each group of models over the holdout set from the associated fold of the cross-validation. Then it aggregates together these outputs and uses the same techniques used in `CoNLL_2.ipynb` to flag potentially-incorrect labels. Finally, the notebook writes out CSV files containing ranked lists of potentially-incorrect labels.\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Libraries and constants"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']\n",
"- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
]
}
],
"source": [
"# Libraries\n",
"import numpy as np\n",
"import pandas as pd\n",
"import os\n",
"import sys\n",
"import time\n",
"import torch\n",
"import transformers\n",
"from typing import *\n",
"import sklearn.model_selection\n",
"import sklearn.pipeline\n",
"import matplotlib.pyplot as plt\n",
"import multiprocessing\n",
"import gc\n",
"\n",
"# And of course we need the text_extensions_for_pandas library itself.\n",
"try:\n",
" import text_extensions_for_pandas as tp\n",
"except ModuleNotFoundError as e:\n",
" raise Exception(\"text_extensions_for_pandas package not found on the Jupyter \"\n",
" \"kernel's path. Please either run:\\n\"\n",
" \" ln -s ../../text_extensions_for_pandas .\\n\"\n",
" \"from the directory containing this notebook, or use a Python \"\n",
" \"environment on which you have used `pip` to install the package.\")\n",
"\n",
"from text_extensions_for_pandas import cleaning\n",
" \n",
"# BERT Configuration\n",
"# Keep this in sync with `CoNLL_3.ipynb`.\n",
"#bert_model_name = \"bert-base-uncased\"\n",
"#bert_model_name = \"bert-large-uncased\"\n",
"bert_model_name = \"dslim/bert-base-NER\"\n",
"tokenizer = transformers.BertTokenizerFast.from_pretrained(bert_model_name, \n",
" add_special_tokens=True)\n",
"bert = transformers.BertModel.from_pretrained(bert_model_name)\n",
"\n",
"# If False, use cached values, provided those values are present on disk\n",
"_REGENERATE_EMBEDDINGS = True\n",
"_REGENERATE_MODELS = True\n",
"\n",
"# Number of dimensions that we reduce the BERT embeddings down to when\n",
"# training reduced-quality models.\n",
"#_REDUCED_DIMS = [8, 16, 32, 64, 128, 256]\n",
"_REDUCED_DIMS = [32, 64, 128, 256]\n",
"\n",
"# How many models we train at each level of dimensionality reduction\n",
"_MODELS_AT_DIM = [4] * len(_REDUCED_DIMS)\n",
"\n",
"# Consistent set of random seeds to use when generating dimension-reduced\n",
"# models. Index is [index into _REDUCED_DIMS, model number], and there are\n",
"# lots of extra entries so we don't need to resize this matrix.\n",
"from numpy.random import default_rng\n",
"_MASTER_SEED = 42\n",
"rng = default_rng(_MASTER_SEED)\n",
"_MODEL_RANDOM_SEEDS = rng.integers(0, 1e6, size=(8, 8))\n",
"\n",
"# Create a Pandas categorical type for consistent encoding of categories\n",
"# across all documents.\n",
"_ENTITY_TYPES = [\"LOC\", \"MISC\", \"ORG\", \"PER\"]\n",
"token_class_dtype, int_to_label, label_to_int = tp.io.conll.make_iob_tag_categories(_ENTITY_TYPES)\n",
"\n",
"# Parameters for splitting the corpus into folds\n",
"_KFOLD_RANDOM_SEED = _MASTER_SEED\n",
"_KFOLD_NUM_FOLDS = 10\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Read inputs\n",
"\n",
"Read in the corpus, retokenize it with the BERT tokenizer, add BERT embeddings, and convert\n",
"to a single dataframe."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'train': 'outputs/eng.train',\n",
" 'dev': 'outputs/eng.testa',\n",
" 'test': 'outputs/eng.testb'}"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Download and cache the data set.\n",
"# NOTE: This data set is licensed for research use only. Be sure to adhere\n",
"# to the terms of the license when using this data set!\n",
"data_set_info = tp.io.conll.maybe_download_conll_data(\"outputs\")\n",
"data_set_info"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# The raw dataset in its original tokenization\n",
"corpus_raw = {}\n",
"for fold_name, file_name in data_set_info.items():\n",
" df_list = tp.io.conll.conll_2003_to_dataframes(file_name, \n",
" [\"pos\", \"phrase\", \"ent\"],\n",
" [False, True, True])\n",
" corpus_raw[fold_name] = [\n",
" df.drop(columns=[\"pos\", \"phrase_iob\", \"phrase_type\"])\n",
" for df in df_list\n",
" ]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']\n",
"- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"preprocessing fold train\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3e030cae2b3b4c898bfc57ba945740ab",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=946, style=ProgressStyle(desc…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Token indices sequence length is longer than the specified maximum sequence length for this model (559 > 512). Running this sequence through the model will result in indexing errors\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"preprocessing fold dev\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "55361e25bc62408082d864b813e9ab61",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=216, style=ProgressStyle(desc…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"preprocessing fold test\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f9753feeace94e65a24961292d4420db",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=231, style=ProgressStyle(desc…"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Retokenize with the BERT tokenizer and regenerate embeddings.\n",
"corpus_df,token_class_dtype, int_to_label, label_to_int = cleaning.preprocess.preprocess_documents(corpus_raw,'ent_type',True,carry_cols=['line_num'],iob_col='ent_iob')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Prepare folds for a 10-fold cross-validation\n",
"\n",
"We divide the documents of the corpus into 10 random samples."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
fold
\n",
"
doc_num
\n",
"
\n",
" \n",
" \n",
"
\n",
"
0
\n",
"
train
\n",
"
0
\n",
"
\n",
"
\n",
"
1
\n",
"
train
\n",
"
1
\n",
"
\n",
"
\n",
"
2
\n",
"
train
\n",
"
2
\n",
"
\n",
"
\n",
"
3
\n",
"
train
\n",
"
3
\n",
"
\n",
"
\n",
"
4
\n",
"
train
\n",
"
4
\n",
"
\n",
"
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
\n",
"
\n",
"
1388
\n",
"
test
\n",
"
226
\n",
"
\n",
"
\n",
"
1389
\n",
"
test
\n",
"
227
\n",
"
\n",
"
\n",
"
1390
\n",
"
test
\n",
"
228
\n",
"
\n",
"
\n",
"
1391
\n",
"
test
\n",
"
229
\n",
"
\n",
"
\n",
"
1392
\n",
"
test
\n",
"
230
\n",
"
\n",
" \n",
"
\n",
"
1393 rows × 2 columns
\n",
"
"
],
"text/plain": [
" fold doc_num\n",
"0 train 0\n",
"1 train 1\n",
"2 train 2\n",
"3 train 3\n",
"4 train 4\n",
"... ... ...\n",
"1388 test 226\n",
"1389 test 227\n",
"1390 test 228\n",
"1391 test 229\n",
"1392 test 230\n",
"\n",
"[1393 rows x 2 columns]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# IDs for each of the keys\n",
"doc_keys = corpus_df[[\"fold\", \"doc_num\"]].drop_duplicates().reset_index(drop=True)\n",
"doc_keys"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
fold
\n",
"
doc_num
\n",
"
\n",
" \n",
" \n",
"
\n",
"
146
\n",
"
train
\n",
"
146
\n",
"
\n",
"
\n",
"
1164
\n",
"
test
\n",
"
2
\n",
"
\n",
"
\n",
"
483
\n",
"
train
\n",
"
483
\n",
"
\n",
"
\n",
"
1190
\n",
"
test
\n",
"
28
\n",
"
\n",
"
\n",
"
20
\n",
"
train
\n",
"
20
\n",
"
\n",
"
\n",
"
237
\n",
"
train
\n",
"
237
\n",
"
\n",
"
\n",
"
86
\n",
"
train
\n",
"
86
\n",
"
\n",
"
\n",
"
408
\n",
"
train
\n",
"
408
\n",
"
\n",
"
\n",
"
1252
\n",
"
test
\n",
"
90
\n",
"
\n",
"
\n",
"
1213
\n",
"
test
\n",
"
51
\n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" fold doc_num\n",
"146 train 146\n",
"1164 test 2\n",
"483 train 483\n",
"1190 test 28\n",
"20 train 20\n",
"237 train 237\n",
"86 train 86\n",
"408 train 408\n",
"1252 test 90\n",
"1213 test 51"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# We want to split the documents randomly into _NUM_FOLDS sets, then\n",
"# for each stage of cross-validation train a model on the union of\n",
"# (_NUM_FOLDS - 1) of them while testing on the remaining fold.\n",
"# sklearn.model_selection doesn't implement this approach directly,\n",
"# but we can piece it together with some help from Numpy.\n",
"#from numpy.random import default_rng\n",
"rng = np.random.default_rng(seed=_KFOLD_RANDOM_SEED)\n",
"iloc_order = rng.permutation(len(doc_keys.index))\n",
"kf = sklearn.model_selection.KFold(n_splits=_KFOLD_NUM_FOLDS)\n",
"\n",
"train_keys = []\n",
"test_keys = []\n",
"for train_ix, test_ix in kf.split(iloc_order):\n",
" # sklearn.model_selection.KFold gives us a partitioning of the\n",
" # numbers from 0 to len(iloc_order). Use that partitioning to \n",
" # choose elements from iloc_order, then use those elements to \n",
" # index into doc_keys.\n",
" train_iloc = iloc_order[train_ix]\n",
" test_iloc = iloc_order[test_ix]\n",
" train_keys.append(doc_keys.iloc[train_iloc])\n",
" test_keys.append(doc_keys.iloc[test_iloc])\n",
"\n",
"train_keys[1].head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Dry run: Train and evaluate models on the first fold\n",
"\n",
"Train models on the first of our 10 folds and manually examine some of the \n",
"model outputs."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
fold
\n",
"
doc_num
\n",
"
token_id
\n",
"
span
\n",
"
input_id
\n",
"
token_type_id
\n",
"
attention_mask
\n",
"
special_tokens_mask
\n",
"
raw_span
\n",
"
line_num
\n",
"
raw_span_id
\n",
"
ent_iob
\n",
"
ent_type
\n",
"
embedding
\n",
"
token_class
\n",
"
token_class_id
\n",
"
\n",
" \n",
" \n",
"
\n",
"
0
\n",
"
train
\n",
"
0
\n",
"
0
\n",
"
[0, 0): ''
\n",
"
101
\n",
"
0
\n",
"
1
\n",
"
True
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
O
\n",
"
<NA>
\n",
"
[ -0.098505184, -0.4050192, 0.7428884...
\n",
"
O
\n",
"
0
\n",
"
\n",
"
\n",
"
1
\n",
"
train
\n",
"
0
\n",
"
1
\n",
"
[0, 1): '-'
\n",
"
118
\n",
"
0
\n",
"
1
\n",
"
False
\n",
"
[0, 10): '-DOCSTART-'
\n",
"
0.0
\n",
"
0.0
\n",
"
O
\n",
"
<NA>
\n",
"
[ -0.057021223, -0.48112097, 0.989868...
\n",
"
O
\n",
"
0
\n",
"
\n",
"
\n",
"
2
\n",
"
train
\n",
"
0
\n",
"
2
\n",
"
[1, 2): 'D'
\n",
"
141
\n",
"
0
\n",
"
1
\n",
"
False
\n",
"
[0, 10): '-DOCSTART-'
\n",
"
0.0
\n",
"
0.0
\n",
"
O
\n",
"
<NA>
\n",
"
[ -0.04824195, -0.25330004, 1.167191...
\n",
"
O
\n",
"
0
\n",
"
\n",
"
\n",
"
3
\n",
"
train
\n",
"
0
\n",
"
3
\n",
"
[2, 4): 'OC'
\n",
"
9244
\n",
"
0
\n",
"
1
\n",
"
False
\n",
"
[0, 10): '-DOCSTART-'
\n",
"
0.0
\n",
"
0.0
\n",
"
O
\n",
"
<NA>
\n",
"
[ -0.26682988, -0.31008753, 1.007472...
\n",
"
O
\n",
"
0
\n",
"
\n",
"
\n",
"
4
\n",
"
train
\n",
"
0
\n",
"
4
\n",
"
[4, 6): 'ST'
\n",
"
9272
\n",
"
0
\n",
"
1
\n",
"
False
\n",
"
[0, 10): '-DOCSTART-'
\n",
"
0.0
\n",
"
0.0
\n",
"
O
\n",
"
<NA>
\n",
"
[ -0.22296889, -0.21308492, 0.9331016...
\n",
"
O
\n",
"
0
\n",
"
\n",
"
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
\n",
"
\n",
"
371472
\n",
"
test
\n",
"
230
\n",
"
314
\n",
"
[1386, 1393): 'brother'
\n",
"
1711
\n",
"
0
\n",
"
1
\n",
"
False
\n",
"
[1386, 1393): 'brother'
\n",
"
50345.0
\n",
"
267.0
\n",
"
O
\n",
"
<NA>
\n",
"
[ -0.028172785, -0.08062388, 0.9804888...
\n",
"
O
\n",
"
0
\n",
"
\n",
"
\n",
"
371473
\n",
"
test
\n",
"
230
\n",
"
315
\n",
"
[1393, 1394): ','
\n",
"
117
\n",
"
0
\n",
"
1
\n",
"
False
\n",
"
[1393, 1394): ','
\n",
"
50346.0
\n",
"
268.0
\n",
"
O
\n",
"
<NA>
\n",
"
[ 0.11817408, -0.07008513, 0.865484...
\n",
"
O
\n",
"
0
\n",
"
\n",
"
\n",
"
371474
\n",
"
test
\n",
"
230
\n",
"
316
\n",
"
[1395, 1400): 'Bobby'
\n",
"
5545
\n",
"
0
\n",
"
1
\n",
"
False
\n",
"
[1395, 1400): 'Bobby'
\n",
"
50347.0
\n",
"
269.0
\n",
"
B
\n",
"
PER
\n",
"
[ -0.35689482, 0.31400457, 1.573853...
\n",
"
B-PER
\n",
"
3
\n",
"
\n",
"
\n",
"
371475
\n",
"
test
\n",
"
230
\n",
"
317
\n",
"
[1400, 1401): '.'
\n",
"
119
\n",
"
0
\n",
"
1
\n",
"
False
\n",
"
[1400, 1401): '.'
\n",
"
50348.0
\n",
"
270.0
\n",
"
O
\n",
"
<NA>
\n",
"
[ -0.18957126, -0.24581163, 0.66257...
\n",
"
O
\n",
"
0
\n",
"
\n",
"
\n",
"
371476
\n",
"
test
\n",
"
230
\n",
"
318
\n",
"
[0, 0): ''
\n",
"
102
\n",
"
0
\n",
"
1
\n",
"
True
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
O
\n",
"
<NA>
\n",
"
[ -0.44689128, -0.31665266, 0.779688...
\n",
"
O
\n",
"
0
\n",
"
\n",
" \n",
"
\n",
"
371477 rows × 16 columns
\n",
"
"
],
"text/plain": [
" fold doc_num token_id span input_id \\\n",
"0 train 0 0 [0, 0): '' 101 \n",
"1 train 0 1 [0, 1): '-' 118 \n",
"2 train 0 2 [1, 2): 'D' 141 \n",
"3 train 0 3 [2, 4): 'OC' 9244 \n",
"4 train 0 4 [4, 6): 'ST' 9272 \n",
"... ... ... ... ... ... \n",
"371472 test 230 314 [1386, 1393): 'brother' 1711 \n",
"371473 test 230 315 [1393, 1394): ',' 117 \n",
"371474 test 230 316 [1395, 1400): 'Bobby' 5545 \n",
"371475 test 230 317 [1400, 1401): '.' 119 \n",
"371476 test 230 318 [0, 0): '' 102 \n",
"\n",
" token_type_id attention_mask special_tokens_mask \\\n",
"0 0 1 True \n",
"1 0 1 False \n",
"2 0 1 False \n",
"3 0 1 False \n",
"4 0 1 False \n",
"... ... ... ... \n",
"371472 0 1 False \n",
"371473 0 1 False \n",
"371474 0 1 False \n",
"371475 0 1 False \n",
"371476 0 1 True \n",
"\n",
" raw_span line_num raw_span_id ent_iob ent_type \\\n",
"0 NaN NaN NaN O \n",
"1 [0, 10): '-DOCSTART-' 0.0 0.0 O \n",
"2 [0, 10): '-DOCSTART-' 0.0 0.0 O \n",
"3 [0, 10): '-DOCSTART-' 0.0 0.0 O \n",
"4 [0, 10): '-DOCSTART-' 0.0 0.0 O \n",
"... ... ... ... ... ... \n",
"371472 [1386, 1393): 'brother' 50345.0 267.0 O \n",
"371473 [1393, 1394): ',' 50346.0 268.0 O \n",
"371474 [1395, 1400): 'Bobby' 50347.0 269.0 B PER \n",
"371475 [1400, 1401): '.' 50348.0 270.0 O \n",
"371476 NaN NaN NaN O \n",
"\n",
" embedding token_class \\\n",
"0 [ -0.098505184, -0.4050192, 0.7428884... O \n",
"1 [ -0.057021223, -0.48112097, 0.989868... O \n",
"2 [ -0.04824195, -0.25330004, 1.167191... O \n",
"3 [ -0.26682988, -0.31008753, 1.007472... O \n",
"4 [ -0.22296889, -0.21308492, 0.9331016... O \n",
"... ... ... \n",
"371472 [ -0.028172785, -0.08062388, 0.9804888... O \n",
"371473 [ 0.11817408, -0.07008513, 0.865484... O \n",
"371474 [ -0.35689482, 0.31400457, 1.573853... B-PER \n",
"371475 [ -0.18957126, -0.24581163, 0.66257... O \n",
"371476 [ -0.44689128, -0.31665266, 0.779688... O \n",
"\n",
" token_class_id \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"... ... \n",
"371472 0 \n",
"371473 0 \n",
"371474 3 \n",
"371475 0 \n",
"371476 0 \n",
"\n",
"[371477 rows x 16 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Gather the training set together by joining our list of documents\n",
"# with the entire corpus on the composite key \n",
"train_inputs_df = corpus_df.merge(train_keys[0])\n",
"train_inputs_df"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
fold
\n",
"
doc_num
\n",
"
token_id
\n",
"
span
\n",
"
input_id
\n",
"
token_type_id
\n",
"
attention_mask
\n",
"
special_tokens_mask
\n",
"
raw_span
\n",
"
line_num
\n",
"
raw_span_id
\n",
"
ent_iob
\n",
"
ent_type
\n",
"
embedding
\n",
"
token_class
\n",
"
token_class_id
\n",
"
\n",
" \n",
" \n",
"
\n",
"
0
\n",
"
train
\n",
"
12
\n",
"
0
\n",
"
[0, 0): ''
\n",
"
101
\n",
"
0
\n",
"
1
\n",
"
True
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
O
\n",
"
<NA>
\n",
"
[ -0.101977676, -0.42442498, 0.8440171...
\n",
"
O
\n",
"
0
\n",
"
\n",
"
\n",
"
1
\n",
"
train
\n",
"
12
\n",
"
1
\n",
"
[0, 1): '-'
\n",
"
118
\n",
"
0
\n",
"
1
\n",
"
False
\n",
"
[0, 10): '-DOCSTART-'
\n",
"
2664.0
\n",
"
0.0
\n",
"
O
\n",
"
<NA>
\n",
"
[ -0.09124618, -0.47710702, 1.120292...
\n",
"
O
\n",
"
0
\n",
"
\n",
"
\n",
"
2
\n",
"
train
\n",
"
12
\n",
"
2
\n",
"
[1, 2): 'D'
\n",
"
141
\n",
"
0
\n",
"
1
\n",
"
False
\n",
"
[0, 10): '-DOCSTART-'
\n",
"
2664.0
\n",
"
0.0
\n",
"
O
\n",
"
<NA>
\n",
"
[ -0.1695277, -0.27063507, 1.209566...
\n",
"
O
\n",
"
0
\n",
"
\n",
"
\n",
"
3
\n",
"
train
\n",
"
12
\n",
"
3
\n",
"
[2, 4): 'OC'
\n",
"
9244
\n",
"
0
\n",
"
1
\n",
"
False
\n",
"
[0, 10): '-DOCSTART-'
\n",
"
2664.0
\n",
"
0.0
\n",
"
O
\n",
"
<NA>
\n",
"
[ -0.27648172, -0.3675844, 1.092024...
\n",
"
O
\n",
"
0
\n",
"
\n",
"
\n",
"
4
\n",
"
train
\n",
"
12
\n",
"
4
\n",
"
[4, 6): 'ST'
\n",
"
9272
\n",
"
0
\n",
"
1
\n",
"
False
\n",
"
[0, 10): '-DOCSTART-'
\n",
"
2664.0
\n",
"
0.0
\n",
"
O
\n",
"
<NA>
\n",
"
[ -0.24050614, -0.24247544, 1.07511...
\n",
"
O
\n",
"
0
\n",
"
\n",
"
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
\n",
"
\n",
"
45059
\n",
"
test
\n",
"
225
\n",
"
75
\n",
"
[208, 213): 'fight'
\n",
"
2147
\n",
"
0
\n",
"
1
\n",
"
False
\n",
"
[208, 213): 'fight'
\n",
"
49418.0
\n",
"
29.0
\n",
"
O
\n",
"
<NA>
\n",
"
[ -0.09621397, -0.48016888, 0.510937...
\n",
"
O
\n",
"
0
\n",
"
\n",
"
\n",
"
45060
\n",
"
test
\n",
"
225
\n",
"
76
\n",
"
[214, 216): 'on'
\n",
"
1113
\n",
"
0
\n",
"
1
\n",
"
False
\n",
"
[214, 216): 'on'
\n",
"
49419.0
\n",
"
30.0
\n",
"
O
\n",
"
<NA>
\n",
"
[ -0.0858628, -0.2341724, 0.832928...
\n",
"
O
\n",
"
0
\n",
"
\n",
"
\n",
"
45061
\n",
"
test
\n",
"
225
\n",
"
77
\n",
"
[217, 225): 'Saturday'
\n",
"
4306
\n",
"
0
\n",
"
1
\n",
"
False
\n",
"
[217, 225): 'Saturday'
\n",
"
49420.0
\n",
"
31.0
\n",
"
O
\n",
"
<NA>
\n",
"
[ -0.012238501, -0.4282664, 0.619483...
\n",
"
O
\n",
"
0
\n",
"
\n",
"
\n",
"
45062
\n",
"
test
\n",
"
225
\n",
"
78
\n",
"
[225, 226): '.'
\n",
"
119
\n",
"
0
\n",
"
1
\n",
"
False
\n",
"
[225, 226): '.'
\n",
"
49421.0
\n",
"
32.0
\n",
"
O
\n",
"
<NA>
\n",
"
[ -0.042955935, -0.36315423, 0.660203...
\n",
"
O
\n",
"
0
\n",
"
\n",
"
\n",
"
45063
\n",
"
test
\n",
"
225
\n",
"
79
\n",
"
[0, 0): ''
\n",
"
102
\n",
"
0
\n",
"
1
\n",
"
True
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
O
\n",
"
<NA>
\n",
"
[ -0.9504192, 0.012983555, 0.7374987...
\n",
"
O
\n",
"
0
\n",
"
\n",
" \n",
"
\n",
"
45064 rows × 16 columns
\n",
"
"
],
"text/plain": [
" fold doc_num token_id span input_id \\\n",
"0 train 12 0 [0, 0): '' 101 \n",
"1 train 12 1 [0, 1): '-' 118 \n",
"2 train 12 2 [1, 2): 'D' 141 \n",
"3 train 12 3 [2, 4): 'OC' 9244 \n",
"4 train 12 4 [4, 6): 'ST' 9272 \n",
"... ... ... ... ... ... \n",
"45059 test 225 75 [208, 213): 'fight' 2147 \n",
"45060 test 225 76 [214, 216): 'on' 1113 \n",
"45061 test 225 77 [217, 225): 'Saturday' 4306 \n",
"45062 test 225 78 [225, 226): '.' 119 \n",
"45063 test 225 79 [0, 0): '' 102 \n",
"\n",
" token_type_id attention_mask special_tokens_mask \\\n",
"0 0 1 True \n",
"1 0 1 False \n",
"2 0 1 False \n",
"3 0 1 False \n",
"4 0 1 False \n",
"... ... ... ... \n",
"45059 0 1 False \n",
"45060 0 1 False \n",
"45061 0 1 False \n",
"45062 0 1 False \n",
"45063 0 1 True \n",
"\n",
" raw_span line_num raw_span_id ent_iob ent_type \\\n",
"0 NaN NaN NaN O \n",
"1 [0, 10): '-DOCSTART-' 2664.0 0.0 O \n",
"2 [0, 10): '-DOCSTART-' 2664.0 0.0 O \n",
"3 [0, 10): '-DOCSTART-' 2664.0 0.0 O \n",
"4 [0, 10): '-DOCSTART-' 2664.0 0.0 O \n",
"... ... ... ... ... ... \n",
"45059 [208, 213): 'fight' 49418.0 29.0 O \n",
"45060 [214, 216): 'on' 49419.0 30.0 O \n",
"45061 [217, 225): 'Saturday' 49420.0 31.0 O \n",
"45062 [225, 226): '.' 49421.0 32.0 O \n",
"45063 NaN NaN NaN O \n",
"\n",
" embedding token_class \\\n",
"0 [ -0.101977676, -0.42442498, 0.8440171... O \n",
"1 [ -0.09124618, -0.47710702, 1.120292... O \n",
"2 [ -0.1695277, -0.27063507, 1.209566... O \n",
"3 [ -0.27648172, -0.3675844, 1.092024... O \n",
"4 [ -0.24050614, -0.24247544, 1.07511... O \n",
"... ... ... \n",
"45059 [ -0.09621397, -0.48016888, 0.510937... O \n",
"45060 [ -0.0858628, -0.2341724, 0.832928... O \n",
"45061 [ -0.012238501, -0.4282664, 0.619483... O \n",
"45062 [ -0.042955935, -0.36315423, 0.660203... O \n",
"45063 [ -0.9504192, 0.012983555, 0.7374987... O \n",
"\n",
" token_class_id \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"... ... \n",
"45059 0 \n",
"45060 0 \n",
"45061 0 \n",
"45062 0 \n",
"45063 0 \n",
"\n",
"[45064 rows x 16 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Repeat the same process for the test set\n",
"test_inputs_df = corpus_df.merge(test_keys[0])\n",
"test_inputs_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train an ensemble of models"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2021-07-12 18:16:33,117\tINFO services.py:1267 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training model using all of 768-dimension embeddings.\n",
"Training model '32_1' (#1 at 32 dimensions) with seed 89250\n",
"Training model '32_2' (#2 at 32 dimensions) with seed 773956\n",
"Training model '32_3' (#3 at 32 dimensions) with seed 654571\n",
"Training model '32_4' (#4 at 32 dimensions) with seed 438878\n",
"Training model '64_1' (#1 at 64 dimensions) with seed 201469\n",
"Training model '64_2' (#2 at 64 dimensions) with seed 94177\n",
"Training model '64_3' (#3 at 64 dimensions) with seed 526478\n",
"Training model '64_4' (#4 at 64 dimensions) with seed 975622\n",
"Training model '128_1' (#1 at 128 dimensions) with seed 513226\n",
"Training model '128_2' (#2 at 128 dimensions) with seed 128113\n",
"Training model '128_3' (#3 at 128 dimensions) with seed 839748\n",
"Training model '128_4' (#4 at 128 dimensions) with seed 450385\n",
"Training model '256_1' (#1 at 256 dimensions) with seed 781567\n",
"Training model '256_2' (#2 at 256 dimensions) with seed 643865\n",
"Training model '256_3' (#3 at 256 dimensions) with seed 402414\n",
"Training model '256_4' (#4 at 256 dimensions) with seed 822761\n",
"\u001b[2m\u001b[36m(pid=72363)\u001b[0m Training model with n_components=256 and seed=781567.\n",
"\u001b[2m\u001b[36m(pid=72362)\u001b[0m Training model with n_components=256 and seed=402414.\n",
"\u001b[2m\u001b[36m(pid=72366)\u001b[0m Training model with n_components=128 and seed=839748.\n",
"\u001b[2m\u001b[36m(pid=72365)\u001b[0m Training model with n_components=256 and seed=643865.\n",
"\u001b[2m\u001b[36m(pid=72372)\u001b[0m Training model with n_components=32 and seed=438878.\n",
"\u001b[2m\u001b[36m(pid=72368)\u001b[0m Training model with n_components=32 and seed=89250.\n",
"\u001b[2m\u001b[36m(pid=72375)\u001b[0m Training model with n_components=32 and seed=654571.\n",
"\u001b[2m\u001b[36m(pid=72371)\u001b[0m Training model with n_components=32 and seed=773956.\n",
"\u001b[2m\u001b[36m(pid=72364)\u001b[0m Training model with n_components=128 and seed=128113.\n",
"\u001b[2m\u001b[36m(pid=72370)\u001b[0m Training model with n_components=64 and seed=526478.\n",
"\u001b[2m\u001b[36m(pid=72374)\u001b[0m Training model with n_components=64 and seed=94177.\n",
"\u001b[2m\u001b[36m(pid=72376)\u001b[0m Training model with n_components=64 and seed=975622.\n",
"\u001b[2m\u001b[36m(pid=72373)\u001b[0m Training model with n_components=64 and seed=201469.\n",
"\u001b[2m\u001b[36m(pid=72369)\u001b[0m Training model with n_components=128 and seed=513226.\n",
"\u001b[2m\u001b[36m(pid=72367)\u001b[0m Training model with n_components=128 and seed=450385.\n",
"\u001b[2m\u001b[36m(pid=72368)\u001b[0m Training model with n_components=256 and seed=822761.\n",
"Trained 17 models.\n",
"Model names after loading or training: 768_1, 32_1, 32_2, 32_3, 32_4, 64_1, 64_2, 64_3, 64_4, 128_1, 128_2, 128_3, 128_4, 256_1, 256_2, 256_3, 256_4\n"
]
}
],
"source": [
"import importlib\n",
"import sklearn.linear_model\n",
"import ray\n",
"ray.init()\n",
"\n",
"# Wrap train_reduced_model in a Ray task\n",
"@ray.remote\n",
"def train_reduced_model_task(\n",
" x_values: np.ndarray, y_values: np.ndarray, n_components: int,\n",
" seed: int, max_iter: int = 10000) -> sklearn.base.BaseEstimator:\n",
" return cleaning.ensemble.train_reduced_model(x_values, y_values, n_components, seed, max_iter)\n",
"\n",
"# Ray task that trains a model using the entire embedding\n",
"@ray.remote\n",
"def train_full_model_task(x_values: np.ndarray, y_values: np.ndarray, \n",
" max_iter: int = 10000) -> sklearn.base.BaseEstimator:\n",
" return (\n",
" sklearn.linear_model.LogisticRegression(\n",
" multi_class=\"multinomial\", max_iter=max_iter\n",
" )\n",
" .fit(x_values, y_values)\n",
" )\n",
"\n",
"def train_models(train_df: pd.DataFrame) \\\n",
" -> Dict[str, sklearn.base.BaseEstimator]:\n",
" \"\"\"\n",
" Train an ensemble of models with different levels of noise.\n",
" \n",
" :param train_df: DataFrame of labeled training documents, with one\n",
" row per token. Must contain the columns \"embedding\" (precomputed \n",
" BERT embeddings) and \"token_class_id\" (integer ID of token type)\n",
" \n",
" :returns: A mapping from mnemonic model name to trained model\n",
" \"\"\"\n",
" X = train_df[\"embedding\"].values\n",
" Y = train_df[\"token_class_id\"]\n",
" \n",
"\n",
" # Push the X and Y values to Plasma so that our tasks can share them.\n",
" X_id = ray.put(X.to_numpy().copy())\n",
" Y_id = ray.put(Y.to_numpy().copy())\n",
" \n",
" names_list = []\n",
" futures_list = []\n",
" \n",
" print(f\"Training model using all of \"\n",
" f\"{X._tensor.shape[1]}-dimension embeddings.\")\n",
" names_list.append(f\"{X._tensor.shape[1]}_1\")\n",
" futures_list.append(train_full_model_task.remote(X_id, Y_id)) \n",
" \n",
" for i in range(len(_REDUCED_DIMS)):\n",
" num_dims = _REDUCED_DIMS[i]\n",
" num_models = _MODELS_AT_DIM[i]\n",
" for j in range(num_models):\n",
" model_name = f\"{num_dims}_{j + 1}\"\n",
" seed = _MODEL_RANDOM_SEEDS[i, j]\n",
" print(f\"Training model '{model_name}' (#{j + 1} \"\n",
" f\"at {num_dims} dimensions) with seed {seed}\")\n",
" names_list.append(model_name)\n",
" futures_list.append(train_reduced_model_task.remote(X_id, Y_id, \n",
" num_dims, seed))\n",
" \n",
" # Block until all training tasks have completed and fetch the resulting models.\n",
" models_list = ray.get(futures_list)\n",
" models = {\n",
" n: m for n, m in zip(names_list, models_list)\n",
" }\n",
" return models\n",
"\n",
"def maybe_train_models(train_df: pd.DataFrame, fold_num: int):\n",
" import pickle\n",
" _CACHED_MODELS_FILE = f\"outputs/fold_{fold_num}_models.pickle\"\n",
" if _REGENERATE_MODELS or not os.path.exists(_CACHED_MODELS_FILE):\n",
" m = train_models(train_df)\n",
" print(f\"Trained {len(m)} models.\")\n",
" with open(_CACHED_MODELS_FILE, \"wb\") as f:\n",
" pickle.dump(m, f)\n",
" else:\n",
" # Use a cached model when using cached embeddings\n",
" with open(_CACHED_MODELS_FILE, \"rb\") as f:\n",
" m = pickle.load(f)\n",
" print(f\"Loaded {len(m)} models from {_CACHED_MODELS_FILE}.\")\n",
" return m\n",
"\n",
"models = maybe_train_models(train_inputs_df, 0)\n",
"print(f\"Model names after loading or training: {', '.join(models.keys())}\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# Uncomment this code if you need to have the cells that follow ignore\n",
"# some of the models saved to disk.\n",
"# _MODEL_SIZES_TO_KEEP = [32, 64, 128, 256]\n",
"# _RUNS_TO_KEEP = [4] * len(_MODEL_SIZES_TO_KEEP)\n",
"# _OTHER_MODELS_TO_KEEP = [\"768_1\"]\n",
"\n",
"# to_keep = _OTHER_MODELS_TO_KEEP.copy()\n",
"# for size in _MODEL_SIZES_TO_KEEP:\n",
"# for num_runs in _RUNS_TO_KEEP:\n",
"# for i in range(num_runs):\n",
"# to_keep.append(f\"{size}_{i+1}\")\n",
"\n",
"# models = {k: v for k, v in models.items() if k in to_keep}\n",
"\n",
"# print(f\"Model names after filtering: {', '.join(models.keys())}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Evaluate the models on this fold's test set"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6eb2c9cdd5f242ffb017fc96bab994b5",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=17, style=ProgressStyle(descr…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
span
\n",
"
ent_type
\n",
"
fold
\n",
"
doc_num
\n",
"
\n",
" \n",
" \n",
"
\n",
"
0
\n",
"
[11, 16): 'Saudi'
\n",
"
MISC
\n",
"
train
\n",
"
12
\n",
"
\n",
"
\n",
"
1
\n",
"
[59, 65): 'MANAMA'
\n",
"
LOC
\n",
"
train
\n",
"
12
\n",
"
\n",
"
\n",
"
2
\n",
"
[86, 91): 'Saudi'
\n",
"
MISC
\n",
"
train
\n",
"
12
\n",
"
\n",
"
\n",
"
3
\n",
"
[259, 264): 'Saudi'
\n",
"
MISC
\n",
"
train
\n",
"
12
\n",
"
\n",
"
\n",
"
0
\n",
"
[55, 65): 'MONTGOMERY'
\n",
"
LOC
\n",
"
train
\n",
"
20
\n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" span ent_type fold doc_num\n",
"0 [11, 16): 'Saudi' MISC train 12\n",
"1 [59, 65): 'MANAMA' LOC train 12\n",
"2 [86, 91): 'Saudi' MISC train 12\n",
"3 [259, 264): 'Saudi' MISC train 12\n",
"0 [55, 65): 'MONTGOMERY' LOC train 20"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def eval_models(models: Dict[str, sklearn.base.BaseEstimator],\n",
" test_df: pd.DataFrame):\n",
" \"\"\"\n",
" Bulk-evaluate an ensemble of models generated by :func:`train_models`.\n",
" \n",
" :param models: Output of :func:`train_models`\n",
" :param test_df: DataFrame of labeled test documents, with one\n",
" row per token. Must contain the columns \"embedding\" (precomputed \n",
" BERT embeddings) and \"token_class_id\" (integer ID of token type)\n",
" \n",
" :returns: A dictionary from model name to results of \n",
" :func:`util.analyze_model`\n",
" \"\"\"\n",
" todo = [(name, model) for name, model in models.items()]\n",
" results = tp.jupyter.run_with_progress_bar(\n",
" len(todo),\n",
" lambda i: cleaning.infer_and_extract_entities_iob(test_df,corpus_raw, int_to_label, todo[i][1]),\n",
" \"model\"\n",
" )\n",
" return {t[0]: result for t, result in zip(todo, results)}\n",
"\n",
"evals = eval_models(models, test_inputs_df)\n",
"# display one of the results\n",
"evals[list(evals.keys())[0]].head()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
"
],
"text/plain": [
" fold doc_num span ent_type in_gold \\\n",
"3 dev 21 [86, 90): 'UEFA' ORG True \n",
"0 dev 21 [25, 39): 'STANDARD LIEGE' ORG True \n",
"78 dev 64 [2571, 2575): 'AIDS' MISC True \n",
"246 dev 120 [63, 70): 'English' MISC True \n",
"374 dev 149 [81, 93): 'Major League' MISC True \n",
"498 dev 182 [2173, 2177): 'Ruch' ORG True \n",
"462 dev 182 [662, 670): 'division' MISC True \n",
"512 dev 203 [879, 881): '90' LOC True \n",
"622 dev 214 [1689, 1705): 'Schindler's List' MISC True \n",
"621 dev 214 [1643, 1648): 'Oscar' PER True \n",
"583 dev 214 [285, 305): 'Venice Film Festival' MISC True \n",
"569 dev 214 [187, 202): 'Michael Collins' MISC True \n",
"802 test 15 [44, 56): 'WORLD SERIES' MISC True \n",
"801 test 15 [32, 43): 'WEST INDIES' LOC True \n",
"942 test 21 [719, 725): 'Wijaya' PER True \n",
"896 test 21 [22, 38): 'WORLD GRAND PRIX' MISC True \n",
"1057 test 23 [1117, 1127): 'NY RANGERS' ORG True \n",
"1052 test 23 [1106, 1113): 'TORONTO' ORG True \n",
"1025 test 23 [673, 689): 'CENTRAL DIVISION' MISC True \n",
"1016 test 23 [599, 611): 'NY ISLANDERS' ORG True \n",
"\n",
" count \n",
"3 0 \n",
"0 0 \n",
"78 0 \n",
"246 0 \n",
"374 0 \n",
"498 0 \n",
"462 0 \n",
"512 0 \n",
"622 0 \n",
"621 0 \n",
"583 0 \n",
"569 0 \n",
"802 0 \n",
"801 0 \n",
"942 0 \n",
"896 0 \n",
"1057 0 \n",
"1052 0 \n",
"1025 0 \n",
"1016 0 "
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Pull out some hard-to-find examples, sorting by document to make labeling easier\n",
"hard_to_get = results[results[\"in_gold\"]].sort_values([\"count\", \"fold\", \"doc_num\"]).head(20)\n",
"hard_to_get"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### TODO: Relabel the above 20 examples with a Markdown table (copy from CSV)\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
fold
\n",
"
doc_num
\n",
"
span
\n",
"
ent_type
\n",
"
in_gold
\n",
"
count
\n",
"
\n",
" \n",
" \n",
"
\n",
"
373
\n",
"
dev
\n",
"
149
\n",
"
[81, 102): 'Major League Baseball'
\n",
"
MISC
\n",
"
False
\n",
"
17
\n",
"
\n",
"
\n",
"
570
\n",
"
dev
\n",
"
214
\n",
"
[187, 202): 'Michael Collins'
\n",
"
PER
\n",
"
False
\n",
"
17
\n",
"
\n",
"
\n",
"
983
\n",
"
test
\n",
"
23
\n",
"
[94, 116): 'National Hockey League'
\n",
"
MISC
\n",
"
False
\n",
"
17
\n",
"
\n",
"
\n",
"
1110
\n",
"
test
\n",
"
25
\n",
"
[856, 864): 'NFC East'
\n",
"
MISC
\n",
"
False
\n",
"
17
\n",
"
\n",
"
\n",
"
1109
\n",
"
test
\n",
"
25
\n",
"
[823, 835): 'Philadelphia'
\n",
"
ORG
\n",
"
False
\n",
"
17
\n",
"
\n",
"
\n",
"
1184
\n",
"
test
\n",
"
41
\n",
"
[674, 688): 'Sporting Gijon'
\n",
"
ORG
\n",
"
False
\n",
"
17
\n",
"
\n",
"
\n",
"
1323
\n",
"
test
\n",
"
114
\n",
"
[51, 61): 'sales-USDA'
\n",
"
ORG
\n",
"
False
\n",
"
17
\n",
"
\n",
"
\n",
"
1367
\n",
"
test
\n",
"
118
\n",
"
[776, 791): 'mid-Mississippi'
\n",
"
LOC
\n",
"
False
\n",
"
17
\n",
"
\n",
"
\n",
"
1362
\n",
"
test
\n",
"
118
\n",
"
[535, 550): 'mid-Mississippi'
\n",
"
LOC
\n",
"
False
\n",
"
17
\n",
"
\n",
"
\n",
"
1509
\n",
"
test
\n",
"
178
\n",
"
[1787, 1800): 'Uruguay Round'
\n",
"
MISC
\n",
"
False
\n",
"
17
\n",
"
\n",
"
\n",
"
1560
\n",
"
test
\n",
"
180
\n",
"
[588, 592): 'BILO'
\n",
"
ORG
\n",
"
False
\n",
"
17
\n",
"
\n",
"
\n",
"
1558
\n",
"
test
\n",
"
180
\n",
"
[579, 583): 'TOPS'
\n",
"
ORG
\n",
"
False
\n",
"
17
\n",
"
\n",
"
\n",
"
1550
\n",
"
test
\n",
"
180
\n",
"
[395, 399): 'BILO'
\n",
"
ORG
\n",
"
False
\n",
"
17
\n",
"
\n",
"
\n",
"
1544
\n",
"
test
\n",
"
180
\n",
"
[286, 293): 'Malysia'
\n",
"
ORG
\n",
"
False
\n",
"
17
\n",
"
\n",
"
\n",
"
1542
\n",
"
test
\n",
"
180
\n",
"
[259, 263): 'BILO'
\n",
"
ORG
\n",
"
False
\n",
"
17
\n",
"
\n",
"
\n",
"
1649
\n",
"
test
\n",
"
207
\n",
"
[1041, 1047): 'Oxford'
\n",
"
ORG
\n",
"
False
\n",
"
17
\n",
"
\n",
"
\n",
"
1786
\n",
"
test
\n",
"
219
\n",
"
[368, 381): 'Koo Jeon Woon'
\n",
"
PER
\n",
"
False
\n",
"
17
\n",
"
\n",
"
\n",
"
1807
\n",
"
test
\n",
"
222
\n",
"
[218, 225): 'EASTERN'
\n",
"
MISC
\n",
"
False
\n",
"
17
\n",
"
\n",
"
\n",
"
1805
\n",
"
test
\n",
"
222
\n",
"
[92, 114): 'National Hockey League'
\n",
"
MISC
\n",
"
False
\n",
"
17
\n",
"
\n",
"
\n",
"
2054
\n",
"
train
\n",
"
48
\n",
"
[885, 899): 'Sjeng Schalken'
\n",
"
ORG
\n",
"
False
\n",
"
17
\n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" fold doc_num span ent_type in_gold \\\n",
"373 dev 149 [81, 102): 'Major League Baseball' MISC False \n",
"570 dev 214 [187, 202): 'Michael Collins' PER False \n",
"983 test 23 [94, 116): 'National Hockey League' MISC False \n",
"1110 test 25 [856, 864): 'NFC East' MISC False \n",
"1109 test 25 [823, 835): 'Philadelphia' ORG False \n",
"1184 test 41 [674, 688): 'Sporting Gijon' ORG False \n",
"1323 test 114 [51, 61): 'sales-USDA' ORG False \n",
"1367 test 118 [776, 791): 'mid-Mississippi' LOC False \n",
"1362 test 118 [535, 550): 'mid-Mississippi' LOC False \n",
"1509 test 178 [1787, 1800): 'Uruguay Round' MISC False \n",
"1560 test 180 [588, 592): 'BILO' ORG False \n",
"1558 test 180 [579, 583): 'TOPS' ORG False \n",
"1550 test 180 [395, 399): 'BILO' ORG False \n",
"1544 test 180 [286, 293): 'Malysia' ORG False \n",
"1542 test 180 [259, 263): 'BILO' ORG False \n",
"1649 test 207 [1041, 1047): 'Oxford' ORG False \n",
"1786 test 219 [368, 381): 'Koo Jeon Woon' PER False \n",
"1807 test 222 [218, 225): 'EASTERN' MISC False \n",
"1805 test 222 [92, 114): 'National Hockey League' MISC False \n",
"2054 train 48 [885, 899): 'Sjeng Schalken' ORG False \n",
"\n",
" count \n",
"373 17 \n",
"570 17 \n",
"983 17 \n",
"1110 17 \n",
"1109 17 \n",
"1184 17 \n",
"1323 17 \n",
"1367 17 \n",
"1362 17 \n",
"1509 17 \n",
"1560 17 \n",
"1558 17 \n",
"1550 17 \n",
"1544 17 \n",
"1542 17 \n",
"1649 17 \n",
"1786 17 \n",
"1807 17 \n",
"1805 17 \n",
"2054 17 "
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Hardest results not in the gold standard for models to avoid\n",
"hard_to_avoid = results[~results[\"in_gold\"]].sort_values([\"count\", \"fold\", \"doc_num\"], ascending=[False, True, True]).head(20)\n",
"hard_to_avoid"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### TODO: Relabel the above 20 examples (copy from CSV)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Remainder of Experiment\n",
"\n",
"For each of the 10 folds, train a model on the fold's training set and run\n",
"analysis on the fold's test set."
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Starting fold 1.\n",
"Training model using all of 768-dimension embeddings.\n",
"Training model '32_1' (#1 at 32 dimensions) with seed 89250\n",
"Training model '32_2' (#2 at 32 dimensions) with seed 773956\n",
"Training model '32_3' (#3 at 32 dimensions) with seed 654571\n",
"Training model '32_4' (#4 at 32 dimensions) with seed 438878\n",
"Training model '64_1' (#1 at 64 dimensions) with seed 201469\n",
"Training model '64_2' (#2 at 64 dimensions) with seed 94177\n",
"Training model '64_3' (#3 at 64 dimensions) with seed 526478\n",
"Training model '64_4' (#4 at 64 dimensions) with seed 975622\n",
"Training model '128_1' (#1 at 128 dimensions) with seed 513226\n",
"Training model '128_2' (#2 at 128 dimensions) with seed 128113\n",
"Training model '128_3' (#3 at 128 dimensions) with seed 839748\n",
"Training model '128_4' (#4 at 128 dimensions) with seed 450385\n",
"Training model '256_1' (#1 at 256 dimensions) with seed 781567\n",
"Training model '256_2' (#2 at 256 dimensions) with seed 643865\n",
"Training model '256_3' (#3 at 256 dimensions) with seed 402414\n",
"Training model '256_4' (#4 at 256 dimensions) with seed 822761\n",
"\u001b[2m\u001b[36m(pid=72363)\u001b[0m Training model with n_components=32 and seed=438878.\n",
"\u001b[2m\u001b[36m(pid=72362)\u001b[0m Training model with n_components=32 and seed=654571.\n",
"\u001b[2m\u001b[36m(pid=72366)\u001b[0m Training model with n_components=64 and seed=975622.\n",
"\u001b[2m\u001b[36m(pid=72365)\u001b[0m Training model with n_components=32 and seed=773956.\n",
"\u001b[2m\u001b[36m(pid=72368)\u001b[0m Training model with n_components=32 and seed=89250.\n",
"\u001b[2m\u001b[36m(pid=72375)\u001b[0m Training model with n_components=256 and seed=643865.\n",
"\u001b[2m\u001b[36m(pid=72371)\u001b[0m Training model with n_components=256 and seed=781567.\n",
"\u001b[2m\u001b[36m(pid=72364)\u001b[0m Training model with n_components=64 and seed=201469.\n",
"\u001b[2m\u001b[36m(pid=72370)\u001b[0m Training model with n_components=128 and seed=839748.\n",
"\u001b[2m\u001b[36m(pid=72374)\u001b[0m Training model with n_components=128 and seed=513226.\n",
"\u001b[2m\u001b[36m(pid=72376)\u001b[0m Training model with n_components=128 and seed=450385.\n",
"\u001b[2m\u001b[36m(pid=72373)\u001b[0m Training model with n_components=128 and seed=128113.\n",
"\u001b[2m\u001b[36m(pid=72369)\u001b[0m Training model with n_components=64 and seed=526478.\n",
"\u001b[2m\u001b[36m(pid=72367)\u001b[0m Training model with n_components=64 and seed=94177.\n",
"\u001b[2m\u001b[36m(pid=72372)\u001b[0m Training model with n_components=256 and seed=402414.\n",
"\u001b[2m\u001b[36m(pid=72368)\u001b[0m Training model with n_components=256 and seed=822761.\n",
"Trained 17 models.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "8e6ab8836b6246edb1d3e635fda3ad7f",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=17, style=ProgressStyle(descr…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done with fold 1.\n",
"Starting fold 2.\n",
"Training model using all of 768-dimension embeddings.\n",
"Training model '32_1' (#1 at 32 dimensions) with seed 89250\n",
"Training model '32_2' (#2 at 32 dimensions) with seed 773956\n",
"Training model '32_3' (#3 at 32 dimensions) with seed 654571\n",
"Training model '32_4' (#4 at 32 dimensions) with seed 438878\n",
"Training model '64_1' (#1 at 64 dimensions) with seed 201469\n",
"Training model '64_2' (#2 at 64 dimensions) with seed 94177\n",
"Training model '64_3' (#3 at 64 dimensions) with seed 526478\n",
"Training model '64_4' (#4 at 64 dimensions) with seed 975622\n",
"Training model '128_1' (#1 at 128 dimensions) with seed 513226\n",
"Training model '128_2' (#2 at 128 dimensions) with seed 128113\n",
"Training model '128_3' (#3 at 128 dimensions) with seed 839748\n",
"Training model '128_4' (#4 at 128 dimensions) with seed 450385\n",
"Training model '256_1' (#1 at 256 dimensions) with seed 781567\n",
"Training model '256_2' (#2 at 256 dimensions) with seed 643865\n",
"Training model '256_3' (#3 at 256 dimensions) with seed 402414\n",
"Training model '256_4' (#4 at 256 dimensions) with seed 822761\n",
"\u001b[2m\u001b[36m(pid=72363)\u001b[0m Training model with n_components=256 and seed=643865.\n",
"\u001b[2m\u001b[36m(pid=72362)\u001b[0m Training model with n_components=256 and seed=781567.\n",
"\u001b[2m\u001b[36m(pid=72366)\u001b[0m Training model with n_components=128 and seed=450385.\n",
"\u001b[2m\u001b[36m(pid=72365)\u001b[0m Training model with n_components=256 and seed=402414.\n",
"\u001b[2m\u001b[36m(pid=72372)\u001b[0m Training model with n_components=32 and seed=654571.\n",
"\u001b[2m\u001b[36m(pid=72368)\u001b[0m Training model with n_components=32 and seed=89250.\n",
"\u001b[2m\u001b[36m(pid=72375)\u001b[0m Training model with n_components=32 and seed=438878.\n",
"\u001b[2m\u001b[36m(pid=72371)\u001b[0m Training model with n_components=32 and seed=773956.\n",
"\u001b[2m\u001b[36m(pid=72364)\u001b[0m Training model with n_components=128 and seed=128113.\n",
"\u001b[2m\u001b[36m(pid=72370)\u001b[0m Training model with n_components=64 and seed=526478.\n",
"\u001b[2m\u001b[36m(pid=72374)\u001b[0m Training model with n_components=64 and seed=975622.\n",
"\u001b[2m\u001b[36m(pid=72376)\u001b[0m Training model with n_components=64 and seed=94177.\n",
"\u001b[2m\u001b[36m(pid=72373)\u001b[0m Training model with n_components=64 and seed=201469.\n",
"\u001b[2m\u001b[36m(pid=72369)\u001b[0m Training model with n_components=128 and seed=839748.\n",
"\u001b[2m\u001b[36m(pid=72367)\u001b[0m Training model with n_components=128 and seed=513226.\n",
"\u001b[2m\u001b[36m(pid=72368)\u001b[0m Training model with n_components=256 and seed=822761.\n",
"Trained 17 models.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "51af4ff8c89d4dca999d510eccb7e575",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=17, style=ProgressStyle(descr…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done with fold 2.\n",
"Starting fold 3.\n",
"Training model using all of 768-dimension embeddings.\n",
"Training model '32_1' (#1 at 32 dimensions) with seed 89250\n",
"Training model '32_2' (#2 at 32 dimensions) with seed 773956\n",
"Training model '32_3' (#3 at 32 dimensions) with seed 654571\n",
"Training model '32_4' (#4 at 32 dimensions) with seed 438878\n",
"Training model '64_1' (#1 at 64 dimensions) with seed 201469\n",
"Training model '64_2' (#2 at 64 dimensions) with seed 94177\n",
"Training model '64_3' (#3 at 64 dimensions) with seed 526478\n",
"Training model '64_4' (#4 at 64 dimensions) with seed 975622\n",
"Training model '128_1' (#1 at 128 dimensions) with seed 513226\n",
"Training model '128_2' (#2 at 128 dimensions) with seed 128113\n",
"Training model '128_3' (#3 at 128 dimensions) with seed 839748\n",
"Training model '128_4' (#4 at 128 dimensions) with seed 450385\n",
"Training model '256_1' (#1 at 256 dimensions) with seed 781567\n",
"Training model '256_2' (#2 at 256 dimensions) with seed 643865\n",
"Training model '256_3' (#3 at 256 dimensions) with seed 402414\n",
"Training model '256_4' (#4 at 256 dimensions) with seed 822761\n",
"\u001b[2m\u001b[36m(pid=72363)\u001b[0m Training model with n_components=32 and seed=773956.\n",
"\u001b[2m\u001b[36m(pid=72362)\u001b[0m Training model with n_components=32 and seed=654571.\n",
"\u001b[2m\u001b[36m(pid=72366)\u001b[0m Training model with n_components=64 and seed=975622.\n",
"\u001b[2m\u001b[36m(pid=72365)\u001b[0m Training model with n_components=32 and seed=438878.\n",
"\u001b[2m\u001b[36m(pid=72368)\u001b[0m Training model with n_components=32 and seed=89250.\n",
"\u001b[2m\u001b[36m(pid=72375)\u001b[0m Training model with n_components=256 and seed=781567.\n",
"\u001b[2m\u001b[36m(pid=72371)\u001b[0m Training model with n_components=256 and seed=643865.\n",
"\u001b[2m\u001b[36m(pid=72364)\u001b[0m Training model with n_components=64 and seed=201469.\n",
"\u001b[2m\u001b[36m(pid=72370)\u001b[0m Training model with n_components=128 and seed=839748.\n",
"\u001b[2m\u001b[36m(pid=72374)\u001b[0m Training model with n_components=128 and seed=450385.\n",
"\u001b[2m\u001b[36m(pid=72376)\u001b[0m Training model with n_components=128 and seed=513226.\n",
"\u001b[2m\u001b[36m(pid=72373)\u001b[0m Training model with n_components=128 and seed=128113.\n",
"\u001b[2m\u001b[36m(pid=72369)\u001b[0m Training model with n_components=64 and seed=94177.\n",
"\u001b[2m\u001b[36m(pid=72367)\u001b[0m Training model with n_components=64 and seed=526478.\n",
"\u001b[2m\u001b[36m(pid=72372)\u001b[0m Training model with n_components=256 and seed=402414.\n",
"\u001b[2m\u001b[36m(pid=72368)\u001b[0m Training model with n_components=256 and seed=822761.\n",
"Trained 17 models.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d7b85c9d3d9f4c7dbad63f5a7b00ce03",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=17, style=ProgressStyle(descr…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done with fold 3.\n",
"Starting fold 4.\n",
"Training model using all of 768-dimension embeddings.\n",
"Training model '32_1' (#1 at 32 dimensions) with seed 89250\n",
"Training model '32_2' (#2 at 32 dimensions) with seed 773956\n",
"Training model '32_3' (#3 at 32 dimensions) with seed 654571\n",
"Training model '32_4' (#4 at 32 dimensions) with seed 438878\n",
"Training model '64_1' (#1 at 64 dimensions) with seed 201469\n",
"Training model '64_2' (#2 at 64 dimensions) with seed 94177\n",
"Training model '64_3' (#3 at 64 dimensions) with seed 526478\n",
"Training model '64_4' (#4 at 64 dimensions) with seed 975622\n",
"Training model '128_1' (#1 at 128 dimensions) with seed 513226\n",
"Training model '128_2' (#2 at 128 dimensions) with seed 128113\n",
"Training model '128_3' (#3 at 128 dimensions) with seed 839748\n",
"Training model '128_4' (#4 at 128 dimensions) with seed 450385\n",
"Training model '256_1' (#1 at 256 dimensions) with seed 781567\n",
"Training model '256_2' (#2 at 256 dimensions) with seed 643865\n",
"Training model '256_3' (#3 at 256 dimensions) with seed 402414\n",
"Training model '256_4' (#4 at 256 dimensions) with seed 822761\n",
"\u001b[2m\u001b[36m(pid=72362)\u001b[0m Training model with n_components=256 and seed=643865.\n",
"\u001b[2m\u001b[36m(pid=72366)\u001b[0m Training model with n_components=128 and seed=450385.\n",
"\u001b[2m\u001b[36m(pid=72365)\u001b[0m Training model with n_components=256 and seed=781567.\n",
"\u001b[2m\u001b[36m(pid=72372)\u001b[0m Training model with n_components=32 and seed=438878.\n",
"\u001b[2m\u001b[36m(pid=72368)\u001b[0m Training model with n_components=32 and seed=89250.\n",
"\u001b[2m\u001b[36m(pid=72375)\u001b[0m Training model with n_components=32 and seed=654571.\n",
"\u001b[2m\u001b[36m(pid=72371)\u001b[0m Training model with n_components=32 and seed=773956.\n",
"\u001b[2m\u001b[36m(pid=72364)\u001b[0m Training model with n_components=128 and seed=839748.\n",
"\u001b[2m\u001b[36m(pid=72370)\u001b[0m Training model with n_components=64 and seed=975622.\n",
"\u001b[2m\u001b[36m(pid=72374)\u001b[0m Training model with n_components=64 and seed=94177.\n",
"\u001b[2m\u001b[36m(pid=72376)\u001b[0m Training model with n_components=64 and seed=526478.\n",
"\u001b[2m\u001b[36m(pid=72373)\u001b[0m Training model with n_components=64 and seed=201469.\n",
"\u001b[2m\u001b[36m(pid=72369)\u001b[0m Training model with n_components=128 and seed=513226.\n",
"\u001b[2m\u001b[36m(pid=72367)\u001b[0m Training model with n_components=128 and seed=128113.\n",
"\u001b[2m\u001b[36m(pid=72363)\u001b[0m Training model with n_components=256 and seed=402414.\n",
"\u001b[2m\u001b[36m(pid=72368)\u001b[0m Training model with n_components=256 and seed=822761.\n",
"Trained 17 models.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "01b0a7edf3714a63b0237cf286fd041d",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=17, style=ProgressStyle(descr…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done with fold 4.\n",
"Starting fold 5.\n",
"Training model using all of 768-dimension embeddings.\n",
"Training model '32_1' (#1 at 32 dimensions) with seed 89250\n",
"Training model '32_2' (#2 at 32 dimensions) with seed 773956\n",
"Training model '32_3' (#3 at 32 dimensions) with seed 654571\n",
"Training model '32_4' (#4 at 32 dimensions) with seed 438878\n",
"Training model '64_1' (#1 at 64 dimensions) with seed 201469\n",
"Training model '64_2' (#2 at 64 dimensions) with seed 94177\n",
"Training model '64_3' (#3 at 64 dimensions) with seed 526478\n",
"Training model '64_4' (#4 at 64 dimensions) with seed 975622\n",
"Training model '128_1' (#1 at 128 dimensions) with seed 513226\n",
"Training model '128_2' (#2 at 128 dimensions) with seed 128113\n",
"Training model '128_3' (#3 at 128 dimensions) with seed 839748\n",
"Training model '128_4' (#4 at 128 dimensions) with seed 450385\n",
"Training model '256_1' (#1 at 256 dimensions) with seed 781567\n",
"Training model '256_2' (#2 at 256 dimensions) with seed 643865\n",
"Training model '256_3' (#3 at 256 dimensions) with seed 402414\n",
"Training model '256_4' (#4 at 256 dimensions) with seed 822761\n",
"\u001b[2m\u001b[36m(pid=72363)\u001b[0m Training model with n_components=32 and seed=438878.\n",
"\u001b[2m\u001b[36m(pid=72362)\u001b[0m Training model with n_components=32 and seed=773956.\n",
"\u001b[2m\u001b[36m(pid=72366)\u001b[0m Training model with n_components=64 and seed=526478.\n",
"\u001b[2m\u001b[36m(pid=72365)\u001b[0m Training model with n_components=32 and seed=654571.\n",
"\u001b[2m\u001b[36m(pid=72368)\u001b[0m Training model with n_components=32 and seed=89250.\n",
"\u001b[2m\u001b[36m(pid=72364)\u001b[0m Training model with n_components=64 and seed=94177.\n",
"\u001b[2m\u001b[36m(pid=72374)\u001b[0m Training model with n_components=128 and seed=513226.\n",
"\u001b[2m\u001b[36m(pid=72369)\u001b[0m Training model with n_components=64 and seed=975622.\n",
"\u001b[2m\u001b[36m(pid=72367)\u001b[0m Training model with n_components=64 and seed=201469.\n",
"\u001b[2m\u001b[36m(pid=72373)\u001b[0m Training model with n_components=128 and seed=128113.\n",
"\u001b[2m\u001b[36m(pid=72376)\u001b[0m Training model with n_components=128 and seed=839748.\n",
"\u001b[2m\u001b[36m(pid=72370)\u001b[0m Training model with n_components=128 and seed=450385.\n",
"\u001b[2m\u001b[36m(pid=72371)\u001b[0m Training model with n_components=256 and seed=781567.\n",
"\u001b[2m\u001b[36m(pid=72375)\u001b[0m Training model with n_components=256 and seed=643865.\n",
"\u001b[2m\u001b[36m(pid=72372)\u001b[0m Training model with n_components=256 and seed=402414.\n",
"\u001b[2m\u001b[36m(pid=72362)\u001b[0m Training model with n_components=256 and seed=822761.\n",
"Trained 17 models.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "c003ad7644b041b18315f34e8bf1f85c",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=17, style=ProgressStyle(descr…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done with fold 5.\n",
"Starting fold 6.\n",
"Training model using all of 768-dimension embeddings.\n",
"Training model '32_1' (#1 at 32 dimensions) with seed 89250\n",
"Training model '32_2' (#2 at 32 dimensions) with seed 773956\n",
"Training model '32_3' (#3 at 32 dimensions) with seed 654571\n",
"Training model '32_4' (#4 at 32 dimensions) with seed 438878\n",
"Training model '64_1' (#1 at 64 dimensions) with seed 201469\n",
"Training model '64_2' (#2 at 64 dimensions) with seed 94177\n",
"Training model '64_3' (#3 at 64 dimensions) with seed 526478\n",
"Training model '64_4' (#4 at 64 dimensions) with seed 975622\n",
"Training model '128_1' (#1 at 128 dimensions) with seed 513226\n",
"Training model '128_2' (#2 at 128 dimensions) with seed 128113\n",
"Training model '128_3' (#3 at 128 dimensions) with seed 839748\n",
"Training model '128_4' (#4 at 128 dimensions) with seed 450385\n",
"Training model '256_1' (#1 at 256 dimensions) with seed 781567\n",
"Training model '256_2' (#2 at 256 dimensions) with seed 643865\n",
"Training model '256_3' (#3 at 256 dimensions) with seed 402414\n",
"Training model '256_4' (#4 at 256 dimensions) with seed 822761\n",
"\u001b[2m\u001b[36m(pid=72363)\u001b[0m Training model with n_components=256 and seed=781567.\n",
"\u001b[2m\u001b[36m(pid=72362)\u001b[0m Training model with n_components=32 and seed=89250.\n",
"\u001b[2m\u001b[36m(pid=72366)\u001b[0m Training model with n_components=128 and seed=839748.\n",
"\u001b[2m\u001b[36m(pid=72372)\u001b[0m Training model with n_components=32 and seed=438878.\n",
"\u001b[2m\u001b[36m(pid=72368)\u001b[0m Training model with n_components=256 and seed=643865.\n",
"\u001b[2m\u001b[36m(pid=72375)\u001b[0m Training model with n_components=32 and seed=654571.\n",
"\u001b[2m\u001b[36m(pid=72371)\u001b[0m Training model with n_components=32 and seed=773956.\n",
"\u001b[2m\u001b[36m(pid=72364)\u001b[0m Training model with n_components=128 and seed=513226.\n",
"\u001b[2m\u001b[36m(pid=72370)\u001b[0m Training model with n_components=64 and seed=94177.\n",
"\u001b[2m\u001b[36m(pid=72374)\u001b[0m Training model with n_components=64 and seed=526478.\n",
"\u001b[2m\u001b[36m(pid=72376)\u001b[0m Training model with n_components=64 and seed=975622.\n",
"\u001b[2m\u001b[36m(pid=72373)\u001b[0m Training model with n_components=64 and seed=201469.\n",
"\u001b[2m\u001b[36m(pid=72369)\u001b[0m Training model with n_components=128 and seed=450385.\n",
"\u001b[2m\u001b[36m(pid=72367)\u001b[0m Training model with n_components=128 and seed=128113.\n",
"\u001b[2m\u001b[36m(pid=72365)\u001b[0m Training model with n_components=256 and seed=402414.\n",
"\u001b[2m\u001b[36m(pid=72375)\u001b[0m Training model with n_components=256 and seed=822761.\n",
"Trained 17 models.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "79e940db371644e9bf2e8c945e0806bf",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=17, style=ProgressStyle(descr…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done with fold 6.\n",
"Starting fold 7.\n",
"Training model using all of 768-dimension embeddings.\n",
"Training model '32_1' (#1 at 32 dimensions) with seed 89250\n",
"Training model '32_2' (#2 at 32 dimensions) with seed 773956\n",
"Training model '32_3' (#3 at 32 dimensions) with seed 654571\n",
"Training model '32_4' (#4 at 32 dimensions) with seed 438878\n",
"Training model '64_1' (#1 at 64 dimensions) with seed 201469\n",
"Training model '64_2' (#2 at 64 dimensions) with seed 94177\n",
"Training model '64_3' (#3 at 64 dimensions) with seed 526478\n",
"Training model '64_4' (#4 at 64 dimensions) with seed 975622\n",
"Training model '128_1' (#1 at 128 dimensions) with seed 513226\n",
"Training model '128_2' (#2 at 128 dimensions) with seed 128113\n",
"Training model '128_3' (#3 at 128 dimensions) with seed 839748\n",
"Training model '128_4' (#4 at 128 dimensions) with seed 450385\n",
"Training model '256_1' (#1 at 256 dimensions) with seed 781567\n",
"Training model '256_2' (#2 at 256 dimensions) with seed 643865\n",
"Training model '256_3' (#3 at 256 dimensions) with seed 402414\n",
"Training model '256_4' (#4 at 256 dimensions) with seed 822761\n",
"\u001b[2m\u001b[36m(pid=72363)\u001b[0m Training model with n_components=32 and seed=654571.\n",
"\u001b[2m\u001b[36m(pid=72366)\u001b[0m Training model with n_components=64 and seed=975622.\n",
"\u001b[2m\u001b[36m(pid=72365)\u001b[0m Training model with n_components=32 and seed=438878.\n",
"\u001b[2m\u001b[36m(pid=72372)\u001b[0m Training model with n_components=256 and seed=781567.\n",
"\u001b[2m\u001b[36m(pid=72368)\u001b[0m Training model with n_components=32 and seed=773956.\n",
"\u001b[2m\u001b[36m(pid=72375)\u001b[0m Training model with n_components=32 and seed=89250.\n",
"\u001b[2m\u001b[36m(pid=72371)\u001b[0m Training model with n_components=256 and seed=402414.\n",
"\u001b[2m\u001b[36m(pid=72364)\u001b[0m Training model with n_components=64 and seed=94177.\n",
"\u001b[2m\u001b[36m(pid=72370)\u001b[0m Training model with n_components=128 and seed=513226.\n",
"\u001b[2m\u001b[36m(pid=72374)\u001b[0m Training model with n_components=128 and seed=128113.\n",
"\u001b[2m\u001b[36m(pid=72376)\u001b[0m Training model with n_components=128 and seed=450385.\n",
"\u001b[2m\u001b[36m(pid=72373)\u001b[0m Training model with n_components=128 and seed=839748.\n",
"\u001b[2m\u001b[36m(pid=72369)\u001b[0m Training model with n_components=64 and seed=526478.\n",
"\u001b[2m\u001b[36m(pid=72367)\u001b[0m Training model with n_components=64 and seed=201469.\n",
"\u001b[2m\u001b[36m(pid=72362)\u001b[0m Training model with n_components=256 and seed=643865.\n",
"\u001b[2m\u001b[36m(pid=72368)\u001b[0m Training model with n_components=256 and seed=822761.\n",
"Trained 17 models.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "04a013f28dcd46d1831ff6ca1be8266a",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=17, style=ProgressStyle(descr…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done with fold 7.\n",
"Starting fold 8.\n",
"Training model using all of 768-dimension embeddings.\n",
"Training model '32_1' (#1 at 32 dimensions) with seed 89250\n",
"Training model '32_2' (#2 at 32 dimensions) with seed 773956\n",
"Training model '32_3' (#3 at 32 dimensions) with seed 654571\n",
"Training model '32_4' (#4 at 32 dimensions) with seed 438878\n",
"Training model '64_1' (#1 at 64 dimensions) with seed 201469\n",
"Training model '64_2' (#2 at 64 dimensions) with seed 94177\n",
"Training model '64_3' (#3 at 64 dimensions) with seed 526478\n",
"Training model '64_4' (#4 at 64 dimensions) with seed 975622\n",
"Training model '128_1' (#1 at 128 dimensions) with seed 513226\n",
"Training model '128_2' (#2 at 128 dimensions) with seed 128113\n",
"Training model '128_3' (#3 at 128 dimensions) with seed 839748\n",
"Training model '128_4' (#4 at 128 dimensions) with seed 450385\n",
"Training model '256_1' (#1 at 256 dimensions) with seed 781567\n",
"Training model '256_2' (#2 at 256 dimensions) with seed 643865\n",
"Training model '256_3' (#3 at 256 dimensions) with seed 402414\n",
"Training model '256_4' (#4 at 256 dimensions) with seed 822761\n",
"\u001b[2m\u001b[36m(pid=72363)\u001b[0m Training model with n_components=256 and seed=781567.\n",
"\u001b[2m\u001b[36m(pid=72362)\u001b[0m Training model with n_components=32 and seed=773956.\n",
"\u001b[2m\u001b[36m(pid=72366)\u001b[0m Training model with n_components=128 and seed=450385.\n",
"\u001b[2m\u001b[36m(pid=72365)\u001b[0m Training model with n_components=256 and seed=402414.\n",
"\u001b[2m\u001b[36m(pid=72372)\u001b[0m Training model with n_components=32 and seed=438878.\n",
"\u001b[2m\u001b[36m(pid=72368)\u001b[0m Training model with n_components=32 and seed=89250.\n",
"\u001b[2m\u001b[36m(pid=72375)\u001b[0m Training model with n_components=256 and seed=643865.\n",
"\u001b[2m\u001b[36m(pid=72371)\u001b[0m Training model with n_components=32 and seed=654571.\n",
"\u001b[2m\u001b[36m(pid=72364)\u001b[0m Training model with n_components=128 and seed=513226.\n",
"\u001b[2m\u001b[36m(pid=72370)\u001b[0m Training model with n_components=64 and seed=94177.\n",
"\u001b[2m\u001b[36m(pid=72374)\u001b[0m Training model with n_components=64 and seed=201469.\n",
"\u001b[2m\u001b[36m(pid=72376)\u001b[0m Training model with n_components=64 and seed=975622.\n",
"\u001b[2m\u001b[36m(pid=72373)\u001b[0m Training model with n_components=64 and seed=526478.\n",
"\u001b[2m\u001b[36m(pid=72369)\u001b[0m Training model with n_components=128 and seed=839748.\n",
"\u001b[2m\u001b[36m(pid=72367)\u001b[0m Training model with n_components=128 and seed=128113.\n",
"\u001b[2m\u001b[36m(pid=72368)\u001b[0m Training model with n_components=256 and seed=822761.\n",
"Trained 17 models.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "4a17afd5ec08439a8956146853fd9679",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=17, style=ProgressStyle(descr…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done with fold 8.\n",
"Starting fold 9.\n",
"Training model using all of 768-dimension embeddings.\n",
"Training model '32_1' (#1 at 32 dimensions) with seed 89250\n",
"Training model '32_2' (#2 at 32 dimensions) with seed 773956\n",
"Training model '32_3' (#3 at 32 dimensions) with seed 654571\n",
"Training model '32_4' (#4 at 32 dimensions) with seed 438878\n",
"Training model '64_1' (#1 at 64 dimensions) with seed 201469\n",
"Training model '64_2' (#2 at 64 dimensions) with seed 94177\n",
"Training model '64_3' (#3 at 64 dimensions) with seed 526478\n",
"Training model '64_4' (#4 at 64 dimensions) with seed 975622\n",
"Training model '128_1' (#1 at 128 dimensions) with seed 513226\n",
"Training model '128_2' (#2 at 128 dimensions) with seed 128113\n",
"Training model '128_3' (#3 at 128 dimensions) with seed 839748\n",
"Training model '128_4' (#4 at 128 dimensions) with seed 450385\n",
"Training model '256_1' (#1 at 256 dimensions) with seed 781567\n",
"Training model '256_2' (#2 at 256 dimensions) with seed 643865\n",
"Training model '256_3' (#3 at 256 dimensions) with seed 402414\n",
"Training model '256_4' (#4 at 256 dimensions) with seed 822761\n",
"\u001b[2m\u001b[36m(pid=72363)\u001b[0m Training model with n_components=32 and seed=773956.\n",
"\u001b[2m\u001b[36m(pid=72366)\u001b[0m Training model with n_components=64 and seed=975622.\n",
"\u001b[2m\u001b[36m(pid=72365)\u001b[0m Training model with n_components=32 and seed=438878.\n",
"\u001b[2m\u001b[36m(pid=72368)\u001b[0m Training model with n_components=32 and seed=89250.\n",
"\u001b[2m\u001b[36m(pid=72375)\u001b[0m Training model with n_components=32 and seed=654571.\n",
"\u001b[2m\u001b[36m(pid=72364)\u001b[0m Training model with n_components=64 and seed=526478.\n",
"\u001b[2m\u001b[36m(pid=72370)\u001b[0m Training model with n_components=128 and seed=513226.\n",
"\u001b[2m\u001b[36m(pid=72369)\u001b[0m Training model with n_components=64 and seed=94177.\n",
"\u001b[2m\u001b[36m(pid=72367)\u001b[0m Training model with n_components=64 and seed=201469.\n",
"\u001b[2m\u001b[36m(pid=72374)\u001b[0m Training model with n_components=128 and seed=128113.\n",
"\u001b[2m\u001b[36m(pid=72376)\u001b[0m Training model with n_components=128 and seed=839748.\n",
"\u001b[2m\u001b[36m(pid=72373)\u001b[0m Training model with n_components=128 and seed=450385.\n",
"\u001b[2m\u001b[36m(pid=72372)\u001b[0m Training model with n_components=256 and seed=781567.\n",
"\u001b[2m\u001b[36m(pid=72371)\u001b[0m Training model with n_components=256 and seed=643865.\n",
"\u001b[2m\u001b[36m(pid=72362)\u001b[0m Training model with n_components=256 and seed=402414.\n",
"\u001b[2m\u001b[36m(pid=72368)\u001b[0m Training model with n_components=256 and seed=822761.\n",
"Trained 17 models.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "038d5be9065f4ac6837021045d963b06",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"IntProgress(value=0, description='Starting...', layout=Layout(width='100%'), max=17, style=ProgressStyle(descr…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done with fold 9.\n"
]
}
],
"source": [
"def handle_fold(fold_ix: int) -> Dict[str, Any]:\n",
" \"\"\"\n",
" The per-fold processing of the previous section's cells, collapsed into \n",
" a single function.\n",
" \n",
" :param fold_ix: 0-based index of fold\n",
" \n",
" :returns: a dictionary that maps data structure name to data structure\n",
" \"\"\"\n",
" # To avoid accidentally picking up leftover data from a previous cell,\n",
" # variables local to this function are named with a leading underscore\n",
" _train_inputs_df = corpus_df.merge(train_keys[fold_ix])\n",
" _test_inputs_df = corpus_df.merge(test_keys[fold_ix])\n",
" _models = maybe_train_models(_train_inputs_df, fold_ix)\n",
" _evals = eval_models(_models, _test_inputs_df)\n",
" _summary_df = make_summary_df(_evals)\n",
" _gold_elts = cleaning.preprocess.combine_raw_spans_docs_to_match(corpus_raw,_evals[list(evals.keys())[0]])\n",
" _full_results = cleaning.flag_suspicious_labels(_evals,'ent_type','ent_type',\n",
" label_name='ent_type',\n",
" gold_feats=_gold_elts,\n",
" align_over_cols=['fold','doc_num','span'],\n",
" keep_cols=[],split_doc=False)\n",
" _results = _full_results[[\"fold\", \"doc_num\", \"span\", \n",
" \"ent_type\", \"in_gold\", \"count\"]]\n",
" return {\n",
" \"models\": _models,\n",
" \"summary_df\": _summary_df,\n",
" \"full_results\": _full_results,\n",
" \"results\": _results\n",
" }\n",
"\n",
"# Start with the (already computed) results for fold 0\n",
"results_by_fold = [\n",
" {\n",
" \"models\": models,\n",
" \"summary_df\": summary_df,\n",
" \"full_results\": full_results,\n",
" \"results\": results\n",
" }\n",
"]\n",
"\n",
"for fold in range(1, _KFOLD_NUM_FOLDS):\n",
" print(f\"Starting fold {fold}.\")\n",
" results_by_fold.append(handle_fold(fold))\n",
" print(f\"Done with fold {fold}.\")\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"