In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0"; 

In [2]:
import ktrain
from ktrain import text

Using TensorFlow backend.


## STEP 1: Load and Preprocess Data


The CoNLL2003 NER dataset can be downloaded from [here](https://github.com/amaiya/ktrain/tree/master/ktrain/tests/conll2003).

In [3]:
TDATA = 'data/conll2003/train.txt'
VDATA = 'data/conll2003/valid.txt'
(trn, val, preproc) = text.entities_from_conll2003(TDATA, val_filepath=VDATA)

Number of sentences:  14041
Number of words in the dataset:  23623
Tags: ['B-MISC', 'I-MISC', 'I-PER', 'B-ORG', 'I-ORG', 'I-LOC', 'O', 'B-PER', 'B-LOC']
Number of Labels:  9
Longest sentence: 113 words


## STEP 2: Define a Model

In [4]:
model = text.sequence_tagger('bilstm-crf', preproc)

In [5]:
learner = ktrain.get_learner(model, train_data=trn, val_data=val)

## STEP 3: Train and Evaluate Model

In [6]:
learner.fit(0.001, 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fb8cde39390>

In [7]:
learner.validate()

   F1: 87.20
           precision    recall  f1-score   support

      LOC       0.87      0.94      0.91      1837
      ORG       0.82      0.80      0.81      1341
     MISC       0.88      0.78      0.83       922
      PER       0.89      0.91      0.90      1842

micro avg       0.87      0.88      0.87      5942
macro avg       0.87      0.88      0.87      5942



We can use the `view_top_losses` method to inspect the sentences we're getting the most wrong. Here, we can see our model has trouble with movie titles, which is understandable since it is mixed into a catch-all miscellaneous category.

In [8]:
learner.view_top_losses(n=1)

total incorrect: 12
Word            True : (Pred)
Best           :O     (B-ORG)
known          :O     (O)
for            :O     (O)
appearances    :O     (O)
in             :O     (O)
"              :O     (O)
Ice            :B-MISC (O)
Cold           :I-MISC (O)
in             :I-MISC (O)
Alex           :I-MISC (B-PER)
"              :O     (O)
,              :O     (O)
"              :O     (O)
Lawrence       :B-MISC (B-PER)
of             :I-MISC (I-MISC)
Arabia         :I-MISC (I-LOC)
"              :O     (O)
and            :O     (O)
,              :O     (O)
as             :O     (O)
Cardinal       :O     (B-PER)
Wolsey         :B-PER (I-PER)
,              :O     (O)
in             :O     (O)
"              :O     (O)
Anne           :B-MISC (B-PER)
of             :I-MISC (O)
a              :I-MISC (O)
Thousand       :I-MISC (I-MISC)
Days           :I-MISC (I-MISC)
"              :O     (O)
.              :O     (O)




## Make Predictions on New Data

In [13]:
predictor = ktrain.get_predictor(learner.model, preproc)

In [14]:
predictor.predict('As of 2019, Donald Trump is still the President of the United States.')

[('As', 'O'),
 ('of', 'O'),
 ('2019', 'O'),
 (',', 'O'),
 ('Donald', 'B-PER'),
 ('Trump', 'I-PER'),
 ('is', 'O'),
 ('still', 'O'),
 ('the', 'O'),
 ('President', 'O'),
 ('of', 'O'),
 ('the', 'O'),
 ('United', 'B-LOC'),
 ('States', 'I-LOC'),
 ('.', 'O')]

In [15]:
predictor.save('/tmp/mypred')

In [16]:
reloaded_predictor = ktrain.load_predictor('/tmp/mypred')

In [18]:
reloaded_predictor.predict('Paul Newman is my favorite actor.')

[('Paul', 'B-PER'),
 ('Newman', 'I-PER'),
 ('is', 'O'),
 ('my', 'O'),
 ('favorite', 'O'),
 ('actor', 'O'),
 ('.', 'O')]