 # Naive Bayes NLTK Demo
 
 $$ P(y \mid x) = \underbrace{P(y)}_{\textit{prior}} \prod_i P( f_i \mid y) $$
 

In [1]:
import pprint
from nltk.classify.naivebayes import NaiveBayesClassifier
from nltk.classify.util import names_demo, names_demo_features

print("Features used to classify a name as male or female:")
pp = pprint.PrettyPrinter(indent=4)
test_features = names_demo_features("anoop")
pp.pprint(test_features)

Features used to classify a name as male or female:
{ 'alwayson': True,
 'count(a)': 1,
 'count(b)': 0,
 'count(c)': 0,
 'count(d)': 0,
 'count(e)': 0,
 'count(f)': 0,
 'count(g)': 0,
 'count(h)': 0,
 'count(i)': 0,
 'count(j)': 0,
 'count(k)': 0,
 'count(l)': 0,
 'count(m)': 0,
 'count(n)': 1,
 'count(o)': 2,
 'count(p)': 1,
 'count(q)': 0,
 'count(r)': 0,
 'count(s)': 0,
 'count(t)': 0,
 'count(u)': 0,
 'count(v)': 0,
 'count(w)': 0,
 'count(x)': 0,
 'count(y)': 0,
 'count(z)': 0,
 'endswith': 'p',
 'has(a)': True,
 'has(b)': False,
 'has(c)': False,
 'has(d)': False,
 'has(e)': False,
 'has(f)': False,
 'has(g)': False,
 'has(h)': False,
 'has(i)': False,
 'has(j)': False,
 'has(k)': False,
 'has(l)': False,
 'has(m)': False,
 'has(n)': True,
 'has(o)': True,
 'has(p)': True,
 'has(q)': False,
 'has(r)': False,
 'has(s)': False,
 'has(t)': False,
 'has(u)': False,
 'has(v)': False,
 'has(w)': False,
 'has(x)': False,
 'has(y)': False,
 'has(z)': False,
 'startswith': 'a'}


In [3]:
print("Train NaiveBayes classifier and run on some example input names:")
classifier = names_demo(NaiveBayesClassifier.train)

Train NaiveBayes classifier and run on some example input names:
Training classifier...
Testing classifier...
Accuracy: 0.7820
Avg. log likelihood: -0.7476

Unseen Names P(Male) P(Female)
----------------------------------------
 Kelli 0.0132 *0.9868
 Er *0.8826 0.1174
 Ally 0.0903 *0.9097
 Stephan *0.8361 0.1639
 Chriss 0.6864 *0.3136


In [4]:
name='nate'
print("Run trained classifier on input name:", name)
test_features = names_demo_features(name)
output = classifier.prob_classify(test_features)
print("P(male|{0})={1}".format(name,output.prob('male')))
print("P(female|{0})={1}".format(name,output.prob('female')))

Run trained classifier on input name: nate
P(male|nate)=0.08246413295145613
P(female|nate)=0.9175358670485438


## Most informative features

The informativeness of a feature `feature_type = feature_value` or $f=v$ is computed by taking the ratio of choosing one label over the other, so if there are two labels: $\ell_1$ or $\ell_2$

$$ score(f=v) = \frac{ P( f=v \mid \ell_1 ) }{ P( f=v \mid \ell_2 ) } $$

If there are more than 2 labels, say $\ell_1, \ldots \ell_n$, then just compare one label versus all others:

$$ score(f=v) = \frac{ P( f=v \mid \ell_i ) }{ \sum_{k \neq i} P( f=v \mid \ell_k ) } $$

We sort all the features by this score and report the top 10 below.


In [5]:
classifier.show_most_informative_features()

Most Informative Features
 endswith = 'a' female : male = 31.5 : 1.0
 endswith = 'p' male : female = 14.2 : 1.0
 endswith = 'v' male : female = 13.0 : 1.0
 endswith = 'f' male : female = 10.5 : 1.0
 endswith = 'm' male : female = 10.3 : 1.0
 endswith = 'd' male : female = 10.2 : 1.0
 endswith = 'o' male : female = 7.7 : 1.0
 count(v) = 2 female : male = 6.5 : 1.0
 endswith = 'r' male : female = 6.4 : 1.0
 endswith = 'w' male : female = 6.1 : 1.0


In [6]:
def bigram_features(name):
 features = {}
 features['alwayson'] = True
 features['startswith'] = name[0].lower()
 features['endswith'] = name[-1].lower()
 for letter in 'abcdefghijklmnopqrstuvwxyz':
 features['count(%s)' % letter] = name.lower().count(letter)
 features['has(%s)' % letter] = letter in name.lower()
 for letter1 in 'abcdefghijklmnopqrstuvwxyz':
 for letter2 in 'abcdefghijklmnopqrstuvwxyz':
 bigram = "%s%s" % (letter1, letter2)
 features['count2(%s)' % bigram] = name.lower().count(bigram)
 features['has2(%s)' % bigram] = bigram in name.lower()
 return features

pp.pprint(bigram_features("Dementor"))

{ 'alwayson': True,
 'count(a)': 0,
 'count(b)': 0,
 'count(c)': 0,
 'count(d)': 1,
 'count(e)': 2,
 'count(f)': 0,
 'count(g)': 0,
 'count(h)': 0,
 'count(i)': 0,
 'count(j)': 0,
 'count(k)': 0,
 'count(l)': 0,
 'count(m)': 1,
 'count(n)': 1,
 'count(o)': 1,
 'count(p)': 0,
 'count(q)': 0,
 'count(r)': 1,
 'count(s)': 0,
 'count(t)': 1,
 'count(u)': 0,
 'count(v)': 0,
 'count(w)': 0,
 'count(x)': 0,
 'count(y)': 0,
 'count(z)': 0,
 'count2(aa)': 0,
 'count2(ab)': 0,
 'count2(ac)': 0,
 'count2(ad)': 0,
 'count2(ae)': 0,
 'count2(af)': 0,
 'count2(ag)': 0,
 'count2(ah)': 0,
 'count2(ai)': 0,
 'count2(aj)': 0,
 'count2(ak)': 0,
 'count2(al)': 0,
 'count2(am)': 0,
 'count2(an)': 0,
 'count2(ao)': 0,
 'count2(ap)': 0,
 'count2(aq)': 0,
 'count2(ar)': 0,
 'count2(as)': 0,
 'count2(at)': 0,
 'count2(au)': 0,
 'count2(av)': 0,
 'count2(aw)': 0,
 'count2(ax)': 0,
 'count2(ay)': 0,
 'count2(az)': 0,
 'count2(ba)': 0,
 'count2(bb)': 0,
 'count2(bc)': 0,
 'count2(bd)': 0,
 'count2(be)': 0,
 'coun

 'has2(jf)': False,
 'has2(jg)': False,
 'has2(jh)': False,
 'has2(ji)': False,
 'has2(jj)': False,
 'has2(jk)': False,
 'has2(jl)': False,
 'has2(jm)': False,
 'has2(jn)': False,
 'has2(jo)': False,
 'has2(jp)': False,
 'has2(jq)': False,
 'has2(jr)': False,
 'has2(js)': False,
 'has2(jt)': False,
 'has2(ju)': False,
 'has2(jv)': False,
 'has2(jw)': False,
 'has2(jx)': False,
 'has2(jy)': False,
 'has2(jz)': False,
 'has2(ka)': False,
 'has2(kb)': False,
 'has2(kc)': False,
 'has2(kd)': False,
 'has2(ke)': False,
 'has2(kf)': False,
 'has2(kg)': False,
 'has2(kh)': False,
 'has2(ki)': False,
 'has2(kj)': False,
 'has2(kk)': False,
 'has2(kl)': False,
 'has2(km)': False,
 'has2(kn)': False,
 'has2(ko)': False,
 'has2(kp)': False,
 'has2(kq)': False,
 'has2(kr)': False,
 'has2(ks)': False,
 'has2(kt)': False,
 'has2(ku)': False,
 'has2(kv)': False,
 'has2(kw)': False,
 'has2(kx)': False,
 'has2(ky)': False,
 'has2(kz)': False,
 'has2(la)': False,
 'has2(lb)': False,
 'has2(lc)': False,


In [7]:
print("Train new classifier using bigram features")
classifier2 = names_demo(NaiveBayesClassifier.train, bigram_features)

Train new classifier using bigram features
Training classifier...
Testing classifier...
Accuracy: 0.8020
Avg. log likelihood: -1.0164

Unseen Names P(Male) P(Female)
----------------------------------------
 Kelli 0.0013 *0.9987
 Er *0.9782 0.0218
 Ally 0.0076 *0.9924
 Stephan *0.9741 0.0259
 Chriss 0.1445 *0.8555


In [8]:
name='nate'
print("Run trained classifier on input name:", name)
test_features = bigram_features(name)
output = classifier2.prob_classify(test_features)
print("P(male|{0})={1}".format(name,output.prob('male')))
print("P(female|{0})={1}".format(name,output.prob('female')))

# try the following:
# luke, lee, leigh, karol, chris, kris, pat

Run trained classifier on input name: nate
P(male|nate)=0.0012201724411289498
P(female|nate)=0.9987798275588747


In [9]:
classifier2.show_most_informative_features()

Most Informative Features
 endswith = 'a' female : male = 31.5 : 1.0
 count2(hu) = 1 male : female = 26.7 : 1.0
 has2(hu) = True male : female = 26.7 : 1.0
 has2(rv) = True male : female = 23.3 : 1.0
 count2(rv) = 1 male : female = 23.3 : 1.0
 count2(lt) = 1 male : female = 19.9 : 1.0
 has2(lt) = True male : female = 19.9 : 1.0
 has2(rk) = True male : female = 15.3 : 1.0
 has2(fo) = True male : female = 15.3 : 1.0
 count2(rk) = 1 male : female = 15.3 : 1.0
