# Predicting Movie Genres from Scripts with Naive Bayes


## Imports

In [148]:
import nltk
import statistics
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import psycopg2
import warnings
from sklearn.model_selection import train_test_split
import math
warnings.filterwarnings("ignore")

## Data
The methodology for constructing this database can be found in the 'Building a Database' notebook on [github.](https://github.com/mocboch/Movie-Script-Data-Analysis/blob/master/Building%20a%20Database.ipynb)

In [2]:
conn = psycopg2.connect(dbname='bechdel_test', user='postgres', password='guest')
cur = conn.cursor()

cur.execute('SELECT * FROM imsdb_scripts JOIN bechdel_ratings ON imsdb_scripts.imdb_id = bechdel_ratings.imdb_id JOIN tmdb_data ON tmdb_data.imdb_id = imsdb_scripts.imdb_id;')
data = pd.DataFrame(cur.fetchall())
df = data.copy()
df.set_index(0, inplace=True)

cur.execute('SELECT genre.imdb_id, genre FROM genre JOIN imsdb_scripts ON imsdb_scripts.imdb_id = genre.imdb_id;')
genre = pd.DataFrame(cur.fetchall())
cur.close()
conn.close()

In [3]:
for genre_ in genre[1].unique():
    df[genre_] = pd.Series()
for row in genre.iterrows():
    df[row[1][1]][row[1][0]] = 1
df.rename(columns={0:'imdb_id',
                        1:'script_date',
                        2:'script',
                        3:'bechdel_id',
                        5:'title',
                        6:'release_year',
                        7:'bechdel_rating',
                        11:'language',
                        13:'popularity',
                        14:'vote_average',
                        15:'vote_count',
                        16:'overview'
                        }, 
               inplace=True)
df.drop(columns=[4, 8, 9, 10, 12], inplace=True)
df.fillna(0, inplace=True)
df.replace('none', np.nan, inplace=True)

## Cleaning the Text
This function will clean and tokenize each script, eliminating stop words and punctuation.

In [4]:
def clean_text(text: str) -> list[str]:
    text = word_tokenize(text.lower())
    ls = list(string.punctuation) + stopwords.words('english') + ['...', '--', '\'\'', '``']
    i = 0
    while i < len(text):
        if text[i] in ls:
            text.remove(text[i])
        else:
            i += 1
    return text

A couple of leftover nas remain in the dataset, otherwise we can go ahead and run the function on the dataset.

In [5]:
df = df.dropna(subset='script')
df['clean_text'] = [clean_text(text) for text in df['script']]

## The UpdateWeights Function
This function updates the weights of the naive bayes classifier for a single row of data.

In [8]:
genres = list(df.columns[11:-1])

def UpdateWeights(row: pd.Series, 
               weights: dict[str: dict[str, int]], 
               total_words_per_genre: dict[str: int],
               genres: list[str]=genres) -> dict[str: dict[str, int]]:
    genre_list = []
    for genre in genres:
        if row[genre] == 1:
            total_words_per_genre[genre] += len(row['clean_text'])
            genre_list.append(genre)
        
    for token in row['clean_text']:
       
        if token in weights:
            for genre in genre_list:
                weights[token][genre] += 1
        else: 
            weights[token] = dict.fromkeys(genres, 0)
            for genre in genre_list:
                weights[token][genre] = 1

        

            


A couple of duplicates remain in the dataset:

In [77]:
x = df.duplicated(subset='script')
df = df.drop(list(x[x==True].index))

## Splitting Off a Test Set

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df.loc[:,'Drama':'History'], test_size=0.2, random_state=42)
train_df = y_train.join(X_train)

## The NaiveBayes Function
This function initiates the weights variable and updates it for each row in the dataframe.

In [11]:
def NaiveBayes(df: pd.DataFrame) -> dict[str: dict[str, int]]:
    total_words_per_genre = dict.fromkeys(genres, 0)
    weights = {}
    for i in list(df.index):
        UpdateWeights(df.loc[i], weights, total_words_per_genre)
    
    for word in weights:
        for genre in weights[word]:
            weights[word][genre] /= total_words_per_genre[genre]
    return weights
        

In [12]:
weights = NaiveBayes(train_df)

## The LogWeights Function
This function returns the natural logarithm of each weight, or -10,000 if the weight is 0.

In [13]:
def LogWeights(weights: dict[str: dict[str: float]]):
    for word in weights.keys():
        for genre in weights[word]:
            if weights[word][genre] == 0:
                weights[word][genre] = -10000
            else:
                weights[word][genre] = math.log(weights[word][genre])

In [14]:
LogWeights(weights)

## The Feature Function and Score Functions
These functions return the feature function and prediction scores for a script. The n highest scoring genres will be considered the model's predictions, where n is the amount of genres listed for the movie.

In [None]:
def FeatureFunction(tokens: list[str]) -> list[tuple[str, int]]:
    return [(token, tokens.count(token)) for token in set(tokens)]

In [36]:
def Score(script: list[str], weights: dict[str: dict[str: float]]=weights, genres: list[str]=genres) -> dict[str: int]:
    score = dict.fromkeys(genres, 0)
    
    
    for word, count in FeatureFunction(script):      
        for genre in score:
            if word in weights: score[genre] += weights[word][genre] * count
    return score
        

For the first entry, index #349903, Crime and Thriller are listed as the movies genres. The Score function scores those two genres highest by an order of magnitude!

In [78]:
train_df

Unnamed: 0_level_0,Drama,Romance,Adventure,Fantasy,Family,Mystery,Crime,Thriller,War,Comedy,Music,Western,Horror,Science Fiction,Action,Animation,History,clean_text
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
349903,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,"[ocean, 's, twelve, written, george, nolfi, ro..."
43014,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"[sunset, boulevard, charles, brackett, billy, ..."
86510,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,"[fire, screenplay, clayton, frohman, ron, shel..."
114369,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,"[seven, andrew, kevin, walker, january, 27,199..."
758758,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"[wild, written, sean, penn, based, book, jon, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100405,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,"[p, r, e, w, n, jonathan, lawton, stephen, met..."
110632,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,"[natural, born, killers, written, quentin, tar..."
448157,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,"[hancock, written, vincent, ngo, vince, gillig..."
1441326,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,"[martha, marcy, may, marlene, written, sean, d..."


In [32]:
pd.Series(Score(train_df['clean_text'][349903])).sort_values(ascending=False)

Crime             -1.169829e+05
Thriller          -1.187586e+05
Drama             -6.394165e+06
Comedy            -9.610512e+06
Romance           -1.141861e+07
Action            -1.331699e+07
Adventure         -1.500677e+07
Science Fiction   -1.607495e+07
Mystery           -1.653374e+07
Fantasy           -1.806365e+07
Horror            -1.845203e+07
History           -2.054818e+07
Family            -2.386636e+07
Music             -2.577168e+07
Animation         -2.709325e+07
Western           -4.132910e+07
War               -4.240746e+07
dtype: float64

In [65]:
test_df = y_test.join(X_test)
test_df['genres_listed'] =  pd.Series()
for i in test_df.index:
    test_df['genres_listed'][i] = sum(test_df.loc[i][:'History'])
test_df

Unnamed: 0_level_0,Drama,Romance,Adventure,Fantasy,Family,Mystery,Crime,Thriller,War,Comedy,Music,Western,Horror,Science Fiction,Action,Animation,History,clean_text,genres_listed
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1126590,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"[big, eyes, written, scott, alexander, larry, ...",1
1655420,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"[week, marilyn, written, adrian, hodges, 1, ex...",2
1365050,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,"[beasts, nation, written, cary, joji, fukunaga...",2
1067774,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,"[monte, carlo, written, ron, bass, based, nove...",3
164052,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,"[hollow, man, written, andrew, w., marlowe, re...",3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1201167,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,"[funny, people, written, judd, apatow, april, ...",2
1027718,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,"[wall, street, money, never, sleeps, written, ...",2
162346,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,"[ghost, world, daniel, clowes, terry, zwigoff,...",2
824747,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,"[changeling, true, story, written, j., michael...",3


## Calculating Accuracy
We can define accuracy as how many of the model's first n predictions are correct over n. Informally, this accuracy represents the percentage of correct genres the model is able to identify.

In [85]:
def PredictionAccuracy(test_df: pd.DataFrame) -> tuple[int, float]:    
    total_score = 0
    for i in test_df.index:
        score = 0
        num_genres = test_df.loc[i]['genres_listed']
    
        preds = list(pd.Series(Score(test_df.loc[i]['clean_text'])).sort_values(ascending=False).index)
    
        for genre in preds[:num_genres]:
            if df.loc[i][genre] == 1:
                score += 1
    
        score /= num_genres
        total_score += score
    return total_score / len(test_df)
        
        


In [83]:
PredictionAccuracy(test_df)

0.40843373493975893

## Precision, Recall, and F-score
Calculating the precision, recall, and F score can give a more complete picture of the model's accuracy.

These scores will all be the same with the previously described mode of making predictions, because it takes into account the correct amount of labels to be predicted. Each false positive is accompanied by a false negative. This approach can be useful for tuning a larger generative model, which is how I ultimately plan to use this code. In order to calculate the accuracy in a more granular way, however, we can define a prediction threshold, either as a discrete quantity or a function of the predicted probabilities for the entire set of classes.

In [202]:
def Precision_Recall_F(test_df: pd.DataFrame, threshold_function, thresh_func_args: tuple) -> tuple[float, float, float]:
    total_score = 0
    true_positives = 0
    true_negatives = 0
    false_positives = 0
    false_negatives = 0
    for i in test_df.index:
        num_genres = test_df.loc[i]['genres_listed']
    
        p = pd.Series(Score(test_df.loc[i]['clean_text'])).sort_values(ascending=False)
        preds = threshold_function(p, thresh_func_args)
        
        #print(preds)
        
        
    
        for genre in list(test_df.loc[:,'Drama':'History'].columns):
            if genre in preds: pred = True #Positive prediction
            else: pred = False #Negative prediction
            
            if test_df.loc[i, genre] == 0: obs = False #Negative observed value
            else: obs = True #Positive observed Value
            
            match (pred, obs):
                case (True, True):
                   true_positives += 1
                case (True, False):
                    false_positives += 1
                case (False, False):
                    true_negatives += 1
                case (False, True):
                    false_negatives += 1
        '''        
        print('preds: ', [genre for genre in preds])
        print('row: ', test_df.loc[i])
        print('TP: ', true_positives)
        print('TN: ', true_negatives)
        print('FP: ', false_positives)
        print('FN: ', false_negatives)
        print('-------')
        '''
        
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f = 2 * ((precision * recall) / (precision + recall))

        
    return (precision, recall, f)
   

In [203]:
def thresh_stdev(p, args=(1,)):
    '''Threshold is defined at the given Z score for each row's predicted probabilities'''
    return list(p[p > statistics.mean(p) + (statistics.stdev(p) * args[0])].index)

In [204]:
def correct_number_of_preds(p, args=(0,)):
    '''Model will make the correct number of predictions, plus the given amount of extra predictions'''
    global num_genres
    return list(p.index)[:num_genres + args[0]]

In [205]:
def thresh_constant(p, args=(-3000000,)):
    '''Threshold is a given constant'''
    return list([p[p > args[0]]].index)

In [206]:
def thresh_linear_wrt_mean(p, args=(10,)):
    '''Threshold is a given constant multiplied by the mean of each row's predicted probabilities'''
    return list(p[p > statistics.mean(p) / args[0]].index)

In [223]:
(precision, recall, f) = Precision_Recall_F(test_df, correct_number_of_preds, (0,))

## Discussion
Ideally, I would have done this type of hyperparameter tuning before making predictions on the test set. As it is, it would be hard to pick an ideal model without overfitting the available data.

However, this data is very limiting to begin with. The model leans heavily in favor of predicting certain categories, predicting drama and thriller significantly more often than the other classes. Looking at the training data, these categories are heavily overrepresented. 

In [221]:
for genre in train_df.columns[:17]:
    print(genre, len(train_df[train_df[genre] == 1]))


Drama 160
Romance 53
Adventure 59
Fantasy 40
Family 19
Mystery 45
Crime 70
Thriller 118
War 5
Comedy 90
Music 7
Western 4
Horror 46
Science Fiction 61
Action 81
Animation 15
History 14


With more data this model can likely be made more accurate, and with the addition of some fresh validation and testing data, a 'final' model can be tuned. More than likely, this simple 'bag-of-words' classifier will only be useful as part of a larger ensemble or GAN if at all.