In [1]:
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", 999)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances
import plotly.graph_objects as go
import plotly.express as px
from sklearn.tree import DecisionTreeClassifier, _tree
from IPython.display import display, HTML

# Data preparation

In [2]:
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv", dtype={"MonthlyCharges": float})
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
def to_num(total_charges, monthly_charges, tenure):
    try:
        return float(total_charges)
    except:
        return monthly_charges * tenure
    
df_with_fixed_types = df.assign(TotalCharges=df.apply(
                        lambda x: to_num(x["TotalCharges"], x["MonthlyCharges"], x["tenure"]), axis=1))

In [4]:
df_num = df_with_fixed_types.select_dtypes([int, float])
df_num.columns

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges'], dtype='object')

In [5]:
df_cat = df_with_fixed_types.select_dtypes("object").drop(columns="customerID")
df_cat.columns

Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod', 'Churn'],
      dtype='object')

In [6]:
df_dummies = pd.get_dummies(df_cat, drop_first=True)
df_dummies.head()

Unnamed: 0,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn_Yes
0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
1,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0
2,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1
3,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0
4,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1


In [7]:
df_features_and_target = pd.concat([df_num, df_dummies], axis=1)
df_features_and_target.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn_Yes
0,0,1,29.85,29.85,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
1,0,34,56.95,1889.5,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0
2,0,2,53.85,108.15,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1
3,0,45,42.3,1840.75,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0
4,0,2,70.7,151.65,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1


In [8]:
target = "Churn_Yes"
x = df_features_and_target.drop(columns=target)
y = df_features_and_target[target]

In [9]:
y.describe()

count    7043.000000
mean        0.265370
std         0.441561
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: Churn_Yes, dtype: float64

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)

# Random Forest

In [11]:
rf = RandomForestClassifier(n_estimators=200, n_jobs=8, min_samples_leaf=20, random_state=123)
rf.fit(x_train, y_train)

RandomForestClassifier(min_samples_leaf=20, n_estimators=200, n_jobs=8,
                       random_state=123)

In [12]:
rf.score(x_train, y_train), rf.score(x_test, y_test)

(0.8202903124013885, 0.7971631205673759)

In [13]:
feature_importances = rf.feature_importances_
fig = go.Figure(data=[
    go.Bar(name='Features importances', x=x.columns, y=feature_importances)
])

fig.show()

# Clustering

In [14]:
n_clusters = 5

In [15]:
df_with_likely_churn = df_features_and_target[rf.predict_proba(x)[:, 1] > 0.3]
x_with_likely_churn = df_with_likely_churn.drop(columns=target)

In [16]:
x_scaled = StandardScaler().fit_transform(x_with_likely_churn)
aggl_clustering_euclidian = AgglomerativeClustering(n_clusters=n_clusters, linkage="average")
aggl_clustering_euclidian.fit(x_scaled)

AgglomerativeClustering(linkage='average', n_clusters=5)

In [17]:
x_with_rf_feature_importance = (x_scaled.T * feature_importances.reshape(-1, 1)).T
aggl_clustering_feature_importance = AgglomerativeClustering(n_clusters=n_clusters, linkage="average")
aggl_clustering_feature_importance.fit(x_with_rf_feature_importance)

AgglomerativeClustering(linkage='average', n_clusters=5)

In [18]:
encoding = rf.apply(x_with_likely_churn)
distance_matrix = pairwise_distances(encoding, metric="hamming")

aggl_clustering_from_rf = AgglomerativeClustering(n_clusters=n_clusters, 
                                                  affinity="precomputed", 
                                                  linkage="average")
aggl_clustering_from_rf.fit(distance_matrix)

AgglomerativeClustering(affinity='precomputed', linkage='average', n_clusters=5)

## Analyze clustering

In [19]:
df_clusters = df_with_likely_churn.assign(cluster_rf=list(map(str, aggl_clustering_from_rf.labels_)),
                                           cluster_eucl=list(map(str, aggl_clustering_euclidian.labels_)),
                                           cluster_fi=list(map(str, aggl_clustering_feature_importance.labels_)))

### Clusters interpretations

In [20]:
# Functions retrieved from https://towardsdatascience.com/the-easiest-way-to-interpret-clustering-result-8137e488a127

def pretty_print(df):
    return display( HTML( df.to_html().replace("\\n","<br>") ) )

def get_class_rules(tree: DecisionTreeClassifier, feature_names: list):
    inner_tree: _tree.Tree = tree.tree_
    classes = tree.classes_
    class_rules_dict = dict()

    def tree_dfs(node_id=0, current_rule=[]):
        # feature[i] holds the feature to split on, for the internal node i.
        split_feature = inner_tree.feature[node_id]
        if split_feature != _tree.TREE_UNDEFINED: # internal node
            name = feature_names[split_feature]
            threshold = inner_tree.threshold[node_id]
            # left child
            left_rule = current_rule + ["({} <= {})".format(name, threshold)]
            tree_dfs(inner_tree.children_left[node_id], left_rule)
            # right child
            right_rule = current_rule + ["({} > {})".format(name, threshold)]
            tree_dfs(inner_tree.children_right[node_id], right_rule)
        else: # leaf
            dist = inner_tree.value[node_id][0]
            dist = dist/dist.sum()
            max_idx = dist.argmax()
            if len(current_rule) == 0:
                rule_string = "ALL"
            else:
                rule_string = " and ".join(current_rule)
            # register new rule to dictionary
            selected_class = classes[max_idx]
            class_probability = dist[max_idx]
            class_rules = class_rules_dict.get(selected_class, [])
            class_rules.append((rule_string, class_probability))
            class_rules_dict[selected_class] = class_rules

    tree_dfs() # start from root, node_id = 0
    return class_rules_dict

def cluster_report(data: pd.DataFrame, clusters, min_samples_leaf=50, pruning_level=0.01):
    # Create Model
    tree = DecisionTreeClassifier(min_samples_leaf=min_samples_leaf, ccp_alpha=pruning_level)
    tree.fit(data, clusters)
    
    # Generate Report
    feature_names = data.columns
    class_rule_dict = get_class_rules(tree, feature_names)

    report_class_list = []
    for class_name in class_rule_dict.keys():
        rule_list = class_rule_dict[class_name]
        combined_string = ""
        for rule in rule_list:
            combined_string += "[{}] {}\n\n".format(rule[1], rule[0])
        report_class_list.append((class_name, combined_string))
        
    cluster_instance_df = pd.Series(clusters).value_counts().reset_index()
    cluster_instance_df.columns = ["class_name", "instance_count"]
    report_df = pd.DataFrame(report_class_list, columns=["class_name", "rule_list"])
    report_df = pd.merge(cluster_instance_df, report_df, on="class_name", how="left")
    pretty_print(report_df.sort_values(by="class_name").reset_index(drop=True)[["class_name", "instance_count", "rule_list"]])

In [21]:
cluster_report(x_with_likely_churn, aggl_clustering_euclidian.labels_, min_samples_leaf=20, pruning_level=0.01)

Unnamed: 0,class_name,instance_count,rule_list
0,0,2197,[0.9995446265938069] (MonthlyCharges > 43.70000076293945) and (Contract_One year <= 0.5) and (PhoneService_Yes > 0.5)
1,1,217,[1.0] (MonthlyCharges <= 43.70000076293945) and (PhoneService_Yes <= 0.5) [1.0] (MonthlyCharges > 43.70000076293945) and (Contract_One year <= 0.5) and (PhoneService_Yes <= 0.5)
2,2,141,[0.986013986013986] (MonthlyCharges <= 43.70000076293945) and (PhoneService_Yes > 0.5)
3,3,1,
4,4,79,[0.9875] (MonthlyCharges > 43.70000076293945) and (Contract_One year > 0.5)


In [22]:
cluster_report(x_with_likely_churn, aggl_clustering_feature_importance.labels_, min_samples_leaf=20, pruning_level=0.01)

Unnamed: 0,class_name,instance_count,rule_list
0,0,17,[0.7727272727272727] (tenure > 20.5) and (TotalCharges <= 5715.474853515625) and (MonthlyCharges <= 69.2750015258789)
1,1,1946,[0.9896854048478597] (tenure <= 20.5)
2,2,66,[0.9672131147540983] (tenure > 20.5) and (TotalCharges > 5715.474853515625)
3,3,1,
4,4,605,[0.9494290375203915] (tenure > 20.5) and (TotalCharges <= 5715.474853515625) and (MonthlyCharges > 69.2750015258789)


In [23]:
cluster_report(x_with_likely_churn, aggl_clustering_from_rf.labels_, min_samples_leaf=10, pruning_level=0.01)

Unnamed: 0,class_name,instance_count,rule_list
0,0,593,[1.0] (InternetService_Fiber optic <= 0.5) and (TechSupport_No internet service <= 0.5)
1,1,1127,[0.9725177304964538] (InternetService_Fiber optic > 0.5) and (tenure <= 17.5)
2,2,541,[0.9460966542750929] (InternetService_Fiber optic > 0.5) and (tenure > 17.5) and (PaymentMethod_Electronic check > 0.5)
3,3,141,[1.0] (InternetService_Fiber optic <= 0.5) and (TechSupport_No internet service > 0.5)
4,4,233,[0.9787234042553191] (InternetService_Fiber optic > 0.5) and (tenure > 17.5) and (PaymentMethod_Electronic check <= 0.5)


## Clusters visualization

In [32]:
def get_distinct_legend_elements_for_cluster_and_payement(fig):
    displayed_clusters = []
    for trace in fig.data:
        name = trace.name.split(',')
        if name[0] in displayed_clusters:
            trace["showlegend"] = False
        else:
            trace["name"] = "Cluster " + name[0]
            displayed_clusters.append(name[0])
            
    fig.add_trace(go.Scatter(y=[None], mode="markers",
                             marker=dict(symbol="square", color="grey"),
                             name="Payment by electronic check",
                             ))
    fig.add_trace(go.Scatter(y=[None], mode="markers",
                             marker=dict(symbol="x", color="grey"),
                             name="Payment with other methods",
                             ))
    
    fig.update_layout(legend_title_text="Clusters and payment methods")

In [33]:
fig_eucl = px.scatter(df_clusters.rename(columns={"cluster_eucl": "cluster"}), 
                       x="MonthlyCharges", y="tenure", color="cluster",
                        symbol="PaymentMethod_Electronic check",
                       symbol_map={0: "square", 1: "x"})

get_distinct_legend_elements_for_cluster_and_payement(fig_eucl)

fig_eucl.show()

In [34]:
fig_fi = px.scatter(df_clusters.rename(columns={"cluster_fi": "cluster"}), 
                   x="MonthlyCharges", y="tenure", color="cluster",
                    symbol="PaymentMethod_Electronic check",
                   symbol_map={0: "square", 1: "x"})

get_distinct_legend_elements_for_cluster_and_payement(fig_fi)

fig_fi.show()

In [35]:
fig_rf = px.scatter(df_clusters.rename(columns={"cluster_rf": "cluster"}), 
                      x="MonthlyCharges", y="tenure", color="cluster",
                  symbol="PaymentMethod_Electronic check",
                   symbol_map={0: "square", 1: "x"})

get_distinct_legend_elements_for_cluster_and_payement(fig_rf)

fig_rf.show()