Decision Tree

In
[1]: # importing libraries

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score ,f1_score
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
In [2]: # loading dataset

df = pd.read_csv('Churn_Modelling.csv')
Data Inspection
In [3]: df.head()
Out[3]: RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Balance NumOfProduc
0 1 15634602 Hargrave 619 France Female 42 2 0.00
1 2 15647311 Hill 608 Spain Female 41 1 83807.86
2 3 15619304 Onio 502 France Female 42 8 159660.80
3 4 15701354 Boni 699 France Female 39 1 0.00
4 5 15737888 Mitchell 850 Spain Female 43 2 125510.82
In [4]: df.shape
(10000, 14)
Out[4]:
In [5]: df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 RowNumber 10000 non-null int64
1 CustomerId 10000 non-null int64
2 Surname 10000 non-null object
3 CreditScore 10000 non-null int64
4 Geography 10000 non-null object
5 Gender 10000 non-null object
6 Age 10000 non-null int64
7 Tenure 10000 non-null int64
8 Balance 10000 non-null float64
9 NumOfProducts 10000 non-null int64
10 HasCrCard 10000 non-null int64
11 IsActiveMember 10000 non-null int64
12 EstimatedSalary 10000 non-null float64
13 Exited 10000 non-null int64
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB
In [6]: df.isnull().sum()
Loading [MathJax]/extensions/Safe.js
RowNumber 0
Out[6]:
CustomerId 0
Surname 0
CreditScore 0
Geography 0
Gender 0
Age 0
Tenure 0
Balance 0
NumOfProducts 0
HasCrCard 0
IsActiveMember 0
EstimatedSalary 0
Exited 0
dtype: int64
In [7]: df[df.duplicated()]
Out[7]: RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure Balance NumOfProducts
In [8]: df.describe(include='all')
Out[8]: RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure
count 10000.00000 1.000000e+04 10000 10000.000000 10000 10000 10000.000000 10000.000000
unique NaN NaN 2932 NaN 3 2 NaN NaN
top NaN NaN Smith NaN France Male NaN NaN
freq NaN NaN 32 NaN 5014 5457 NaN NaN
mean 5000.50000 1.569094e+07 NaN 650.528800 NaN NaN 38.921800 5.012800
std 2886.89568 7.193619e+04 NaN 96.653299 NaN NaN 10.487806 2.892174
min 1.00000 1.556570e+07 NaN 350.000000 NaN NaN 18.000000 0.000000
25% 2500.75000 1.562853e+07 NaN 584.000000 NaN NaN 32.000000 3.000000
50% 5000.50000 1.569074e+07 NaN 652.000000 NaN NaN 37.000000 5.000000
75% 7500.25000 1.575323e+07 NaN 718.000000 NaN NaN 44.000000 7.000000 1
max 10000.00000 1.581569e+07 NaN 850.000000 NaN NaN 92.000000 10.000000 2
Data Wrangling
In [9]: df.drop(columns=['RowNumber','CustomerId','Surname'],inplace=True)
In [10]: df.rename(columns={"Exited":"Churned"},inplace=True)
df["Churned"].replace({0:"No",1:"Yes"},inplace=True)
In [11]: df.head()
Out[11]: CreditScore Geography Gender Age Tenure Balance NumOfProducts HasCrCard IsActiveMember Esti
0 619 France Female 42 2 0.00 1 1 1
1 608 Spain Female 41 1 83807.86 1 0 1
2 502 France Female 42 8 159660.80 3 1 0
3 699 France Female 39 1 0.00 2 0 0
4 850 Spain Female 43 2 125510.82 1 1 1
Explorator Data Analysis (EDA)

In [12]: sns.set_style('whitegrid')
sns.countplot(x='Churned',data=df)
plt.xlabel('Churned')
Text(0.5, 0, 'Churned')
Out[12]:
sns.countplot(x='Geography',hue='Churned',data=df)
<AxesSubplot:xlabel='Geography', ylabel='count'>
Out[13]:
sns.countplot(x='Gender',hue='Churned',data=df)
<AxesSubplot:xlabel='Gender', ylabel='count'>
Out[14]:
sns.countplot(x='NumOfProducts',hue='Churned',data=df)
<AxesSubplot:xlabel='NumOfProducts', ylabel='count'>
Out[15]:
sns.countplot(x='HasCrCard',hue='Churned',data=df)
<AxesSubplot:xlabel='HasCrCard', ylabel='count'>
Out[16]:
sns.countplot(x='IsActiveMember',hue='Churned',data=df)
<AxesSubplot:xlabel='IsActiveMember', ylabel='count'>
Out[17]:
In [18]: plt.figure(figsize=(8,8))
sns.set_style('whitegrid')
sns.countplot(x='Tenure',hue='Churned',data=df)
<AxesSubplot:xlabel='Tenure', ylabel='count'>
Out[18]:
In [19]: sns.boxplot(x='Churned',y='CreditScore',data=df)
<AxesSubplot:xlabel='Churned', ylabel='CreditScore'>
Out[19]:
In [20]: sns.distplot(df['Age'])
C:\Users\Admin\AppData\Local\Temp\ipykernel_8808\3255828239.py:1: UserWarning:
`distplot` is a deprecated function and will be removed in seaborn v0.14.0.
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
sns.distplot(df['Age'])
<AxesSubplot:xlabel='Age', ylabel='Density'>
Out[20]:
In [21]: sns.boxplot(x='Churned',y='EstimatedSalary',data=df)
<AxesSubplot:xlabel='Churned', ylabel='EstimatedSalary'>
Out[21]:
Feature Engineering
In [22]: def products(col):
for i in col:
if i==1:
return 'One product'
if i==2:
return 'Two product'
if i>2:
return 'More Than 2 Products'
In [23]: df['Product']=df[["NumOfProducts"]].apply(products,axis=1)
In [24]: df.drop(columns='NumOfProducts',inplace=True)
In [25]: sns.countplot(x='Product',hue='Churned',data=df)
<AxesSubplot:xlabel='Product', ylabel='count'>
Out[25]:
In [26]: def balance(col):

for i in col:
if i==0:
return 'Zero'
if i>0:
return 'More Than zero '
In [27]: df['Account_Balance']=df[['Balance']].apply(balance,axis=1)
In [28]: df.drop(columns='Balance',inplace=True)
Data Preparation
In [29]: df = pd.get_dummies(columns=["Geography","Gender","Product","Account_Balance"],data=df)
In [30]: df["Churned"].replace({"No":0,"Yes":1},inplace=True)
In [31]: import numpy as np

df["Age"] = np.log(df["Age"])
In [32]: X = df.drop(columns=["Churned"])
y = df["Churned"]
In [33]: x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)
Implementing Decision Tree Classifier

In [34]: model = DecisionTreeClassifier()
In [35]: parameters = {"max_depth":[3,4,5,6,7,8,9,10],

"min_samples_split":[2,3,4,5,6,7,8],
"min_samples_leaf":[1,2,3,4,5,6,7,8],
"criterion":["gini","entropy"],
"splitter":["best","random"],
"max_features":["auto",None],
"random_state":[0,42]}
In [36]: decision_tree = GridSearchCV(model, parameters, cv=5, n_jobs=-1)
In [37]: decision_tree.fit(x_train,y_train)
Out[37]: ▸ GridSearchCV
▸ estimator: DecisionTreeClassifier
▸ DecisionTreeClassifier
In [38]: decision_tree.best_params_
{'criterion': 'gini',
Out[38]:
'max_depth': 7,
'max_features': None,
'min_samples_leaf': 3,
'min_samples_split': 8,
'random_state': 42,
'splitter': 'best'}
In [ ]:
In [39]: decision_tree.best_score_
0.8561250000000001
Out[39]:
In [40]: y_train_pred = decision_tree.predict(x_train)
Accuracy
In [41]: round(accuracy_score(y_train,y_train_pred)*100,2)
86.9
Out[41]:
In [42]: y_test_pred = decision_tree.predict(x_test)
In [43]: round(accuracy_score(y_test,y_test_pred)*100,2)
85.75
Out[43]:
In [44]: print("F1 Score of the Model is =>",f1_score(y_test,y_test_pred,average="micro"))

print("Recall Score of the Model is =>",recall_score(y_test,y_test_pred,average="micro")
print("Precision Score of the Model is =>",precision_score(y_test,y_test_pred,average="m
F1 Score of the Model is => 0.8575

Recall Score of the Model is => 0.8575
Precision Score of the Model is => 0.8575
Finding Importance of Features in DecisionTreeClassifier.
In [45]: features = x_train.columns

importances = decision_tree.best_estimator_.feature_importances_
In [46]: feat_imp = pd.Series(importances,index= features).sort_values()
feat_imp.plot(kind='barh')
plt.xlabel("Gini Importance")
plt.ylabel("Feature")
plt.title("Feature Importance");
Implementing Random Forest Classifier
In [48]: model = RandomForestClassifier()
In [49]: parameter = {"max_depth":[3,4,5,6,7,8],

"min_samples_split":[3,4,5,6,7,8],
"min_samples_leaf":[3,4,5,6,7,8],
"n_estimators": [50,70,90,100],
"criterion":["gini","entropy"]}
In [50]: forest = GridSearchCV(model, parameter,cv=5, n_jobs=-1)
forest.fit(x_train,y_train)
Out[50]: ▸ GridSearchCV
▸ estimator: RandomForestClassifier
▸ RandomForestClassifier
In [51]: forest.best_params_
{'criterion': 'gini',
Out[51]:
'max_depth': 8,
'min_samples_leaf': 4,
'min_samples_split': 4,
'n_estimators': 70}
In [52]: forest.best_score_
0.8618750000000001
Out[52]:
In [53]: y_train_pred = decision_tree.predict(x_train)
In [54]: y_test_pred = decision_tree.predict(x_test)
Accuracy
In [55]: round(accuracy_score(y_train,y_train_pred)*100,2)
86.9
Out[55]:
In [56]: round(accuracy_score(y_test,y_test_pred)*100,2)
85.75
Out[56]:
In [57]: print("F1 Score of the Model is =>",f1_score(y_test,y_test_pred,average="micro"))

print("Recall Score of the Model is =>",recall_score(y_test,y_test_pred,average="micro")
print("Precision Score of the Model is =>",precision_score(y_test,y_test_pred,average="m
F1 Score of the Model is => 0.8575

Recall Score of the Model is => 0.8575
Precision Score of the Model is => 0.8575
Finding Importance of Features in Random Forest Classifier.
In [58]: features = x_train.columns

importances = forest.best_estimator_.feature_importances_
In [59]: feat_imp = pd.Series(importances,index= features).sort_values()
feat_imp.plot(kind='barh')
plt.xlabel("Gini Importance")
plt.ylabel("Feature")
plt.title("Feature Importance");
In [ ]:
In [ ]:

Decision Tree

Uploaded by

Decision Tree

Uploaded by

In

[1]: # importing libraries

In [2]: # loading dataset

0 1 15634602 Hargrave 619 France Female 42 2 0.00

1 2 15647311 Hill 608 Spain Female 41 1 83807.86

2 3 15619304 Onio 502 France Female 42 8 159660.80

3 4 15701354 Boni 699 France Female 39 1 0.00

4 5 15737888 Mitchell 850 Spain Female 43 2 125510.82

Out[8]: RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure

count 10000.00000 1.000000e+04 10000 10000.000000 10000 10000 10000.000000 10000.000000

unique NaN NaN 2932 NaN 3 2 NaN NaN

top NaN NaN Smith NaN France Male NaN NaN

freq NaN NaN 32 NaN 5014 5457 NaN NaN

mean 5000.50000 1.569094e+07 NaN 650.528800 NaN NaN 38.921800 5.012800

std 2886.89568 7.193619e+04 NaN 96.653299 NaN NaN 10.487806 2.892174

min 1.00000 1.556570e+07 NaN 350.000000 NaN NaN 18.000000 0.000000

25% 2500.75000 1.562853e+07 NaN 584.000000 NaN NaN 32.000000 3.000000

50% 5000.50000 1.569074e+07 NaN 652.000000 NaN NaN 37.000000 5.000000

75% 7500.25000 1.575323e+07 NaN 718.000000 NaN NaN 44.000000 7.000000 1

max 10000.00000 1.581569e+07 NaN 850.000000 NaN NaN 92.000000 10.000000 2

0 619 France Female 42 2 0.00 1 1 1

1 608 Spain Female 41 1 83807.86 1 0 1

2 502 France Female 42 8 159660.80 3 1 0

3 699 France Female 39 1 0.00 2 0 0

4 850 Spain Female 43 2 125510.82 1 1 1

Explorator Data Analysis (EDA)

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

In [26]: def balance(col):

In [31]: import numpy as np

In [33]: x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

Implementing Decision Tree Classifier

In [35]: parameters = {"max_depth":[3,4,5,6,7,8,9,10],

In [36]: decision_tree = GridSearchCV(model, parameters, cv=5, n_jobs=-1)

In [40]: y_train_pred = decision_tree.predict(x_train)

In [42]: y_test_pred = decision_tree.predict(x_test)

In [44]: print("F1 Score of the Model is =>",f1_score(y_test,y_test_pred,average="micro"))

F1 Score of the Model is => 0.8575

Finding Importance of Features in DecisionTreeClassifier.

In [45]: features = x_train.columns

In [46]: feat_imp = pd.Series(importances,index= features).sort_values()

In [49]: parameter = {"max_depth":[3,4,5,6,7,8],

In [50]: forest = GridSearchCV(model, parameter,cv=5, n_jobs=-1)

In [53]: y_train_pred = decision_tree.predict(x_train)

In [54]: y_test_pred = decision_tree.predict(x_test)

In [57]: print("F1 Score of the Model is =>",f1_score(y_test,y_test_pred,average="micro"))

F1 Score of the Model is => 0.8575

Finding Importance of Features in Random Forest Classifier.

In [58]: features = x_train.columns

In [59]: feat_imp = pd.Series(importances,index= features).sort_values()

You might also like