Week - 6 - SWI - MLP - LogisticRegression - Ipynb - Colaboratory
Week - 6 - SWI - MLP - LogisticRegression - Ipynb - Colaboratory
Introduction
Logistic Regression (also called Logit Regression) is commonly used to estimate the probability
that an instance belongs to a particular class.
Attribute Information:
Reference
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
#Define the column names
cols = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'ol
# Load the dataset
heart_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/heart-
heart_data
age sex cp trestbps chol fbs restecg thalach exang oldpeak slope c
0 63.0 1.0 1.0 145.0 233.0 1.0 2.0 150.0 0.0 2.3 3.0 0.
1 67.0 1.0 4.0 160.0 286.0 0.0 2.0 108.0 1.0 1.5 2.0 3.
2 67.0 1.0 4.0 120.0 229.0 0.0 2.0 129.0 1.0 2.6 2.0 2.
3 37.0 1.0 3.0 130.0 250.0 0.0 0.0 187.0 0.0 3.5 3.0 0.
4 41.0 0.0 2.0 130.0 204.0 0.0 2.0 172.0 0.0 1.4 1.0 0.
... ... ... ... ... ... ... ... ... ... ... ... .
298 45.0 1.0 1.0 110.0 264.0 0.0 0.0 132.0 0.0 1.2 2.0 0.
299 68.0 1.0 4.0 144.0 193.0 1.0 0.0 141.0 0.0 3.4 2.0 2.
300 57.0 1.0 4.0 130.0 131.0 0.0 0.0 115.0 1.0 1.2 2.0 1.
301 57.0 0.0 2.0 130.0 236.0 0.0 2.0 174.0 0.0 0.0 2.0 1.
302 38.0 1.0 3.0 138.0 175.0 0.0 0.0 173.0 0.0 0.0 1.0
# to check the type of data variable
type(heart_data)
pandas.core.frame.DataFrame
# Display first five rows of the dataset
heart_data.head() #head is first 5 rows
age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca
0 63.0 1.0 1.0 145.0 233.0 1.0 2.0 150.0 0.0 2.3 3.0 0.0
1 67.0 1.0 4.0 160.0 286.0 0.0 2.0 108.0 1.0 1.5 2.0 3.0
2 67.0 1.0 4.0 120.0 229.0 0.0 2.0 129.0 1.0 2.6 2.0 2.0
3 37.0 1.0 3.0 130.0 250.0 0.0 0.0 187.0 0.0 3.5 3.0 0.0
4 41.0 0.0 2.0 130.0 204.0 0.0 2.0 172.0 0.0 1.4 1.0 0.0
# Display last five rows of the dataset
heart_data.tail() #tail is last 5 rows
age sex cp trestbps chol fbs restecg thalach exang oldpeak slope c
298 45.0 1.0 1.0 110.0 264.0 0.0 0.0 132.0 0.0 1.2 2.0 0.
299 68.0 1.0 4.0 144.0 193.0 1.0 0.0 141.0 0.0 3.4 2.0 2.
300 57.0 1.0 4.0 130.0 131.0 0.0 0.0 115.0 1.0 1.2 2.0 1.
301 57.0 0.0 2.0 130.0 236.0 0.0 2.0 174.0 0.0 0.0 2.0 1.
302 38.0 1.0 3.0 138.0 175.0 0.0 0.0 173.0 0.0 0.0 1.0
for feature in cols:
plt.hist(heart_data[feature])
plt.title(feature)
# display histogram
plt.show()
# converting class labels 2,3, and 4 into label 1
heart_data = heart_data.replace({"num": {2:1,3:1, 4:1}})
# Visualize the label
heart_data
age sex cp trestbps chol fbs restecg thalach exang oldpeak slope c
0 63.0 1.0 1.0 145.0 233.0 1.0 2.0 150.0 0.0 2.3 3.0 0.
1 67.0 1.0 4.0 160.0 286.0 0.0 2.0 108.0 1.0 1.5 2.0 3.
2 67.0 1.0 4.0 120.0 229.0 0.0 2.0 129.0 1.0 2.6 2.0 2.
3 37.0 1.0 3.0 130.0 250.0 0.0 0.0 187.0 0.0 3.5 3.0 0.
4 41.0 0.0 2.0 130.0 204.0 0.0 2.0 172.0 0.0 1.4 1.0 0.
... ... ... ... ... ... ... ... ... ... ... ... .
298 45.0 1.0 1.0 110.0 264.0 0.0 0.0 132.0 0.0 1.2 2.0 0.
299 68.0 1.0 4.0 144.0 193.0 1.0 0.0 141.0 0.0 3.4 2.0 2.
300 57.0 1.0 4.0 130.0 131.0 0.0 0.0 115.0 1.0 1.2 2.0 1.
301 57.0 0.0 2.0 130.0 236.0 0.0 2.0 174.0 0.0 0.0 2.0 1.
302 38.0 1.0 3.0 138.0 175.0 0.0 0.0 173.0 0.0 0.0 1.0
imputer = SimpleImputer(missing_values = np.nan, strategy ='mean')
imputer = imputer.fit(heart_data)
heart_imputed = imputer.transform(heart_data)
heart_data_imputed = pd.DataFrame(heart_imputed, columns = cols)
heart_data_imputed
age sex cp trestbps chol fbs restecg thalach exang oldpeak slope
0 63.0 1.0 1.0 145.0 233.0 1.0 2.0 150.0 0.0 2.3 3.0 0.
1 67.0 1.0 4.0 160.0 286.0 0.0 2.0 108.0 1.0 1.5 2.0 3.
2 67.0 1.0 4.0 120.0 229.0 0.0 2.0 129.0 1.0 2.6 2.0 2.
3 37.0 1.0 3.0 130.0 250.0 0.0 0.0 187.0 0.0 3.5 3.0 0.
4 41.0 0.0 2.0 130.0 204.0 0.0 2.0 172.0 0.0 1.4 1.0 0.
... ... ... ... ... ... ... ... ... ... ... ...
298 45.0 1.0 1.0 110.0 264.0 0.0 0.0 132.0 0.0 1.2 2.0 0.
299 68.0 1.0 4.0 144.0 193.0 1.0 0.0 141.0 0.0 3.4 2.0 2.
300 57.0 1.0 4.0 130.0 131.0 0.0 0.0 115.0 1.0 1.2 2.0 1.
301 57.0 0.0 2.0 130.0 236.0 0.0 2.0 174.0 0.0 0.0 2.0 1.
302 38.0 1.0 3.0 138.0 175.0 0.0 0.0 173.0 0.0 0.0 1.0 0.
# Assign a new variable y as target
y = heart_data_imputed['num']
y = np.array(y)
array([0., 1., 1., 0., 0., 0., 1., 0., 1., 1., 0., 0., 1., 0., 0., 0., 1.,
0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 1., 0., 1., 1., 0.,
0., 0., 1., 1., 1., 0., 1., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0.,
0., 1., 0., 1., 1., 1., 1., 0., 0., 1., 0., 1., 0., 1., 1., 1., 0.,
1., 1., 0., 1., 1., 1., 1., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0.,
0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0.,
0., 0., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 0., 0., 1.,
1., 1., 1., 0., 1., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
1., 1., 1., 0., 0., 1., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0.,
1., 1., 1., 1., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0.,
1., 0., 1., 0., 1., 1., 0., 1., 0., 0., 1., 1., 0., 0., 1., 0., 0.,
1., 1., 1., 0., 1., 1., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0.,
0., 1., 1., 1., 0., 1., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0.,
0., 0., 1., 1., 0., 0., 0., 1., 1., 0., 1., 1., 0., 0., 1., 1., 1.,
0., 0., 0., 0., 0., 1., 0., 1., 1., 1., 1., 0., 0., 1., 0., 0., 0.,
0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 1., 0.,
1., 0., 1., 0., 0., 0., 1., 0., 1., 0., 1., 0., 1., 1., 1., 0., 0.,
0., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0.])
# Remove the target variable from heart_data
del heart_data_imputed['num']
heart_data_imputed.describe()
plt.figure(figsize=[15,15])
sns.heatmap(heart_data_imputed.corr(),annot = True, square = True)
plt.show()
# Let us split the data for training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(heart_data_imputed, y, test_size = 0.2
print('Shape of training data',X_train.shape)
print('Shape of training labels', y_train.shape)
print('Shape of testing data', X_test.shape)
print('Shape of testing labels',y_test.shape)
X_train
age sex cp trestbps chol fbs restecg thalach exang oldpeak slope
173 62.0 0.0 4.0 140.0 394.0 0.0 2.0 157.0 0.0 1.2 2.0 0.
261 58.0 0.0 2.0 136.0 319.0 1.0 2.0 152.0 0.0 0.0 1.0 2.
37 57.0 1.0 4.0 150.0 276.0 0.0 2.0 112.0 1.0 0.6 2.0 1.
101 34.0 1.0 1.0 118.0 182.0 0.0 2.0 174.0 0.0 0.0 1.0 0.
166 52.0 1.0 3.0 138.0 223.0 0.0 0.0 169.0 0.0 0.0 1.0 0.
... ... ... ... ... ... ... ... ... ... ... ...
251 58.0 1.0 4.0 146.0 218.0 0.0 0.0 105.0 0.0 2.0 2.0 1.
192 43.0 1.0 4.0 132.0 247.0 1.0 2.0 143.0 1.0 0.1 2.0 0.
117 35.0 0.0 4.0 138.0 183.0 0.0 0.0 182.0 0.0 1.4 1.0 0.
47 50.0 1.0 4.0 150.0 243.0 0.0 2.0 128.0 0.0 2.6 2.0 0.
172 59.0 0.0 4.0 174.0 249.0 0.0 0.0 143.0 1.0 0.0 2.0 0.
As there
227 is a wide
rows × 13 variation
columns among the numerical values between features, it is a best practice
to normalize the features before training.
# Instantiate the scaler to a variable and fit the train and test data
ss = StandardScaler()
X_train_norm = ss.fit_transform(X_train)
X_test_norm = ss.transform(X_test)
Perform Classification
LR = LogisticRegression()
classifier=LR.fit(X_train_norm, y_train)
score = LR.score(X_train_norm, y_train)
print("Training score: ", score)
#Make the prediction
y_pred = LR.predict(X_test_norm)
# Import the libraries
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
Confusion Matrix
Source
# visualizing the confusion matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
class_names=["0","1"]
plot_confusion_matrix(classifier, X_test_norm, y_test,display_labels=class_names,cmap=plt.
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names, rotation=0)
plt.yticks(tick_marks, class_names)
plt.title('Confusion matrix')
plt.show()
/usr/local/lib/python3.7/dist-packages/sklearn/utils/deprecation.py:87: FutureWarnin
warnings.warn(msg, category=FutureWarning)
CR = classification_report(y_test, y_pred)
print('Classification report \n')
print(CR)
Classification report
accuracy 0.82 76
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
RandomizedSearchCV
# Create a hyperparameter grid for LogisticRegression
log_reg_grid_rs = {"C": np.logspace(-4, 4, 20),
"solver": ["liblinear"]}
# Tune LogisticRegression
np.random.seed(42)
# Setup random hypterparameter search for LogisticRegression with cross-validation
RS_log_reg = RandomizedSearchCV(LogisticRegression(),
param_distributions=log_reg_grid_rs,
cv=5,
n_iter=20,
verbose=True)
# Fit random hyperparamter search model for LogisticRegression
RS_log_reg.fit(X_train_norm, y_train)
'solver': ['liblinear']},
verbose=True)
?LogisticRegression()
np.logspace(-4, 4, 20)
# Find the best hyperparameters
RS_log_reg.best_params_
RS_log_reg.score(X_train_norm, y_train)
0.8678414096916299
# Make predictions with tuned model
y_preds = RS_log_reg.predict(X_test_norm)
# Confusion matrix
print(confusion_matrix(y_test, y_preds))
[[36 4]
[10 26]]
?plot_roc_curve
# Plot ROC curve and calculate AUC metric
plot_roc_curve(RS_log_reg, X_test_norm, y_test)
/usr/local/lib/python3.7/dist-packages/sklearn/utils/deprecation.py:87: FutureWarnin
warnings.warn(msg, category=FutureWarning)
<sklearn.metrics._plot.roc_curve.RocCurveDisplay at 0x7f8a13f6d190>
GridSearchCV
# Setup grid hyperparamter search for LogisticRegression
log_reg_grid_gs = {"C": np.logspace(-4, 4, 30),
"solver": ["liblinear"]}
GS_log_reg = GridSearchCV(LogisticRegression(),
param_grid=log_reg_grid_gs,
cv=5,
verbose=True)
# Fit grid hyperparameter search model
GS_log_reg.fit(X_train_norm, y_train);
np.logspace(-4, 4, 30)
5.29831691e+03, 1.00000000e+04])
# Check the best hyperparameters
GS_log_reg.best_params_
# Evaluate the grid search LogisticRegression model
GS_log_reg.score(X_train_norm, y_train)
0.8634361233480177
# Make predictions with tuned model
y_preds = GS_log_reg.predict(X_test_norm)
# Confusion matrix
print(confusion_matrix(y_test, y_preds))
[[36 4]
[10 26]]