Logistic Regression

5/7/2017 Logistic Regression
Logistic Regression
Peter Caya
April 25, 2017
# This is a code block

library(ggplot2) # Data visualization
library(readr) # CSV file I/O, e.g. the read_csv function
library(caret)
## Loading required package: lattice
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':

##
## filter, lag
## The following objects are masked from 'package:base':

##
## intersect, setdiff, setequal, union
[1] Working with the Data

a. Cleaning the Data
The data was loaded. While there were no missing values, the data was cleaned in order to remove the following:
*
le:///media/petercaya/B192-0759/Kaggle/CC_Default/Logistic_Regression_for_CC_Fraud.html 1/29
# This R environment comes with all of CRAN preinstalled, as well as many other helpful
packages
# The environment is defined by the kaggle/rstats docker image: https://github.com/kagg
le/docker-rstats
# For example, here's several helpful packages to load in
library(ggplot2) # Data visualization

library(readr) # CSV file I/O, e.g. the read_csv function
library(dplyr)
library(caret)
library(reshape2)
library(knitr)
set.seed(13)
raw_data <- read.csv("UCI_Credit_Card.csv")
# Input data files are available in the "../input/" directory.

# For example, running this (by clicking run or pressing Shift+Enter) will list the fil
es in the input directory
# The purpose of this script is to do the following:

# [1] Identify any duplications or missing observations in the dataset.
# [2] Observe the way that payment and limit variables change based on the variables
of
# - Sex
# - Education
# - Marriage
# - Age groups
# [3] Observe the amount of defaults that occur based on the previously mentioned va
riables.
# [4] Identify pattens in Pay, Bill and Pay_AMT variables for defaulted and undefaul
ted datasets.
# [5] Get percentiles for different variables.
# [1] ---------------------------------------------------------------------
# 25 variables with 30,000 observations. No duplicates were observed. There were al
so no missing values so this appears to be a rather
# clean dataset.
print(dim(raw_data))
## [1] 30000 25
print(sum(duplicated(raw_data)))
## [1] 0
count_missing <- apply(X = raw_data,MARGIN = 2,FUN = function(X){sum(is.na(X))})

sum(count_missing)
## [1] 0
apply(X = raw_data, MARGIN = 2,FUN = class)
## ID LIMIT_BAL
## "numeric" "numeric"
## SEX EDUCATION
## MARRIAGE AGE
## PAY_0 PAY_2
## PAY_3 PAY_4
## PAY_5 PAY_6
## BILL_AMT1 BILL_AMT2
## PAY_AMT1 PAY_AMT2
## default.payment.next.month
## "numeric"
Note something: The PAY_# variables contain observations of -2 which are not mentioned in the data dictionary.
If you look at the data you can further see that we have fewer instances of 1 than we would expect - in the case of
PAY_5 and PAY_6 there are no observations! I am guessing that these were mislabeled somehow and correct the
mistake below along with some other transformations on the data.
pay_names <- grep(names(raw_data),pattern = "PAY_\\d")
for(i in pay_names){
raw_data[,i] <- ifelse(raw_data[,i]==-2,1,raw_data[,i])
}
bill_names <- grep(names(raw_data),pattern = "BILL_AMT\\d")
for(i in bill_names){
raw_data[,i] <- ifelse(raw_data[,i]<0, -raw_data[,i] ,raw_data[,i])
}
exp_data <- raw_data %>% mutate(SEX = factor(ifelse(SEX ==1,"Male","Female"))) %>%

filter(EDUCATION !=0 & EDUCATION != 4 & EDUCATION != 5 & EDUCATION != 6) %>%
mutate(EDUCATION = factor(
ifelse(EDUCATION==1,"GS",ifelse(EDUCATION==2,"Uni","HS")))) %>%
filter(MARRIAGE == 1 | MARRIAGE == 2 ) %>%
mutate(MARRIAGE = factor( ifelse(MARRIAGE == 1,"Married","Single"))) %>%
mutate( Default =
factor(ifelse(default.payment.next.month==1,"Default","Reg"))) %>%
mutate_at(.cols = grep(names(raw_data),pattern = "PAY_\\d"), .funs = factor)
exp_data <- exp_data %>% mutate( AGE_Range = factor( ifelse( AGE >= 20 & AGE <= 30 ,
"20-30",
ifelse( AGE >= 30 & AGE
<= 40 , "30-40",
ifelse( AGE >= 4
0 & AGE <= 50,"40-50",
ifelse(
AGE >= 50 & AGE <= 60, "50-60", "70-80"))))))
raw_data <- raw_data %>%

# filter(EDUCATION !=0 & EDUCATION != 4 & EDUCATION != 5 & EDUCATION != 6) %
>%
mutate(EDUCATION = ifelse(EDUCATION == 0 , 4,ifelse(EDUCATION >=4 , 4,EDUCATI
ON))) %>%
filter(MARRIAGE == 1 | MARRIAGE == 2 ) %>%
mutate(MARRIAGE=as.factor(MARRIAGE)) %>%
mutate(SEX = as.factor(SEX)) %>%
mutate(EDUCATION = as.factor(EDUCATION)) %>%
select(-ID) %>%
mutate(Default = as.factor(default.payment.next.month)) %>%
select(-default.payment.next.month)
for_testing <- raw_data
NOTE TO SELF: Create one variable from the payment data showing the number of months that were missed as
of payment 4!
bill_data <- exp_data %>% select( SEX,Default , contains( "BILL") )

pmt_data <- exp_data %>% select( SEX ,Default , contains( "PAY_AMT") )
ggplot(melt(bill_data,id.vars = c("SEX","Default"))) + geom_boxplot(aes(x=variable, y=

value))+
xlab("Gender")+ylab("Amount")+ facet_grid(SEX
~ Default)
ggplot(melt(pmt_data,id.vars = c("SEX","Default"))) + geom_boxplot(aes(x=variable, y= v

alue))+
xlab("Gender")+ylab("Amount")+ facet_grid(SEX
~ Default)
bill_data <- exp_data %>% select( EDUCATION ,Default, contains( "BILL") )

pmt_data <- exp_data %>% select( EDUCATION ,Default, contains( "PAY_AMT") )
ggplot(melt(bill_data,id.vars = c("EDUCATION","Default" ) )) + geom_boxplot(aes(x=varia

ble, y = value ))+ xlab("Education")+ylab("Amount") + facet_grid(EDUCATION~Default)
ggplot(melt(pmt_data,id.vars = c("EDUCATION","Default" ))) + geom_boxplot(aes(x=variabl

e, y = value ))+ xlab("Education")+ylab("Amount") + facet_grid(EDUCATION~Default)
bill_data <- exp_data %>% select( MARRIAGE,Default , contains( "BILL") )

pmt_data <- exp_data %>% select( MARRIAGE ,Default , contains( "PAY_AMT") )
ggplot(melt(bill_data,id.vars = c("MARRIAGE","Default" ) )) + geom_boxplot(aes(x=varia

ble, y = value ))+ xlab("MARRIAGE")+ylab("Amount") + facet_grid(MARRIAGE~Default)
ggplot(melt(pmt_data,id.vars = c("MARRIAGE","Default" ))) + geom_boxplot(aes(x=variabl

e, y = value ))+ xlab("MARRIAGE")+ylab("Amount") + facet_grid(MARRIAGE~Default)
# Summarize age:
summary(exp_data$AGE)
## Min. 1st Qu. Median Mean 3rd Qu. Max.

## 21.00 28.00 34.00 35.39 41.00 79.00
ggplot(exp_data) + geom_histogram(aes(AGE),bins = 50) + xlab("Age") + ylab("Observation

s")
bill_data <- exp_data %>% select( AGE_Range,Default , contains( "BILL") )

pmt_data <- exp_data %>% select( AGE_Range ,Default , contains( "PAY_AMT") )
ggplot(melt(bill_data,id.vars = c("AGE_Range","Default" ))) + geom_boxplot(aes(x=varia

ble, y = value ))+ xlab("AGE_Range")+ylab("Amount") + facet_grid(AGE_Range~Default)
ggplot(melt(bill_data,id.vars = c("AGE_Range","Default" ))) + geom_boxplot(aes(x=varia

ble, y = value ))+ xlab("AGE_Range")+ylab("Amount") + facet_grid(AGE_Range~Default)
# [3] ---------------------------------------------------------------------
# We see that bankruptcies make up ~22% of the data.
sum(exp_data$default.payment.next.month==1)/nrow(exp_data)
## [1] 0.2233652
exp_data %>% group_by(SEX,Default) %>%

summarise("Default Status Breakdown"=n()) %>%
mutate( `P(Default|Sex)` = `Default Status Breakdown`/sum(`Default Status
Breakdown`)) %>%
filter(Default=="Default")
## Source: local data frame [2 x 4]

## Groups: SEX [2]
##
## SEX Default `Default Status Breakdown` `P(Default|Sex)`
## <fctr> <fctr> <int> <dbl>
## 1 Female Default 3697 0.2102001
## 2 Male Default 2817 0.2433693
exp_data %>% group_by(EDUCATION,Default) %>%

mutate( `P(Default|Education)` = `Default Status Breakdown`/sum(`Default St
atus Breakdown`)) %>%
filter(Default == "Default")

## Groups: EDUCATION [3]
##
## EDUCATION Default `Default Status Breakdown` `P(Default|Education)`
## 1 GS Default 2023 0.1920995
## 2 HS Default 1206 0.2528302
## 3 Uni Default 3285 0.2369788
exp_data %>% group_by(MARRIAGE,Default) %>%

mutate( `P(Default|MARRIAGE)` = `Default Status Breakdown`/sum(`Default S
tatus Breakdown`)) %>%

## Groups: MARRIAGE [2]
##
## MARRIAGE Default `Default Status Breakdown` `P(Default|MARRIAGE)`
## 1 Married Default 3189 0.2375419
## 2 Single Default 3325 0.2112721
ggplot(exp_data)+geom_bar(aes(Default))+ coord_flip()
ggplot(exp_data)+geom_bar(aes(Default))+ facet_grid( SEX ~. ) + coord_flip() + ylab("Ob

servations") + xlab("Status")
ggplot(exp_data)+geom_bar(aes(Default))+ facet_grid( EDUCATION ~. ) + coord_flip()+ yla

b("Observations") + xlab("Status")
ggplot(exp_data)+geom_bar(aes(Default))+ facet_grid( MARRIAGE ~. ) + coord_flip()+

ylab("Observations") + xlab("Status")
ggplot(exp_data)+geom_bar(aes(Default))+ facet_grid( PAY_0 ~. ) + coord_flip()+ ylab("O

bservations") + xlab("Status")





# [4] ---------------------------------------------------------------------
# Create probability functions:

q1 <- function(x){quantile(x=x, probs = .2)}
hold <- exp_data %>% group_by(Default) %>% summarise_if(.predicate = is.numeric, funs(m

in,q1,q2,median,q3,q4,max) )
hold <- melt(hold,id.vars = "Default")
pay_names <- names(exp_data)[grep(names(exp_data),pattern = "PAY_\\d")]
# exp_data %>% group_by_(i,"Default") %>% summarise(Observations = n() ) %>% group_by_

(i) %>% mutate(pct = Observations/sum(Observations))
k <- 1
pay_table <- list()
for(i in pay_names){
cur_table <- exp_data %>% group_by_(i,"Default") %>%
summarise(Observations = n() ) %>%
group_by_(i) %>%
mutate(pct = Observations/sum(Observations)) %>%
names(cur_table)[1] <- c("Category")
pay_table[[k]] <- cur_table[c(1,4)]
k = k+1
}
for( i in 1:length(pay_table)){
if(i==1){ disp_table <- pay_table[[i]]
names(disp_table)[ncol(disp_table)] <- pay_names[i] }else{
disp_table <- merge(disp_table,pay_table[[i]],by = "Category",all = TRUE)

names(disp_table)[ncol(disp_table)] <- pay_names[i]}
kable(disp_table)
Category PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6
0 0.1285844 0.1598030 0.1759120 0.1846231 0.1896783 0.1896923
1 0.2537885 0.1853156 0.1881761 0.1956470 0.1992302 0.2032040
Category PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6
-1 0.1691203 0.1619289 0.1581315 0.1622744 0.1647538 0.1718219
2 0.6968187 0.5585492 0.5159659 0.5220470 0.5427911 0.5089974
3 0.7667732 0.6172840 0.5751073 0.6193182 0.6363636 0.6448087
4 0.6842105 0.5052632 0.5866667 0.6666667 0.6071429 0.6250000
5 0.5416667 0.6000000 0.5714286 0.5142857 0.5882353 0.5384615
6 0.5454545 0.7500000 0.6086957 0.4000000 0.7500000 0.7222222
7 0.7777778 0.6000000 0.8076923 0.8214286 0.8214286 0.8222222
8 0.5789474 NA 0.6666667 0.5000000 1.0000000 1.0000000
b. Exploratory Analysis
In the exploratory analysis I will attempt to answer a few questions I have regarding this dataset.
How much of a determinant are the payment status variables in

determining default behavior?
The variable description documentation states that PAY_0 is: > Repayment status in September, 2005 (-1=pay
duly, 1=payment delay for one month, 2=payment delay for two months, 8=payment delay for eight months,
9=payment delay for nine months and above)
PAY_2, PAY_3, PAY_4, PAY_5, and PAY_6 represent the same metric for the months of August, July, June, May,
and April respectively. How does the number of defaults change as we move from the earliest month to the latest?
[2] Applying Logistic Regression.

train_portion = .8
train_index <- createDataPartition(for_testing$Default,p = train_portion, list =

FALSE,times = 1)
training_data <- for_testing[train_index,]
testing_data <- for_testing[-train_index,]
# training_data <- upSample(x = training_data[-24],training_data[,24],list = FALSE,ynam
e = "Default")
To start with this analysis I will apply logistic regression using Rs caret package. In this
fitControl <- trainControl(method = "repeatedcv",

number = 10,
repeats = 10)
training_data$AGE <- as.numeric( training_data$AGE )
def <- training_data$Default

# fact <- training_data$BILL_AMT1
# fact2 <- training_data$BILL_AMT2
# fact3 <- training_data$SEX
# fact4 <- training_data$EDUCATION
#
# hold1 = glm(data = training_data, formula = Default ~. , family = binomial(link = "l
ogit") )
#
#
# hold2 = glm(data = training_data, formula = Default ~. , family = binomial(link = "l
ogit") )
#
# predict(object = hold, )
#
#
log_model <- train(Default ~. , data = training_data,
method="glm", family = binomial(link = "logit"),
trControl = fitControl)
#
#
# vars <- training_data[1:11]
#
#
# y = training_data$Default
#
# nb_model <- train(vars,y,'nb',trControl=trainControl(method='cv',number=10))
#
# log_model <- train( ,
# method="nb",trControl = fitControl)
#
#
#
#
[3] Model Accuracy.

library(mlbench)
#
#
train_preds <- predict(log_model,newdata = training_data)
test_preds <- predict(log_model,newdata = testing_data)
#
confusionMatrix(train_preds,training_data$Default)
## Confusion Matrix and Statistics

##
## Reference
## Prediction 0 1
## 0 17998 4057
## 1 463 1181
##
## Accuracy : 0.8093
## 95% CI : (0.8042, 0.8143)
## No Information Rate : 0.779
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2657
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9749
## Specificity : 0.2255
## Pos Pred Value : 0.8161
## Neg Pred Value : 0.7184
## Prevalence : 0.7790
## Detection Rate : 0.7594
## Detection Prevalence : 0.9306
## Balanced Accuracy : 0.6002
##
## 'Positive' Class : 0
##
confusionMatrix(test_preds,testing_data$Default)
## Confusion Matrix and Statistics

##
## Reference
## Prediction 0 1
## 0 4493 1005
## 1 122 304
##
## Accuracy : 0.8098
## 95% CI : (0.7995, 0.8197)
## No Information Rate : 0.779
## P-Value [Acc > NIR] : 3.697e-09
##
## Kappa : 0.2714
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9736
## Specificity : 0.2322
## Pos Pred Value : 0.8172
## Neg Pred Value : 0.7136
## Prevalence : 0.7790
## Detection Rate : 0.7584
## Detection Prevalence : 0.9281
## Balanced Accuracy : 0.6029
##
## 'Positive' Class : 0
##
[4] Summary of Results

Logistic Regression

Uploaded by

Logistic Regression

Uploaded by

5/7/2017 Logistic Regression

# This is a code block

## Loading required package: lattice

## The following objects are masked from 'package:stats':

## The following objects are masked from 'package:base':

[1] Working with the Data

library(ggplot2) # Data visualization

raw_data <- read.csv("UCI_Credit_Card.csv")

# Input data files are available in the "../input/" directory.

# The purpose of this script is to do the following:

count_missing <- apply(X = raw_data,MARGIN = 2,FUN = function(X){sum(is.na(X))})

apply(X = raw_data, MARGIN = 2,FUN = class)

pay_names <- grep(names(raw_data),pattern = "PAY_\\d")

bill_names <- grep(names(raw_data),pattern = "BILL_AMT\\d")

exp_data <- raw_data %>% mutate(SEX = factor(ifelse(SEX ==1,"Male","Female"))) %>%

raw_data <- raw_data %>%

for_testing <- raw_data

bill_data <- exp_data %>% select( SEX,Default , contains( "BILL") )

ggplot(melt(bill_data,id.vars = c("SEX","Default"))) + geom_boxplot(aes(x=variable, y=

ggplot(melt(pmt_data,id.vars = c("SEX","Default"))) + geom_boxplot(aes(x=variable, y= v

bill_data <- exp_data %>% select( EDUCATION ,Default, contains( "BILL") )

ggplot(melt(bill_data,id.vars = c("EDUCATION","Default" ) )) + geom_boxplot(aes(x=varia

ggplot(melt(pmt_data,id.vars = c("EDUCATION","Default" ))) + geom_boxplot(aes(x=variabl

bill_data <- exp_data %>% select( MARRIAGE,Default , contains( "BILL") )

ggplot(melt(bill_data,id.vars = c("MARRIAGE","Default" ) )) + geom_boxplot(aes(x=varia

ggplot(melt(pmt_data,id.vars = c("MARRIAGE","Default" ))) + geom_boxplot(aes(x=variabl

## Min. 1st Qu. Median Mean 3rd Qu. Max.

ggplot(exp_data) + geom_histogram(aes(AGE),bins = 50) + xlab("Age") + ylab("Observation

bill_data <- exp_data %>% select( AGE_Range,Default , contains( "BILL") )

ggplot(melt(bill_data,id.vars = c("AGE_Range","Default" ))) + geom_boxplot(aes(x=varia

ggplot(melt(bill_data,id.vars = c("AGE_Range","Default" ))) + geom_boxplot(aes(x=varia

exp_data %>% group_by(SEX,Default) %>%

## Source: local data frame [2 x 4]

exp_data %>% group_by(EDUCATION,Default) %>%

## Source: local data frame [3 x 4]

exp_data %>% group_by(MARRIAGE,Default) %>%

## Source: local data frame [2 x 4]

ggplot(exp_data)+geom_bar(aes(Default))+ facet_grid( SEX ~. ) + coord_flip() + ylab("Ob

ggplot(exp_data)+geom_bar(aes(Default))+ facet_grid( EDUCATION ~. ) + coord_flip()+ yla

ggplot(exp_data)+geom_bar(aes(Default))+ facet_grid( MARRIAGE ~. ) + coord_flip()+

ggplot(exp_data)+geom_bar(aes(Default))+ facet_grid( PAY_0 ~. ) + coord_flip()+ ylab("O

ggplot(exp_data)+geom_bar(aes(Default))+ facet_grid( PAY_2 ~. ) + coord_flip()+ ylab("O

ggplot(exp_data)+geom_bar(aes(Default))+ facet_grid( PAY_3 ~. ) + coord_flip()+ ylab("O

ggplot(exp_data)+geom_bar(aes(Default))+ facet_grid( PAY_4 ~. ) + coord_flip()+ ylab("O

ggplot(exp_data)+geom_bar(aes(Default))+ facet_grid( PAY_5 ~. ) + coord_flip()+ ylab("O

ggplot(exp_data)+geom_bar(aes(Default))+ facet_grid( PAY_6 ~. ) + coord_flip()+ ylab("O

# Create probability functions:

hold <- exp_data %>% group_by(Default) %>% summarise_if(.predicate = is.numeric, funs(m

pay_names <- names(exp_data)[grep(names(exp_data),pattern = "PAY_\\d")]

# exp_data %>% group_by_(i,"Default") %>% summarise(Observations = n() ) %>% group_by_

disp_table <- merge(disp_table,pay_table[[i]],by = "Category",all = TRUE)

Category PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6

0 0.1285844 0.1598030 0.1759120 0.1846231 0.1896783 0.1896923

1 0.2537885 0.1853156 0.1881761 0.1956470 0.1992302 0.2032040

Category PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6

-1 0.1691203 0.1619289 0.1581315 0.1622744 0.1647538 0.1718219

2 0.6968187 0.5585492 0.5159659 0.5220470 0.5427911 0.5089974

3 0.7667732 0.6172840 0.5751073 0.6193182 0.6363636 0.6448087

4 0.6842105 0.5052632 0.5866667 0.6666667 0.6071429 0.6250000

5 0.5416667 0.6000000 0.5714286 0.5142857 0.5882353 0.5384615

6 0.5454545 0.7500000 0.6086957 0.4000000 0.7500000 0.7222222

7 0.7777778 0.6000000 0.8076923 0.8214286 0.8214286 0.8222222

8 0.5789474 NA 0.6666667 0.5000000 1.0000000 1.0000000

How much of a determinant are the payment status variables in

[2] Applying Logistic Regression.

train_index <- createDataPartition(for_testing$Default,p = train_portion, list =

fitControl <- trainControl(method = "repeatedcv",

training_data$AGE <- as.numeric( training_data$AGE )

def <- training_data$Default

[3] Model Accuracy.

## Confusion Matrix and Statistics

## Confusion Matrix and Statistics

[4] Summary of Results

You might also like