Change the working directory appropriately
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(reshape2)
#setwd("e:/temp/1")
setwd("C:/Senthil/MSDataAnalytics/Semester3/608/Project")
options(scipen = 999)
Data cleansing
Change data types to numeric for columns that are used in the computation
rename columns to proper names for easy understanding of data
change lookup values of gender, education, marital status to actual string values
dataLoadAndCleanup <- function(dfDefault)
{
names(dfDefault) <- tolower(names(dfDefault))
dfDefault <- transform(dfDefault, id <- as.numeric(id),
limit_bal <- as.numeric(limit_bal),
age <- as.numeric(age),
education <- as.numeric(education),
pay_0 <- as.numeric(pay_0),
pay_2 <- as.numeric(pay_2),
pay_3 <- as.numeric(pay_3),
pay_4 <- as.numeric(pay_4),
pay_5 <- as.numeric(pay_5),
pay_6 <- as.numeric(pay_5),
bill_amt1 <- as.numeric(bill_amt1),
bill_amt2 <- as.numeric(bill_amt2),
bill_amt3 <- as.numeric(bill_amt3),
bill_amt4 <- as.numeric(bill_amt4),
bill_amt5 <- as.numeric(bill_amt5),
bill_amt6 <- as.numeric(bill_amt6),
pay_amt1 <- as.numeric(pay_amt1),
pay_amt2 <- as.numeric(pay_amt2),
pay_amt3 <- as.numeric(pay_amt3),
pay_amt4 <- as.numeric(pay_amt4),
pay_amt5 <- as.numeric(pay_amt5),
pay_amt6 <- as.numeric(pay_amt6)
)
names(dfDefault)[13:18] <- c('sep_bill_amt', 'aug_bill_amt', 'jul_bill_amt', 'jun_bill_amt', 'may_bill_amt', 'apr_bill_amt')
names(dfDefault)[19:24] <- c('sep_pay_amt', 'aug_pay_amt', 'jul_pay_amt', 'jun_pay_amt', 'may_pay_amt', 'apr_pay_amt')
names(dfDefault)[25] <- c('default.expected')
#Convert numbers to actual values for sex
dfDefault$sex[which(dfDefault$sex==1)] <- 'Male'
dfDefault$sex[which(dfDefault$sex==2)] <- 'Female'
#Convert numbers to actual values for education
dfDefault$education[which(dfDefault$education==1)] <- 'Graduate'
dfDefault$education[which(dfDefault$education==2)] <- 'University'
dfDefault$education[which(dfDefault$education==3)] <- 'HighSchool'
dfDefault$education[which(dfDefault$education==4)] <- 'Others'
dfDefault$education[which(dfDefault$education==5)] <- 'Others'
dfDefault$education[which(dfDefault$education==6)] <- 'Others'
dfDefault$education[which(dfDefault$education==0)] <- 'Others'
#Convert numbers to actual values for marital status
dfDefault$marriage[which(dfDefault$marriage==1)] <- 'Married'
dfDefault$marriage[which(dfDefault$marriage==2)] <- 'Single'
dfDefault$marriage[which(dfDefault$marriage==3)] <- 'Others'
dfDefault$marriage[which(dfDefault$marriage==0)] <- 'Others'
return (dfDefault)
}
Condense function excludes columns not needed for this project
It re-arranges the columns for easy reading and working with this dataset
condense_dfDefault <- function(dfDefault)
{
excludecolumns <- paste("pay_", 2:6, sep="")
excludecolumns <- c('pay_0', excludecolumns)
excludevars <- names(dfDefault) %in% excludecolumns
dfDefault <- dfDefault[!excludevars]
rearrangedcols <- c('id', 'limit_bal', 'sex', 'education', 'marriage', 'age',
'apr_bill_amt', 'may_bill_amt', 'jun_bill_amt', 'jul_bill_amt',
'aug_bill_amt', 'sep_bill_amt',
'apr_pay_amt', 'may_pay_amt', 'jun_pay_amt',
'jul_pay_amt', 'aug_pay_amt', 'sep_pay_amt',
'default.expected')
dfDefault <- dfDefault[,rearrangedcols]
excludecolumns <- c('sex','education','marriage','age')
excludevars <- names(dfDefault) %in% excludecolumns
dfDefault <- dfDefault[!excludevars]
excludecolumns <- c('apr_pay_amt','sep_bill_amt')
excludevars <- names(dfDefault) %in% excludecolumns
dfDefault <- dfDefault[!excludevars]
#write.csv(dfDefault,'./condensedfile.csv')
return (dfDefault)
}
sample rows of data before and after cleansing
dfDefault <- read.csv(file='default of credit card clients.csv', stringsAsFactors = FALSE)
head(dfDefault)
## ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
## 1 1 20000 2 2 1 24 2 2 -1 -1 -2
## 2 2 120000 2 2 2 26 -1 2 0 0 0
## 3 3 90000 2 2 2 34 0 0 0 0 0
## 4 4 50000 2 2 1 37 0 0 0 0 0
## 5 5 50000 1 2 1 57 -1 0 -1 0 0
## 6 6 50000 1 1 2 37 0 0 0 0 0
## PAY_6 BILL_AMT1 BILL_AMT2 BILL_AMT3 BILL_AMT4 BILL_AMT5 BILL_AMT6
## 1 -2 3913 3102 689 0 0 0
## 2 2 2682 1725 2682 3272 3455 3261
## 3 0 29239 14027 13559 14331 14948 15549
## 4 0 46990 48233 49291 28314 28959 29547
## 5 0 8617 5670 35835 20940 19146 19131
## 6 0 64400 57069 57608 19394 19619 20024
## PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6
## 1 0 689 0 0 0 0
## 2 0 1000 1000 1000 0 2000
## 3 1518 1500 1000 1000 1000 5000
## 4 2000 2019 1200 1100 1069 1000
## 5 2000 36681 10000 9000 689 679
## 6 2500 1815 657 1000 1000 800
## default.payment.next.month
## 1 1
## 2 1
## 3 0
## 4 0
## 5 0
## 6 0
dfDefault <- dataLoadAndCleanup(dfDefault)
dfDefault <- condense_dfDefault(dfDefault)
head(dfDefault)
## id limit_bal apr_bill_amt may_bill_amt jun_bill_amt jul_bill_amt
## 1 1 20000 0 0 0 689
## 2 2 120000 3261 3455 3272 2682
## 3 3 90000 15549 14948 14331 13559
## 4 4 50000 29547 28959 28314 49291
## 5 5 50000 19131 19146 20940 35835
## 6 6 50000 20024 19619 19394 57608
## aug_bill_amt may_pay_amt jun_pay_amt jul_pay_amt aug_pay_amt sep_pay_amt
## 1 3102 0 0 0 689 0
## 2 1725 0 1000 1000 1000 0
## 3 14027 1000 1000 1000 1500 1518
## 4 48233 1069 1100 1200 2019 2000
## 5 5670 689 9000 10000 36681 2000
## 6 57069 1000 1000 657 1815 2500
## default.expected
## 1 1
## 2 1
## 3 0
## 4 0
## 5 0
## 6 0
Create limit group column
This column groups the credit card limits to discrete groups
dfDefault['limitgrp'] <- as.numeric(0)
dfDefault$limitgrp[dfDefault$limit_bal<=100000] <- 100000
dfDefault$limitgrp[dfDefault$limit_bal>100000 & dfDefault$limit_bal<=200000] <- 200000
dfDefault$limitgrp[dfDefault$limit_bal>200000 & dfDefault$limit_bal<=300000] <- 300000
dfDefault$limitgrp[dfDefault$limit_bal>300000 & dfDefault$limit_bal<=400000] <- 400000
dfDefault$limitgrp[dfDefault$limit_bal>400000 & dfDefault$limit_bal<=500000] <- 500000
dfDefault$limitgrp[dfDefault$limit_bal>500000 & dfDefault$limit_bal<=600000] <- 600000
dfDefault$limitgrp[dfDefault$limit_bal>600000 & dfDefault$limit_bal<=700000] <- 700000
dfDefault$limitgrp[dfDefault$limit_bal>700000 & dfDefault$limit_bal<=800000] <- 800000
dfDefault$limitgrp[dfDefault$limit_bal>800000 & dfDefault$limit_bal<=900000] <- 900000
dfDefault$limitgrp[dfDefault$limit_bal>900000 & dfDefault$limit_bal<=1000000] <- 1000000
This chart shows how many card holders are there given a credit card limit group and how many of them are defaulters.
This will help us identify if the limit should be reduced or increased to reduce the defaulters.
dfGroupedOnLimitGrp <- dfDefault %>% group_by(limitgrp) %>% summarise(Members = n(), Defaulters = sum(default.expected))
ggplot(dfGroupedOnLimitGrp, aes(x=limitgrp)) + geom_bar(aes(y=Members),stat = 'identity') +
geom_line(aes(y=Defaulters)) + geom_point(aes(y=Defaulters)) + xlab('CreditCardLimit') + ylab('Members vs Defaulters') +
ggtitle('Bar represents Members in the group and Line represents Defaulters') + theme(plot.title=element_text(family="Times", face="bold")) + scale_x_continuous(breaks = seq(100000,1000000,100000)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
This chart shows how many people paid how much of their due amount in terms of percentage. It breaks down that info by the 5 months
that are being observed.
dfDefault['apr_pay_percent'] <- as.numeric(0)
dfDefault['may_pay_percent'] <- as.numeric(0)
dfDefault['jun_pay_percent'] <- as.numeric(0)
dfDefault['jul_pay_percent'] <- as.numeric(0)
dfDefault['aug_pay_percent'] <- as.numeric(0)
#dfDefault$apr_pay_percent[dfDefault$apr_bill_amt == 0] <- 0
#dfDefault$apr_pay_percent[dfDefault$apr_bill_amt >= 0] <- dfDefault$apr_bill_amt/dfDefault$may_pay_amt
for(i in 1:length(dfDefault$id))
{
#i <-1
if(dfDefault$apr_bill_amt[i] > 0)
{
dfDefault$apr_pay_percent[i] <- (dfDefault$may_pay_amt[i]/dfDefault$apr_bill_amt[i])*100
}
if(dfDefault$may_bill_amt[i] > 0)
{
dfDefault$may_pay_percent[i] <- (dfDefault$jun_pay_amt[i]/dfDefault$may_bill_amt[i])*100
}
if(dfDefault$jun_bill_amt[i] > 0)
{
dfDefault$jun_pay_percent[i] <- (dfDefault$jul_pay_amt[i]/dfDefault$jun_bill_amt[i])*100
}
if(dfDefault$jul_bill_amt[i] > 0)
{
dfDefault$jul_pay_percent[i] <- (dfDefault$aug_pay_amt[i]/dfDefault$jul_bill_amt[i])*100
}
if(dfDefault$aug_bill_amt[i] > 0)
{
dfDefault$aug_pay_percent[i] <- (dfDefault$sep_pay_amt[i]/dfDefault$aug_bill_amt[i])*100
}
if(dfDefault$apr_pay_percent[i] >=0 & dfDefault$apr_pay_percent[i]<=20) dfDefault$apr_pay_percent[i] <- 20
if(dfDefault$apr_pay_percent[i] >20 & dfDefault$apr_pay_percent[i]<=40) dfDefault$apr_pay_percent[i] <- 40
if(dfDefault$apr_pay_percent[i] >40 & dfDefault$apr_pay_percent[i]<=60) dfDefault$apr_pay_percent[i] <- 60
if(dfDefault$apr_pay_percent[i] >60 & dfDefault$apr_pay_percent[i]<=80) dfDefault$apr_pay_percent[i] <- 80
if(dfDefault$apr_pay_percent[i] >80 ) dfDefault$apr_pay_percent[i] <- 100
if(dfDefault$may_pay_percent[i] >=0 & dfDefault$may_pay_percent[i]<=20) dfDefault$may_pay_percent[i] <- 20
if(dfDefault$may_pay_percent[i] >20 & dfDefault$may_pay_percent[i]<=40) dfDefault$may_pay_percent[i] <- 40
if(dfDefault$may_pay_percent[i] >40 & dfDefault$may_pay_percent[i]<=60) dfDefault$may_pay_percent[i] <- 60
if(dfDefault$may_pay_percent[i] >60 & dfDefault$may_pay_percent[i]<=80) dfDefault$may_pay_percent[i] <- 80
if(dfDefault$may_pay_percent[i] >80 ) dfDefault$may_pay_percent[i] <- 100
if(dfDefault$may_pay_percent[i] >=0 & dfDefault$jun_pay_percent[i]<=20) dfDefault$jun_pay_percent[i] <- 20
if(dfDefault$jun_pay_percent[i] >20 & dfDefault$jun_pay_percent[i]<=40) dfDefault$jun_pay_percent[i] <- 40
if(dfDefault$jun_pay_percent[i] >40 & dfDefault$jun_pay_percent[i]<=60) dfDefault$jun_pay_percent[i] <- 60
if(dfDefault$jun_pay_percent[i] >60 & dfDefault$jun_pay_percent[i]<=80) dfDefault$jun_pay_percent[i] <- 80
if(dfDefault$jun_pay_percent[i] >80 ) dfDefault$jun_pay_percent[i] <- 100
if(dfDefault$may_pay_percent[i] >=0 & dfDefault$jul_pay_percent[i]<=20) dfDefault$jul_pay_percent[i] <- 20
if(dfDefault$jul_pay_percent[i] >20 & dfDefault$jul_pay_percent[i]<=40) dfDefault$jul_pay_percent[i] <- 40
if(dfDefault$jul_pay_percent[i] >40 & dfDefault$jul_pay_percent[i]<=60) dfDefault$jul_pay_percent[i] <- 60
if(dfDefault$jul_pay_percent[i] >60 & dfDefault$jul_pay_percent[i]<=80) dfDefault$jul_pay_percent[i] <- 80
if(dfDefault$jul_pay_percent[i] >80 ) dfDefault$jul_pay_percent[i] <- 100
if(dfDefault$may_pay_percent[i] >=0 & dfDefault$aug_pay_percent[i]<=20) dfDefault$aug_pay_percent[i] <- 20
if(dfDefault$aug_pay_percent[i] >20 & dfDefault$aug_pay_percent[i]<=40) dfDefault$aug_pay_percent[i] <- 40
if(dfDefault$aug_pay_percent[i] >40 & dfDefault$aug_pay_percent[i]<=60) dfDefault$aug_pay_percent[i] <- 60
if(dfDefault$aug_pay_percent[i] >60 & dfDefault$aug_pay_percent[i]<=80) dfDefault$aug_pay_percent[i] <- 80
if(dfDefault$aug_pay_percent[i] >80 ) dfDefault$aug_pay_percent[i] <- 100
}
myvars <- c('id',paste(c('apr','may','jun','jul','aug'), '_pay_percent', sep=""))
newdata <- dfDefault[myvars]
dfMelted1 <- melt(newdata, id('id'))
#head(dfMelted1)
dfMelted1$variable <- gsub('_pay_percent','',dfMelted1$variable)
ggplot(dfMelted1,aes(x=factor(value))) + geom_bar() + facet_grid(~variable) + xlab('Percentage of Payment to Bill Amt')
#head(dfMelted1[which(dfMelted1$value <=0),])