Machine Learning

2021-11-17

This week we use more about model selection and we will add more variables from other datasets to our model

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5purrr   0.3.4
## ✓ tibble  3.1.2dplyr   1.0.7
## ✓ tidyr   1.1.4stringr 1.4.0
## ✓ readr   2.0.1forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(sf)
## Linking to GEOS 3.8.1, GDAL 3.2.1, PROJ 7.2.1
library(Matrix)
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(tmap)
## Registered S3 methods overwritten by 'stars':
##   method             from
##   st_bbox.SpatRaster sf  
##   st_crs.SpatRaster  sf
library(formattable)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
mortgage <-read_csv(here::here('dataset','dcr_clean.csv'))
## Rows: 619660 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): state_orig_time, month, day
## dbl (29): id, time, orig_time, first_time, mat_time, res_time, balance_time,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
mortgage$age<- mortgage$time-mortgage$orig_time
mortgage$TTM<-mortgage$mat_time-mortgage$orig_time
mortgage$date=paste(as.character(mortgage$year),as.character(mortgage$month),as.character(mortgage$day),sep='-')
mortgage_first<-mortgage[match(unique(mortgage$id), mortgage$id),]
mortgage_first$first_time_date<-mortgage_first$date
mortgage_first$first_time<-mortgage_first$time
first <- mortgage_first %>% select(id, first_time, orig_time,mat_time,rate_time,REtype_CO_orig_time,REtype_PU_orig_time,REtype_SF_orig_time,investor_orig_time,balance_orig_time:Interest_Rate_orig_time,state_orig_time,hpi_orig_time,first_time_date, age, TTM)

get_last <- mortgage %>% select(id,date,time,default_time,payoff_time,status_time) %>% group_by(id) %>% summarise_all(last)
get_last$last_time_date<-get_last$date
get_last$last_time<-get_last$time
get_last$status_last<-get_last$status_time
atlast <- get_last %>% select(id,last_time_date:status_last,default_time)
meanvalue <- mortgage %>% group_by(id) %>% summarise(interest_rate_mean = mean(interest_rate_time),gdp_mean = mean(gdp_time), risk_free_mean = mean(rate_time),hpi_mean = mean(hpi_time),uer_mean = mean(uer_time))
mortgage_all <- first %>% left_join(atlast, by = 'id') %>% left_join(meanvalue, by = 'id')
mortgage_all$time_to_GFC <- 37-mortgage_all$first_time
## to increase the accuracy, I use Lasso regression with machine learning
mortgage_ml <- na.omit(mortgage_all)
set.seed(123)
training.samples <- mortgage_ml$default_time %>% 
  createDataPartition(p = 0.8, list = FALSE)
train.data  <- mortgage_ml[training.samples, ]
test.data <- mortgage_ml[-training.samples, ]
library(glmnet)  
## Loaded glmnet 4.1-3
library(faraway)
## 
## Attaching package: 'faraway'
## The following object is masked from 'package:lattice':
## 
##     melanoma
x <- model.matrix(default_time ~ age + TTM+ last_time+ status_last+ interest_rate_mean+gdp_mean*uer_mean+ risk_free_mean+ hpi_mean+  FICO_orig_time+ hpi_mean,
                         data = train.data)[,-1]

y <- ifelse(train.data$default_time == 1, 1, 0)
glmnet(x, y, family = "binomial", alpha = 1, lambda = NULL)
## 
## Call:  glmnet(x = x, y = y, family = "binomial", alpha = 1, lambda = NULL) 
## 
##    Df  %Dev   Lambda
## 1   0  0.00 0.194000
## 2   1  2.45 0.176800
## 3   1  4.47 0.161100
## 4   1  6.15 0.146700
## 5   2  7.58 0.133700
## 6   2  9.59 0.121800
## 7   2 11.30 0.111000
## 8   2 12.76 0.101100
## 9   2 14.00 0.092160
## 10  3 15.51 0.083970
## 11  3 17.01 0.076510
## 12  4 20.71 0.069720
## 13  4 24.04 0.063520
## 14  4 26.98 0.057880
## 15  4 29.58 0.052740
## 16  4 31.90 0.048050
## 17  4 33.96 0.043780
## 18  5 35.98 0.039890
## 19  5 37.88 0.036350
## 20  5 39.58 0.033120
## 21  6 41.09 0.030180
## 22  6 42.47 0.027500
## 23  7 43.71 0.025050
## 24  8 44.82 0.022830
## 25  8 45.81 0.020800
## 26  8 46.68 0.018950
## 27  8 47.45 0.017270
## 28  8 48.11 0.015740
## 29  8 48.70 0.014340
## 30 10 49.26 0.013060
## 31 10 50.15 0.011900
## 32 10 50.97 0.010850
## 33 11 51.72 0.009882
## 34 11 52.66 0.009004
## 35 10 53.45 0.008204
## 36 10 54.07 0.007475
## 37 10 54.62 0.006811
## 38 10 55.11 0.006206
## 39 10 55.54 0.005655
## 40 10 55.92 0.005153
## 41 10 56.24 0.004695
## 42 10 56.53 0.004278
## 43 10 56.78 0.003898
## 44 10 57.00 0.003551
## 45 10 57.18 0.003236
## 46 10 57.35 0.002948
## 47 10 57.49 0.002687
## 48 10 57.61 0.002448
## 49 10 57.71 0.002230
## 50 10 57.80 0.002032
## 51 10 57.88 0.001852
## 52 10 57.94 0.001687
## 53 10 58.00 0.001537
## 54 10 58.05 0.001401
## 55 10 58.09 0.001276
## 56 11 58.15 0.001163
## 57 11 58.37 0.001060
## 58 11 58.55 0.000966
## 59 11 58.71 0.000880
## 60 11 58.84 0.000802
## 61 11 58.95 0.000730
## 62 11 59.04 0.000665
## 63 11 59.11 0.000606
## 64 11 59.17 0.000553
## 65 11 59.23 0.000503
## 66 11 59.27 0.000459
## 67 11 59.31 0.000418
## 68 11 59.34 0.000381
## 69 11 59.36 0.000347
## 70 11 59.38 0.000316
## 71 11 59.40 0.000288
## 72 11 59.42 0.000262
## 73 11 59.43 0.000239
## 74 11 59.44 0.000218
## 75 11 59.45 0.000199
## 76 11 59.46 0.000181
## 77 11 59.46 0.000165
## 78 11 59.47 0.000150
## 79 11 59.47 0.000137
## 80 11 59.48 0.000125
## 81 11 59.48 0.000114
## 82 11 59.48 0.000104
## 83 11 59.48 0.000094
## 84 11 59.48 0.000086
## 85 11 59.49 0.000078
## 86 11 59.49 0.000071
set.seed(123) 
cv.lasso <- cv.glmnet(x, y, alpha = 1, family = "binomial")
model <- glmnet(x, y, alpha = 1, family = "binomial",
                lambda = cv.lasso$lambda.min)



coef(model)
## 12 x 1 sparse Matrix of class "dgCMatrix"
##                              s0
## (Intercept)        -4.576481886
## age                -0.041744705
## TTM                 0.021109214
## last_time          -0.444417226
## status_last        -5.352373253
## interest_rate_mean  0.075824269
## gdp_mean           -7.199292425
## uer_mean            1.662571752
## risk_free_mean     -0.357088532
## hpi_mean            0.117828076
## FICO_orig_time     -0.003016795
## gdp_mean:uer_mean   0.774048073
x.test <- model.matrix(default_time ~ age + TTM+ last_time+ status_last+ interest_rate_mean+gdp_mean*uer_mean+ risk_free_mean+ hpi_mean+  FICO_orig_time+ hpi_mean, test.data)[,-1]
probabilities <- model %>% predict(newx = x.test)
predicted.classes <- ifelse(probabilities > 0.5, "default", "payback")
library(glmnet)
set.seed(123)
cv.lasso <- cv.glmnet(x, y, alpha = 1, family = "binomial")
plot(cv.lasso)

cv.lasso$lambda.min
## [1] 7.135674e-05
cv.lasso$lambda.1se
## [1] 0.0008015674
coef(cv.lasso, cv.lasso$lambda.min)
## 12 x 1 sparse Matrix of class "dgCMatrix"
##                              s1
## (Intercept)        -4.538261067
## age                -0.041706498
## TTM                 0.021097993
## last_time          -0.443585296
## status_last        -5.346884481
## interest_rate_mean  0.075829728
## gdp_mean           -7.170121997
## uer_mean            1.658933176
## risk_free_mean     -0.357121952
## hpi_mean            0.117502011
## FICO_orig_time     -0.003014254
## gdp_mean:uer_mean   0.769563666
##Using lambda.1se as the best lambda, gives the following regression coefficients:
coef(cv.lasso, cv.lasso$lambda.1se)
## 12 x 1 sparse Matrix of class "dgCMatrix"
##                              s1
## (Intercept)        -1.774807825
## age                -0.035866922
## TTM                 0.019528984
## last_time          -0.372909642
## status_last        -4.943609179
## interest_rate_mean  0.067435512
## gdp_mean           -4.050056636
## uer_mean            1.474202368
## risk_free_mean     -0.352693658
## hpi_mean            0.089116206
## FICO_orig_time     -0.002741908
## gdp_mean:uer_mean   0.272358675
# Final model with lambda.min
lasso.model <- glmnet(x, y, alpha = 1, family = "binomial",
                      lambda = cv.lasso$lambda.min)
# Make prediction on test data
x.test <- model.matrix(default_time ~ age + TTM+ last_time+ status_last+ interest_rate_mean+gdp_mean*uer_mean+ risk_free_mean+ hpi_mean+  FICO_orig_time+ hpi_mean, test.data)[,-1]
probabilities <- lasso.model %>% predict(newx = x.test)
predicted.classes <- ifelse(probabilities > 0.5, 1, 0)
# Model accuracy
observed.classes <- test.data$default_time
mean(predicted.classes == observed.classes)
## [1] 0.9334605
# Final model with lambda.1se
lasso.model <- glmnet(x, y, alpha = 1, family = "binomial",
                      lambda = cv.lasso$lambda.1se)
# Make prediction on test data
x.test <- model.matrix(default_time ~ age + TTM+ last_time+ status_last+ interest_rate_mean+gdp_mean*uer_mean+ risk_free_mean+ hpi_mean+  FICO_orig_time+ hpi_mean,  test.data)[,-1]
probabilities <- lasso.model %>% predict(newx = x.test)
predicted.classes <- ifelse(probabilities > 0.5, 1, 0)
# Model accuracy rate
observed.classes <- test.data$default_time
mean(predicted.classes == observed.classes)
## [1] 0.9356684
# Fit the model

 
full.model <- glm(default_time ~ age + TTM+ last_time+ status_last+ interest_rate_mean+gdp_mean*uer_mean+ risk_free_mean+ hpi_mean+  FICO_orig_time+ hpi_mean, data = train.data, family = binomial)
# Make predictions
probabilities <- full.model %>% predict(test.data, type = "response")

predicted.classes <- ifelse(probabilities > 0.5, 1, 0)
# Model accuracy
observed.classes <- test.data$default_time
mean(predicted.classes == observed.classes)
## [1] 0.9420915
library(modelr)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
full.model
## 
## Call:  glm(formula = default_time ~ age + TTM + last_time + status_last + 
##     interest_rate_mean + gdp_mean * uer_mean + risk_free_mean + 
##     hpi_mean + FICO_orig_time + hpi_mean, family = binomial, 
##     data = train.data)
## 
## Coefficients:
##        (Intercept)                 age                 TTM           last_time  
##          -5.012578           -0.042306            0.021325           -0.454664  
##        status_last  interest_rate_mean            gdp_mean            uer_mean  
##          -5.415270            0.076664           -7.604302            1.698916  
##     risk_free_mean            hpi_mean      FICO_orig_time   gdp_mean:uer_mean  
##          -0.360264            0.121955           -0.003055            0.836962  
## 
## Degrees of Freedom: 39855 Total (i.e. Null);  39844 Residual
## Null Deviance:       48820 
## Residual Deviance: 19770     AIC: 19800
mortgage_all <- mortgage_all %>% add_predictions(full.model, type = 'response')
roc(mortgage_all$default_time,mortgage_all$pred,plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

## 
## Call:
## roc.default(response = mortgage_all$default_time, predictor = mortgage_all$pred,     plot = TRUE)
## 
## Data: mortgage_all$pred in 34703 controls (mortgage_all$default_time 0) < 15117 cases (mortgage_all$default_time 1).
## Area under the curve: 0.949