Further Model Selection and Combining Dataset

2021-11-15

This week we will do more about model selection and we will add more variables from other datasets to our model.

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.2     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(sf)
## Linking to GEOS 3.8.1, GDAL 3.2.1, PROJ 7.2.1
library(tmap)
## Registered S3 methods overwritten by 'stars':
##   method             from
##   st_bbox.SpatRaster sf  
##   st_crs.SpatRaster  sf
library(formattable)
mortgage <-read_csv(here::here('dataset','dcr_clean.csv'))
## Rows: 619660 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): state_orig_time, month, day
## dbl (29): id, time, orig_time, first_time, mat_time, res_time, balance_time,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
mortgage$age<- mortgage$time-mortgage$orig_time
mortgage$TTM<-mortgage$mat_time-mortgage$orig_time
mortgage$date=paste(as.character(mortgage$year),as.character(mortgage$month),as.character(mortgage$day),sep='-')
mortgage_first<-mortgage[match(unique(mortgage$id), mortgage$id),]
mortgage_first$first_time_date<-mortgage_first$date
mortgage_first$first_time<-mortgage_first$time
first <- mortgage_first %>% select(id, first_time, orig_time,mat_time,rate_time,REtype_CO_orig_time,REtype_PU_orig_time,REtype_SF_orig_time,investor_orig_time,balance_orig_time:Interest_Rate_orig_time,state_orig_time,hpi_orig_time,first_time_date, age, TTM)

get_last <- mortgage %>% select(id,date,time,default_time,payoff_time,status_time) %>% group_by(id) %>% summarise_all(last)
get_last$last_time_date<-get_last$date
get_last$last_time<-get_last$time
get_last$status_last<-get_last$status_time
atlast <- get_last %>% select(id,last_time_date:status_last,default_time)
meanvalue <- mortgage %>% group_by(id) %>% summarise(interest_rate_mean = mean(interest_rate_time),gdp_mean = mean(gdp_time), risk_free_mean = mean(rate_time),hpi_mean = mean(hpi_time),uer_mean = mean(uer_time))
mortgage_all <- first %>% left_join(atlast, by = 'id') %>% left_join(meanvalue, by = 'id')
mortgage_all$time_to_GFC <- 37-mortgage_all$first_time
library(tidyverse)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
mortgage_select <- mortgage_all %>% select(default_time,rate_time,first_time,gdp_mean,gdp_mean,hpi_orig_time,FICO_orig_time,TTM,interest_rate_mean,age,risk_free_mean)
## 普通stepwise
fullmod = glm(default_time ~.,data = mortgage_select,family=binomial)
summary(fullmod)
## 
## Call:
## glm(formula = default_time ~ ., family = binomial, data = mortgage_select)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.6717  -0.7796  -0.4586   0.8650   3.2044  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -3.2067426  0.2414469 -13.281  < 2e-16 ***
## rate_time           1.6947815  1.6173371   1.048    0.295    
## first_time         -0.0223649  0.0043960  -5.088 3.63e-07 ***
## gdp_mean           -0.7985498  0.0130908 -61.001  < 2e-16 ***
## hpi_orig_time       0.0132781  0.0009223  14.396  < 2e-16 ***
## FICO_orig_time     -0.0036078  0.0001836 -19.650  < 2e-16 ***
## TTM                 0.0176525  0.0009626  18.339  < 2e-16 ***
## interest_rate_mean  0.2832708  0.0079373  35.688  < 2e-16 ***
## age                -0.0070443  0.0046052  -1.530    0.126    
## risk_free_mean     -1.7326169  1.6169385  -1.072    0.284    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 61154  on 49819  degrees of freedom
## Residual deviance: 49256  on 49810  degrees of freedom
## AIC: 49276
## 
## Number of Fisher Scoring iterations: 5
nothing <- glm(default_time ~ 1,data = mortgage_select,family=binomial)
summary(nothing)
## 
## Call:
## glm(formula = default_time ~ 1, family = binomial, data = mortgage_select)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8504  -0.8504  -0.8504   1.5444   1.5444  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.831006   0.009745  -85.27   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 61154  on 49819  degrees of freedom
## Residual deviance: 61154  on 49819  degrees of freedom
## AIC: 61156
## 
## Number of Fisher Scoring iterations: 4
redmod1 = glm(default_time ~.,data = mortgage_select,family=binomial) 
backwards = step(fullmod)
## Start:  AIC=49276.05
## default_time ~ rate_time + first_time + gdp_mean + hpi_orig_time + 
##     FICO_orig_time + TTM + interest_rate_mean + age + risk_free_mean
## 
##                      Df Deviance   AIC
## - rate_time           1    49257 49275
## - risk_free_mean      1    49257 49275
## <none>                     49256 49276
## - age                 1    49258 49276
## - first_time          1    49284 49302
## - hpi_orig_time       1    49497 49515
## - TTM                 1    49637 49655
## - FICO_orig_time      1    49646 49664
## - interest_rate_mean  1    50612 50630
## - gdp_mean            1    53496 53514
## 
## Step:  AIC=49275.41
## default_time ~ first_time + gdp_mean + hpi_orig_time + FICO_orig_time + 
##     TTM + interest_rate_mean + age + risk_free_mean
## 
##                      Df Deviance   AIC
## - risk_free_mean      1    49259 49275
## <none>                     49257 49275
## - age                 1    49260 49276
## - first_time          1    49285 49301
## - hpi_orig_time       1    49499 49515
## - TTM                 1    49639 49655
## - FICO_orig_time      1    49648 49664
## - interest_rate_mean  1    50615 50631
## - gdp_mean            1    53498 53514
## 
## Step:  AIC=49275.34
## default_time ~ first_time + gdp_mean + hpi_orig_time + FICO_orig_time + 
##     TTM + interest_rate_mean + age
## 
##                      Df Deviance   AIC
## <none>                     49259 49275
## - age                 1    49264 49278
## - first_time          1    49285 49299
## - hpi_orig_time       1    49521 49535
## - TTM                 1    49639 49653
## - FICO_orig_time      1    49652 49666
## - interest_rate_mean  1    50678 50692
## - gdp_mean            1    53852 53866
formula(backwards)
## default_time ~ first_time + gdp_mean + hpi_orig_time + FICO_orig_time + 
##     TTM + interest_rate_mean + age
summary(backwards)
## 
## Call:
## glm(formula = default_time ~ first_time + gdp_mean + hpi_orig_time + 
##     FICO_orig_time + TTM + interest_rate_mean + age, family = binomial, 
##     data = mortgage_select)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.6487  -0.7791  -0.4580   0.8653   3.2055  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -3.3179189  0.2274555 -14.587  < 2e-16 ***
## first_time         -0.0206038  0.0041977  -4.908 9.19e-07 ***
## gdp_mean           -0.7933188  0.0125210 -63.359  < 2e-16 ***
## hpi_orig_time       0.0128981  0.0008786  14.681  < 2e-16 ***
## FICO_orig_time     -0.0036184  0.0001834 -19.727  < 2e-16 ***
## TTM                 0.0176275  0.0009622  18.319  < 2e-16 ***
## interest_rate_mean  0.2805760  0.0076677  36.592  < 2e-16 ***
## age                -0.0089871  0.0043646  -2.059   0.0395 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 61154  on 49819  degrees of freedom
## Residual deviance: 49259  on 49812  degrees of freedom
## AIC: 49275
## 
## Number of Fisher Scoring iterations: 5
back2=glm (default_time ~. - rate_time,data = mortgage_select,family=binomial)
summary(back2)
## 
## Call:
## glm(formula = default_time ~ . - rate_time, family = binomial, 
##     data = mortgage_select)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.6727  -0.7797  -0.4587   0.8651   3.2046  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -3.2051155  0.2414624 -13.274  < 2e-16 ***
## first_time         -0.0224262  0.0043961  -5.101 3.37e-07 ***
## gdp_mean           -0.7985790  0.0130907 -61.003  < 2e-16 ***
## hpi_orig_time       0.0132916  0.0009224  14.410  < 2e-16 ***
## FICO_orig_time     -0.0036093  0.0001836 -19.659  < 2e-16 ***
## TTM                 0.0176600  0.0009626  18.345  < 2e-16 ***
## interest_rate_mean  0.2833832  0.0079369  35.704  < 2e-16 ***
## age                -0.0069701  0.0046050  -1.514    0.130    
## risk_free_mean     -0.0387074  0.0278969  -1.388    0.165    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 61154  on 49819  degrees of freedom
## Residual deviance: 49257  on 49811  degrees of freedom
## AIC: 49275
## 
## Number of Fisher Scoring iterations: 5
forwards = step(nothing, scope=list(default=formula(nothing),upper=formula(fullmod)), direction="forward")
## Start:  AIC=61155.51
## default_time ~ 1
## 
##                      Df Deviance   AIC
## + gdp_mean            1    53746 53750
## + hpi_orig_time       1    57460 57464
## + interest_rate_mean  1    59151 59155
## + rate_time           1    60008 60012
## + risk_free_mean      1    60010 60014
## + TTM                 1    60416 60420
## + first_time          1    60431 60435
## + FICO_orig_time      1    60559 60563
## + age                 1    60564 60568
## <none>                     61154 61156
## 
## Step:  AIC=53749.83
## default_time ~ gdp_mean
## 
##                      Df Deviance   AIC
## + interest_rate_mean  1    51336 51342
## + FICO_orig_time      1    52213 52219
## + hpi_orig_time       1    53167 53173
## + TTM                 1    53240 53246
## + age                 1    53261 53267
## + rate_time           1    53591 53597
## + risk_free_mean      1    53592 53598
## + first_time          1    53729 53735
## <none>                     53746 53750
## 
## Step:  AIC=51342.26
## default_time ~ gdp_mean + interest_rate_mean
## 
##                  Df Deviance   AIC
## + hpi_orig_time   1    50229 50237
## + TTM             1    50584 50592
## + age             1    50739 50747
## + FICO_orig_time  1    51016 51024
## + first_time      1    51318 51326
## <none>                 51336 51342
## + risk_free_mean  1    51335 51343
## + rate_time       1    51335 51343
## 
## Step:  AIC=50237.27
## default_time ~ gdp_mean + interest_rate_mean + hpi_orig_time
## 
##                  Df Deviance   AIC
## + TTM             1    49786 49796
## + FICO_orig_time  1    49871 49881
## + first_time      1    50070 50080
## + age             1    50095 50105
## <none>                 50229 50237
## + risk_free_mean  1    50227 50237
## + rate_time       1    50227 50237
## 
## Step:  AIC=49795.51
## default_time ~ gdp_mean + interest_rate_mean + hpi_orig_time + 
##     TTM
## 
##                  Df Deviance   AIC
## + FICO_orig_time  1    49458 49470
## + first_time      1    49655 49667
## + age             1    49672 49684
## + risk_free_mean  1    49782 49794
## + rate_time       1    49782 49794
## <none>                 49786 49796
## 
## Step:  AIC=49470.42
## default_time ~ gdp_mean + interest_rate_mean + hpi_orig_time + 
##     TTM + FICO_orig_time
## 
##                  Df Deviance   AIC
## + first_time      1    49264 49278
## + age             1    49285 49299
## + risk_free_mean  1    49456 49470
## + rate_time       1    49456 49470
## <none>                 49458 49470
## 
## Step:  AIC=49277.54
## default_time ~ gdp_mean + interest_rate_mean + hpi_orig_time + 
##     TTM + FICO_orig_time + first_time
## 
##                  Df Deviance   AIC
## + age             1    49259 49275
## + risk_free_mean  1    49260 49276
## + rate_time       1    49260 49276
## <none>                 49264 49278
## 
## Step:  AIC=49275.34
## default_time ~ gdp_mean + interest_rate_mean + hpi_orig_time + 
##     TTM + FICO_orig_time + first_time + age
## 
##                  Df Deviance   AIC
## <none>                 49259 49275
## + risk_free_mean  1    49257 49275
## + rate_time       1    49257 49275
formula(redmod1)
## default_time ~ rate_time + first_time + gdp_mean + hpi_orig_time + 
##     FICO_orig_time + TTM + interest_rate_mean + age + risk_free_mean
formula(backwards)
## default_time ~ first_time + gdp_mean + hpi_orig_time + FICO_orig_time + 
##     TTM + interest_rate_mean + age
formula(forwards)
## default_time ~ gdp_mean + interest_rate_mean + hpi_orig_time + 
##     TTM + FICO_orig_time + first_time + age
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(tidyverse)
library(modelr)
mortgage_all <- mortgage_all %>% add_predictions(backwards, type = 'response')
roc(mortgage_all$default_time,mortgage_all$pred,plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

## 
## Call:
## roc.default(response = mortgage_all$default_time, predictor = mortgage_all$pred,     plot = TRUE)
## 
## Data: mortgage_all$pred in 34703 controls (mortgage_all$default_time 0) < 15117 cases (mortgage_all$default_time 1).
## Area under the curve: 0.7948
library(pROC)
model <- glm(default_time ~ age + first_time+REtype_SF_orig_time+interest_rate_mean+gdp_mean+rate_time+hpi_orig_time+uer_mean+investor_orig_time+FICO_orig_time+LTV_orig_time+TTM , data = mortgage_all, family = binomial) 
mortgage_all <- mortgage_all %>% add_predictions(model, type = 'response')
roc(mortgage_all$default_time,mortgage_all$pred,plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

## 
## Call:
## roc.default(response = mortgage_all$default_time, predictor = mortgage_all$pred,     plot = TRUE)
## 
## Data: mortgage_all$pred in 34703 controls (mortgage_all$default_time 0) < 15117 cases (mortgage_all$default_time 1).
## Area under the curve: 0.8203
fit <-  leaps::regsubsets(default_time ~ age + first_time+REtype_SF_orig_time+interest_rate_mean+gdp_mean+rate_time+hpi_orig_time+uer_mean+investor_orig_time+FICO_orig_time+LTV_orig_time+TTM, method = "forward",data= mortgage_all, nvmax = 10)
summary(fit)$which
##    (Intercept)   age first_time REtype_SF_orig_time interest_rate_mean gdp_mean
## 1         TRUE FALSE      FALSE               FALSE              FALSE     TRUE
## 2         TRUE FALSE      FALSE               FALSE              FALSE     TRUE
## 3         TRUE FALSE      FALSE               FALSE              FALSE     TRUE
## 4         TRUE FALSE      FALSE               FALSE              FALSE     TRUE
## 5         TRUE  TRUE      FALSE               FALSE              FALSE     TRUE
## 6         TRUE  TRUE      FALSE               FALSE               TRUE     TRUE
## 7         TRUE  TRUE      FALSE               FALSE               TRUE     TRUE
## 8         TRUE  TRUE       TRUE               FALSE               TRUE     TRUE
## 9         TRUE  TRUE       TRUE               FALSE               TRUE     TRUE
## 10        TRUE  TRUE       TRUE                TRUE               TRUE     TRUE
##    rate_time hpi_orig_time uer_mean investor_orig_time FICO_orig_time
## 1      FALSE         FALSE    FALSE              FALSE          FALSE
## 2      FALSE         FALSE     TRUE              FALSE          FALSE
## 3      FALSE         FALSE     TRUE              FALSE           TRUE
## 4      FALSE         FALSE     TRUE              FALSE           TRUE
## 5      FALSE         FALSE     TRUE              FALSE           TRUE
## 6      FALSE         FALSE     TRUE              FALSE           TRUE
## 7      FALSE         FALSE     TRUE              FALSE           TRUE
## 8      FALSE         FALSE     TRUE              FALSE           TRUE
## 9      FALSE         FALSE     TRUE               TRUE           TRUE
## 10     FALSE         FALSE     TRUE               TRUE           TRUE
##    LTV_orig_time   TTM
## 1          FALSE FALSE
## 2          FALSE FALSE
## 3          FALSE FALSE
## 4           TRUE FALSE
## 5           TRUE FALSE
## 6           TRUE FALSE
## 7           TRUE  TRUE
## 8           TRUE  TRUE
## 9           TRUE  TRUE
## 10          TRUE  TRUE
summary(fit)$bic
##  [1]  -7596.947 -11847.708 -13043.437 -13474.537 -13727.848 -13937.529
##  [7] -14207.649 -14355.493 -14408.570 -14411.618
summary(fit)$cp
##  [1] 7408.00466 2718.60252 1463.64526 1012.82920  746.09844  524.84629
##  [7]  243.82847   86.69563   24.75533   12.89255
summary(fit)$rsq
##  [1] 0.1418034 0.2121611 0.2310118 0.2378028 0.2418330 0.2451811 0.2494256
##  [8] 0.2518121 0.2527710 0.2529789