R Machine Learning Codebook (Part 1)
- Linear Regression
- Used when response variable is ‘numerical’ variable
getwd()
## [1] "/Users/jungyulyang/programming/school/r_blog/content/post"
library(readr)
#information for data
usedcar2 <- read_csv("data/usedcar2.csv")
## Parsed with column specification:
## cols(
## Price = col_double(),
## Odometer = col_double(),
## Color = col_character()
## )
usedcar2
## # A tibble: 100 x 3
## Price Odometer Color
## <dbl> <dbl> <chr>
## 1 14636 37388 white
## 2 14122 44758 white
## 3 14470 45854 white
## 4 15072 40149 white
## 5 14802 40237 white
## 6 14660 43533 white
## 7 15612 32744 white
## 8 14632 41350 white
## 9 15008 35781 white
## 10 14666 48613 white
## # … with 90 more rows
Encoding categorical variable
usedcar2$Ind1 <- as.numeric(usedcar2$Color == 'white')
usedcar2$Ind2 <- as.numeric(usedcar2$Color == 'silver')
usedcar2
## # A tibble: 100 x 5
## Price Odometer Color Ind1 Ind2
## <dbl> <dbl> <chr> <dbl> <dbl>
## 1 14636 37388 white 1 0
## 2 14122 44758 white 1 0
## 3 14470 45854 white 1 0
## 4 15072 40149 white 1 0
## 5 14802 40237 white 1 0
## 6 14660 43533 white 1 0
## 7 15612 32744 white 1 0
## 8 14632 41350 white 1 0
## 9 15008 35781 white 1 0
## 10 14666 48613 white 1 0
## # … with 90 more rows
Regression Fitting
lm_used <- lm(Price ~ Odometer + Ind1 + Ind2, data=usedcar2)
summary(lm_used)
##
## Call:
## lm(formula = Price ~ Odometer + Ind1 + Ind2, data = usedcar2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -664.58 -193.42 -5.78 197.38 639.46
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.670e+04 1.843e+02 90.600 < 2e-16 ***
## Odometer -5.554e-02 4.737e-03 -11.724 < 2e-16 ***
## Ind1 9.048e+01 6.817e+01 1.327 0.187551
## Ind2 2.955e+02 7.637e+01 3.869 0.000199 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 284.5 on 96 degrees of freedom
## Multiple R-squared: 0.698, Adjusted R-squared: 0.6886
## F-statistic: 73.97 on 3 and 96 DF, p-value: < 2.2e-16
Plot
plot(lm_used,which=2)
plot(lm_used,which=1)
Things need to check for linear regression 1. Independence 2. Normality 3. Homoscedasticity
X variable Selection
#both = stepwise elimination, backward = backward elimination, forward = forward selection
step_used <- step(lm_used, direction='both')
## Start: AIC=1134.09
## Price ~ Odometer + Ind1 + Ind2
##
## Df Sum of Sq RSS AIC
## - Ind1 1 142641 7915205 1133.9
## <none> 7772564 1134.1
## - Ind2 1 1211971 8984535 1146.6
## - Odometer 1 11129137 18901701 1221.0
##
## Step: AIC=1133.91
## Price ~ Odometer + Ind2
##
## Df Sum of Sq RSS AIC
## <none> 7915205 1133.9
## + Ind1 1 142641 7772564 1134.1
## - Ind2 1 1090245 9005450 1144.8
## - Odometer 1 11056747 18971952 1219.3
summary(step_used)
##
## Call:
## lm(formula = Price ~ Odometer + Ind2, data = usedcar2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -713.81 -199.46 0.95 183.30 683.18
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.674e+04 1.826e+02 91.683 < 2e-16 ***
## Odometer -5.533e-02 4.753e-03 -11.640 < 2e-16 ***
## Ind2 2.489e+02 6.809e+01 3.655 0.000417 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 285.7 on 97 degrees of freedom
## Multiple R-squared: 0.6925, Adjusted R-squared: 0.6861
## F-statistic: 109.2 on 2 and 97 DF, p-value: < 2.2e-16
Interaction
int_used <- lm(Price ~ Odometer+Ind1+Ind2+Ind1:Odometer+Ind2:Odometer,data=usedcar2)
summary(int_used)
##
## Call:
## lm(formula = Price ~ Odometer + Ind1 + Ind2 + Ind1:Odometer +
## Ind2:Odometer, data = usedcar2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -632.76 -173.28 -7.88 183.62 712.02
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.708e+04 2.776e+02 61.515 < 2e-16 ***
## Odometer -6.563e-02 7.292e-03 -9.001 2.45e-14 ***
## Ind1 -8.153e+02 4.180e+02 -1.950 0.0541 .
## Ind2 -3.014e+01 4.061e+02 -0.074 0.9410
## Odometer:Ind1 2.399e-02 1.093e-02 2.195 0.0306 *
## Odometer:Ind2 8.445e-03 1.169e-02 0.723 0.4717
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 280.4 on 94 degrees of freedom
## Multiple R-squared: 0.7129, Adjusted R-squared: 0.6976
## F-statistic: 46.68 on 5 and 94 DF, p-value: < 2.2e-16
Train Test split
set.seed(1234)
nobs=nrow(usedcar2)
#train:test = 70:30
i = sample(1:nobs, round(nobs*0.7))
train = usedcar2[i,]
test= usedcar2[-i,]
nrow(train);nrow(test)
## [1] 70
## [1] 30
Prediction and correlation
model = lm(Price ~ Odometer + Ind1 + Ind2, data=train)
pred = predict(model, newdata=test, type = 'response')
pred
## 1 2 3 4 5 6 7 8
## 14676.73 15410.82 14222.10 14947.62 14239.45 14428.65 14392.39 14780.56
## 9 10 11 12 13 14 15 16
## 14637.97 14690.80 14919.48 14576.19 15154.04 15568.29 15190.30 15879.67
## 17 18 19 20 21 22 23 24
## 15118.95 15847.80 15429.82 14124.72 14957.29 14633.57 14539.03 15226.01
## 25 26 27 28 29 30
## 14735.85 14372.92 14047.14 14946.89 14493.71 14658.32
cor(test$Price,pred)^2
## [1] 0.6832043
- Logistic Regression
- Used when response variable is ‘categorical’ variable
directmail <- read_csv("data/directmail.csv")
## Parsed with column specification:
## cols(
## RESPOND = col_double(),
## AGE = col_double(),
## BUY18 = col_double(),
## CLIMATE = col_double(),
## FICO = col_double(),
## INCOME = col_double(),
## MARRIED = col_double(),
## OWNHOME = col_double(),
## GENDER = col_character()
## )
#remove all na
complete = complete.cases(directmail)
directmail <- directmail[complete,]
directmail
## # A tibble: 9,727 x 9
## RESPOND AGE BUY18 CLIMATE FICO INCOME MARRIED OWNHOME GENDER
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 0 71 1 10 719 67 1 0 M
## 2 0 53 0 10 751 72 1 0 M
## 3 0 53 1 10 725 70 1 0 F
## 4 0 45 1 10 684 56 0 0 F
## 5 0 32 0 10 651 66 0 0 F
## 6 0 35 0 10 691 48 0 1 F
## 7 0 43 0 10 694 49 0 1 F
## 8 0 39 0 10 659 64 0 0 M
## 9 0 66 0 10 692 65 0 0 M
## 10 0 52 2 10 705 58 1 1 M
## # … with 9,717 more rows
nobs=nrow(directmail)
#train:test = 70:30
i = sample(1:nobs, round(nobs*0.7))
train = directmail[i,]
test= directmail[-i,]
nrow(train);nrow(test)
## [1] 6809
## [1] 2918
Logistic Fitting
full_model = glm(RESPOND ~ AGE+BUY18+CLIMATE+FICO+INCOME+MARRIED+OWNHOME+GENDER,
family='binomial',data=directmail)
summary(full_model)
##
## Call:
## glm(formula = RESPOND ~ AGE + BUY18 + CLIMATE + FICO + INCOME +
## MARRIED + OWNHOME + GENDER, family = "binomial", data = directmail)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1544 -0.4275 -0.3582 -0.2941 2.9353
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.713421 0.948186 2.862 0.00421 **
## AGE -0.037951 0.004469 -8.491 < 2e-16 ***
## BUY18 0.461354 0.058442 7.894 2.92e-15 ***
## CLIMATE -0.020629 0.006271 -3.289 0.00100 **
## FICO -0.004988 0.001326 -3.760 0.00017 ***
## INCOME -0.001425 0.002499 -0.570 0.56851
## MARRIED 0.534643 0.090075 5.936 2.93e-09 ***
## OWNHOME -0.421534 0.090408 -4.663 3.12e-06 ***
## GENDERM -0.076648 0.079758 -0.961 0.33655
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5179.6 on 9726 degrees of freedom
## Residual deviance: 4977.5 on 9718 degrees of freedom
## AIC: 4995.5
##
## Number of Fisher Scoring iterations: 5
Predict Probability
prob_pred = predict(full_model, newdata=test, type="response")
ROC CURVE
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
roccurve <- roc(test$RESPOND ~ prob_pred)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roccurve,col="red",print.auc=TRUE, print.auc.adj=c(1,-7),auc.polygon=TRUE)
- Decision Trees
- Used regardless of the type of response variable
library(readr)
titanic <- read_csv("data/titanic.csv")
## Parsed with column specification:
## cols(
## Class = col_character(),
## Age = col_character(),
## Sex = col_character(),
## Survived = col_character()
## )
titanic
## # A tibble: 2,201 x 4
## Class Age Sex Survived
## <chr> <chr> <chr> <chr>
## 1 First Adult Male Yes
## 2 First Adult Male Yes
## 3 First Adult Male Yes
## 4 First Adult Male Yes
## 5 First Adult Male Yes
## 6 First Adult Male Yes
## 7 First Adult Male Yes
## 8 First Adult Male Yes
## 9 First Adult Male Yes
## 10 First Adult Male Yes
## # … with 2,191 more rows
Tree Fitting
#class represents tree where anova represents regression tree
library(rpart)
library(rpart.plot)
D_tree <- rpart(Survived ~ Class+Sex+Age, data=titanic, method='class')
print(D_tree)
## n= 2201
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 2201 711 No (0.6769650 0.3230350)
## 2) Sex=Male 1731 367 No (0.7879838 0.2120162)
## 4) Age=Adult 1667 338 No (0.7972406 0.2027594) *
## 5) Age=Child 64 29 No (0.5468750 0.4531250)
## 10) Class=Third 48 13 No (0.7291667 0.2708333) *
## 11) Class=First,Second 16 0 Yes (0.0000000 1.0000000) *
## 3) Sex=Female 470 126 Yes (0.2680851 0.7319149)
## 6) Class=Third 196 90 No (0.5408163 0.4591837) *
## 7) Class=Crew,First,Second 274 20 Yes (0.0729927 0.9270073) *
prp(D_tree, type=4, extra=2, digits=3)
More parameters to control
#xval means cross validation, whether to prun or not
my.control <- rpart.control(xval=10,cp=0.001,minsplit=3)
tree <- rpart(Survived ~ Class+Sex+Age, data=titanic, method='class', control = my.control)
plot(tree, uniform=T, compress = T, margin=0.05)
printcp(tree)
##
## Classification tree:
## rpart(formula = Survived ~ Class + Sex + Age, data = titanic,
## method = "class", control = my.control)
##
## Variables actually used in tree construction:
## [1] Age Class Sex
##
## Root node error: 711/2201 = 0.32303
##
## n= 2201
##
## CP nsplit rel error xerror xstd
## 1 0.306610 0 1.00000 1.00000 0.030857
## 2 0.022504 1 0.69339 0.69339 0.027510
## 3 0.011252 2 0.67089 0.69620 0.027549
## 4 0.001000 4 0.64838 0.64838 0.026850