2019-01-31
閱讀量:
1285
利用glmnet函數(shù)實(shí)現(xiàn)嶺回歸與Lasso
#Hitters(棒球)數(shù)據(jù)集介紹
library(ISLR)
names(Hitters)
#[1] "AtBat"? ???"Hits"? ?? ?"HmRun"? ?
# [4] "Runs"? ?? ?"RBI"? ?? ? "Walks"? ?
#[7] "Years"? ???"CAtBat"? ? "CHits"? ?
#[10] "CHmRun"? ? "CRuns"? ???"CRBI"? ???
#[13] "CWalks"? ? "League"? ? "Division"
#[16] "PutOuts"? ?"Assists"? ?"Errors"? ?
#[19] "Salary"? ? "NewLeague"
sum(is.na(Hitters$Salary))
Hitters=na.omit(Hitters)? ? #先剔除缺失值
####嶺回歸和Lasso####
x=model.matrix(Salary~.,Hitters)[,-1]??#刪除了截距項(xiàng)
y=Hitters$Salary
library(glmnet)
grid=10^seq(10,-2,length=100)#10^-2至10^10
ridge.mod=glmnet(x,y,alpha=0,lambda=grid)
grid[50]? ?#11497.57
grid[60]? ?#705.4802
dim(coef(ridge.mod))
ridge.mod$lambda[50]??#lamuda=11498時(shí)的系數(shù)向量
ridge.mod$lambda[60]? ?#lamuda=705時(shí)的系數(shù)向量
#說明,比較上述兩個(gè)系數(shù)向量,可知lamuda值小,嶺回歸的系數(shù)更大
#預(yù)測lamuda值為50的嶺回歸系數(shù)
predict(ridge.mod,s=50,type="coefficients")[1:20,] #將系數(shù)矩陣轉(zhuǎn)化為向量
#劃分訓(xùn)練集和測試集
set.seed(1)
train=sample(1:nrow(x),nrow(x)/2)
test=(-train)
y.test=y[test]
ridge.mod=glmnet(x[train,],y[train],alpha= 0,lambda = grid,thresh = 1e-12)
ridge.pred=predict(ridge.mod,s=4,newx=x[test,])
mean((ridge.pred-y.test)^2)??# lamuda=4時(shí),MSE=101036.8
mean((mean(y[train])-y.test)^2)? ?#只含截距項(xiàng)模型的測試誤差193253.1
ridge.pred=predict(ridge.mod,s=1e10,newx= x[test,])
mean((ridge.pred-y.test)^2)??# lamuda=10^10時(shí),MSE=193253.1
#當(dāng)lamuda很大時(shí),模型的測試誤差與只含截距項(xiàng)模型的測試誤差相當(dāng)
set.seed(1)
cv.out=cv.glmnet(x[train,],y[train],alpha=0)
par(mfrow=c(1,1))
plot(cv.out)
bestlam=cv.out$lambda.min??#交叉驗(yàn)證確定最優(yōu)的lamuda=211.7416
ridge.pred=predict(ridge.mod,s=bestlam,newx= x[test,])
mean((ridge.pred-y.test)^2)??# lamuda=212時(shí),96015.51
out=glmnet(x,y,alpha=0)
predict(out,type="coefficients",s=bestlam)[1:20,]
#嶺回歸無法將系數(shù)變量壓縮至0
###
#最小二乘即lamda=0的嶺回歸
ridge.pred=predict(ridge.mod,s=0,newx= x[test,],exact = TRUE)
mean((ridge.pred-y.test)^2)??#114783
lm.fit=lm(y~x,subset= train)
lasso.mod=glmnet(x[train,],y[train],alpha=1,lambda=grid)
plot(lasso.mod)
set.seed(1)
cv.out=cv.glmnet(x[train,],y[train],alpha=1)
plot(cv.out)
bestlam=cv.out$lambda.min??#交叉驗(yàn)證確定最優(yōu)的lamuda=35.32063
lasso.pred=predict(lasso.mod,s=bestlam,newx= x[test,])
mean((lasso.pred-y.test)^2)? ?# lamuda=35時(shí), MSE=100743.4
out=glmnet(x,y,alpha=1)
predict(out,type="coefficients",s=bestlam)[1:20,]
#Lasso的系數(shù)估計(jì)結(jié)果是稀疏的






評(píng)論(0)


暫無數(shù)據(jù)
CDA考試動(dòng)態(tài)
CDA報(bào)考指南
推薦帖子
0條評(píng)論
0條評(píng)論
0條評(píng)論