生存分析随机森林实验与代码Word文档下载推荐.docx
《生存分析随机森林实验与代码Word文档下载推荐.docx》由会员分享,可在线阅读,更多相关《生存分析随机森林实验与代码Word文档下载推荐.docx(5页珍藏版)》请在冰豆网上搜索。
Analysis:
RSF
Family:
surv
Splittingrule:
logrank
Errorrate:
19.87%
发现直接使用随机森林获得的模型,预测误差很年夜,到达了19.8%,进一步考虑使用随机森林模型进行变量选择,结果如下:
Samplesize:
52
19
500
Minimumterminalnodesize:
2
9
Splittingrule:
logrank*random*
Numberofrandomsplitpoints:
10
11.4%
>
our.rf$topvars
[1]"
213821_s_at"
"
219778_at"
204690_at"
220788_s_at"
202202_s_at"
[6]"
211603_s_at"
213055_at"
219336_s_at"
37892_at"
一共选取了9个变量,同时误差只有11.4%
接下来,使用这些变量做cox回归,剔除模型中不显著(>
0.01)的变量,最终介入模型建立的变量共有4个.模型结果如下:
exp(coef)exp(-coef)lower.95upper.95
最后选取六个变量拟合生存模型,绘制生存曲线如下:
下面绘制ROC曲线,分别在训练集和测试集上绘制ROC曲线,结果如下:
训练集:
测试集:
由于测试集上的样本过少,所以获得的AUC值摆荡年夜,考虑使用bootstrap屡次计算训练集上的AUC值并求平均来测试模型的效果:
由此可以看到,随机森林通过删除贡献较低的变量,完成变量选择的工作,在测试集上具有较高的AUC值,可是比lasso-cox模型获得的AUC略低.
附录:
load("
~/R/brea.rda"
)
library(survival)
set.seed(10)
i<
-sample(1:
77,52)
train<
-dat[i,]
test<
-dat[-i,]
library(randomForestSRC)
disease.rf<
-rfsrc(Surv(time,status)~.,data=train,
ntree=800,mtry=3,
nodesize=3,splitrule="
logrank"
our.rf<
-var.select(object=disease.rf,vdv,
method="
vh.vimp"
nrep=50)
our.rf$topvars
index<
-numeric(var.rf$modelsize)
for(iin1:
var.rf$modelsize){
index[i]<
-which(names(dat)==var.rf$topvars[i])
}
data<
-dat[,c(1,2,index)]
-data[i,]
-data[-i,]
mod.brea<
-coxph(Surv(time,status)~.,data=train)
train_data<
-train[,c(1,2,which(summary(mod.brea)$coefficients[,5]<
=0.1)+2)]
tset_data<
-test[,c(1,2,which(summary(mod.brea)$coefficients[,5]<
mod.brea1<
-coxph(Surv(time,status)~.,data=train_data)
summary(mod.brea1)
names(coef(mod.brea1))
plot(survfit(mod.brea1),xlab="
Time"
ylab="
Proportion"
main="
CoxModel"
conf.int=TRUE,col=c("
black"
"
red"
),ylim=c(0.6,1))
index0<
-numeric(length(coef(mod.brea1)))
coefficients<
-coef(mod.brea1)
name<
-gsub("
\`"
"
names(coefficients))
for(jin1:
length(index0)){
index0[j]<
-which(names(dat)==name[j])
library(survivalROC)
riskscore<
-as.matrix(dat[i,index0])%*%as.matrix(coefficients)
y1<
-survivalROC(Stime=train$time,status=train$status,marker=riskscore,predict.time=1,span=0.25*(nrow(train))^(-0.20))
y3<
-survivalROC(Stime=train$time,status=train$status,marker=riskscore,predict.time=3,span=0.25*(nrow(train))^(-0.20))
y5<
-survivalROC(Stime=train$time,status=train$status,marker=riskscore,predict.time=5,span=0.25*(nrow(train))^(-0.20))
a<
-matrix(data=c("
y1"
y3"
y5"
y1$AUC,y3$AUC,y5$AUC),nrow=3,ncol=2);
a
plot(y1$FP,y1$TP,type="
l"
xlab="
FalsePositiveRate"
TruePositiveRate"
Time-dependentROCcurve"
col="
green"
)
lines(y3$FP,y3$TP,col="
lty=2)
lines(y5$FP,y5$TP,col="
blue"
lty=3)
legend("
bottomright"
bty="
n"
legend=c("
AUCat1year:
0.9271"
AUCat3years:
0.8621"
AUCat5years:
0.8263"
),col=c("
),lty=c(1,2,3),cex=0.9)
abline(0,1)
-as.matrix(dat[-i,index0])%*%as.matrix(coefficients)
-survivalROC(Stime=test$time,status=test$status,marker=riskscore,predict.time=1,span=0.25*(nrow(train))^(-0.20))
-survivalROC(Stime=test$time,status=test$status,marker=riskscore,predict.time=3,span=0.25*(nrow(train))^(-0.20))
-survivalROC(Stime=test$time,status=test$status,marker=riskscore,predict.time=5,span=0.25*(nrow(train))^(-0.20))
0.8761"
0.7611"
-matrix(0,30,3)
for(cin1:
30){
i<
train<
test<
mod.brea<
train_data<
tset_data<
mod.brea1<
names(coef(mod.brea1))
index0<
coefficients<
name<