实验报告聚类分析.docx
《实验报告聚类分析.docx》由会员分享,可在线阅读,更多相关《实验报告聚类分析.docx(26页珍藏版)》请在冰豆网上搜索。
实验报告聚类分析
实验报告聚类分析
实验原理:
K均值聚类、中心点聚类、系统聚类和EM算法聚类分析技术。
实验题目:
用鸢尾花的数据集,进行聚类挖掘分析。
实验要求:
探索鸢尾花数据的基本特征,利用不同的聚类挖掘方法,获得基本结论并简明解释。
实验题目--分析报告:
data(iris)
>rm(list=ls())
>gc()
used(Mb)gctrigger(Mb)maxused(Mb)
Ncells431730929718607591
Vcells78760583886081592403
>data(iris)
>data<-iris
>head(data)
Species
1setosa
2setosa
3setosa
4setosa
5setosa
6setosa
#Kmean聚类分析
>newiris<-iris
>newiris$Species<-NULL
>(kc<-kmeans(newiris,3))
K-meansclusteringwith3clustersofsizes62,50,38
Clustermeans:
1
2
3
Clusteringvector:
[1]2222222222222222222222222222222222222222
[41]2222222222113111111111111111111111111311
[81]1111111111111111111131333313333331133331
[121]313133113333313333133313331331
Withinclustersumofsquaresbycluster:
[1]
(between_SS/total_SS=%)
Availablecomponents:
[1]"cluster""centers""totss""withinss"""
[6]"betweenss""size""iter""ifault"
>table(iris$Species,kc$cluster)
123
setosa0500
versicolor4802
virginica14036
>plot(newiris[c("","")],col=kc$cluster)
>points(kc$centers[,c("","")],col=1:
3,pch=8,cex=2)
#K-Mediods进行聚类分析
>("cluster")
>library(cluster)
><-pam(iris,3)
>table(iris$Species,$clustering)
123
setosa5000
versicolor0347
virginica0491
>layout(matrix(c(1,2),1,2))
>plot
>layout(matrix
(1))
#hc
><-hclust(dist(iris[,1:
4]))
>plot(,hang=-1)
>plclust(,labels=FALSE,hang=-1)
>re<-,k=3)
><-cutree,3)
#利用剪枝函数cutree()参数h控制输出height=18时的系谱类别
>sapply(unique,
+function(g)iris$Species[==g])
[[1]]
[1]setosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosa
[12]setosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosa
[23]setosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosa
[34]setosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosa
[45]setosasetosasetosasetosasetosasetosa
Levels:
setosaversicolorvirginica
[[2]]
[1]versicolorversicolorversicolorversicolorversicolorversicolorversicolor
[8]versicolorversicolorversicolorversicolorversicolorversicolorversicolor
[15]versicolorversicolorversicolorversicolorversicolorversicolorversicolor
[22]versicolorversicolorvirginicavirginicavirginicavirginicavirginica
[29]virginicavirginicavirginicavirginicavirginicavirginicavirginica
[36]virginicavirginicavirginicavirginicavirginicavirginicavirginica
[43]virginicavirginicavirginicavirginicavirginicavirginicavirginica
[50]virginicavirginicavirginicavirginicavirginicavirginicavirginica
[57]virginicavirginicavirginicavirginicavirginicavirginicavirginica
[64]virginicavirginicavirginicavirginicavirginicavirginicavirginica
[71]virginicavirginica
Levels:
setosaversicolorvirginica
[[3]]
[1]versicolorversicolorversicolorversicolorversicolorversicolorversicolor
[8]versicolorversicolorversicolorversicolorversicolorversicolorversicolor
[15]versicolorversicolorversicolorversicolorversicolorversicolorversicolor
[22]versicolorversicolorversicolorversicolorversicolorversicolorvirginica
Levels:
setosaversicolorvirginica
>plot
>,k=4,border="lightgrey")#用浅灰色矩形框出4分类聚类结果
>,k=3,border="darkgrey")#用浅灰色矩形框出3分类聚类结果
>,k=7,which=c(2,6),border="darkgrey")
#DBSCAN#基于密度的聚类
>("fpc")
>library(fpc)
>ds1=dbscan(iris[,1:
4],eps=1,MinPts=5)#半径参数为1,密度阈值为5
>ds1
dbscanPts=150MinPts=5eps=1
12
border01
seed5099
total50100
>ds2=dbscan(iris[,1:
4],eps=4,MinPts=5)
>ds3=dbscan(iris[,1:
4],eps=4,MinPts=2)
>ds4=dbscan(iris[,1:
4],eps=8,MinPts=2)
>par(mfcol=c(2,2))
>plot(ds1,iris[,1:
4],main="1:
MinPts=5eps=1")
>plot(ds3,iris[,1:
4],main="3:
MinPts=2eps=4")
>plot(ds2,iris[,1:
4],main="2:
MinPts=5eps=4")
>plot(ds4,iris[,1:
4],main="4:
MinPts=2eps=8")
>d=dist(iris[,1:
4])#计算数据集的距离矩阵d
>max(d);min(d)#计算数据集样本的距离的最值
[1]
[1]0
>("ggplot2")
>library(ggplot2)
>interval=cut_interval(d,30)
>table(interval)
interval
[0,],],],],],]
88585876891831688
],],],],],]
543369379339335406
],],],],],]
458459465480468505
],],],],],]
349385321291187138
],],],],],]
97927850184
>(table(interval))
]
4
>for(iin3:
5)
+{for(jin1:
10)
+{ds=dbscan(iris[,1:
4],eps=i,MinPts=j)
+print(ds)
+}
+}
dbscanPts=150MinPts=1eps=3
1
seed150
total150
dbscanPts=150MinPts=2eps=3
1
seed150
total150
dbscanPts=150MinPts=3eps=3
1
seed150
total150
dbscanPts=150MinPts=4eps=3
1
seed150
total150
dbscanPts=150MinPts=5eps=3
1
seed150
total150
dbscanPts=150MinPts=6eps=3
1
seed150
total150
dbscanPts=150MinPts=7eps=3
1
seed150
total150
dbscanPts=150MinPts=8eps=3
1
seed150
total150
dbscanPts=150MinPts=9eps=3
1
seed150
total150
dbscanPts=150MinPts=10eps=3
1
seed150
total150
dbscanPts=150MinPts=1eps=4
1
seed150
total150
dbscanPts=150MinPts=2eps=4
1
seed150
total150
dbscanPts=150MinPts=3eps=4
1
seed150
total150
dbscanPts=150MinPts=4eps=4
1
seed150
total150
dbscanPts=150MinPts=5eps=4
1
seed150
total150
dbscanPts=150MinPts=6eps=4
1
seed150
total150
dbscanPts=150MinPts=7eps=4
1
seed150
total150
dbscanPts=150MinPts=8eps=4
1
seed150
total150
dbscanPts=150MinPts=9eps=4
1
seed150
total150
dbscanPts=150MinPts=10eps=4
1
seed150
total150
dbscanPts=150MinPts=1eps=5
1
seed150
total150
dbscanPts=150MinPts=2eps=5
1
seed150
total150
dbscanPts=150MinPts=3eps=5
1
seed150
total150
dbscanPts=150MinPts=4eps=5
1
seed150
total150
dbscanPts=150MinPts=5eps=5
1
seed150
total150
dbscanPts=150MinPts=6eps=5
1
seed150
total150
dbscanPts=150MinPts=7eps=5
1
seed150
total150
dbscanPts=150MinPts=8eps=5
1
seed150
total150
dbscanPts=150MinPts=9eps=5
1
seed150
total150
dbscanPts=150MinPts=10eps=5
1
seed150
total150
#30次dbscan的聚类结果
>ds5=dbscan(iris[,1:
4],eps=3,MinPts=2)
>ds6=dbscan(iris[,1:
4],eps=4,MinPts=5)
>ds7=dbscan(iris[,1:
4],eps=5,MinPts=9)
>par(mfcol=c(1,3))
>plot(ds5,iris[,1:
4],main="1:
MinPts=2eps=3")
>plot(ds6,iris[,1:
4],main="3:
MinPts=5eps=4")
>plot(ds7,iris[,1:
4],main="2:
MinPts=9eps=5")
#EM期望最大化聚类
>("mclust")
>library(mclust)
>fit_EM=Mclust(iris[,1:
4])
fitting...
|===========================================================================|100%
>summary(fit_EM)
----------------------------------------------------
GaussianfinitemixturemodelfittedbyEMalgorithm
----------------------------------------------------
MclustVEV(ellipsoidal,equalshape)modelwith2components:
ndfBICICL
15026
Clusteringtable:
12
50100
>summary(fit_EM,parameters=TRUE)
----------------------------------------------------
GaussianfinitemixturemodelfittedbyEMalgorithm
----------------------------------------------------
MclustVEV(ellipsoidal,equalshape)modelwith2components:
ndfBICICL
15026
Clusteringtable:
12
50100
Mixingprobabilities:
12
Means:
[,1][,2]
Variances:
[,,1]
0.0.
0.0.
[,,2]
0.0.
0.
0.0.
0.
>plot(fit_EM)#对EM聚类结果作图
Model-basedclusteringplots:
1:
BIC
2:
classification
3:
uncertainty
4:
density
Selection:
(下面显示选项)
#选1
#选2
#选3
#选4
Selection:
0
>iris_BIC=mclustBIC(iris[,1:
4])
fitting...
|===========================================================================|100%
>iris_BICsum=summary(iris_BIC,data=iris[,1:
4])
>iris_BICsum#获取数1据集iris在各模型和类别数下的BIC值
BestBICvalues:
VEV,2VEV,3VVV,2
BIC
BICdiff
Classificationtableformodel(VEV,2):
12
50100
>iris_BIC
BayesianInformationCriterion(BIC):
EIIVIIEEIVEIEVIVVIEEE
1
2
3
4
5
6
7
8
9
EVEVEEVVEEEVVEVEVVVVV
1
2
3
4
5NANA
6NA
7NA
8
9NA
Top3modelsbasedontheBICcriterion:
VEV,2VEV,3VVV,2
>par(mfcol=c(1,1))
>plot(iris_BIC,G=1:
7,col="yellow")
>mclust2Dplot(iris[,1:
2],
+classification=iris_BICsum$classification,
+parameters=iris_BICsum$parameters,col="yellow")
>iris_Dens=densityMclust(iris[,1:
2])#对每一个样本进行密度估计
fitting...
|===========================================================================|100%
>iris_Dens
'densityMclust'modelobject:
(VEV,2)
Availablecomponents:
[1]"call""data""modelName""n"
[5]"d""G""BIC""bic"
[9]"loglik""df""hypvol""parameters"
[13]"z""classification""uncertainty""density"
>plot(iris_Dens,iris[,1:
2],col="yellow",nlevels=55)##输入1或2
Model-baseddensityestimationplots:
1:
BIC
2:
density
Selection:
(下面显示选项)
#选1
#选2
Selection:
0
>plot(iris_Dens,type="persp",col=grey)
Model-baseddensityestimationplots:
1:
BIC
2:
density
Selection:
(下面显示选项)
#选1
#选2
Selection:
0