完整word版离群点检测基于距离实验报告Word文档下载推荐.docx

资源描述

完整word版离群点检测基于距离实验报告Word文档下载推荐.docx

《完整word版离群点检测基于距离实验报告Word文档下载推荐.docx》由会员分享，可在线阅读，更多相关《完整word版离群点检测基于距离实验报告Word文档下载推荐.docx（17页珍藏版）》请在冰豆网上搜索。

完整word版离群点检测基于距离实验报告Word文档下载推荐.docx

C++

五、算法描述

K-means算法是很典型的基于距离的聚类算法，采用距离作为相似性的评价指标，即认为两个对象的距离越近，其相似度就越大。

该算法认为簇是由距离靠近的对象组成的，因此把得到紧凑且独立的簇作为最终目标。

1、算法思路

K-means算法

先随机选取K个对象作为初始的聚类中心。

然后计算每个对象与各个种子聚类中心之间的距离，把每个对象分配给距离它最近的聚类中心。

聚类中心以及分配给它们的对象就代表一个聚类。

一旦全部对象都被分配了，每个聚类的聚类中心会根据聚类中现有的对象被重新计算。

这个过程将不断重复直到满足某个终止条件。

终止条件可以是以下任何一个：

1）没有（或最小数目）对象被重新分配给不同的聚类。

2）没有（或最小数目）聚类中心再发生变化。

3）误差平方和局部最小。

2、算法步骤

a.从数据集中随机挑K个数据当簇心；

b.对数据中的所有点求到这K个簇心的距离，假如点Pi离簇心Si最近，那么Pi属于Si对应的簇；

c.根据每个簇的数据，更新簇心，使得簇心位于簇的中心；

d.重复步骤e和步骤f，直到簇心不再移动（或其他条件，如前后两次距离和不超过特定值），继续下一步；

e.计算每个簇的正常半径，即阀值（此程序阀值为每个簇的平均距离与1.5倍标准差之和）；

f.从每个簇中，找出大于阀值的点，即离群点。

六、数据结构

Node类，定义了二维空间中的一个点，pos_x,pos_y三成员变量分别为x，y，轴的值，且为double型。

Node类作为基本数据结构，使用在KMean类里。

KMean类封装了一系列成员变量和函数，实现了KMean算法。

具体成员变量和函数详细说明如下：

classKMean

{

private:

intcluster_num;

//生成的簇的数量。

vector<

Node>

mean_nodes;

//均值点

data;

//所有的数据点

*clusters;

//簇,key为簇的下标，value为该簇中所有点

intcount;

//记录迭代次数

*cutData;

double*radio;

//初始化函数（首先随即生成代表点）

voidInit_Means（）;

//聚类过程，将空间中的点分到不同的簇中

voidClusterProcess（）;

//获取当前结点的簇下标

intgetIndexOfCluster（vector<

means,Nodeactive）;

//获取每个点到各自簇中心的距离和

doublegetSumOfDist（vector<

*clusters,vector<

mean_nodes）;

//生成均值

NodegetMeans（intcluster_index）;

//获取两个点之间的距离

doublegetDistance（Nodeactive,Nodeother）;

public:

//构造函数，c_num为簇个数，node_vector为原始数据

KMean（intc_num,vector<

node_vector）;

~KMean（）;

//找出离群点只要距离大于平均距离+标准差，则视为离群点

voidcut（）;

//显示剪枝结果

voidshowCutResult（）;

};

程序代码图

注：

代码图中相关函数的说明见KMean类的方法说明。

七、程序截图

随机生成50个数据，随机选取4个簇心，如上图所示。

经过聚类，簇1、簇2的中心已改变，算出的阀值、检测到的离群点如上图所示。

簇3、簇4聚类后，正常点和离群点如图所示。

八、实验总结

实验程序，是在聚类完成之后，基于距离筛选出了离群点。

在数据挖掘过程中，将离群点数据丢弃，更有利于分析获取有用的数据。

从实验结果看，部分离群点的距离远大于正常距离，丢弃这些数据，避免无效数据干扰，显得非常有意义。

九、附件

1.程序源码

main.cpp主程序入口

#include<

iostream>

vector>

#include"

k-mean.h"

ctime>

usingnamespacestd;

//输入数据

voidinput（vector<

vecData,intnum）;

intmain（）

srand（（int）time（0））;

intnum,k;

cout<

请依次输入数据量、聚类个数（数据随机产生）\n"

;

cin>

num>

input（data,num）;

KMeankmean（k,data）;

kmean.cut（）;

kmean.showCutResult（）;

system（"

pause"

）;

return0;

}

vecData,intnum）

for（inti=0;

num;

i++）

{

Nodenode;

node.pos_x=（rand（）%5000）;

node.pos_y=（rand（）%5000）;

vecData.push_back（node）;

}

k-mean.hkmean类和Node类声明

//k-mean.h

#pragmaonce

//空间点的定义

classNode

doublepos_x;

doublepos_y;

Node（）

pos_x=0.0;

pos_y=0.0;

friendbooloperator<

（constNode&

first,constNode&

second）

//对x轴的比较

if（first.pos_x<

second.pos_x）

{

returntrue;

}

elseif（first.pos_x>

returnfalse;

//对y轴的比较

else

if（first.pos_y<

second.pos_y）

{

returntrue;

}

else

returnfalse;

}

friendbooloperator==（constNode&

if（first.pos_x==second.pos_x&

first.pos_y==second.pos_y）

else

k-mean.cppkmean类的成员函数具体定义

cstdlib>

algorithm>

cmath>

iomanip>

KMean:

KMean（intc_num,vector<

node_vector）

cluster_num=c_num;

data=node_vector;

clusters=newvector<

[cluster_num];

cutData=newvector<

radio=newdouble[cluster_num];

Init_Means（）;

ClusterProcess（）;

//进行聚类过程

~KMean（）

delete[]clusters;

delete[]cutData;

delete[]radio;

voidKMean:

Init_Means（）//初始化函数（首先随即生成代表点）

intnum=data.size（）;

srand（（int）time（0））;

for（inti=0;

cluster_num;

）

intpos=rand（）%num;

boolinsert_flag=true;

//首先判断选中的点是否是中心点

for（unsignedintj=0;

mean_nodes.size（）;

j++）

if（mean_nodes[j]==data[pos]）

insert_flag=false;

break;

if（insert_flag）

mean_nodes.push_back（data[pos]）;

i++;

cout.setf（ios:

fixed）;

setprecision

（1）;

随机产生的数据如下：

\n"

for（inti=0;

num;

i++）

cout<

（"

data[i].pos_x<

data[i].pos_y<

）\t\t"

\n随机产生的"

cluster_num<

个簇中心如下：

cluster_num;

mean_nodes[i].pos_x<

mean_nodes[i].pos_y<

）\t"

endl<

endl;

ClusterProcess（）//聚类过程，将空间中的点分到不同的簇中

//下面是聚类过程

inti;

doublenewVar=3,oldVar=-1;

//新旧距离和

do{

for（i=0;

data.size（）;

i++）//找到每个点当前最近的中心点，并放进对应的簇

intindex=getIndexOfCluster（mean_nodes,data[i]）;

clusters[index].push_back（data[i]）;

for（i=0;

i++）//更新每个簇的中心点

mean_nodes[i]=getMeans（i）;

//获取簇中心

oldVar=newVar;

count++;

newVar=getSumOfDist（clusters,mean_nodes）;

if（abs（newVar-oldVar）>

=1）

for（inti=0;

clusters[i].clear（）;

}while（abs（newVar-oldVar）>

=1）;

//当前后两次距离和相差不大时，则认为达到分类要求

doubleKMean:

getDistance（Nodeactive,Nodeother）

returnsqrt（pow（（active.pos_x-other.pos_x）,2）+pow（（active.pos_y-other.pos_y）,2））;

NodeKMean:

getMeans（intcluster_index）

//求出簇中所有点的均值

NodetmpNode;

intnum=clusters[cluster_index].size（）;

for（intj=0;

tmpNode.pos_x+=clusters[cluster_index][j].pos_x;

tmpNode.pos_y+=clusters[cluster_index][j].pos_y;

tmpNode.pos_x=tmpNode.pos_x/num;

tmpNode.pos_y=tmpNode.pos_y/num;

returntmpNode;

intKMean:

getIndexOfCluster（vector<

means,Nodeactive）//获取当前结点的簇下标

intnum=means.size（）;

intindex=0;

doubletmpDist,minDist=getDistance（means[0],active）;

tmpDist=getDistance（means[i],active）;

if（tmpDist<

minDist）

minDist=tmpDist;

index=i;

returnindex;

getSumOfDist（vector<

mean_nodes）

doublesum=0;

intm_size=mean_nodes.size（）;

intc_size;

m_size;

c_size=clusters[i].size（）;

for（intj=0;

c_size;

j++）

sum+=getDistance（mean_nodes[i],clusters[i][j]）;

returnsum;

cut（）

doubleavgDist;

doublesum=0;

intc_size=clusters[i].size（）;

j++）//计算每个簇的平均值

avgDist=sum/c_size;

//计算每个簇的正常半径：

平均值+标准差

sum=0;

doubled=getDistance（mean_nodes[i],clusters[i][j]）-avgDist;

sum+=pow（d,2）;

radio[i]=1.5*sqrt（sum/c_size）+avgDist;

clusters[i].size（）;

doubled=getDistance（mean_nodes[i],clusters[i][j]）;

if（d>

radio[i]）

vector<

iteratorit=clusters[i].begin（）;

for（intk=0;

k++,it++）

{

}

cutData[i].push_back（*it）;

clusters[i].erase（it）;

showCutResult（）

\n\n******************离群检测结果********************************************"

\n*离群点基于距离进行局部检测，当距离大于平均值与1.5倍标准差的和，则算离群点*"

\n***************************************************************************\n"

intsizeOfCluster=clusters[i].size（）;

\n\n簇"

i+1<

簇心：

正常半径：

radio[i];

sizeOfCluster;

cout<

\n正常点（"

clusters[i][j].pos_x<

clusters[i][j].pos_y<

）\t\t半径：

getDistance（mean_nodes[i],clusters[i][j]）;

intcu_size=cutData[i].size（）;

cu_size;

\n离群点（"

cutData[i][j].pos_x<

cutData[i][j].pos_y<

getDistance（mean_nodes[i],cutData[i][j]）<

超过正常半径，离群！

endl;

2.数据集随机产生

展开阅读全文