K-Means概念
K均值聚类算法(k-means clustering algorithm)是一种迭代求解的聚类分析算法,其步骤是,预将数据分为K组,则随机选取K个对象作为初始的聚类中心,然后计算每个对象与各个种子聚类中心之间的距离,把每个对象分配给距离它最近的聚类中心。聚类中心以及分配给它们的对象就代表一个聚类。每分配一个样本,聚类的聚类中心会根据聚类中现有的对象被重新计算。这个过程将不断重复直到满足某个终止条件。终止条件可以是没有(或最小数目)对象被重新分配给不同的聚类,没有(或最小数目)聚类中心再发生变化,误差平方和局部最小。
概述
此篇代码为Python生成指定数量的随机点,同时内定几个初始随机点,遍历计算各点到这些随机点的距离(坐标系下两点间距离公式),最终将各点归类,此即K-Means算法。最后通过Python的PLT库进行可视化。
参数
K-Means的分类数K(小于随机点数目即可); 需要的随机点数目(尽可能多,最好超过K的20倍); 可视化坐标轴刻度(一般取随机点数的一半,不固定)。
运行过程提示
运行结果可视化
图中三个区域中心点为三个不同颜色,与其相邻且同色的点归为一簇。
代码
import math
import time
import random
import matplotlib. pyplot as pltclass KMeans : def __init__ ( self, k, pointsAcount, dataRange) : self. k = int ( k) self. pointsAcount = int ( pointsAcount) self. dataRange = int ( dataRange) self. dataset = [ ] self. startPooints = [ ] def generateDataset ( self) : Xs = [ ] Ys = [ ] a = 0 while a < self. pointsAcount: Xs. append( random. randint( 0 , self. dataRange) ) Ys. append( random. randint( 0 , self. dataRange) ) a += 1 self. dataset. append( Xs) self. dataset. append( Ys) return self. datasetdef distanceFormula ( self, point1, point2) : distance = math. sqrt( ( point1[ 0 ] - point2[ 0 ] ) ** 2 + ( point1[ 1 ] - point2[ 1 ] ) ** 2 ) return distancedef getMin ( self, compareList) : minNum = compareList[ 0 ] minIndex = 0 for i in range ( len ( compareList) ) : if compareList[ i] < minNum: minNum = compareList[ i] minIndex = ireturn minNum, minIndexdef randomStart ( self) : newDataset = self. dataset. copy( ) newDataset. append( [ ] ) a = 0 print ( ) while a < self. k: positon = random. randint( 0 , self. pointsAcount - 1 ) newDataset[ 2 ] . append( [ newDataset[ 0 ] [ positon] , newDataset[ 1 ] [ positon] ] ) a += 1 print ( '起始中心点 {} 为: ( {:.5f} , {:.5f} )' . format ( str ( a) , newDataset[ 0 ] [ positon] , newDataset[ 1 ] [ positon] ) ) return newDatasetdef generateClusters ( self, newDataset) : a = 0 clusterList = [ ] while a < self. k: clusterList. append( [ ] ) a += 1 distanceDict = { } startPointIndex = 0 for startPoint in newDataset[ 2 ] : distanceDict. setdefault( str ( startPointIndex) , [ ] ) for p in range ( self. pointsAcount) : if [ newDataset[ 0 ] [ p] , newDataset[ 1 ] [ p] ] in newDataset[ 2 ] : pass else : anotherPoint = [ newDataset[ 0 ] [ p] , newDataset[ 1 ] [ p] ] distance = self. distanceFormula( startPoint, anotherPoint) distanceDict[ str ( startPointIndex) ] . append( distance) startPointIndex += 1 for p in range ( self. pointsAcount- self. k) : compareList = [ ] for k in range ( self. k) : compareList. append( distanceDict[ str ( k) ] [ p] ) minDistance, minIndex = self. getMin( compareList) clusterList[ minIndex] . append( p) newDataset[ 2 ] . clear( ) a = 0 for cluster in clusterList: Xs = 0 Ys = 0 for p in cluster: Xs += newDataset[ 0 ] [ p] Ys += newDataset[ 1 ] [ p] newStartPoint = [ Xs/ len ( cluster) , Ys/ len ( cluster) ] newDataset[ 2 ] . append( newStartPoint) a += 1 print ( '新的中心点 {} 为: ( {:.5f} , {:.5f} )' . format ( str ( a) , Xs/ len ( cluster) , Ys/ len ( cluster) ) ) return newDataset[ 2 ] , clusterListdef randomColor ( self) : colorArr = [ '1' , '2' , '3' , '4' , '5' , '6' , '7' , '8' , '9' , 'A' , 'B' , 'C' , 'D' , 'E' , 'F' ] color = '' for i in range ( 6 ) : color += colorArr[ random. randint( 0 , 14 ) ] return '#' + colordef drawDataset ( self, startPointList, clusterList) : plt. title( 'K-Means Clustering Results' ) plt. xlabel( 'X' ) plt. ylabel( 'Y' ) for startPoint in startPointList: startPointColor = self. randomColor( ) plt. plot( startPoint[ 0 ] , startPoint[ 1 ] , color= startPointColor, marker= 'o' ) for cluster in clusterList: pointColor = self. randomColor( ) for p in cluster: plt. plot( self. dataset[ 0 ] [ p] , self. dataset[ 1 ] [ p] , color= pointColor, marker= '.' ) plt. show( ) def runKMeans ( self) : startTime = time. time( ) self. generateDataset( ) newDataset = self. randomStart( ) lastStartPointList = [ ] print ( '\n开始第 1 次迭代...' ) newStartPointList, clusterList = self. generateClusters( newDataset) iteration = 2 while lastStartPointList != newStartPointList: print ( '\n开始第 ' + str ( iteration) + ' 次迭代...' ) newDataset = self. dataset. copy( ) newDataset. append( newStartPointList) lastStartPointList = newStartPointList. copy( ) newStartPointList, clusterList = self. generateClusters( newDataset) iteration += 1 print ( ) k = 0 while k < self. k: print ( '迭代完毕,最终中心点 {} 为: ( {:.5f} , {:.5f} )' . format ( str ( k+ 1 ) , newStartPointList[ k] [ 0 ] , newStartPointList[ k] [ 1 ] ) ) k += 1 endTime = time. time( ) print ( '\n共耗费时间: ' + str ( endTime- startTime) ) self. drawDataset( newStartPointList, clusterList) if __name__ == '__main__' : KMeans = KMeans( k= input ( '请输入K值:' ) , pointsAcount= input ( '请输入随机点个数:' ) , dataRange= input ( '请输入坐标轴最大刻度:' ) ) KMeans. runKMeans( )