k- 近邻 (k-NearestNeighbor,kNN) 分类算法是数据挖掘分类技术中最简单的方法之一。K- 近邻算法是通过测量不同特征值之间的距离进行分类的。基本思路是:如果一个样本在特征空间中的k 个最邻近样本中的大多数属于某一个类别,则该样本也属于这个类别。该方法在决定类别上只依据最近的一个或几个样本的类别来决定待分类样本所属的类别。KNN 算法中,所选择的邻居都是已经正确分类的对象。该方法在分类决策上只依据最邻近的一个或几个样本的类别来决定待分类样本所属的类别。
- 不喜欢的人-代码0
- 魅力一般的人-代码1
- 极具魅力的人-代码3
0x00 收集/准备数据
序号 | 飞行公里数 | 玩游戏所占时间百分比 | 吃冰淇淋公升数 | 样本标签 |
1 | 40920 | 8.3 | 0.9 | 3 |
2 | 14488 | 7.1 | 1.6 | 2 |
3 | 26052 | 1.4 | 0.8 | 1 |
4 | 75136 | 13.1 | 0.42 | 1 |
0x01 设计算法分析数据
- 二位数组:存储前三位属性的位置分别是飞行公里数、玩游戏所占比例、吃冰淇淋公升数
#!/usr/bin/python # -*- coding: utf-8 -*- #__author__ : stray_camel #pip_source : https://mirrors.aliyun.com/pypi/simple import sys,os import numpy as np import operator # Operator module ''' k-Nearest Neighbor,KNN ''' class KNN_test(): def __init__(self, absPath:dict(type = str, help="Directory of the current file") = os.path.dirname(os.path.abspath(__file__)), ): self.absPath = absPath # Open file by file name to get test data def file2matrix(self, filename : dict(type = str, help="Relative path and name of the file"), )->dict(type=(np.array,list), help=('The two-dimensional array stores the first three digits of the sample data', 'and the list stores the tags of the sample data (type of person)')): fr = open(self.absPath+'/'+filename) arrayOLines = fr.readlines() # get the number of data lines numberOfLines = len(arrayOLines) # Create a return matrix returnMat = np.zeros((numberOfLines, 3)) classLabelVector = [] index = 0 for line in arrayOLines: # Remove whitespace line = line.strip() # split specifies the separator to slice the data # '\t' means to jump horizontally to the next tab position '\r' means enter '\n' means carriage return and line feed listFromLine = line.split('\t') # Select the first 3 elements (features) and store them in the returned matrix returnMat[index, :] = listFromLine[0:3] classLabelVector.append(int(listFromLine[-1])) # The -1 index indicates the last column of elements, and the bit 'label' information is stored in the 'classLabelVector' index += 1 return returnMat, classLabelVector
if __name__ == "__main__":
test = KNN_test()
# print(test.file2matrix.__annotations__)
4.0920000e+04 表示为 4.0920000*10^4
# get Euclidean distance
def get_Euclidean_distance(self,
a : dict(type = np.array,help = "Normalized data set"),
dataSet : dict(type = np.array, help = "The two-dimensional array stores the first three digits of the sample data"),
labels : dict(type=list, help="Actual labeling of test sample data"),
k : dict(type = int, help = "Number of nearest neighbors") ):
# get the length of the first dimension维度 of the data matrix, Which is the length of each dimension
dataSetSize = dataSet.shape[0]
# tile repeating array 'a', with dataSet rows and 1 dataSet column, subtraction calculation difference
diffMat = np.tile(a, (dataSetSize, 1)) - dataSet
# **means the power operation, the Euclidean distance used here
sqDiffMat = diffMat ** 2
# default parameters of Ordinary 'sum' are axis = 0 for ordinary addition, and axis = 1 for row vector addition of one row
sqDisttances = sqDiffMat.sum(axis =1)
# argsort() returns the index via value from samll to large(array index 0, 1, 2,3)
distances = sqDisttances ** 0.5
sortedDistIndicies = distances.argsort()
# Select 'k' points with the smallest distance
classCount = {}
for _ in range(k):
# Returns the first k labels that are close according to the index value of the sorted result
voteIlabel = labels[sortedDistIndicies[_]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
# Sorting frequency
sortedClassCount = sorted(classCount.items(), key = operator.itemgetter(1), reverse = True)
# return the most frequent
return sortedClassCount[0][0]
# Normalized eigenvalues 归一化特征值
# Normalization formula公式: ('current value'-'minimum value') / range
def autoNorm(self,
dataSet : dict(type = np.array, help="The two-dimensional array stores the first three digits of the sample data"),
)->dict(type=(normDataSet, ranges, minVals),help=("Normalized data matrix 1000 * 3 * 1", "maximum value minus minimum value data matrix 1 * 3", "matrix formed by minimum value in data set 1 * 3")):
# stores the minimum value of each column, parameter 0 makes it possible to select the minimum value from the column instead of the current row
minVals = dataSet.min (0)
# stores the maximum value of each column
maxVals = dataSet.max (0)
ranges = maxVals-minVals
# initialize the normalization matrix to the read dataSet
normDataSet = np.zeros(shape (dataSet))
# m holds the first row
# The feature matrix is 3x1000, and the min max range is 1x3. Therefore, the content of the variable is copied into the input matrix with the same size as the tile.
m = dataSet.shape [0]
normDataSet = dataSet-tile (minVals, (m, 1))
normDataSet = normDataSet / tile (ranges, (m, 1))
return normDataSet, ranges, minVals
0x02 测试/使用算法
# Test Dating Site Classification Results Code
def datingClassTest(self):
hoRatio = 0.10 # hold out 10%
datingDataMat, datingLabels = self.file2matrix('/datingTestSet2.txt') # load data setfrom file
normMat, ranges, minVals = self.autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m * hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = self.get_Euclidean_distance(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
print("分类器返回的值: %s, 正确的值: %s" % (classifierResult, datingLabels[i]))
if (classifierResult != datingLabels[i]): errorCount += 1.0
print("总的错误率是: %f" % (errorCount / float(numTestVecs)))
print("错误的个数:%f" % errorCount)
if __name__ == "__main__":
test = KNN_test()
# print(test.file2matrix.__annotations__)
分类器返回的值: 3, 正确的值: 3
分类器返回的值: 3, 正确的值: 1
总的错误率是: 0.050000
# Complete Dating Website Prediction: Given a person, determine when it is suitable for dating
def classifyPerson(self):
resultList = ['不喜欢', '一般喜欢', '特别喜欢']
percentTats = float(input("玩游戏占的百分比"))
ffMiles = float(input("每年坐飞机多少公里"))
iceCream = float(input("每年吃多少公升的冰淇淋"))
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
inArr = array([ffMiles, percentTats, iceCream])
classifierResult = classify0((inArr - minVals) / ranges, normMat, datingLabels, 3)
print("你将有可能对这个人是:", resultList[int(classifierResult) - 1])
