計算已知類別數據集中的點 與 當前點之間的距離;
選取 與 當前距離最小的k個點;
確定 前k個點所在類別的出現頻率;
返回 前k個點出現頻率最高的類別作為當前點的預測分類。
步驟1用到 歐式距離公式
def classify0(inX, dataSet, labels, k): #距離計算 dataSetSize = dataSet.shape[0] #numpy庫數組的行數 diffMat = tile(inX, (dataSetSize,1)) - dataSet #tile復制inX dataSetSize行數倍,以便相減 sqDiffMat = diffMat**2 sqDistances = sqDiffMat.sum(axis=1) #axis=0表示按列相加,axis=1表示按照行的方向相加 #開二次方 distances = sqDistances**0.5 sortedDistIndicies = distances.argsort() #將元素從小到大排序,提取對應的index,然后輸出返回,如x[3]=-1,y[0]=3 #選擇距離最小的k個點 classCount={} #python對象 for i in range(k): voteIlabel = labels[sortedDistIndicies[i]] classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 #設置鍵 值 + 1 #排序 sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) #operator.itemgetter(1)返回函數,得classCount K-V的V,對V進行排序,因設置reverse,從左到右,從大到小 return sortedClassCount[0][0]
def createDataSet(): group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]]) labels = ['A','A','B','B'] return group, labels group, labels = createDataSet() #最后輸出值 print classify0([0, 0], group, labels, 3) #輸出 B
def file2matrix(filename): fr = open(filename) numberOfLines = len(fr.readlines()) #get the number of lines in the file returnMat = zeros((numberOfLines,3)) #prepare matrix to return classLabelVector = [] #prepare labels return fr = open(filename) index = 0 for line in fr.readlines(): line = line.strip() listFromLine = line.split('\t') returnMat[index,:] = listFromLine[0:3] classLabelVector.append(int(listFromLine[-1])) index += 1 return returnMat,classLabelVector
>>>datingDataMat, datingLabels = kNN.file2matrix('datingTestSet2.txt') >>>print datingDataMat [[ 4.09200000e+04 8.32697600e+00 9.53952000e-01] [ 1.44880000e+04 7.15346900e+00 1.67390400e+00] [ 2.60520000e+04 1.44187100e+00 8.05124000e-01] ..., [ 2.65750000e+04 1.06501020e+01 8.66627000e-01] [ 4.81110000e+04 9.13452800e+00 7.28045000e-01] [ 4.37570000e+04 7.88260100e+00 1.33244600e+00]] >>>print datingLabels[0:20] [3, 2, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 2, 3]
import matplotlib import matplotlib.pyplot as plt from numpy import * fig = plt.figure() ax = fig.add_subplot(111)#1行1列1圖 ax.scatter(datingDataMat[:, 1], datingDataMat[:, 2]) #datingDataMat[:, 1] /*返回所有行,第2列*/ plt.show()
#附帶尺寸、顏色參數 ax.scatter(datingDataMat[:, 1], datingDataMat[:, 2] \ , 15.0*array(datingLabels), 15.0*array(datingLabels)) plt.show()
newValue = (oldValue - min) / (max - min)
def autoNorm(dataSet): minVals = dataSet.min(0)#從每列中選出最小值 maxVals = dataSet.max(0)#從每列中選出最大值 ranges = maxVals - minVals#范圍 normDataSet = zeros(shape(dataSet))#行寬和dataSet相同的00矩陣 m = dataSet.shape[0]#dataSet有多少個實例 normDataSet = dataSet - tile(minVals, (m,1)) #tile將數組A重復n次,上例子minVals,重復m次,1表示 #tile(a,(2,1))就是把a先沿x軸(就這樣稱呼吧)復制1倍,即沒有復制,仍然是 [0,1,2]。 再把結果沿y方向復制2倍 normDataSet = normDataSet / tile(ranges, (m,1)) #element wise divide return normDataSet, ranges, minVals
>>>normMat, ranges, minVals = kNN.autoNorm(datingDataMat) >>>normMat [[ 0.44832535 0.39805139 0.56233353] [ 0.15873259 0.34195467 0.98724416] [ 0.28542943 0.06892523 0.47449629] ..., [ 0.29115949 0.50910294 0.51079493] [ 0.52711097 0.43665451 0.4290048 ] [ 0.47940793 0.3768091 0.78571804]] >>>ranges [ 9.12730000e+04 2.09193490e+01 1.69436100e+00] >>>minVals [ 0. 0. 0.001156]
def datingClassTest(): #取50%的數據進行測試 hoRatio = 0.50 #hold out 10% #處理數據 datingDataMat,datingLabels = file2matrix('datingTestSet2.txt') #load data setfrom file #數據歸一化處理 normMat, ranges, minVals = autoNorm(datingDataMat) m = normMat.shape[0] #拿來 測試條目 數目 numTestVecs = int(m*hoRatio) errorCount = 0.0 for i in range(numTestVecs): #kNN核心算法 classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3) #輸出結果 print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i]) #統計錯誤數 if (classifierResult != datingLabels[i]): errorCount += 1.0 print "the total error rate is: %f" % (errorCount/float(numTestVecs)) print 'errorCount: '+str(errorCount)
>>>datingClassTest() ... the classifier came back with: 2, the real answer is: 1 the classifier came back with: 2, the real answer is: 2 the classifier came back with: 1, the real answer is: 1 the classifier came back with: 1, the real answer is: 1 the classifier came back with: 2, the real answer is: 2 the total error rate is: 0.064000 errorCount: 32.0
def classifyPerson(file): resultList = ['not at all','in small doses', 'in large doses'] percentTats = float(raw_input(\ "percentage of time spent playing video games?")) ffMiles = float(raw_input("frequent flier miles earned per year?")) iceCream = float(raw_input("liters of ice cream consumed per year?")) datingDataMat,datingLabels = file2matrix(file) normMat, ranges, minVals = autoNorm(datingDataMat) inArr = array([ffMiles, percentTats, iceCream]) classifierResult = classify0((inArr- \ minVals)/ranges,normMat,datingLabels,3) print "You will probably like this person: ",\ resultList[classifierResult - 1] kNN.classifyPerson('..\\datingTestSet2.txt')
percentage of time spent playing video games?10 frequent flier miles earned per year?10000 liters of ice cream consumed per year?0.5 You will probably like this person: in small doses
trainingDigits 2000個訓練樣本
testDigitsa 900個測試數據
00000000000001111000000000000000 00000000000011111110000000000000 00000000001111111111000000000000 00000001111111111111100000000000 00000001111111011111100000000000 00000011111110000011110000000000 00000011111110000000111000000000 00000011111110000000111100000000 00000011111110000000011100000000 00000011111110000000011100000000 00000011111100000000011110000000 00000011111100000000001110000000 00000011111100000000001110000000 00000001111110000000000111000000 00000001111110000000000111000000 00000001111110000000000111000000 00000001111110000000000111000000 00000011111110000000001111000000 00000011110110000000001111000000 00000011110000000000011110000000 00000001111000000000001111000000 00000001111000000000011111000000 00000001111000000000111110000000 00000001111000000001111100000000 00000000111000000111111000000000 00000000111100011111110000000000 00000000111111111111110000000000 00000000011111111111110000000000 00000000011111111111100000000000 00000000001111111110000000000000 00000000000111110000000000000000 00000000000011000000000000000000
將把一個32 * 32的二進制圖像矩陣裝換為1 * 1024的向量
def img2vector(filename): returnVect = zeros((1,1024))#創建有1024個元素的列表 fr = open(filename) for i in range(32): lineStr = fr.readline() for j in range(32): returnVect[0,32*i+j] = int(lineStr[j]) return returnVect
>>>testVector = kNN.img2vector('testDigits/0_13.txt') >>>testVector[0, 0:31] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] >>>testVector[0, 32:63] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
import os def handwritingClassTest(): #準備訓練數據和測試數據 hwLabels = [] trainingFileList = os.listdir('trainingDigits') #load the training set m = len(trainingFileList) trainingMat = zeros((m,1024)) for i in range(m): fileNameStr = trainingFileList[i] fileStr = fileNameStr.split('.')[0] #take off .txt classNumStr = int(fileStr.split('_')[0]) hwLabels.append(classNumStr) trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr) testFileList = os.listdir('testDigits') #iterate through the test set errorCount = 0.0 mTest = len(testFileList) #開始測試 for i in range(mTest): fileNameStr = testFileList[i] fileStr = fileNameStr.split('.')[0] #take off .txt classNumStr = int(fileStr.split('_')[0]) vectorUnderTest = img2vector('testDigits/%s' % fileNameStr) classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3) print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr) if (classifierResult != classNumStr): errorCount += 1.0 print "\nthe total number of errors is: %d" % errorCount print "\nthe total error rate is: %f" % (errorCount/float(mTest))
>>>handwritingClassTest() ... the classifier came back with: 9, the real answer is: 9 the classifier came back with: 9, the real answer is: 9 the classifier came back with: 9, the real answer is: 9 the classifier came back with: 9, the real answer is: 9 the classifier came back with: 9, the real answer is: 9 the classifier came back with: 9, the real answer is: 9 the classifier came back with: 9, the real answer is: 9 the total number of errors is: 11 the total error rate is: 0.011628
