K-近鄰算法:
# coding:utf-8from numpy import *import Operatordef classify0(inx,dataset,lables,k): row_size=dataset.shape[0] # 相當于x1-x2 y1-y2 diff_mat=tile(inx,(row_size,1))-dataset #平方 sq_diff_mat=diff_mat**2 # 行相加 sq_distances=sq_diff_mat.sum(axis=1) # 開根號 distances=sq_distances**0.5 # 從小到大排序之后返回索引的位置 sort_distance=argsort(distances) class_count={} for i in range(k): vote_lable=lables[sort_distance[i]] # 計算各個分類的數量 class_count[vote_lable]=class_count.get(vote_lable,0)+1 # 返回出現次數最多的分類 返回一個tuple list 第一個是Key 第二個是Value sort_class_count=sorted(class_count.iteritems(),key=operator.itemgetter(1),reverse=True) return sort_class_count[0][0]def file2matrix(filrname): fr=open(filrname) arraylines=fr.readlines() line_count=len(arraylines) # 生成為0的矩陣 mat=zeros((line_count,3)) # 存儲分類信息 class_labels=[] index=0 for line in arraylines: line=line.strip() list_line=line.split('/t') mat[index,:]=list_line[0:3] class_labels.append(int(list_line[-1])) index+=1 return mat,class_labels## new_value=(old_value-min_value)/rangedef auto_norm(dataset): min_value=dataset.min(0) max_value=dataset.max(0) range=max_value-min_value norm_data=zeros(shape(dataset)) row_count=dataset.shape[0] # old-min norm_data=dataset-tile(min_value,(row_count,1)) norm_data=norm_data/tile(max_value,(row_count,1)) return norm_data,range,min_valuedef data_class_test(): hoRatio=0.10 data_mat,class_lables=file2matrix('datingTestSet2.txt') norm_data,ranges,min_value=auto_norm(data_mat) row_count=norm_data.shape[0] numTest_vecs=int(row_count*hoRatio) error_count=0.0 for i in range(numTest_vecs): result=classify0(norm_data[i,:],norm_data[numTest_vecs:row_count,:] ,class_lables[numTest_vecs:row_count],3) 代碼摘抄自:機器學習實戰新聞熱點
疑難解答