<-- home

kNN Intro

February 4, 2019

kNN 算法

邻近算法，或者说K最近邻(kNN，k-NearestNeighbor)分类算法是数据挖掘分类技术中最简单的方法之一。所谓K最近邻，就是k个最近的邻居的意思，说的是每个样本都可以用它最接近的k个邻居来代表。

应用简单实现

根据下述过程可以看到，可以说kNN是一个不需要训练过程的算法。

k近邻算法是非常特殊的，可以被认为是没有模型的算法。

为了和其他算法统一，可以认为训练数据集就是模型本身。

首先我们自己造一个数据集

import numpy as np
import matplotlib.pyplot as plt

raw_data_X = [[3.393, 2.331],
             [3.110, 1.782],
             [1.343, 3.368],
             [3.582, 4.679],
             [2.280, 2.967],
             [7.423, 4.697],
             [5.745, 3.534],
             [9.172, 2.511],
             [7.792, 3.424],
             [7.940, 0.792]]

raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]

X_train = np.array(raw_data_X)
y_train = np.array(raw_data_y)

x = np.array([8.094, 3.366])

画散点图

plt.scatter(X_train[y_train==0,0], X_train[y_train==0,1], color='g')
plt.scatter(X_train[y_train==1,0], X_train[y_train==1,1], color='r')
plt.scatter(x[0], x[1], color='b')
plt.show()

png

我们封装一个我们自己的 kNN 算法

import numpy as np
from math import sqrt
from collections import Counter

def kNN_classify(k, X_train, y_train, x):
    assert 1 <= k <= X_train.shape[0], "k must be vaild"
    assert X_train.shape[0] == y_train.shape[0], ("the size"
        + " of X_train must be equal to the size of y_train")
    assert X_train.shape[1] == x.shape[0], ("the feature "
        + "number of x must be equal to X_train")

    distances = [sqrt(np.sum((x_train - x) ** 2)) for x_train in X_train]
    nearest = np.argsort(distances)

    topK_y = [y_train[i] for i in nearest[:k]]
    votes = Counter(topK_y)

    return votes.most_common(1)[0][0]

使用我们自己封装的 kNN 算法

predict_y = kNN_classify(k=3, X_train=X_train, y_train=y_train, x=x)
predict_y

使用scikit-learn中的kNN

from sklearn.neighbors import KNeighborsClassifier

kNN_classifier = KNeighborsClassifier(n_neighbors=6)
kNN_classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm=’auto’, leaf_size=30, metric=’minkowski’,

metric_params=None, n_jobs=1, n_neighbors=6, p=2,

weights=’uniform’)

# 将要预测的数据放入一个矩阵
X_predict = x.reshape(1, -1)
y_predict = kNN_classifier.predict(X_predict)
y_predict[0]

重新整理kNN代码

我们模仿 sklearn 中的kNN实现，改造我们自己的 kNN 算法实现

面向对象
批量预测

import numpy as np
from math import sqrt
from collections import Counter

class kNNClassifier():

    def __init__(self, k):
        assert k >=1, "k must valid"
        self.k = k
        self._X_train = None
        self._y_train = None

    def fit(self, X_train, y_train):
        assert X_train.shape[0] == y_train.shape[0], ("the "
        + "size of X_train must be equal to the size of "
        + "y_train")
        assert self.k <= X_train.shape[0], ("the size of "
            + "X_train must be at least k")
        self._X_train = X_train
        self._y_train = y_train
        return self

    def predict(self, X_predict):
        assert (self._X_train is not None and
            self._y_train is not None),("must fit "
            + "before predict")
        assert X_predict.shape[1] == self._X_train.shape[1],\
            ("the feature number of X_predict must be equal"
            + " to X_train")
        y_predict = [self._predict(x) for x in X_predict]
        return np.array(y_predict)

    def _predict(self, x):
        distances = [sqrt(np.sum((x_train - x) ** 2)) for x_train in self._X_train]
        nearest = np.argsort(distances)

        topK_y = [self._y_train[i] for i in nearest[:self.k]]
        votes = Counter(topK_y)

        return votes.most_common(1)[0][0]

    def __repr__(self):
        return "kNN(k=%d)" % self.k

kNN_classifier_re = kNNClassifier(6)
kNN_classifier_re.fit(X_train, y_train)

kNN(k=6)

y_predict_re = kNN_classifier_re.predict(X_predict)
y_predict_re[0]