Vectorization

March 26, 2019

向量化运算

\(a = \frac{\sum_{i=1}^m(x^{(i)} - \bar{x})(y^{(i)} - \bar{y})}{\sum_{i=1}^m(x^{(i)} - \bar{x})^2}\) 符合形式： \(\sum_{i=1}^m w^{(i)} · v^{(i)}\) 可以使使用向量点乘来提高运算效率： \(w·v\)

import numpy as np
import matplotlib.pyplot as plt

m = 1000000
x = np.random.random(size=m)
y = x * 2.0 + 3.0 + np.random.normal()

import numpy as np

class SimpleLinearRegressionV1:

    def __init__(self):
        self.a_ = None
        self.b_ = None

    def fit(self, x_train, y_train):
        assert x_train.ndim == 1 and y_train.ndim == 1,\
            "needs single feature train data"
        assert len(x_train) == len(y_train),\
            "the size of x and y must be equals"
        x_mean = np.mean(x_train)
        y_mean = np.mean(y_train)
        self.a_ = np.sum((x_train - x_mean) * (y_train - y_mean)) / np.sum((x_train - x_mean) ** 2)
        self.b_ = y_mean - self.a_ * x_mean
        return self

    def predict(self, x_predict):
        assert isinstance(x_predict, (int, float)) or x_predict.ndim == 1,\
            "needs single feature data to predict"
        assert self.a_ is not None and self.b_ is not None,\
            "must be fit before oredict"
        return self.a_ * x_predict + self.b_

    def __repr__(self):
        return "SimpleLinearRegressionV1()"

class SimpleLinearRegressionV2:

    def __init__(self):
        self.a_ = None
        self.b_ = None

    def fit(self, x_train, y_train):
        assert x_train.ndim == 1 and y_train.ndim == 1,\
            "needs single feature train data"
        assert len(x_train) == len(y_train),\
            "the size of x and y must be equals"
        x_mean = np.mean(x_train)
        y_mean = np.mean(y_train)
        self.a_ = (x_train - x_mean).dot(y_train - y_mean) / (x_train - x_mean).dot((x_train - x_mean))
        self.b_ = y_mean - self.a_ * x_mean
        return self

    def predict(self, x_predict):
        assert isinstance(x_predict, (int, float)) or x_predict.ndim == 1,\
            "needs single feature data to predict"
        assert self.a_ is not None and self.b_ is not None,\
            "must be fit before oredict"
        return self.a_ * x_predict + self.b_

    def score(self, x_test, y_test):
        assert len(x_test) == len(y_test),\
        "the size of y_true must be equal to the size of y_predict"
        return 1 - np.sum((y_test - self.predict(x_test)) ** 2) / len(y_test) / np.var(y_test)

    def __repr__(self):
        return "SimpleLinearRegressionV2()"

%%time
slr1 = SimpleLinearRegressionV1()
slr1.fit(x, y)
print(slr1.predict(6))

14.47465267558519 Wall time: 269 ms

%%time
slr2 = SimpleLinearRegressionV2()
slr2.fit(x, y)
print(slr2.predict(6))

14.474652675585194 Wall time: 43 ms

其中v2是向量化之后的，v1和上一小节一样，没变，可以看到效率得到了不错的提升。