首页手记【学习打卡】第12天 Python3入门机器学习

【学习打卡】第12天 Python3入门机器学习

标签：

Python 机器学习

课程介绍

课程名称：Python3入门机器学习 经典算法与应用 入行人工智能
课程章节：5-5；5-6；5-7
主讲老师：liuyubobobo

内容导读

第一部分衡量线性回归准确性的四种办法
第二部分多元线性回归和正规方程解
第三部分代码展示

课程详细

- 第一部分衡量线性回归准确性的四种办法

1.MSE均方误差，一分之m乘y的预测值-y的实际值的平方的和
2.RMSR均方根误差，根号一分之m乘y的预测值-y的实际值的平方的和加了个根号有时候用于排除平方根的影响
3.MSE平均绝对误差，一分之m乘y的预测值-y的实际值的绝对值的和

这节要用引入一个实际的数据，分别用三种方法实现对预测数据准确性的衡量

#引入机器学习包中的波士顿房价信息，并只使用一个特征
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
boston = datasets.load_boston()
x = boston.data[:,5]#这里只使用房间数量的特征
print(type(x.ndim))
y = boston.target

plt.scatter(x,y)
plt.show()
#从图中可以看到最上面有一些数据很奇怪，这就是真实的数据，比如计量仪器的上线，
#问卷调查勾选的限制等等

np.max(y)
#从这里可以看出确实是最大是50
#为了排除异常点的干扰，可以重新定义数据
x = x[y < 50.0]
y = y[y < 50.0]

plt.scatter(x,y)
plt.show()

#拆分波士顿房价数据
from nike.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, train_ratio=0.8, seed=666)

#调用自己编写的单个线性回归特征进行预测并展示
from nike.SimpleLinearRegression import SimpleLinearRegression2
reg2 = SimpleLinearRegression2()
reg2.fit(x_train, y_train)
y_predict = reg2.predict(x_test)
plt.scatter(x_train, y_train)
plt.plot(x_test, y_predict,color="red")
plt.show()

那么接下来就是使用哪几种方法来衡量数据

MSE

mse_test = np.sum((y_test - y_predict) ** 2) / len(y_test)
mse_test
#28.418315342489713

RMSE

from math import sqrt
rmse_test = sqrt(np.sum((y_test - y_predict) ** 2) / len(y_test))
rmse_test
#5.330883167214389

MAE

mae_test = np.sum(np.absolute(y_test - y_predict)) / len(y_test)
mae_test
#3.8540656979860923

封装代码并调用

from nike.metrics import mse_error
from nike.metrics import rmse_error
from nike.metrics import mae_error

mse_error(y_test,y_predict)
#28.418315342489713
rmse_error(y_test,y_predict)
#5.330883167214389
mae_error(y_test,y_predict)
#3.8540656979860923

sklearn 中调用MSE和MAE

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

mean_squared_error(y_test,y_predict)
#28.418315342489713
mean_absolute_error(y_test,y_predict)
3.8540656979860923

R Squared

上一节，学习了三种评价线性回归的评价方法，但sklearn中采用的是R Squared，这节就要着重学习这种评价方式

分类问题判断准确度的方式很简单，1就是好（真值与预测值相同），0就是差（真值与预测值不同），然后看看0有多少个
但是线性回归不大一样，

#核心代码
1 - mean_squared_error(y_test, y_predict) / np.var(y_test)
#调用自己写的代码获得r2
from nike.metrics import r2_score

r2_score(y_test, y_predict)
#调用sklearn获得r2
from sklearn.metrics import r2_score

r2_score(y_test,y_predict)

- 第二部分多元线性回归和正规方程解

多元线性回归就是，x拥有多个特征量，多元线性回归更加符合实际情况

#导入包
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

#导入波士顿房价信息
boston = datasets.load_boston()

X = boston.data
y = boston.target

X = X[y < 50]
y = y[y < 50]

#进行数据切割
from nike.model_selection import train_test_split
# X = np.arange(100).reshape(20, 5)
# y = np.random.randint(0,10,20).reshape(-1,1)

X_train, X_test, y_train, y_test = train_test_split(X, y,train_ratio=0.8,seed=666)

#导入自己写的函数，拟合参数
# from nike.LinearRegression import LinearRegression
%run nike\LinearRegression.py
reg = LinearRegression()
reg.fit_normal(X_train, y_train)

#计算线性回归最优解的S2准确度
reg.score(X_test, y_test)
#0.7788559306186865

- 第三部分代码展示

四种线性回归准确率检测代码的编写

def mse_error(y_true, y_predict):
    """计算真实值与预测值之间的MSE的误差"""
    assert len(y_true) == len(y_predict),\
        'the size of y_test must be equal to size of the y_predict'
    return np.sum((y_true - y_predict) ** 2) / len(y_true)

def rmse_error(y_true, y_predict):
    """计算真实值与预测值之间的MSE的误差"""
    assert len(y_true) == len(y_predict),\
        'the size of y_test must be equal to size of the y_predict'
    return sqrt(np.sum((y_true - y_predict) ** 2) / len(y_true))


def mae_error(y_true, y_predict):
    """计算真实值与预测值之间的MSE的误差"""
    assert len(y_true) == len(y_predict),\
        'the size of y_test must be equal to size of the y_predict'
    return np.sum(np.absolute(y_true - y_predict)) / len(y_true)
def r2_score(y_true, y_predict):
    assert len(y_true) == len(y_predict), \
        'the size of y_test must be equal to size of the y_predict'
    return 1 - mse_error(y_true, y_predict) / np.var(y_true)

线性回归正规方程解的代码编写

class LinearRegression:

    def __init__(self):
        """初始化多元线性回归模型"""
        #初始化截距coef_和系数interception_，和theta私有化参数
        self.coef_ =  None
        self.intercept_ = None
        self._theta = None

    def fit_normal(self, X_train ,y_train):
        assert X_train.shape[0] ==y_train.shape[0],\
            "the size of X_train must be equal to the size of y_train"
        #ones(多少个,0or1行和列)
        X_b = np.concatenate(([np.ones((len(X_train), 1)), X_train]), axis=1)
        self._theta = np.linalg.pinv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train)

        self.intercept_ = self._theta[0]
        self.coef_ = self._theta[1:]

        return self

    def predict(self,X_predict):
        assert self.intercept_ is not None and self.coef_ is not None,\
            'must fit before predict'
        assert X_predict.shape[1] == len(self.coef_),\
            'the feature number of X_predict must be equal to X_train'
        X_b = np.concatenate([np.ones((X_predict.shape[0],1)),X_predict], axis=1)

        return X_b.dot(self._theta)

    def score(self, X_test, y_test):

        y_predict = self.predict(X_test)
        return r2_score(y_test, y_predict)

    def __repr__(self):
        return "LinearRegression()"