import numpy as np
from sklearn import datasets
boston = datasets.load_boston()
X = boston.data
y = boston.target
defmse_score(y_predict,y_test):'''input:y_predict(ndarray):预测值y_test(ndarray):真实值ouput:mse(float):mse损失函数值'''#********* Begin *********#mse=np.mean((y_predict-y_test)**2)#********* End *********#return mse
classLinearRegression:def__init__(self):'''初始化线性回归模型'''self.theta =Nonedeffit_normal(self,train_data,train_label):'''input:train_data(ndarray):训练样本train_label(ndarray):训练标签'''#********* Begin *********#ones=np.ones((train_data.shape[0],1))X=np.hstack((ones,train_data))XT=X.TXTX=np.dot(XT,X)XTX_1=np.linalg.inv(XTX)XTX_1XT=np.dot(XTX_1,XT)self.theta=np.dot(XTX_1XT,train_label)#********* End *********#return self.thetadefpredict(self,test_data):'''input:test_data(ndarray):测试样本'''#********* Begin *********#ones=np.ones((test_data.shape[0],1))X=np.hstack((ones,test_data))y_predict=np.dot(X,self.theta)return y_predict#********* End *********#
朴素贝叶斯模型实现
import numpy as np
classNaiveBayesClassifier(object):def__init__(self):'''self.label_prob表示每种类别在数据中出现的概率例如,{0:0.333, 1:0.667}表示数据中类别0出现的概率为0.333,类别1的概率为0.667'''self.label_prob ={}'''self.condition_prob表示每种类别确定的条件下各个特征出现的概率例如训练数据集中的特征为 [[2, 1, 1],[1, 2, 2],[2, 2, 2],[2, 1, 2],[1, 2, 3]]标签为[1, 0, 1, 0, 1]那么当标签为0时第0列的值为1的概率为0.5,值为2的概率为0.5;当标签为0时第1列的值为1的概率为0.5,值为2的概率为0.5;当标签为0时第2列的值为1的概率为0,值为2的概率为1,值为3的概率为0;当标签为1时第0列的值为1的概率为0.333,值为2的概率为0.666;当标签为1时第1列的值为1的概率为0.333,值为2的概率为0.666;当标签为1时第2列的值为1的概率为0.333,值为2的概率为0.333,值为3的概率为0.333;因此self.label_prob的值如下: {0:{0:{1:0.52:0.5}1:{1:0.52:0.5}2:{1:02:13:0}}1:{0:{1:0.3332:0.666}1:{1:0.3332:0.666}2:{1:0.3332:0.3333:0.333}}}'''self.condition_prob ={}deffit(self, feature, label):'''对模型进行训练,需要将各种概率分别保存在self.label_prob和self.condition_prob中:param feature: 训练数据集所有特征组成的ndarray:param label:训练数据集中所有标签组成的ndarray:return: 无返回'''#********* Begin *********#row_num =len(feature)col_num =len(feature[0])unique_label_count =len(set(label))for c in label:if c in self.label_prob:self.label_prob[c]+=1else:self.label_prob[c]=1for key in self.label_prob.keys():# 计算每种类别在数据集中出现的概率,拉普拉斯平滑self.label_prob[key]+=1self.label_prob[key]/=(unique_label_count+row_num)# 构建self.condition_prob中的keyself.condition_prob[key]={}for i inrange(col_num):self.condition_prob[key][i]={}for k in np.unique(feature[:, i], axis=0):self.condition_prob[key][i][k]=1for i inrange(len(feature)):for j inrange(len(feature[i])):if feature[i][j]in self.condition_prob[label[i]]:self.condition_prob[label[i]][j][feature[i][j]]+=1for label_key in self.condition_prob.keys():for k in self.condition_prob[label_key].keys():#拉普拉斯平滑total =len(self.condition_prob[label_key].keys())for v in self.condition_prob[label_key][k].values():total += vfor kk in self.condition_prob[label_key][k].keys():# 计算每种类别确定的条件下各个特征出现的概率self.condition_prob[label_key][k][kk]/= total#********* End *********#defpredict(self, feature):'''对数据进行预测,返回预测结果:param feature:测试数据集所有特征组成的ndarray:return:'''# ********* Begin *********#result =[]# 对每条测试数据都进行预测for i, f inenumerate(feature):# 可能的类别的概率prob = np.zeros(len(self.label_prob.keys()))ii =0for label, label_prob in self.label_prob.items():# 计算概率prob[ii]= label_probfor j inrange(len(feature[0])):prob[ii]*= self.condition_prob[label][j][f[j]]ii +=1# 取概率最大的类别作为结果result.append(list(self.label_prob.keys())[np.argmax(prob)])return np.array(result)#********* End *********#
kmeans实现
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeansX = np.array([[1,2],[1,4],[1,0],[4,2],[4,4],[4,0]])
k =2
kmeans = KMeans(n_clusters=k)
y_pred = kmeans.fit_predict(X)plt.scatter(X[:,0], X[:,1], c=y_pred)
plt.title('KMeans Clustering')
plt.xlabel('X1')
plt.ylabel('X2')
plt.show()
PCA实现
import numpy as np
from sklearn.decomposition import PCAX = np.array([[1,2,3],[4,5,6],[7,8,9]])
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)print(X_pca)'''
result
[[-1.34690654e+00 1.11022302e-16][ 0.00000000e+00 0.00000000e+00][ 1.34690654e+00 -1.11022302e-16]]'''
决策树实现
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_splitiris = datasets.load_iris()
X = iris.data
y = iris.targetX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)y_pred = clf.predict(X_test)print("训练集得分:", clf.score(X_train, y_train))print("测试集得分:", clf.score(X_test, y_test))'''
训练集得分: 1.0
测试集得分: 0.9555555555555556'''