一、 加載sklearn中的數(shù)據(jù)集datasets
from sklearn import datasetsiris = datasets.load_iris() # 鳶尾花卉數(shù)據(jù)digits = datasets.load_digits() # 手寫(xiě)數(shù)字8x8像素信息數(shù)據(jù)查看數(shù)據(jù)的信息PRint iris.data[:4] # 查看數(shù)據(jù)的特征信息print iris.data.shape # 查看數(shù)據(jù)的特征信息維度print iris.target_names # 查看標(biāo)簽對(duì)應(yīng)的文本print iris.target[:4] # 查看數(shù)據(jù)的標(biāo)簽 setosa:0 ...[[ 5.1  3.5  1.4  0.2] [ 4.9  3.   1.4  0.2] [ 4.7  3.2  1.3  0.2] [ 4.6  3.1  1.5  0.2]](150L, 4L)['setosa' 'versicolor' 'virginica'][0 0 0 0]print digits.data[0]print digits.data.shapeprint digits.target[0]print digits.data[0].reshape((8,8)) # 重塑成8x8的像素?cái)?shù)組[  0.   0.   5.  13.   9.   1.   0.   0.   0.   0.  13.  15.  10.  15.   5.   0.   0.   3.  15.   2.   0.  11.   8.   0.   0.   4.  12.   0.   0.   8.   8.   0.   0.   5.   8.   0.   0.   9.   8.   0.   0.   4.  11.   0.   1.  12.   7.   0.   0.   2.  14.   5.  10.  12.   0.   0.   0.   0.   6.  13.  10.   0.   0.   0.](1797L, 64L)0[[  0.   0.   5.  13.   9.   1.   0.   0.] [  0.   0.  13.  15.  10.  15.   5.   0.] [  0.   3.  15.   2.   0.  11.   8.   0.] [  0.   4.  12.   0.   0.   8.   8.   0.] [  0.   5.   8.   0.   0.   9.   8.   0.] [  0.   4.  11.   0.   1.  12.   7.   0.] [  0.   2.  14.   5.  10.  12.   0.   0.] [  0.   0.   6.  13.  10.   0.   0.   0.]]二、訓(xùn)練集和分割集的分割
from sklearn.model_selection import train_test_splitX = digits.data # 特征矩陣y = digits.target # 標(biāo)簽向量# 隨機(jī)分割訓(xùn)練集和測(cè)試集:# test_size:設(shè)置測(cè)試集的比例。random_state:可理解為種子,保證隨機(jī)唯一X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3., random_state=8) print X_train.shapeprint X_test.shape(1198L, 64L)(599L, 64L)三、特征數(shù)值的歸一化
當(dāng)特征的數(shù)值范圍差距較大時(shí),需要對(duì)特征的數(shù)值進(jìn)行歸一化,防止某些屬性的權(quán)重過(guò)大。(有時(shí)不需特征歸一化)
from sklearn import preprocessingimport numpy as npx1 = np.random.randint(0,10,1000).reshape((1000,1))x2 = np.random.randint(0,100,1000).reshape((1000,1))x3 = np.random.randint(0,10000,1000).reshape((1000,1))X = np.concatenate([x1,x2,x3],axis=1)print X[:4][[   0   78 3423] [   7   35  963] [   7   63 9945] [   3   60 6516]]# 特征數(shù)值歸一化print preprocessing.scale(X)[:4][[-1.59232736  0.96948157 -0.56718281] [ 0.83236081 -0.55950092 -1.40672255] [ 0.83236081  0.43611559  1.65862133] [-0.55317529  0.32944239  0.48838484]驗(yàn)證歸一化的重要性# 生成分類(lèi)數(shù)據(jù)進(jìn)行驗(yàn)證from sklearn import datasetsimport matplotlib.pyplot as plt%matplotlib inline# 生成分類(lèi)數(shù)據(jù):# n_sample:樣本個(gè)數(shù)、n_features:類(lèi)別標(biāo)簽種類(lèi)數(shù)X, y = datasets.make_classification(n_samples=300, n_features=2, n_redundant=0, n_informative=2,                            random_state=25, n_clusters_per_class=1, scale=100)plt.scatter(X[:,0], X[:,1], c=y)plt.show()
使用svm模型(未特征歸一化時(shí))# 使用svm模型(未特征歸一化時(shí))from sklearn import svm# X = preprocessing.scale(X)# 分割訓(xùn)練集合測(cè)試集X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3., random_state=7) svm_classifier = svm.SVC()# 訓(xùn)練模型svm_classifier.fit(X_train, y_train)# 在測(cè)試集上對(duì)模型打分svm_classifier.score(X_test, y_test)0.52000000000000002使用svm模型(特征歸一化時(shí))# 使用svm模型(特征歸一化時(shí))from sklearn import svmX = preprocessing.scale(X)# 分割訓(xùn)練集合測(cè)試集X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3., random_state=7) svm_classifier = svm.SVC()svm_classifier.fit(X_train, y_train)svm_classifier.score(X_test, y_test)0.97999999999999998四、模型的訓(xùn)練、預(yù)測(cè)與保存
以線性回歸模型為例iris = datasets.load_iris() # 鳶尾花卉數(shù)據(jù)X = iris.datay = iris.targetprint X[:3]print y[:3][[ 5.1  3.5  1.4  0.2] [ 4.9  3.   1.4  0.2] [ 4.7  3.2  1.3  0.2]][0 0 0]# 選擇線性回歸模型from sklearn.linear_model import LinearRegression# 新建一個(gè)模型(參數(shù)默認(rèn))iris_model = LinearRegression()# 分割訓(xùn)練集、測(cè)試集X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3., random_state=7) # 訓(xùn)練該模型iris_model.fit(X_train,y_train)# 返回模型參數(shù)列表iris_model.get_params(){'copy_X': True, 'fit_intercept': True, 'n_jobs': 1, 'normalize': False}# 模型在訓(xùn)練集上的評(píng)分iris_model.score(X_train, y_train)0.94505820275667418# 模型在測(cè)試集上的評(píng)分iris_model.score(X_test, y_test)0.89618390663189063# 使用模型進(jìn)行預(yù)測(cè)y_pred = iris_model.predict(X_test)print '預(yù)測(cè)標(biāo)簽:', y_pred[:3]print '真實(shí)標(biāo)簽:', y_test[:3]預(yù)測(cè)標(biāo)簽: [ 1.66080893  1.39414184 -0.02450645]真實(shí)標(biāo)簽: [2 1 0]# 使用pickle保存模型import cPickle as picklewith open('LR_model.pkl', 'w') as f:    pickle.dump(iris_model, f)# 重新加載模型進(jìn)行預(yù)測(cè)with open('LR_model.pkl', 'r') as f:    model = pickle.load(f)# 使用模型進(jìn)行預(yù)測(cè)model.predict(X_test)[:3]array([ 1.66080893,  1.39414184, -0.02450645])五、交叉驗(yàn)證
在分割訓(xùn)練集和測(cè)試集后,測(cè)試集一般用于對(duì)最后選擇的模型進(jìn)行評(píng)分。而在選擇模型的超參數(shù)的過(guò)程中,為驗(yàn)證/評(píng)價(jià)模型的好壞,需要在訓(xùn)練集中取出一定比例的樣本作為驗(yàn)證集來(lái)評(píng)價(jià)某個(gè)模型的好壞,訓(xùn)練集中的其他樣本用來(lái)訓(xùn)練模型。
為消除所選出來(lái)的驗(yàn)證集中的樣本的特殊性,則需要將訓(xùn)練集中的每一份樣本作為驗(yàn)證集,其他樣本作為訓(xùn)練集來(lái)訓(xùn)練模型。若將訓(xùn)練集中的樣本分為N分,最后會(huì)得到N個(gè)對(duì)該模型的評(píng)分,對(duì)這些評(píng)分取均值,即得到了交叉驗(yàn)證的評(píng)分。
交叉驗(yàn)證一般用來(lái)調(diào)整模型的超參數(shù)。
sklearn中的交叉驗(yàn)證
cross_val_score函數(shù)用于交叉驗(yàn)證,返回各個(gè)驗(yàn)證集的評(píng)價(jià)得分。
from sklearn import datasetsfrom sklearn.model_selection import train_test_splitfrom sklearn.model_selection import cross_val_scoreimport matplotlib.pyplot as plt%matplotlib inlineiris = datasets.load_iris()X = iris.datay = iris.targetX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3., random_state=5)以KNN模型為例# 使用KNN模型進(jìn)行預(yù)測(cè)(K為超參數(shù))from sklearn.neighbors import KNeighborsClassifier# 超參數(shù)K的范圍k_range = range(1,31)# 交叉驗(yàn)證的評(píng)分集合cv_scores = []for k in k_range:    knn = KNeighborsClassifier(k) # 構(gòu)造模型    # 將訓(xùn)練集均分為10份    #cv: 交叉驗(yàn)證迭代器    scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy') # 分類(lèi)問(wèn)題使用    #scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='neg_mean_squared_error') # 回歸問(wèn)題使用    cv_scores.append(scores.mean()) # # print cv_scores# 可視化各個(gè)k值模型的交叉驗(yàn)證的平均分plt.plot(k_range,cv_scores)plt.xlabel('k')plt.ylabel('Accuracy')plt.show()
# 選擇最優(yōu)的超參數(shù)Kbest_knn = KNeighborsClassifier(15)best_knn.fit(X_train, y_train)print best_knn.score(X_test, y_test)print best_knn.predict(X_test)1.0[1 2 2 0 2 1 0 1 0 1 1 2 2 2 0 0 2 2 0 0 1 2 0 1 1 2 1 1 1 2 0 1 1 0 1 0 0 2 0 2 2 1 0 0 1 2 1 2 2 0]六、過(guò)擬合與欠擬合
過(guò)擬合:模型對(duì)于訓(xùn)練集數(shù)據(jù)擬合程度過(guò)當(dāng),以致太適合訓(xùn)練集數(shù)據(jù)而無(wú)法適應(yīng)一般情況。即訓(xùn)練出來(lái)的模型在訓(xùn)練集上表現(xiàn)很好,在驗(yàn)證集/測(cè)試集上表現(xiàn)并不好。
欠擬合:模型在訓(xùn)練集/測(cè)試集上都表現(xiàn)得不是很好。
# 加載數(shù)據(jù)from sklearn.model_selection import learning_curvefrom sklearn.svm import SVCimport numpy as npdigits = datasets.load_digits()X = digits.datay = digits.target欠擬合情況# 繪制學(xué)習(xí)曲線(學(xué)習(xí)曲線:在不同訓(xùn)練數(shù)據(jù)量下對(duì)訓(xùn)練出來(lái)的模型進(jìn)行評(píng)分)# gamma = 0.001train_sizes, train_scores, val_scores = learning_curve(    SVC(gamma=0.001), X, y, cv=10, scoring='accuracy',    train_sizes=[0.1, 0.25, 0.5, 0.75, 1])# train_sizes: 每次模型訓(xùn)練的數(shù)量# train_scores: 每次模型在訓(xùn)練集上的評(píng)分# val_scores:每次模型在驗(yàn)證集上的交叉驗(yàn)證評(píng)分# 求不同訓(xùn)練數(shù)據(jù)量下的評(píng)分的均值train_scores_mean = np.mean(train_scores, axis=1) # 行均值val_scores_mean = np.mean(val_scores, axis=1)# 繪制學(xué)習(xí)曲線plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='training')plt.plot(train_sizes, val_scores_mean, '*-', color='g', label='cross validation')plt.xlabel('training sample size')plt.ylabel('accuracy')plt.legend(loc='best')plt.show()
如圖所示:當(dāng)訓(xùn)練數(shù)據(jù)量較少時(shí),模型在交叉驗(yàn)證中的得分較低,可看作一種欠擬合現(xiàn)象;隨著訓(xùn)練數(shù)據(jù)量的增多,交叉驗(yàn)證的平均分也隨之增加過(guò)擬合train_sizes, train_scores, val_scores = learning_curve(            SVC(gamma=0.1), X, y, cv=10, scoring='accuracy',            train_sizes=[0.1, 0.25, 0.5, 0.75, 1]            )# 求不同訓(xùn)練數(shù)據(jù)量下的評(píng)分的均值train_scores_mean = np.mean(train_scores, axis=1) # 行均值val_scores_mean = np.mean(val_scores, axis=1)# 繪制學(xué)習(xí)曲線plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='training')plt.plot(train_sizes, val_scores_mean, '*-', color='g', label='cross validation')plt.xlabel('training sample size')plt.ylabel('accuracy')plt.legend(loc='best')plt.show()
 如圖所示:當(dāng)訓(xùn)練數(shù)據(jù)量較少時(shí),模型在交叉驗(yàn)證中的得分較低;然而,隨著訓(xùn)練數(shù)據(jù)量的增多,交叉驗(yàn)證的平均分并沒(méi)增多。因此出現(xiàn)過(guò)擬合現(xiàn)象。通過(guò)驗(yàn)證曲線來(lái)觀察過(guò)擬合# 繪制驗(yàn)證曲線(驗(yàn)證曲線:在不同超參數(shù)下對(duì)模型的評(píng)分)from sklearn.model_selection import validation_curve# 設(shè)置SVC模型的超參數(shù)gamma的取值范圍gamma_range = np.arange(1, 10) / 3000.train_scores, val_scores = validation_curve(    SVC(), X, y, param_name='gamma', param_range=gamma_range,    cv=5, scoring='accuracy')# train_scores: 每次模型在訓(xùn)練集上的評(píng)分# val_scores:每次模型在驗(yàn)證集上的交叉驗(yàn)證評(píng)分# 求每次的平均值train_scores_mean = np.mean(train_scores, axis=1)val_scores_mean = np.mean(val_scores, axis=1)# 繪制驗(yàn)證曲線plt.plot(gamma_range, train_scores_mean, 'o-', color='r', label='training')plt.plot(gamma_range, val_scores_mean, '*-', color='g', label='cross validation')plt.xlabel('gamma')plt.ylabel('accuracy')plt.legend(loc='best')plt.show()
如圖所示:當(dāng)gamma>0.0015時(shí),出現(xiàn)過(guò)擬合現(xiàn)象,對(duì)應(yīng)模型在訓(xùn)練集上的評(píng)分不斷增大,而在驗(yàn)證集上的評(píng)分反而在減小