機器學習實戰:基于Scikit-Learn和TensorFlow的筆記
參考:作者的Jupyter Notebook
Chapter 2 – End-to-end Machine Learning project
-
獲取MNIST資料集的代碼:
def sort_by_target(mnist): reorder_train = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[:60000])]))[:, 1] reorder_test = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[60000:])]))[:, 1] mnist.data[:60000] = mnist.data[reorder_train] mnist.target[:60000] = mnist.target[reorder_train] mnist.data[60000:] = mnist.data[reorder_test + 60000] mnist.target[60000:] = mnist.target[reorder_test + 60000] from sklearn.datasets import fetch_openml mnist = fetch_openml('mnist_784', version=1, cache=True) mnist.target = mnist.target.astype(np.int8) # fetch_openml() returns targets as strings sort_by_target(mnist) # fetch_openml() returns an unsorted dataset -
查看這些陣列
#print(mnist["data"], mnist["target"]) #print(mnist.data.shape) X, y = mnist["data"], mnist["target"] #print(X.shape) #print(y.shape) some_digit = X[36000] some_digit_image = some_digit.reshape(28, 28) plt.imshow(some_digit_image, cmap = mpl.cm.binary, interpolation="nearest") plt.axis("off") #plt.show() #print(y[36000]) -
MNIST資料集中的部分數字影像
def plot_digits(instances, images_per_row=10, **options): size = 28 images_per_row = min(len(instances), images_per_row) images = [instance.reshape(size,size) for instance in instances] n_rows = (len(instances) - 1) // images_per_row + 1 row_images = [] n_empty = n_rows * images_per_row - len(instances) images.append(np.zeros((size, size * n_empty))) for row in range(n_rows): rimages = images[row * images_per_row : (row + 1) * images_per_row] row_images.append(np.concatenate(rimages, axis=1)) image = np.concatenate(row_images, axis=0) plt.imshow(image, cmap = mpl.cm.binary, **options) plt.axis("off") plt.figure(figsize=(9,9)) example_images = np.r_[X[:12000:600], X[13000:30600:600], X[30600:60000:590]] plot_digits(example_images, images_per_row=10) #save_fig("more_digits_plot") #plt.show() -
給資料集洗牌
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:] shuffle_index = np.random.permutation(60000) X_train, y_train = X_train[shuffle_index], y_train[shuffle_index] -
訓練一個二元分類器,為此分類任務創建目標向量:
y_train_5 = (y_train == 5) # True for all 5s, False for all other digits. y_test_5 = (y_test == 5) -
創建一個SGDClassifier(隨機梯度下降分類器)并在整個訓練集上進行訓練:
from sklearn.linear_model import SGDClassifier sgd_clf = SGDClassifier(max_iter=5, tol=-np.infty, random_state=42) #random_state=42 sgd_clf.fit(X_train, y_train_5) #print(sgd_clf.fit(X_train, y_train_5)) #現在可以用它來檢測數字5的影像了: sgd_clf.predict([some_digit]) #print(sgd_clf.predict([some_digit])) -
交叉驗證
from sklearn.model_selection import cross_val_score cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy") print(cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy")) #下面這段代碼與前面的cross_val_score()大致相同,并列印出相同的結果: from sklearn.model_selection import StratifiedKFold from sklearn.base import clone skfolds = StratifiedKFold(n_splits=3, random_state=42) #random_state=42 for train_index, test_index in skfolds.split(X_train, y_train_5): clone_clf = clone(sgd_clf) X_train_folds = X_train[train_index] y_train_folds = (y_train_5[train_index]) X_test_fold = X_train[test_index] y_test_fold = (y_train_5[test_index]) clone_clf.fit(X_train_folds, y_train_folds) y_pred = clone_clf.predict(X_test_fold) n_correct = sum(y_pred == y_test_fold) print(n_correct / len(y_pred)) -
一個蠢笨的分類器(不是我說的),它將每張圖都分類成“非5”:
from sklearn.base import BaseEstimator class Never5Classifier(BaseEstimator): def fit(self, X, y=None): pass def predict(self, X): return np.zeros((len(X), 1), dtype=bool) #準確度 never_5_clf = Never5Classifier() print(cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring="accuracy")) -
混淆矩陣:評估分類器性能的更好方法是混淆矩陣,
from sklearn.model_selection import cross_val_predict y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3) #cross_val_predict()函式同樣執行K-fold交叉驗證,但回傳的不是評估分數,而是每個折疊的預測,這意味著對于每個實體都可以得到一個干凈的預測 from sklearn.metrics import confusion_matrix #confusion_matrix(y_train_5, y_train_pred) #print(confusion_matrix(y_train_5, y_train_pred)) y_train_perfect_predictions = y_train_5 #print(confusion_matrix(y_train_5, y_train_perfect_predictions)) -
精度和召回率
#精度=TP/(TP+FP):TP是真正類的數量,FP是假正類的數量, #召回率=TP/(TP+FN):FN是假負類的數量, from sklearn.metrics import precision_score, recall_score print(precision_score(y_train_5, y_train_pred)) #精度4344 / (4344 + 1307) print(recall_score(y_train_5, y_train_pred)) #召回率4344 / (4344 + 1077) #F1分數:F1=2/(1/精度+1/召回率)=TP/(TP+(FN+FP)/2) from sklearn.metrics import f1_score print(f1_score(y_train_5, y_train_pred)) -
精度/召回率權衡:閾值
y_scores = sgd_clf.decision_function([some_digit]) print(y_scores) threshold = 0 y_some_digit_pred = (y_scores > threshold) print(y_some_digit_pred) #提高閾值 threshold = 200000 y_some_digit_pred_a = (y_scores > threshold) print(y_some_digit_pred_a) -
決定使用什么閾值
#獲取訓練集中所有實體的分數 y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3, method="decision_function") #計算所有可能的閾值的精度和召回率 from sklearn.metrics import precision_recall_curve precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores) #使用Matplotlib繪制精度和召回率相對于閾值的函式圖 def plot_precision_recall_vs_threshold(precisions, recalls, thresholds): plt.plot(thresholds, precisions[:-1], "b--", label="Precision") plt.plot(thresholds, recalls[:-1], "g-", label="Recall") plt.xlabel("Threshold") plt.legend(loc="upper left") plt.ylim([0, 1]) plt.figure(figsize=(8, 4)) plot_precision_recall_vs_threshold(precisions, recalls, thresholds) plt.xlim([-700000, 700000]) plt.show() #print((y_train_pred == (y_scores > 0)).all()) y_train_pred_90 = (y_scores > 70000) from sklearn.metrics import precision_score, recall_score print(precision_score(y_train_5, y_train_pred_90)) #精度 print(recall_score(y_train_5, y_train_pred_90)) #召回率 -
精度和召回率的函式圖PR
def plot_precision_vs_recall(precisions, recalls): plt.plot(recalls, precisions, "b-", linewidth=2) plt.xlabel("Recall", fontsize=16) plt.ylabel("Precision", fontsize=16) plt.axis([0, 1, 0, 1]) plt.figure(figsize=(8, 6)) plot_precision_vs_recall(precisions, recalls) plt.show() -
ROC曲線(受試者作業特征曲線):真正類率和假正類率
from sklearn.metrics import roc_curve fpr, tpr, thresholds = roc_curve(y_train_5, y_scores) def plot_roc_curve(fpr, tpr, label=None): plt.plot(fpr, tpr, linewidth=2, label=label) plt.plot([0, 1], [0, 1], 'k--') plt.axis([0, 1, 0, 1]) plt.xlabel('False Positive Rate', fontsize=16) plt.ylabel('True Positive Rate', fontsize=16) ''' plt.figure(figsize=(8, 6)) plot_roc_curve(fpr, tpr) plt.show() from sklearn.metrics import roc_auc_score print(roc_auc_score(y_train_5, y_scores)) -
訓練一個RandomForestClassifier分類器,并比較它和SGDClassifier分類器的ROC曲線和ROC AUC分數,
from sklearn.ensemble import RandomForestClassifier forest_clf = RandomForestClassifier(n_estimators=10, random_state=42) y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3, method="predict_proba") y_scores_forest = y_probas_forest[:, 1] # score = proba of positive class fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5,y_scores_forest) plt.figure(figsize=(8, 6)) plt.plot(fpr, tpr, "b:", linewidth=2, label="SGD") plot_roc_curve(fpr_forest, tpr_forest, "Random Forest") plt.legend(loc="lower right", fontsize=16) plt.show() from sklearn.metrics import roc_auc_score print(roc_auc_score(y_train_5, y_scores_forest)) -
多類別分類器,用SGDClassifier試試
#用SGDClassifier試試: sgd_clf.fit(X_train, y_train) sgd_clf.predict([some_digit]) #print(sgd_clf.predict([some_digit])) some_digit_scores = sgd_clf.decision_function([some_digit]) #print(some_digit_scores) #print(np.argmax(some_digit_scores)) #print(sgd_clf.classes_) #print(sgd_clf.classes_[5]) #下面這段代碼使用OvO策略,基于SGDClassifier創建了一個多類別分類器: from sklearn.multiclass import OneVsOneClassifier ovo_clf = OneVsOneClassifier(SGDClassifier(max_iter=5, tol=-np.infty, random_state=42)) ovo_clf.fit(X_train, y_train) ovo_clf.predict([some_digit]) len(ovo_clf.estimators_) #print(ovo_clf.predict([some_digit])) #print(len(ovo_clf.estimators_)) -
訓練RandomForestClassifier
from sklearn.model_selection import cross_val_score forest_clf.fit(X_train, y_train) #print(forest_clf.predict([some_digit])) #print(forest_clf.predict_proba([some_digit])) #概率串列 #print(cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")) #準確率 #將輸入進行簡單縮放 from sklearn.preprocessing import StandardScaler scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train.astype(np.float64)) #print(cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")) -
使用Matplotlib的matshow()函式來查看混淆矩陣
y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3) conf_mx = confusion_matrix(y_train, y_train_pred) #print(conf_mx) #使用Matplotlib的matshow()函式來查看混淆矩陣的影像表示 #plt.matshow(conf_mx, cmap=plt.cm.gray) #save_fig("confusion_matrix_plot", tight_layout=False) -
你需要將混淆矩陣中的每個值除以相應類別中的圖片數量,這樣你比較的就是錯誤率而不是錯誤的絕對值
row_sums = conf_mx.sum(axis=1, keepdims=True) norm_conf_mx = conf_mx / row_sums #用0填充對角線,只保留錯誤,重新繪制結果: np.fill_diagonal(norm_conf_mx, 0) plt.matshow(norm_conf_mx, cmap=plt.cm.gray) #save_fig("confusion_matrix_errors_plot", tight_layout=False) -
看看數字3和數字5的例子:
cl_a, cl_b = 3, 5 X_aa = X_train[(y_train == cl_a) & (y_train_pred == cl_a)] X_ab = X_train[(y_train == cl_a) & (y_train_pred == cl_b)] X_ba = X_train[(y_train == cl_b) & (y_train_pred == cl_a)] X_bb = X_train[(y_train == cl_b) & (y_train_pred == cl_b)] plt.figure(figsize=(8,8)) plt.subplot(221); plot_digits(X_aa[:25], images_per_row=5) plt.subplot(222); plot_digits(X_ab[:25], images_per_row=5) plt.subplot(223); plot_digits(X_ba[:25], images_per_row=5) plt.subplot(224); plot_digits(X_bb[:25], images_per_row=5) #save_fig("error_analysis_digits_plot") -
多標簽分類
#這段代碼會創建一個y_multilabel陣列,其中包含兩個數字圖片的目標標簽:第一個表示數字是否是大數(7、8、9),第二個表示是否為奇數, from sklearn.neighbors import KNeighborsClassifier y_train_large = (y_train >= 7) y_train_odd = (y_train % 2 == 1) y_multilabel = np.c_[y_train_large, y_train_odd] knn_clf = KNeighborsClassifier() knn_clf.fit(X_train, y_multilabel) #print(knn_clf.fit(X_train, y_multilabel)) #下一行創建一個KNeighborsClassifier實體(它支持多標簽分類,不是所有的分類器都支持),然后使用多個目標陣列對它進行 #訓練,現在用它做一個預測,注意它輸出的兩個標簽: knn_clf.predict([some_digit]) #數字5確實不大(False),為奇數(True), #print(knn_clf.predict([some_digit])) -
下面這段代碼計算所有標簽的平均F1分數:
from sklearn.metrics import f1_score y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3, n_jobs=-1) f1_score(y_multilabel, y_train_knn_pred, average="macro") #print(f1_score(y_multilabel, y_train_knn_pred, average="macro")) -
多輸出分類(多輸出-多類別分類)
#還先從創建訓練集和測驗集開始,使用NumPy的randint()函式 #為MNIST圖片的像素強度增加噪聲,目標是將圖片還原為原始圖片: noise = np.random.randint(0, 100, (len(X_train), 784)) X_train_mod = X_train + noise noise = np.random.randint(0, 100, (len(X_test), 784)) X_test_mod = X_test + noise y_train_mod = X_train y_test_mod = X_test some_index = 5500 #plt.subplot(121); plot_digit(X_test_mod[some_index]) #plt.subplot(122); plot_digit(y_test_mod[some_index]) #save_fig("noisy_digit_example_plot") -
清洗這張圖片:
knn_clf.fit(X_train_mod, y_train_mod) clean_digit = knn_clf.predict([X_test_mod[some_index]]) plot_digit(clean_digit) save_fig("cleaned_digit_example_plot") plt.show()
轉載請註明出處,本文鏈接:https://www.uj5u.com/houduan/164053.html
標籤:Python
下一篇:Python- 裝飾器
