例外檢測
參考資料:https://github.com/fengdu78/Coursera-ML-AndrewNg-Notes
先看資料:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from scipy.io import loadmat
import math
data = loadmat('data/ex8data1.mat') # Xval,yval,X
X = data['X']
Xval = data['Xval']
yval = data['yval']
plt.scatter(X[:,0], X[:,1])
plt.show()

其中有幾個點明顯偏離中心,應該為例外資料點,下面我們讓機器學會檢測這些例外資料點:
首先根據兩個特征建立兩個高斯函式:
def estimate_gaussian(X):
mu = X.mean(axis=0) # X.mean()所有數取均值,X.mean(axis=0)取所有列的均值
sigma = X.var(axis=0) # 方差
return mu, sigma
mu, sigma = estimate_gaussian(X) # X每一個特征的均值(mu)和方差(sigma)
p = np.zeros((X.shape[0], X.shape[1])) # X.shape = (307,2)
p0 = stats.norm(mu[0], sigma[0])
p1 = stats.norm(mu[1], sigma[1])
p[:,0] = p0.pdf(X[:,0])
p[:,1] = p1.pdf(X[:,1])
stats.norm根據傳入的均值和方差自動創建高斯分布函式(正態分布),p0.pdf(x)將x帶入建立好的高斯分布函式中得到輸出,
嚴謹起見筆者手動創建了一個高斯分布的函式:
def gussian(x,mu=mu[0], sigma=sigma[0]):
s = 1/(math.sqrt(2*math.pi)*sigma)
ss = np.exp(-1*(np.power((x-mu),2))/(2*np.power(sigma,2)))
return s*ss
結果不能說一摸一樣,也可以說是完全相同,(注:因為有兩個特征,所以有兩個高斯函式,這里只畫出一個舉例)


根據以下函式通過驗證集與F1(查準率與查全率之間的權衡,可參考吳恩達學習筆記v5.51,166~168頁)來尋找最佳閾值
def select_threshold(pval, yval):
best_epsilon = 0
best_f1 = 0
step = (pval.max() - pval.min()) / 1000
for epsilon in np.arange(pval.min(), pval.max(), step):
preds = pval < epsilon
# np.logical_and邏輯與
tp = np.sum(np.logical_and(preds == 1, yval == 1)).astype(float)
fp = np.sum(np.logical_and(preds == 1, yval == 0)).astype(float)
fn = np.sum(np.logical_and(preds == 0, yval == 1)).astype(float)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = (2 * precision * recall) / (precision + recall)
if f1 > best_f1:
best_f1 = f1
best_epsilon = epsilon
return best_epsilon, best_f1
然后再區分例外資料就很容易了,將X帶入建立好的兩個高斯函式得到例外概率,找到小于閾值的資料,標記,完成
p = np.zeros((X.shape[0], X.shape[1])) # X.shape = (307,2)
p0 = stats.norm(mu[0], sigma[0])
p1 = stats.norm(mu[1], sigma[1])
p[:,0] = p0.pdf(X[:,0])
p[:,1] = p1.pdf(X[:,1])
pval = np.zeros((Xval.shape[0], Xval.shape[1])) # Xval.shape = (307,2)
pval[:,0] = stats.norm(mu[0], sigma[0]).pdf(Xval[:,0]) # pval的第0列為驗證集的第一個特征帶入根據X第一個特征建立的高斯函式值
pval[:,1] = stats.norm(mu[1], sigma[1]).pdf(Xval[:,1])
epsilon, f1 = select_threshold(pval, yval) # 用pval得到最佳閾值,再用閾值epsilon尋找p中的例外值
outliers = np.where(p < epsilon)
plt.scatter(X[:,0], X[:,1])
plt.scatter(X[outliers[0],0], X[outliers[0],1], s=50, color='r', marker='o')
plt.show()

預測新資料時可以將兩個特征的均值與方差及其閾值保存,在新代碼中構建高斯分布函式,新特征帶入小于閾值例外,
協同過濾
即推薦系統,太難了我giao,這一塊弄了好久,能想出這個演算法的人

代碼如下:
import numpy as np
import scipy.optimize as opt
from scipy.io import loadmat
def serialize(X, theta): # 展成一維再銜接
return np.concatenate((X.ravel(), theta.ravel()))
def deserialize(param, n_movie, n_user, n_features): # 再reshape成向量
"""into ndarray of X(1682, 10), theta(943, 10)"""
return param[:n_movie * n_features].reshape(n_movie, n_features), \
param[n_movie * n_features:].reshape(n_user, n_features)
def cost(param, Y, R, n_features):
"""compute cost for every r(i, j)=1
Args:
param: serialized X, theta
Y (movie, user), (1682, 943): (movie, user) rating
R (movie, user), (1682, 943): (movie, user) has rating
"""
# theta (user, feature), (943, 10): user preference
# X (movie, feature), (1682, 10): movie features
n_movie, n_user = Y.shape
X, theta = deserialize(param, n_movie, n_user, n_features)
inner = np.multiply(X @ theta.T - Y, R)
return np.power(inner, 2).sum() / 2
def gradient(param, Y, R, n_features):
# theta (user, feature), (943, 10): user preference
# X (movie, feature), (1682, 10): movie features
n_movies, n_user = Y.shape
X, theta = deserialize(param, n_movies, n_user, n_features)
inner = np.multiply(X @ theta.T - Y, R) # (1682, 943)
# X_grad (1682, 10)
X_grad = inner @ theta
# theta_grad (943, 10)
theta_grad = inner.T @ X
# roll them together and return
return serialize(X_grad, theta_grad)
def regularized_cost(param, Y, R, n_features, l=1):
reg_term = np.power(param, 2).sum() * (l / 2)
return cost(param, Y, R, n_features) + reg_term
def regularized_gradient(param, Y, R, n_features, l=1):
grad = gradient(param, Y, R, n_features)
reg_term = l * param
return grad + reg_term
data = loadmat('data/ex8_movies.mat')
# params_data = loadmat('data/ex8_movieParams.mat')
Y = data['Y'] # 評的幾顆星,沒評為0 1682款電影,943位用戶
R = data['R'] # 評分為1,沒評為0 shape=(1682,943)
# X = params_data['X'] # 電影引數.shape=(1682,10)
# theta = params_data['Theta'] # 用戶引數.shape(943,10)
movie_list = []
with open('data/movie_ids.txt') as f:
for line in f:
tokens = line.strip().split(' ') #去除兩端空白再分
movie_list.append(' '.join(tokens[1:])) # 去掉數字,僅將電影名稱存入串列
movie_list = np.array(movie_list)
# 假設我給1682部電影中的某些評了分
ratings = np.zeros(1682)
ratings[0] = 4
ratings[6] = 3
ratings[11] = 5
ratings[53] = 4
ratings[63] = 5
ratings[65] = 3
ratings[68] = 5
ratings[97] = 2
ratings[182] = 4
ratings[225] = 5
ratings[354] = 5
# 插入后ratting將在0的位置(按列插入)Y(評的幾顆星),R(評沒評分)
# 自己為用戶0
Y = np.insert(Y, 0, ratings, axis=1) # now I become user 0
R = np.insert(R, 0, ratings != 0, axis=1)
movies, users = Y.shape # (1682,944)
n_features = 10
X = np.random.random(size=(movies, n_features))
theta = np.random.random(size=(users, n_features))
lam = 1
Ymean = np.zeros((movies, 1))
Ynorm = np.zeros((movies, users))
param = serialize(X, theta)
# 均值歸一化
for i in range(movies): #R.shape = (1682,944),movies = 1682
idx = np.where(R[i,:] == 1)[0] # 第i部電影,用戶0,找到用戶0評過分的電影
Ymean[i] = Y[i,idx].mean() # 第i行,評過分的求平均
Ynorm[i,idx] = Y[i,idx] - Ymean[i] # 評過分的減去他們的均值
res = opt.minimize(fun=regularized_cost,
x0=param,
args=(Ynorm, R, n_features,lam),
method='TNC',
jac=regularized_gradient)
X = np.mat(np.reshape(res.x[:movies * n_features], (movies, n_features)))
theta = np.mat(np.reshape(res.x[movies * n_features:], (users, n_features)))
predictions = X * theta.T
my_preds = predictions[:, 0] + Ymean
sorted_preds = np.sort(my_preds, axis=0)[::-1]
idx = np.argsort(my_preds, axis=0)[::-1]
print("Top 10 movie predictions:")
for i in range(10):
j = int(idx[i])
print('Predicted rating of {} for movie {}.'.format(str(float(my_preds[j])), movie_list[j]))
每次運行結果不可能完全一致,我們的目的是是根據所擁有的評分建模,找到你可能喜歡的,再加上隨機初始化的問題,結果不可能每次都一樣,但總體來說是可行的,舉個栗子,Return of the Jedi (1983),Star Wars (1977)等電影經常排在前10,甚至前5.
當然也可以讓特征值n_feature=50,可能會更準確一些,但運算量直接上升好幾個數量級,粗略掐了下表,結果非常amazing啊,給pycharm 4G的運行記憶體算了13分鐘多,
轉載請註明出處,本文鏈接:https://www.uj5u.com/qita/292852.html
標籤:AI
