




import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn import datasets
# 生成資料
x1, y1 = datasets.make_circles(n_samples=2000, factor=0.5, noise=0.05)
x2, y2 = datasets.make_blobs(n_samples=1000, centers=[[1.2, 1.2]], cluster_std=[[0.1]])
x = np.concatenate((x1, x2))
# k-means方法聚類
model = KMeans(n_clusters=3)
model.fit(x)
y_pred = model.predict(x)
plt.scatter(x[:, 0], x[:, 1], c=y_pred)
# DBSCAN方法聚類
model = DBSCAN(eps=0.2, min_samples=50)
model.fit(x)
y_pred = model.fit_predict(x)
plt.figure()
plt.scatter(x[:, 0], x[:, 1], c=y_pred)
plt.show()

k-means聚類方法,它有一個很大的缺陷,就是它對于簡單成團的資料樣本聚類效果較好,但是對于復雜的樣本資料分布就搞不定了,比如環形分布的樣本資料,
應用

先放一下直方圖,集中在7、8、20、21、22、23,(6類)

import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn import metrics
def loadData(filePath):
f = open(filePath)
lines = f.readlines()
# print(lines)
mac2id=dict()
online_times=[]
for line in lines:
# lines[1]
# 2c929293466b97a6014754607e457d68,U201215025,A417314EEA7B,10.12.49.26,2014-07-20 22:44:18.540000000,2014-07-20 23:10:16.540000000,1558,15,本科生動態IP模版,100元每半年,internet
# .split(',' )[0] .split(',' )[1] .......
mac=line.split(',' )[2]
#1558時間單位為秒
online_time=int(line.split(',')[6])
# line.split(',')[4] 2014-07-20 22:44:18.540000000
# .split(' ')[1] 22:44:18.540000000
# .split(':')[0] 22
start_time= int(line.split(',')[4].split(' ')[1].split(':')[0])
# print(mac,online_time,start_time)
if mac not in mac2id:
mac2id[mac]= len(online_times)
# print(mac2id) #{'A417314EEA7B': 0, 'F0DEF1C78366': 1, '88539523E88D': 2,,,,}
# print(online_times) #[(22, 1558), (12, 40261),,,,()]
online_times.append((start_time,online_time/12000))
else:
#如果有相同的MAC地址 則以最后一條為準 實際上沒有
online_times[mac2id[mac]]=[(start_time,online_time)]
print(online_times)
# print(online_times) [(22, 1558), (12, 40261),,,,,
# print(np.array(online_times)) .reshape((-1,2))要兩列資料 -1為unspecified value
# [[ 22 1558]
# [ 12 40261]
# [ 22 1721].....]
#
real_X=np.array(online_times).reshape((-1,2))
return real_X
X=loadData("E:\Desktop\python_code\sklearn\課程資料\聚類\\time2.txt")
# print(X)
db=DBSCAN(eps=0.5 ,min_samples=20,metric='euclidean').fit(X)
labels = db.labels_
print('Labels:',labels)
raito=len(labels[labels[:] == -1]) / len(labels)
print( 'Noise raito: ',format(raito,'.2%'))
n_clusters_= len(set(labels)) - (1 if -1 in labels else 0)
print( ' Est imated number of clusters: %d' % n_clusters_)
print( " Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels))
for i in range(n_clusters_):
print('Cluster',i,':')
print(list(X[labels == i,0].flatten()))
plt.scatter(X[:, 0], X[:, 1],c=labels)
plt.show()
# plt.hist(X[:,0],24)
# plt.show()
課程中的代碼有問題,對應引數是不會得出課程中的結果,
不斷調參,嘗試,
經過思考,發現了問題,橫軸間距太小,如果調大eps,必然會橫向誤判,
如果縱向縮小一些,,,
于是嘗試進行了歸一化,結果與課程相符,
不過課程后面提到用對數變換,不過我貌似那是針對第二個例子,應該也可以,

轉載請註明出處,本文鏈接:https://www.uj5u.com/qita/294857.html
標籤:AI
上一篇:Python OpenCV3:OpenCV 幾何變換
下一篇:機器學習吃瓜教程學習筆記1
