0. 載入相關模組
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
# 用於在jupyter中進行繪圖
%matplotlib inline
1. 資料載入
1.1 資料預覽
# 載入資料集
fruits_df = pd.read_table(
'fruit_data_with_colors.txt'
)# 資料預覽
fruits_df.head(
)fruit_label fruit_name fruit_subtype mass width height color_score
08.4
7.30.55
18.0
6.80.59
27.4
7.20.60
32 mandarin mandarin 86
6.24.7
0.80
42 mandarin mandarin 84
6.04.6
0.79
('樣本個數:'
,len
(fruits_df)
)樣本個數: 59
sns.countplot(fruits_df[
'fruit_name'
], label=
"count"
)1.2 資料處理
# 建立目標標籤和名稱的字典
fruit_name_dict =
dict
(zip
(fruits_df[
'fruit_label'
], fruits_df[
'fruit_name'])
(fruit_name_dict)
# 劃分資料集
x = fruits_df[
['mass'
,'width'
,'height'
,'color_score']]
y = fruits_df[
'fruit_label'
]x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1/
4, random_state=0)
('資料集樣本數:{},訓練集樣本數:{},測試集樣本數:{}'
.format
(len
(x),
len(x_train)
,len
(x_test)))
資料集樣本數:59,訓練集樣本數:44,測試集樣本數:15
2. 視覺化檢視特徵變數
# 檢視資料集
sns.pairplot(data=fruits_df, hue=
'fruit_name'
,vars=[
'mass'
,'width'
,'height'
,'color_score'])
>
%matplotlib notebook
from mpl_toolkits.mplot3d import axes3d
label_color_dict =
colors =
list
(map
(lambda label: label_color_dict[label]
, y_train)
)fig = plt.figure(
)ax = fig.add_subplot(
111, projection=
'3d'
)ax.scatter(x_train[
'width'
], x_train[
'height'
], x_train[
'color_score'
], c=colors, marker=
'o', s=
100)
ax.set_xlabel(
'width'
)ax.set_ylabel(
'height'
)ax.set_zlabel(
'color_score'
)plt.show()3
. 建立/選擇模型
from sklearn.neighbors import kneighborsclassifier
# 建立knn模型
knn = kneighborsclassifier(n_neighbors=5)
4. 訓練模型
knn.fit(x_train, y_train)
kneighborsclassifier(algorithm=
'auto'
, leaf_size=
30, metric=
'minkowski'
, metric_params=
none
, n_jobs=
1, n_neighbors=
5, p=2,
weights=
'uniform')5
. 測試模型
y_pred = knn.predict(x_test)
('**標籤:'
, y_pred)
**標籤: [31
4411
3314
2131
4]print
('真實標籤:'
, y_test.values)
真實標籤: [33
4311
3431
2133
3]from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred)
('準確率:'
, acc)
準確率: 0.5333333333333333
6. 檢視k值對結果的影響
k_range =
range(1
,20)acc_scores =
for k in k_range:
knn = kneighborsclassifier(n_neighbors=k)
knn.fit(x_train, y_train)
)
plt.figure(
)plt.xlabel(
'k')
plt.ylabel(
'accuracy'
)plt.plot(k_range, acc_scores, marker=
'o')
plt.xticks([0
,5,11
,15,21
])([
>
,>
,>
,>
,>],
)# 只檢視width和height兩列特徵
from ml_visualization import plot_fruit_knn
plot_fruit_knn(x_train, y_train,1)
plot_fruit_knn(x_train, y_train,5)
plot_fruit_knn(x_train, y_train,
10)
機器學習 KNN
posted on 2015 05 21 20 34 in ml 0 comment views 3195 原始碼 決策樹和基於規則的分類器都是積極學習方法 eager learner 的例子,因為一旦訓練資料可用,他們就開始學習從輸入屬性到類標號的對映模型。乙個相反的策略是推遲對訓練資料的建模,直...
機器學習 KNN
定義事件ejl j 0,1.k e j epsilon 0,1.k ejl j 0,1.k 為對於標籤 l,k個鄰居中包換j個這個標籤,則 e為乙個 k n 的矩陣,每一行為距離該例項的樣本中每個標籤的數量,則 c l 向量是e矩陣的最後一行,表示距離為k時的各個標籤數量。則我們需要的是在已知e矩陣...
機器學習 KNN
1 import numpy as np 2from sklearn.neighbors import kneighborsclassifier 3from sklearn.model selection import train test split 4from sklearn.preproces...