#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import time
import cv2
from sklearn.cross_validation import train_test_split
# 提取hog特徵,784 -> 324
def get_hog_features(trainset):
features =
hog = cv2.hogdescriptor('./hog.xml') # 讀取配置檔案
for image in trainset:
image = image.reshape(28, 28)
cv_img = image.astype(np.uint8) # uint8範圍為0-255,和畫素數值範圍相同
hog_feature = hog.compute(cv_img)
features = np.array(features)
features = features.reshape(-1, 324) # 第一維任意,第二維為提取到的特徵18*18
return features
def train(trainset, train_labels):
trainset_size = len(train_labels)
# 初始化w和b
w = np.zeros((feature_length, 1)) # shape (d,1)
b = 0
# 學習次數,只有當(xi,yi)分類錯誤時才增加
study_count = 0
# 統計連續分類正確數,當分類錯誤時為0
nochange_count = 0
# 連續分類正確上界,當到達此數代表已訓練好
nochange_upper_limit = 100000
while true:
nochange_count += 1
if nochange_count > nochange_upper_limit:
break
# 隨機選乙個資料
index = np.random.randint(0, trainset_size) # 前閉後開
image = trainset[index] # shape (d,)
label = train_labels[index]
# 計算yi(w*xi+b),如果label為1則為正類1,label為0則為負類-1
yi = int(label != object_num) * 2 - 1
result = yi * (image.dot(w) + b)
# 如果為誤分類就需要更新w和b
if result <= 0:
# 為了下面的計算,需要重新設定維度
image = image.reshape(feature_length, 1)
w += learning_rate * yi * image
b += learning_rate * yi
study_count += 1
if study_count > nochange_upper_limit:
break
nochange_count = 0
return w, b
def predict(test_set, w, b):
predict =
for image in test_set:
result = image.dot(w) + b
result = result > 0 # >0為true,<0為false
return np.array(predict)
feature_length = 324 # hog特徵維度
learning_rate = 0.0001 # 學習率
object_num = 0 # 分類的數字,如果數字為0,標籤為1
study_total = 10000 # 設定最多迭代次數
if __name__ == '__main__':
print('start reading data:')
time1 = time.time()
# raw_data為pandas的dataframe型別
# 讀取csv並去除第一行,從資料的第一行開始讀
# 每一行資料為:第一列標籤,後面每一列為畫素 28*28=784
# label>0的設定成1,label為0的資料不變
raw_data = pd.read_csv('./data/train_binary.csv', header=0)
# 返回值為numpy的ndarray型別,shape(42000,785)
data = raw_data.values
img = data[:, 1:] # 第二列開始為資料
labels = data[:, 0] # 第一列為label
print(img.shape)
print(labels.shape)
# 利用hog提取特徵,784 -> 324
features = get_hog_features(img)
print(features.shape)
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.33, random_state=11111)
print(train_features.shape)
print(test_features.shape)
time2 = time.time()
print('read data cost %f seconds' % (time2 - time1))
print('starting training:')
w, b = train(train_features, train_labels)
time3 = time.time()
print('training cost %f seconds' % (time3 - time2))
print('starting predicting:')
test_predict = predict(test_features, w, b)
time4 = time.time()
print('predicting cost %f seconds' % (time4 - time3))
# label = 0 負分類-1
# label = 1 正分類1
accuracy = np.sum(test_labels == test_predict.reshape(len(test_labels))) / len(test_labels)
print('the accuracy is: %f!' % accuracy)
'''output:
start reading data:
(42000, 784)
(42000,)
(42000, 324)
(28140, 324)
(13860, 324)
read data cost 6.194034 seconds
starting training:
training cost 46.450333 seconds
starting predicting:
predicting cost 0.081242 seconds
the accuracy is: 0.996609!
'''
感知機 統計學習方法
一 感知機適用問題以及它的輸入,輸出,求解方法 1 感知機 perceptron 適用於二類分類問題 該分類問題是線性可分問題 2 感知機模型是線性分類模型 3 感知機的幾何解釋,感知機對應的是乙個超平面 4 輸入 例項的特徵向量 5 輸出 例項的類別,取 1和 1二值 6 求解方法 有監督學習 給...
《統計學習方法》 感知機
最近終於有開始看 統計學習方法 了,畢竟無腦調參確實沒有什麼意義。一方面是作為看書的筆記,一方面作為比部落格或許能起到一點參考作用吧。希望可以日更。由輸入空間到輸出空間的函式 f x si gn w x b f x sign w cdot x b f x s ign w x b 稱為感知機。感知機是...
統計學習方法 感知機 python實現
感知機是二類分類的線性分類模型,利用隨機梯度下降法對基於誤分類的損失函式進行極小化。書中演算法可以將所有樣本和係數向量寫成增廣向量的形式,並將所有負樣本乘以 1,統一形式,方便計算。1 訓練資料集線性可分時,感知機學習演算法原始形式迭代收斂 2 演算法存在許多解 感知機學習演算法的對偶形式使得訓練過...