# -*- coding: utf-8 -*- #
# author neu_lightbulb-----zhangj
import numpy as np
import pandas as pd
class score():
def __init__(self,pre_score,rel_label,threshold,beta):
self.tn = 0
self.fn = 0
self.fp = 0
self.tp = 0
self.pre_score = pre_score
self.rel_label = rel_label
self.threshold = threshold
self.beta = beta
list(map(self.__getcm_count,
self.pre_score,
self.rel_label))
def __getcm(self,pre, rel):
if (pre < self.threshold):
if (rel == 0): return 'tn'
if (rel == 1): return 'fn'
if (pre >= self.threshold):
if (rel == 0): return 'fp'
if (rel == 1): return 'tp'
def get_cm(self):
return list(map(self.__getcm,
self.pre_score,
self.rel_label))
def __getcm_count(self,pre, rel):
if (pre < self.threshold):
if (rel == 0): self.tn += 1
if (rel == 1): self.fn += 1
if (pre >= self.threshold):
if (rel == 0): self.fp += 1
if (rel == 1): self.tp += 1
def get_f1(self):
p = self.tp/(self.tp+self.fp)
r = self.tp/(self.tp+self.fn)
if(p == 0.0):
return 0.0
else:
return (self.beta*self.beta+1)*p*r/(self.beta*self.beta*p+r)
# 方法二 precision——分數精度
def get_auc_by_count(self,precision=100):
# 正樣本數
postive_len = sum(self.rel_label)
# 負樣本數
negative_len = len(self.rel_label) - postive_len
# 總對比數
total_case = postive_len * negative_len
# 正樣本分數計數器(填0在range...)
pos_histogram = [0 for _ in range(precision+1)]
# 負樣本分數計數器(填0在range...)
neg_histogram = [0 for _ in range(precision+1)]
# 分數放大
bin_width = 1.0 / precision
for i in range(len(self.rel_label)):
nth_bin = int(self.pre_score[i] / bin_width)
if self.rel_label[i] == 1:
pos_histogram[nth_bin] += 1
else:
neg_histogram[nth_bin] += 1
accumulated_neg = 0
satisfied_pair = 0
for i in range(precision+1):
satisfied_pair += (pos_histogram[i] * accumulated_neg + pos_histogram[i] * neg_histogram[i] * 0.5)
accumulated_neg += neg_histogram[i]
return satisfied_pair / float(total_case)
# 方法三
def get_auc_by_rank(self):
# 拼接排序
df = pd.dataframe()
df = df.sort_values(by='pre_score',ascending=false).reset_index(drop=true)
# 獲取 n,n,m
n = len(df)
m = len(df[df['rel_label']==1])
n = n - m
# 初始化rank 和同值統計ank_tmp,count_all,count_p
rank = 0.0
rank_tmp,count_all,count_p = 0.0,0,0
# 新增防止越界的一條不影響結果的記錄
df.loc[n] = [0,0]
# 遍歷一次
for i in range(n):
# 判斷i+1是否與i同值,不同值則要考慮是否剛剛結束同值統計
if(df['pre_score'][i+1] != df['pre_score'][i]):
# 正樣本
if(df['rel_label'][i] == 1):
# 計數不為0,剛剛結束同值統計
if (count_all != 0):
# 同值統計結果加在rank上,這裡注意補回結束統計時漏掉的最後一條同值資料
rank += (rank_tmp + n - i) * (count_p+1) / (count_all+1)
rank_tmp, count_all, count_p = 0.0, 0, 0
continue
rank += (n-i)
else:
if (count_all != 0):
rank += (rank_tmp + n - i) * (count_p) / (count_all+1)
rank_tmp, count_all, count_p = 0.0, 0, 0
continue
else:
rank_tmp += (n-i)
count_all += 1
if(df['rel_label'][i] == 1):
count_p += 1
return (rank-m*(1+m)/2)/(m*n)
if __name__ == '__main__':
learn_data_l2 = [0.2,0.3,0.4,0.35,0.6,0.55,0.2,0.57,0.3,0.15,0.77,0.33,0.9,0.49, 0.45,0.41, 0.66,0.43,0.7,0.4]
learn_data_r2 = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
learn_data2 = pd.dataframe()
score2 = score(learn_data2['learn'], learn_data2['real'], 0.5, 1)
print(score2.get_cm())
print(score2.get_f1())
print(score2.get_auc_by_count())
print(score2.get_auc_by_rank())
python計算auc的方法
1 安裝scikit learn 1.1 scikit learn 依賴 分別檢視上述三個依賴的版本 python v 結果 python 2.7.3 python c import scipy print scipy.version.version scipy版本結果 0.9.0 python c...
python計算auc指標例項
1 安裝scikit learn 1.1scikit learn 依賴 python 2.6 or 3.3 numpy 1.6.1 scipy 0.9 分別檢視上述三個依賴的版本,python v 結果 python 2.7.3 python c import scipy print scipy.v...
使用R和Python計算AUC
某日重新灑下的分割線,無奈的我又用回了python 原因有兩個,第一python用了好久了,不想再去用r了,雖然r的ggplot畫圖很好看,不過今天安裝了python的ggplot庫 好激動!第二,也是r的一大缺憾,就是迴圈簡直是慢得離譜。所以又用回了python,於是,就硬著頭皮來分析之前的程式裡...