from numpy import *
def loaddataset():
return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
def createc1(dataset):
c1=for transaction in dataset:
for item in transaction:
if not [item] in c1:
c1.sort()
return list(map(frozenset,c1))#frozenset是指被「冰凍 」的集合, 就是說它們是不可改變的,艮口使用者不能修改它們
#資料集ck,包含候選集合的列表以及感興趣項集的最小支援度minsupport
def scand(d,ck,minsupport):
sscnt={}
for tid in d:
for can in ck:
if can.issubset(tid):
if not can in sscnt: sscnt[can]=1
else: sscnt[can]+=1
numitems=float(len(d))
retlist=
supportdata={}
for key in sscnt:
support=sscnt[key]/numitems
if support>=minsupport:
retlist.insert(0,key)
supportdata[key]=support
return retlist,supportdata
#~ dataset=loaddataset()
#~ c1=createc1(dataset)
#~ print(c1)
#~ d=list(map(set,dataset))
#~ l1,suppdata0=scand(d,c1,0.5)
#~ print(l1)
#capriorigen () 的輸人引數為頻繁項集列表lk與項集元素個數k, 輸出為ck
def apriorigen(lk,k):
retlist=
lenlk=len(lk)
for i in range(lenlk):
for j in range(i+1,lenlk):
l1=list(lk[i])[:k-2]
l2=list(lk[j])[:k-2]
l1.sort();l2.sort()
if l1==l2:
return retlist
def apriori(dataset, minsupport = 0.5):
c1 = createc1(dataset)
d = list(map(set, dataset))
l1, supportdata = scand(d, c1, minsupport)
l = [l1]
k = 2
while (len(l[k-2]) > 0):
ck = apriorigen(l[k-2], k)
lk, supk = scand(d, ck, minsupport)#scan db to get lk
supportdata.update(supk)
k += 1
return l, supportdata
#~ dataset=loaddataset()
#~ l,suppdata=apriori(dataset)
#~ print(l)
#~ print(apriorigen(l[0],2))
#函式generaterules()有3個引數:頻繁項集列表、包含那些頻繁項集支援資料的字典、最小可信度閾值
def generaterules(l,supportdata,minconf=0.7):
bigrulelist=
for i in range(1,len(l)):
for freqset in l[i]:
h1=[frozenset([item]) for item in freqset]
if (i > 1):
rulesfromconseq(freqset, h1, supportdata, bigrulelist, minconf)
else:
calcconf(freqset, h1, supportdata, bigrulelist, minconf)
return bigrulelist
def calcconf(freqset, h, supportdata, brl, minconf=0.7):
prunedh = #create new list to return
for conseq in h:
conf = supportdata[freqset]/supportdata[freqset-conseq] #calc confidence
if conf >= minconf:
print(freqset-conseq,'-->',conseq,'conf:',conf)
return prunedh
def rulesfromconseq(freqset, h, supportdata, brl, minconf=0.7):
m = len(h[0])#h中第乙個元素(任意乙個元素)的長度
print('m=',m)
#生成式的右部可能包含兩個或多個元素,如果從集合 開始,那麼h1應該是,[,,,]
#~ 如果頻繁項集的元素數目超過 2 ,那麼會考慮對它做進一步的合併
#~ 右部可能出現,,
if (len(freqset) > (m + 1)): #try further merging
hmp1 = apriorigen(h, m+1)#create hm+1 new candidates
hmp1 = calcconf(freqset, hmp1, supportdata, brl, minconf)
if (len(hmp1) > 1): #need at least two sets to merge
rulesfromconseq(freqset, hmp1, supportdata, brl, minconf)
#~ dataset=loaddataset()
#~ l,suppdata=apriori(dataset,minsupport=0.5)
#~ rules=generaterules(l,suppdata,minconf=0.5)
#~ print(rules)
mushdatset=[line.split() for line in open('mushroom.dat').readlines()]
l,suppdata=apriori(mushdatset,minsupport=0.3)
for item in l[2]:
if item.intersection('2'): print(item)
機器學習之Apriori
1 幾個概念 1 關聯分析 一種在大規模資料中尋找有趣關係的任務。這種有趣關係一般有兩種形式 頻繁項集或者關聯規則。2 頻繁項集 經常,頻繁出現在一起的物品集合,通常用一對 來表示。3 關聯規則 兩種物品之間存在的關聯關係,通常用 4 支援度 這是用來衡量頻繁項集的因子。乙個項集的支援度即為乙個資料...
機器學習演算法 之Apriori
apriori演算法不同於以前接觸過的機器學習演算法,這種演算法用於在資料集中尋找有趣的關係。這些關係可以有兩種形式 頻繁項集或者關聯規則。關於演算法的詳細介紹參見 def apriori dataset,minsupport 0.5 c1 createc1 dataset d map set,da...
機器學習之Apriori演算法python實現
coding utf 8 created on sun dec 23 15 50 25 2018 author muli from future import print function import pandas as pd 自定義連線函式,用於實現l 到c k的連線 def connect s...