对于
Apriori 算法是一种挖掘关联规则的频繁项集算法,其核心思想是通过候选集生成向下封闭、检测两个阶段来挖掘频繁项集。
Apriori 算法应用广泛,可用于消费市场价格分析,猜测顾客的消费习惯;网络安全领域中的入侵检测技术;可用在用于高校管理中,根据挖掘规则可以有效地辅助学校管理部门有针对性的开展贫困助学工作;也可用在移动通信领域中,指导运营商的业务运营和辅助业务提供商的决策制定。
1. 依据支持度找出所有频繁项集(频度)
2. 依据置信度产生关联规则(强度)
Apriori 使用一种称作逐层搜索的迭代方法,"K-1 项集" 用于搜索 "K 项集"。
首先,扫描整个事务,找出频繁 1 - 项集的集合,该集合记作 L1。L1 用于找频繁 "2 项集" 的集合 L2,而 L2 用于找 L3。如此下去,直到不能找到 "K 项集"。找每个 Lk 都需要一次数据库扫描。
核心思想是:连接步和剪枝步。连接步是自连接,原则是保证前 k-2 项相同,并按照字典顺序连接。剪枝步,是使任一频繁项集的所有非空子集也必须是频繁的。反之,如果某个候选的非空子集不是频繁的,那么该候选肯定不是频繁的,从而可以将其从 CK 中删除。
简单的讲,1、发现频繁项集,过程为(1)扫描(2)计数(3)比较(4)产生频繁项集(5)连接、剪枝,产生候选项集 重复步骤(1)~(5)直到不能发现更大的频集
根据前面提到的置信度的定义,关联规则的产生如下:
(1)对于每个频繁项集 L,产生 L 的所有非空子集;
(2)对于 L 的每个非空子集 S,如果
则输出规则
注:L-S 表示在项集 L 中除去 S 子集的项集
- 伪代码描述:
- // 找出频繁 1 项集
- L1 =find_frequent_1-itemsets(D);
- For(k=2;Lk-1 !=null;k++){
- // 产生候选,并剪枝
- Ck =apriori_gen(Lk-1 );
- // 扫描 D 进行候选计数
- For each 事务t in D{
- Ct =subset(Ck,t); // 得到 t 的子集
- For each 候选 c 属于 Ct
- c.count++;
- }
- //返回候选项集中不小于最小支持度的项集
- Lk ={c 属于 Ck | c.count>=min_sup}
- }
- Return L= 所有的频繁集;
- 第一步:连接(join)
- Procedure apriori_gen (Lk-1 :frequent(k-1)-itemsets)
- For each 项集 l1 属于 Lk-1
- For each 项集 l2 属于 Lk-1
- If( (l1 [1]=l2 [1])&&( l1 [2]=l2 [2])&& ……&& (l1 [k-2]=l2 [k-2])&&(l1 [k-1]<l2 [k-1]) )
- then{
- c = l1 连接 l2 // 连接步:产生候选
- //若k-1项集中已经存在子集c则进行剪枝
- if has_infrequent_subset(c, Lk-1 ) then
- delete c; // 剪枝步:删除非频繁候选
- else add c to Ck;
- }
- Return Ck;
- 第二步:剪枝(prune)
- Procedure has_infrequent_sub (c:candidate k-itemset; Lk-1 :frequent(k-1)-itemsets)
- For each (k-1)-subset s of c
- If s 不属于 Lk-1 then
- Return true;
- Return false;
- #coding:utf-8
- samples = [
- ["I1","I2","I5"],
- ["I2","I4"],
- ["I2","I3"],
- ["I1","I2","I4"],
- ["I1","I3"],
- ["I2","I3"],
- ["I1","I3"],
- ["I1","I2","I3","I5"],
- ["I1","I2","I3"]
- ]
- min_support = 2
- min_confidence = 0.6
- fre_list = list()
- def get_c1():
- global record_list
- global record_dict
- new_dict = dict()
- for row in samples:
- for item in row:
- if item not in fre_list:
- fre_list.append(item)
- new_dict[item] = 1
- else:
- new_dict[item] = new_dict[item] + 1
- fre_list.sort()
- print "candidate set:"
- print_dict(new_dict)
- for key in fre_list:
- if new_dict[key] < min_support:
- del new_dict[key]
- print "after pruning:"
- print_dict(new_dict)
- record_list = fre_list
- record_dict = record_dict
- def get_candidateset():
- new_list = list()
- #自连接
- for i in range(0,len(fre_list)):
- for j in range(0,len(fre_list)):
- if i == j:
- continue
- #如果两个k项集可以自连接,必须保证它们有k-1项是相同的
- if has_samesubitem(fre_list[i],fre_list[j]):
- curitem = fre_list[i] + ',' + fre_list[j]
- curitem = curitem.split(",")
- curitem = list(set(curitem))
- curitem.sort()
- curitem = ','.join(curitem)
- #如果一个k项集要成为候选集,必须保证它的所有子集都是频繁的
- if has_infresubset(curitem) == False and already_constains(curitem,new_list) == False:
- new_list.append(curitem)
- new_list.sort()
- return new_list
- def has_samesubitem(str1,str2):
- str1s = str1.split(",")
- str2s = str2.split(",")
- if len(str1s) != len(str2s):
- return False
- nums = 0
- for items in str1s:
- if items in str2s:
- nums += 1
- str2s.remove(items)
- if nums == len(str1s) - 1:
- return True
- else:
- return False
- def judge(candidatelist):
- # 计算候选集的支持度
- new_dict = dict()
- for item in candidatelist:
- new_dict[item] = get_support(item)
- print "candidate set:"
- print_dict(new_dict)
- #剪枝
- #频繁集的支持度要大于最小支持度
- new_list = list()
- for item in candidatelist:
- if new_dict[item] < min_support:
- del new_dict[item]
- continue
- else:
- new_list.append(item)
- global fre_list
- fre_list = new_list
- print "after pruning:"
- print_dict(new_dict)
- return new_dict
- def has_infresubset(item):
- # 由于是逐层搜索的,所以对于Ck候选集只需要判断它的k-1子集是否包含非频繁集即可
- subset_list = get_subset(item.split(","))
- for item_list in subset_list:
- if already_constains(item_list,fre_list) == False:
- return True
- return False
- def get_support(item,splitetag=True):
- if splitetag:
- items = item.split(",")
- else:
- items = item.split("^")
- support = 0
- for row in samples:
- tag = True
- for curitem in items:
- if curitem not in row:
- tag = False
- continue
- if tag:
- support += 1
- return support
- def get_fullpermutation(arr):
- if len(arr) == 1:
- return [arr]
- else:
- newlist = list()
- for i in range(0,len(arr)):
- sublist = get_fullpermutation(arr[0:i]+arr[i+1:len(arr)])
- for item in sublist:
- curlist = list()
- curlist.append(arr[i])
- curlist.extend(item)
- newlist.append(curlist)
- return newlist
- def get_subset(arr):
- newlist = list()
- for i in range(0,len(arr)):
- arr1 = arr[0:i]+arr[i+1:len(arr)]
- newlist1 = get_fullpermutation(arr1)
- for newlist_item in newlist1:
- newlist.append(newlist_item)
- newlist.sort()
- newlist = remove_dumplicate(newlist)
- return newlist
- def remove_dumplicate(arr):
- newlist = list()
- for i in range(0,len(arr)):
- if already_constains(arr[i],newlist) == False:
- newlist.append(arr[i])
- return newlist
- def already_constains(item,curlist):
- import types
- items = list()
- if type(item) is types.StringType:
- items = item.split(",")
- else:
- items = item
- for i in range(0,len(curlist)):
- curitems = list()
- if type(curlist[i]) is types.StringType:
- curitems = curlist[i].split(",")
- else:
- curitems = curlist[i]
- if len(set(items)) == len(curitems) and len(list(set(items).difference(set(curitems)))) == 0:
- return True
- return False
- def print_dict(curdict):
- keys = curdict.keys()
- keys.sort()
- for curkey in keys:
- print "%s:%s"%(curkey,curdict[curkey])
- # 计算关联规则的方法
- def get_all_subset(arr):
- rtn = list()
- while True:
- subset_list = get_subset(arr)
- stop = False
- for subset_item_list in subset_list:
- if len(subset_item_list) == 1:
- stop = True
- rtn.append(subset_item_list)
- if stop:
- break
- return rtn
- def get_all_subset(s):
- from itertools import combinations
- return sum(map(lambda r: list(combinations(s, r)), range(1, len(s)+1)), [])
- def cal_associative_rule(frelist):
- rule_list = list()
- rule_dict = dict()
- for fre_item in frelist:
- fre_items = fre_item.split(",")
- subitem_list = get_all_subset(fre_items)
- for subitem in subitem_list:
- # 忽略为为自身的子集
- if len(subitem) == len(fre_items):
- continue
- else:
- difference = set(fre_items).difference(subitem)
- rule_list.append("^".join(subitem)+"->"+"^".join(difference))
- print "The rule is:"
- for rule in rule_list:
- conf = cal_rule_confidency(rule)
- print rule,conf
- if conf >= min_confidence:
- rule_dict[rule] = conf
- print "The associative rule is:"
- for key in rule_list:
- if key in rule_dict.keys():
- print key,":",rule_dict[key]
- def cal_rule_confidency(rule):
- rules = rule.split("->")
- support1 = get_support("^".join(rules),False)
- support2 = get_support(rules[0],False)
- if support2 == 0:
- return 0
- rule_confidency = float(support1)/float(support2)
- return rule_confidency
- if __name__ == '__main__':
- record_list = list()
- record_dict = dict()
- get_c1()
- # 不断进行自连接和剪枝,直到得到最终的频繁集为止;终止条件是,如果自连接得到的已经不再是频繁集
- # 那么取最后一次得到的频繁集作为结果
- while True:
- record_list = fre_list
- new_list = get_candidateset()
- judge_dict = judge(new_list)
- if len(judge_dict) == 0:
- break
- else:
- record_dict = judge_dict
- print "The final frequency set is:"
- print record_list
- # 根据频繁集计算关联规则
- cal_associative_rule(record_list)
来源: