用java实现FP-growth算法

2022-08-06 08:38:31

首先我们得了解一下什么是FP-growth算法,如下:

FP-Growth算法是韩嘉炜等人在2000年提出的关联分析算法,它采取如下分治策略:将提供频繁项集的数据库压缩到一棵频繁模式树(FP-tree),但仍保留项集关联信息。

在算法中使用了一种称为频繁模式树(Frequent Pattern Tree)的数据结构。FP-tree是一种特殊的前缀树,由频繁项头表和项前缀树构成。FP-Growth算法基于以上的结构加快整个挖掘过程。

我们看看FP-growth算法的python实现,如下:

#树的节点对象
class treeNode:
    def __init__(self, nameValue, numOccur, parentNode):
        self.name = nameValue
        self.count = numOccur
        self.nodeLink = None
        self.parent = parentNode      #needs to be updated
        self.children = {} 
    
    def inc(self, numOccur):
        self.count += numOccur
        
    def disp(self, ind=1):
        print '  '*ind, self.name, ' ', self.count
        for child in self.children.values():
            child.disp(ind+1)

#创建树的方法
def createTree(dataSet, minSup=1): #create FP-tree from dataset but don't mine
    headerTable = {}
    #go over dataSet twice
    for trans in dataSet:#first pass counts frequency of occurance
        for item in trans:
            headerTable[item] = headerTable.get(item, 0) + dataSet[trans]
    for k in headerTable.keys():  #remove items not meeting minSup
        if headerTable[k] < minSup: 
            del(headerTable[k])
    freqItemSet = set(headerTable.keys())
    #print 'freqItemSet: ',freqItemSet
    if len(freqItemSet) == 0: return None, None  #if no items meet min support -->get out
    for k in headerTable:
        headerTable[k] = [headerTable[k], None] #reformat headerTable to use Node link 
    #print 'headerTable: ',headerTable
    retTree = treeNode('Null Set', 1, None) #create tree
    for tranSet, count in dataSet.items():  #go through dataset 2nd time
        localD = {}
        for item in tranSet:  #put transaction items in order
            if item in freqItemSet:
                localD[item] = headerTable[item][0]
        if len(localD) > 0:
            orderedItems = [v[0] for v in sorted(localD.items(), key=lambda p: p[1], reverse=True)]
            updateTree(orderedItems, retTree, headerTable, count)#populate tree with ordered freq itemset
    return retTree, headerTable #return tree and header table

#递归构建FP树和更新头部映射表
def updateTree(items, inTree, headerTable, count):
    if items[0] in inTree.children:#check if orderedItems[0] in retTree.children
        inTree.children[items[0]].inc(count) #incrament count
    else:   #add items[0] to inTree.children
        inTree.children[items[0]] = treeNode(items[0], count, inTree)
        if headerTable[items[0]][1] == None: #update header table 
            headerTable[items[0]][1] = inTree.children[items[0]]
        else:
            updateHeader(headerTable[items[0]][1], inTree.children[items[0]])
    if len(items) > 1:#call updateTree() with remaining ordered items
        updateTree(items[1::], inTree.children[items[0]], headerTable, count)

#更新头部映射表        
def updateHeader(nodeToTest, targetNode):   #this version does not use recursion
    while (nodeToTest.nodeLink != None):    #Do not use recursion to traverse a linked list!
        nodeToTest = nodeToTest.nodeLink
    nodeToTest.nodeLink = targetNode

#根据后缀上溯FP树        
def ascendTree(leafNode, prefixPath): #ascends from leaf node to root
    if leafNode.parent != None:
        prefixPath.append(leafNode.name)
        ascendTree(leafNode.parent, prefixPath)

#遍历整颗树,找出该后缀所有条件模式基
def findPrefixPath(basePat, treeNode): #treeNode comes from header table
    condPats = {}
    while treeNode != None:
        prefixPath = []
        ascendTree(treeNode, prefixPath)
        if len(prefixPath) > 1: 
            condPats[frozenset(prefixPath[1:])] = treeNode.count
        treeNode = treeNode.nodeLink
    return condPats

#遍历所有后缀
def mineTree(inTree, headerTable, minSup, preFix, freqItemList):
    bigL = [v[0] for v in sorted(headerTable.items(), key=lambda p: p[1])]#(sort header table)
    for basePat in bigL:  #start from bottom of header table
        newFreqSet = preFix.copy()
        newFreqSet.add(basePat)
        #print 'finalFrequent Item: ',newFreqSet    #append to set
        freqItemList.append(newFreqSet)
        condPattBases = findPrefixPath(basePat, headerTable[basePat][1])
        #print 'condPattBases :',basePat, condPattBases
        #2. construct cond FP-tree from cond. pattern base
        myCondTree, myHead = createTree(condPattBases, minSup)
        #print 'head from conditional tree: ', myHead
        if myHead != None: #3. mine cond. FP-tree
            #print 'conditional tree for: ',newFreqSet
            #myCondTree.disp(1)            
            mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList)

现在我们开始用java实现,首先得定义好树节点类

package com.algorithm;

import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class FPNode {
	
	
	private String name;
	
	private double count;
	
	private FPNode nodeLink;
	
	private FPNode parent;
	 
	private Map<String,FPNode> children = new HashMap<String,FPNode>();
	
	FPNode(String name,double count){
		this.name = name;
		this.count = count;
	}
	
	FPNode(String name,double count,FPNode parent){
		this.name = name;
		this.count = count;
		this.parent = parent;
	}
	
	
	public void inc(double numOccur) {
		this.count += numOccur;
	}
	
    public void disp(int ind) {
    	for(int i=0;i<ind;i++)
    		System.out.print(" ");
    	System.out.println(this.name+" "+this.count);
    	
    	for (Map.Entry<String,FPNode> child : this.children.entrySet()) { 
    		child.getValue().disp(ind+1);
		}
    	
    }
 

	public String getName() {
		return name;
	}

	public void setName(String name) {
		this.name = name;
	}

	public double getCount() {
		return count;
	}

	public void setCount(double count) {
		this.count = count;
	}

	public FPNode getNodeLink() {
		return nodeLink;
	}

	public void setNodeLink(FPNode nodeLink) {
		this.nodeLink = nodeLink;
	}

	public FPNode getParent() {
		return parent;
	}

	public void setParent(FPNode parent) {
		this.parent = parent;
	}

	public Map<String, FPNode> getChildren() {
		return children;
	}

	public void setChildren(Map<String, FPNode> children) {
		this.children = children;
	}
	
	

}

然后是树的类

package com.algorithm;

import java.util.List;
import java.util.Map;

public class FPTreeInfo {
	
	
	private FPNode retTree;
	
	private Map<String,HeaderTable> headerTable;
	
	FPTreeInfo(FPNode retTree,Map<String,HeaderTable> headerTable){
		this.retTree = retTree;
		this.headerTable = headerTable;
	}

	public FPNode getRetTree() {
		return retTree;
	}

	public void setRetTree(FPNode retTree) {
		this.retTree = retTree;
	}

	public Map<String, HeaderTable> getHeaderTable() {
		return headerTable;
	}

	public void setHeaderTable(Map<String, HeaderTable> headerTable) {
		this.headerTable = headerTable;
	}


	
	
	
	

}

头部映射表的类

package com.algorithm;

import java.util.List;
import java.util.Map;

public class HeaderTable {
	
	
	private double hTCount;
	
	private FPNode fpNode;
	
	HeaderTable(double hTCount,FPNode fpNode){
		this.hTCount = hTCount;
		this.fpNode = fpNode;
	}

    

	public double gethTCount() {
		return hTCount;
	}



	public void sethTCount(double hTCount) {
		this.hTCount = hTCount;
	}



	public FPNode getFpNode() {
		return fpNode;
	}

	public void setFpNode(FPNode fpNode) {
		this.fpNode = fpNode;
	}
	
	

}

然后是初始化基础字典

	private static Map<List<String>,Double> createInitSet(List<List<String>> dataSet){
		Map<List<String>,Double> retDict = new HashMap<List<String>,Double>();
		
		for(int i=0;i<dataSet.size();i++) {
			retDict.put(dataSet.get(i), (double)1);
		}

		return retDict;
	}

更新头部映射表的方法

private static void updateHeader(FPNode nodeToTest,FPNode targetNode) {
		
		if(nodeToTest.getNodeLink() != null) {
			nodeToTest = nodeToTest.getNodeLink();
		}
		nodeToTest.setNodeLink(targetNode);
	}

递归构建FP树核心方法


	private static void updateTree(List<String> items,FPNode inTree,Map<String,HeaderTable> headerTable,double count) {
		
		if(inTree.getChildren().containsKey(items.get(0))) {
			inTree.getChildren().get(items.get(0)).inc(count);
		}else {
			inTree.getChildren().put(items.get(0), new FPNode(items.get(0), count, inTree));

			if(headerTable.get(items.get(0)).getFpNode() == null) {
				headerTable.get(items.get(0)).setFpNode(inTree.getChildren().get(items.get(0)));
			}else {
				updateHeader(headerTable.get(items.get(0)).getFpNode(),inTree.getChildren().get(items.get(0)));
			}
	            
		}
		
		if(items.size() > 1) {
			updateTree(items.subList(1, items.size()),inTree.getChildren().get(items.get(0)), headerTable, count);
		}
        
		
	}

构建FP树

public static FPTreeInfo createTree(Map<List<String>,Double> dataSet,double minSup) {
		
		Map<String,Double> tmp = new HashMap<String,Double>();
		
		for (Map.Entry<List<String>,Double> trans : dataSet.entrySet()) { 
	        for(String item : trans.getKey()) {
	        	if(tmp.containsKey(item)) {
	        		tmp.replace(item, tmp.get(item)+trans.getValue());
	        	}else {
	        		tmp.put(item,trans.getValue());
	        	}
	        }    
		}
		
		Map<String,HeaderTable> ht = new HashMap<String,HeaderTable>();
		
		for (Map.Entry<String,Double> trans : tmp.entrySet()) { 
			if(trans.getValue() >= minSup) {
				List<String> list = new ArrayList<String>();
					list.add(trans.getKey());
				ht.put(trans.getKey(),new HeaderTable(trans.getValue(),null));
			}
		}
		
		Set<String> freqItemSet = ht.keySet();
		
		//System.out.println("freqItemSet: "+freqItemSet.toString());
		
		if(freqItemSet.size() == 0) {
			return new FPTreeInfo(null,null);
		}
    
		FPNode retTree = new FPNode("Null Set", 1);
		
		for (Map.Entry<List<String>,Double> trans : dataSet.entrySet()) { 
	        List<String> localD = new ArrayList<String>();
	        
	        List<Map.Entry<String,HeaderTable>> list = new ArrayList<Map.Entry<String,HeaderTable>>(ht.entrySet());
	        Collections.sort(list,new Comparator<Map.Entry<String,HeaderTable>>() {
	            public int compare(Entry<String,HeaderTable> o1,
	                    Entry<String,HeaderTable> o2) {
	            	
	            	if(o1.getValue().gethTCount() < o2.getValue().gethTCount()) {
	            		return 1;
	            	}else if (o1.getValue().gethTCount() < o2.getValue().gethTCount()) {
	            		return 0;
	            	}else {
	            		return -1;
	            	}
	                
	            }
	            
	        });
	        
	        for(Map.Entry<String,HeaderTable> mapping:list){ 
	        	if(trans.getKey().contains(mapping.getKey()) && freqItemSet.contains(mapping.getKey())) {
	        		localD.add(mapping.getKey());
	        	}
	        } 
	        
	        if(localD.size() > 0) {
	            updateTree(localD, retTree, ht, trans.getValue());
	        }
	        

		}

    	return new FPTreeInfo(retTree,ht);
	}

我们先构建FP树看看效果,如下:

List<String[]> datas = new ArrayList<String[]>();
		
			datas.add(new String[]{"r", "z", "h", "j", "p"});
			datas.add(new String[]{"z", "y", "x", "w", "v", "u", "t", "s"});
			datas.add(new String[]{"z"});
			datas.add(new String[]{"r", "x", "n", "o", "s"});
			datas.add(new String[]{"y", "r", "x", "z", "q", "t", "p"});
			datas.add(new String[]{"y", "z", "x", "e", "q", "s", "t", "m"});
		
		List<List<String>> dataIns = new ArrayList<List<String>>();	
		
		for(String[] item : datas) {
			dataIns.add(Arrays.asList(item));
		}
		
		Map<List<String>,Double> initSet = createInitSet(dataIns);
		
		System.out.println(initSet.toString());
		
		FPTreeInfo fpt = createTree(initSet,3);
		
		fpt.getRetTree().disp(1);

构建FP树成功

然后我们用FP树寻找条件模式基和频繁项

首先是根据前缀递归上溯方法

	private static void ascendTree(FPNode leafNode,List<String> prefixPath) {
	    if(leafNode.getParent() != null) {
	    	prefixPath.add(leafNode.getName());
	    	ascendTree(leafNode.getParent(), prefixPath);
	    }     
	}

根据该前缀遍历整颗树找出所有条件模式基的方法

private static Map<List<String>,Double> findPrefixPath(String basePat,FPNode treeNode){
		Map<List<String>,Double> condPats = new HashMap<List<String>,Double>();
	    while(treeNode != null) {
	    	List<String> prefixPath = new ArrayList<String>();
		    ascendTree(treeNode, prefixPath);
		    if(prefixPath.size() > 1) {
		    	condPats.put(prefixPath.subList(1, prefixPath.size()), treeNode.getCount());
		    }     
		    treeNode = treeNode.getNodeLink();
	    } 
	    return condPats;
	}

最后是遍历所有的前缀

private static void mineTree(FPNode  inTree,Map<String,HeaderTable> headerTable,double minSup,List<String> preFix,List<List<String>> freqItemList) {
		
		List<String> bigL = new ArrayList<String>();
        
        List<Map.Entry<String,HeaderTable>> list = new ArrayList<Map.Entry<String,HeaderTable>>(headerTable.entrySet());
        Collections.sort(list,new Comparator<Map.Entry<String,HeaderTable>>() {
            public int compare(Entry<String,HeaderTable> o1,
                    Entry<String,HeaderTable> o2) {
            	
            	if(o1.getValue().gethTCount() < o2.getValue().gethTCount()) {
            		return -1;
            	}else if (o1.getValue().gethTCount() < o2.getValue().gethTCount()) {
            		return 0;
            	}else {
            		return 1;
            	}
                
            }
            
        });
        
        for(Map.Entry<String,HeaderTable> mapping:list){ 
        	if(!bigL.contains(mapping.getKey()))
        		bigL.add(mapping.getKey());
        } 
        
        for(String basePat : bigL) {
        	//System.out.println(basePat);
	        List<String> newFreqSet = new ArrayList<>(preFix);
	        newFreqSet.add(basePat);
	        freqItemList.add(newFreqSet);
	        Map<List<String>,Double> condPattBases = findPrefixPath(basePat, headerTable.get(basePat).getFpNode());
	       // System.out.println(condPattBases.toString());
	        FPTreeInfo ct = createTree(condPattBases,minSup);

	        if(ct.getHeaderTable() != null) {
	        	System.out.println("conditional tree for: "+newFreqSet);
	        	ct.getRetTree().disp(1);   
	            mineTree(ct.getRetTree(), ct.getHeaderTable(), minSup, newFreqSet, freqItemList);
	        }          
        }

	}

测试结果如下:

  • 作者:路边草随风
  • 原文链接:https://blog.csdn.net/luohualiushui1/article/details/87554812
    更新时间:2022-08-06 08:38:31