JAVA实现FP_Growth算法（附详细注释）

2022-08-08 13:17:37

JAVA实现FP_Growth算法（附详细注释）
在学习数据挖掘的过程中，必然会接触到FP_Growth算法。
而FP_Growth算法需要使用大量的递归调用，比较难以理解和实践，作者在网上多番搜寻也没有找到一个完整详细解释的FP_Growth代码，可能是高手大多不屑于解释自己的代码，无奈只能自己动手，有不足之处请多谅解。
程序编写过程中部分参考了JAVA实现FP-GROWTH算法https://www.cnblogs.com/liguangsunls/p/6892482.html，数据集使用的是FP-Tree算法的实现https://www.cnblogs.com/zhangchaoyang/articles/2198946.html中的trolley.txt。

具体实现
作者编写了两个类来实现FP_Growth算法的相关功能，其中TreeNode类用以保存树结点的信息，FPTree类用以实现需要用到的各种方法。

TreeNode类

import java.util.ArrayList;import java.util.Comparator;publicclassTreeNode{private String name;//项名称privateint count;//支持度private TreeNode parent;//父结点private ArrayList<TreeNode> childs;//子结点（列表）private TreeNode nextSameNode;//指向下一个相同结点publicTreeNode(){super();}publicTreeNode(String name,int count){super();this.name= name;this.count= count;this.childs=newArrayList<TreeNode>();this.parent= null;this.nextSameNode= null;}publicTreeNode(String name,int count, TreeNode parent){super();this.name= name;this.count= count;this.childs=newArrayList<TreeNode>();this.parent= parent;this.nextSameNode= null;}public StringgetName(){return name;}publicvoidsetName(String name){this.name= name;}publicintgetCount(){return count;}publicvoidsetCount(int count){this.count= count;}public TreeNodegetParent(){returnthis.parent;}publicbooleanhasParent(){returnthis.parent== null?false:true;}publicvoidsetParent(TreeNode parent){this.parent= parent;}publicvoidaddChild(TreeNode child){this.childs.add(child);}publicvoidaddChilds(ArrayList<TreeNode> childs){this.childs.addAll(childs);}publicvoiddelChild(TreeNode child){this.childs.remove(child);}public TreeNodegetNextSameNode(){returnthis.nextSameNode;}publicbooleanhasNextSameNode(){returnthis.nextSameNode== null?false:true;}publicvoidsetNextSameNode(TreeNode nextSameNode){this.nextSameNode= nextSameNode;}public ArrayList<TreeNode>getChilds(){return childs;}publicvoidinc(int i){this.count+= i;}@Overridepublic StringtoString(){returnthis.getName()+":"+this.getCount();}}/*
 * @function：为实现使用Collections.sort方法对泛型为TreeNode的List进行排序，需要实现Comparator接口
 */classTreeNodeSortimplementsComparator<TreeNode>{@Overridepublicintcompare(TreeNode o1, TreeNode o2){int result= Integer.compare(o2.getCount(), o1.getCount());//实现降序排列//int result = Integer.compare(o1.getCount(), o2.getCount());	//实现升序排列if(result!=0)return result;return o1.getName().compareTo(o2.getName());}}

FPTree类

import java.io.File;import java.io.FileNotFoundException;import java.util.ArrayList;import java.util.Arrays;import java.util.Collections;import java.util.HashMap;import java.util.LinkedList;import java.util.Scanner;publicclassFPTree{private String splitChar;//分隔符privatedouble support;//支持度阈值（比例值）privatedouble supportNum;//支持度阈值（绝对值）/*
	 * @param1	:指定支持度阈值
	 * @return	:无
	 * @function:构造一个FPTree实例，以调用相关方法，分隔符默认为","，
	 * 			 判断输入支持度阈值为比例值或绝对值
	 */publicFPTree(double support){super();this.splitChar=",";if(support>=1)this.supportNum= support;elsethis.support= support;}public StringgetSplitChar(){returnthis.splitChar;}/*
	 * @param1	:给定文件分隔符
	 * @function:文件分隔符仅可通过此方法进行修改
	 */publicvoidsetSplitChar(String splitChar){this.splitChar= splitChar;}/*
	 * @param1	:输入文件路径
	 * @return	:所有事务组成的二维列表
	 * @function:将文件内容按行读出，一行为一条事务，按指定的分隔符分隔成项，存入列表
	 * 			 文件内容要求为购物篮形式，如：牛奶，啤酒，咖啡
	 * 			 					      啤酒，咖啡
	 */public ArrayList<ArrayList<String>>readBacketFile(String path){
		ArrayList<ArrayList<String>> transactions=newArrayList<ArrayList<String>>(); 
		File file=newFile(path);
		Scanner reader= null;int countLine=0;try{
			reader=newScanner(file);while(reader.hasNext()){
				String line= reader.nextLine().trim();				
				ArrayList<String> items=newArrayList<String>(Arrays.asList(line.split(splitChar))); 
				transactions.add(items);//需要记录事务总数
				countLine++;}}catch(FileNotFoundException e){
			e.printStackTrace();}
		reader.close();if(supportNum==0)//当给定的支持度阈值为比例值时，需要乘上事务总数求出支持度阈值绝对值
			supportNum= support* countLine;//如给定的支持度阈值为0.4，事务总数为20条，则支持度阈值绝对值为：0.4 * 20 = 8return transactions;}/*
	 * @param1	:输入文件路径
	 * @param2	:给定判断项是否存在的标识，如给出的例子中标识为“1”
	 * @return	:所有事务组成的二维列表
	 * @function:将文件内容按行读出，一行为一条事务，按指定的分隔符分隔成项，存入列表
	 * 			 文件内容要求为向量形式，如：牛奶，啤酒，咖啡
	 * 									 1 ， 1 ， 1
	 * 									 0 ， 1 ， 1
	 */public ArrayList<ArrayList<String>>readVectorFile(String path, String exist){
		ArrayList<ArrayList<String>> transactions=newArrayList<ArrayList<String>>(); 
		File file=newFile(path);
		Scanner reader= null;int countLine=0;try{
			reader=newScanner(file);//表头是项目名称，单独处理、保存
			String[] itemNames= reader.nextLine().trim().split(this.getSplitChar());while(reader.hasNext()){
				String line= reader.nextLine().trim();
				ArrayList<String> items=newArrayList<String>(Arrays.asList(line.split(this.getSplitChar()))); 
				ArrayList<String> appearedItems=newArrayList<String>();for(int i=0; i< items.size(); i++)if(items.get(i).equals(exist))//根据给定标识判断项是否出现
						appearedItems.add(itemNames[i]);
				transactions.add(appearedItems);
				countLine++;}}catch(FileNotFoundException e){
			e.printStackTrace();}
		reader.close();if(supportNum==0)
			supportNum= support* countLine;return transactions;}/*
	 * @param1	:所有事务，每条事务用一个ArrayList保存
	 * @return	:排序后的频繁一项集
	 * @function:从事务中获取频繁一项集，按支持度降序排序后，以列表形式返回
	 */public LinkedList<TreeNode>buildHeaderTable(ArrayList<ArrayList<String>> transactions){
		LinkedList<TreeNode> headerTable=newLinkedList<TreeNode>();
		HashMap<String, TreeNode> map=newHashMap<String, TreeNode>();//使用HashMap保存项名称及结点，使用结点的原因为方便计数和排序for(ArrayList<String> items: transactions){for(String item: items){if(map.containsKey(item))
					map.get(item).inc(1);调用了TreeNode的inc方法，使计数加1else{
					TreeNode node=newTreeNode(item,1);
					map.put(item, node);}}}for(String name: map.keySet()){
			TreeNode node= map.get(name);if(node.getCount()>= supportNum)只保存支持度大于支持度阈值（绝对值）的项
				headerTable.add(node);}
		
		Collections.sort(headerTable,newTreeNodeSort());//使用Collections的sort方法，需要对TreeNode类实现一个排序类TreeNodeSortreturn headerTable;}/*
	 * @param1	:所有事务
	 * @param2	:按支持度降序排列的频繁一项集
	 * @return	:排序后的所有事务
	 * @function:将所有事务按频繁项顺序排序后，返回排序后的事务
	 */public ArrayList<ArrayList<String>>sortByFreqItem(ArrayList<ArrayList<String>> transactions, 
			LinkedList<TreeNode> itemSortByFreq){//保存排序后的所有事务
		ArrayList<ArrayList<String>> sortedTransactions=newArrayList<ArrayList<String>>();for(ArrayList<String> transaction: transactions){
			ArrayList<String> sortedItem=newArrayList<String>();保存排序后的一条事务int itemNum= transaction.size();for(TreeNode node: itemSortByFreq){//对排序后的频繁一项集遍历if(transaction.contains(node.getName())){//若当前事务中存在该频繁一项集，则保存
					sortedItem.add(node.getName());
					itemNum--;}if(itemNum==0)//用以计数避免无用的循环break;}
			sortedTransactions.add(sortedItem);每次循环处理一条事务}return sortedTransactions;}/*
	 * @param1	:按频繁一项集顺序排好序的所有事务
	 * @param2	:项头表，此时表中仅包含频繁一项集及其出现次数，没有指向相同结点的链
	 * @return	:返回构建好的FP树的根结点
	 * @function:利用所有事务和项头表
	 */public TreeNodebuildFPTree(ArrayList<ArrayList<String>> transactions, LinkedList<TreeNode> headerTable){
		TreeNode root=newTreeNode("root",0, null);//树根结点，计数为0，无父结点for(ArrayList<String> items: transactions){
			TreeNode parent= root;每次循环处理一条事务，每条事务的第一个结点都是父结点for(String item: items){								
				TreeNode itemNode=exist(item, parent);//根据父结点判断当前结点是否存在，并获取正确的结点addSameNode(headerTable, itemNode);//得到结点后，放入项头表中
				parent= itemNode;//父结点动态变化}}return root;}/*
	 * @param1	:项头表
	 * @param2	:当前结点
	 * @return	:无
	 * @function:为项头表中对应的项添加指向当前结点的链式关系
	 */publicvoidaddSameNode(LinkedList<TreeNode> headerTable, TreeNode itemNode){for(TreeNode head: headerTable)if(head.getName().equals(itemNode.getName())){遍历项头表找到当前结点对应的项while(head.hasNextSameNode()){
					head= head.getNextSameNode();//寻找对应的项的链表，找到其指向的最后一个相同结点if(head== itemNode)//若遍历链表时发现当前结点已存在链式关系，即无需添加，直接返回return;}
				head.setNextSameNode(itemNode);若链表中不存在当前结点，则使最后一个结点指向当前结点，形成链式关系return;}}/*
	 * @param1	:项名
	 * @param2	:父结点
	 * @return	:项名对应的结点
	 * @function:根据父结点，判断当前项是否存在，若已存在则其次数加一并返回该结点，
	 * 			 若不存在则创建一个计数为1的结点并返回
	 */public TreeNodeexist(String item, TreeNode parent){for(TreeNode child: parent.getChilds())if(child.getName().equals(item)){
				child.inc(1);return child;}
		
		TreeNode node=newTreeNode(item,1, parent);//注意新创建结点后为当前结点添加父结点
		parent.addChild(node);//父结点的子结点列表中也要添加当前结点return node;}/*
	 * @param1	:FP树结点
	 * @return	:该结点对应的条件模式基，以列表的形式存储
	 * @function:根据当前结点，遍历其父结点直到找到根结点root，其路径上的所有结点项构成条件模式基
	 */public ArrayList<String>getCPB(TreeNode node){
		ArrayList<String> transaction=newArrayList<String>();
		TreeNode parent= node;while(parent.hasParent()){if(parent.getParent().getName().equals("root"))break;
			parent= parent.getParent();
			transaction.add(parent.getName());}return transaction;}/*
	 * @param1	:无序的、针对当前项的所有事务
	 * @param2	:当前项
	 * @return	:频繁项集
	 * @function:递归调用，根据所有事务和当前项，计算出包含当前项的频繁项集并返回
	 */public ArrayList<String>fp_growth(ArrayList<ArrayList<String>> transactions, String item){
		ArrayList<String> freqItems=newArrayList<String>();//保存频繁项集
		LinkedList<TreeNode> itemSortByFreq=buildHeaderTable(transactions);
		transactions=sortByFreqItem(transactions, itemSortByFreq);
		TreeNode root=buildFPTree(transactions, itemSortByFreq);//对于当前项和事务，生成其对应的FP树和项头表