zoukankan      html  css  js  c++  java
  • flink PageRank详解(批量迭代的页面排名算法的基本实现)

    1、PageRank算法原理

     

    2、基本数据准备

    /**
     *  numPages缺省15个测试页面
     *
     *  EDGES表示从一个pageId指向相连的另外一个pageId
     */
    public class PageRankData {
    
        public static final Object[][] EDGES = {
                {1L, 2L},
                {1L, 15L},
                {2L, 3L},
                {2L, 4L},
                {2L, 5L},
                {2L, 6L},
                {2L, 7L},
                {3L, 13L},
                {4L, 2L},
                {5L, 11L},
                {5L, 12L},
                {6L, 1L},
                {6L, 7L},
                {6L, 8L},
                {7L, 1L},
                {7L, 8L},
                {8L, 1L},
                {8L, 9L},
                {8L, 10L},
                {9L, 14L},
                {9L, 1L},
                {10L, 1L},
                {10L, 13L},
                {11L, 12L},
                {11L, 1L},
                {12L, 1L},
                {13L, 14L},
                {14L, 12L},
                {15L, 1L},
        };
    
        private static int numPages = 15;
    
        public static DataSet<Tuple2<Long, Long>> getDefaultEdgeDataSet(ExecutionEnvironment env) {
    
            List<Tuple2<Long, Long>> edges = new ArrayList<Tuple2<Long, Long>>();
            for (Object[] e : EDGES) {
                edges.add(new Tuple2<Long, Long>((Long) e[0], (Long) e[1]));
            }
            return env.fromCollection(edges);
        }
    
        public static DataSet<Long> getDefaultPagesDataSet(ExecutionEnvironment env) {
            return env.generateSequence(1, 15);
        }
    
        public static int getNumberOfPages() {
            return numPages;
        }
    
    }

    3、算法实现

    /**
     * @Description: 使用批量迭代的页面排名算法的基本实现。
     * 此实现需要一组页面和一组有向链接作为输入,并按如下方式工作。
     * 在每次迭代中,每个页面的等级均匀分布到它指向的所有页面。每个页面收集指向它的所有页面的部分等级,对它们求和,并对总和应用阻尼因子。结果是页面的新排名。使用所有页面的新等级开始新的迭代。该实现在固定次数的迭代之后终止。
     * 这是页面排名算法的维基百科条目。
     *
     * 输入文件是纯文本文件,必须格式如下:
     *
     * 页面表示为由新行字符分隔的(长)ID。
     * 例如,"1
    2
    12
    42
    63"给出五个页面ID为1,2,12,42和63的页面。
     * 链接表示为页面ID对,由空格字符分隔。链接由换行符分隔。
     * 例如,"1 2
    2 12
    1 12
    42 63"给出四个(定向)链接(1) - >(2),(2) - >(12),(1) - >(12)和(42) - >(63)。
     * 对于这个简单的实现,要求每个页面至少有一个传入链接和一个传出链接(页面可以指向自身)。
     * 用法:PageRankBasic --pages <path> --links <path> --output <path> --numPages <n> --iterations <n>
     *  如果未提供参数,则使用{@link PageRankData}中的默认数据和10次迭代运行程序。
     *
     **/
    public class PageRank {
        //阻尼系数
        private static final double DAMPENING_FACTOR = 0.85;
        //收敛阈值.
        private static final double EPSILON = 0.0001;
    
        private static DataSet<Long> getPagesDataSet(ExecutionEnvironment env, ParameterTool params) {
            if (params.has("pages")) {
                return env.readCsvFile(params.get("params"))
                        .fieldDelimiter(" ")
                        .lineDelimiter("
    ")
                        .types(Long.class)
                        .map(new MapFunction<Tuple1<Long>, Long>() {
                            @Override
                            public Long map(Tuple1<Long> value) throws Exception {
                                return value.f0;
                            }
                        });
            } else {
                System.out.println("Executing PageRank example with default pages data set.");
                System.out.println("Use --pages to specify file input.");
                return PageRankData.getDefaultPagesDataSet(env);
            }
        }
    
        private static DataSet<Tuple2<Long, Long>> getLinksDataSet(ExecutionEnvironment env, ParameterTool params) {
            if (params.has("links")) {
                return env.readCsvFile(params.get("links"))
                        .fieldDelimiter(" ")
                        .lineDelimiter("
    ")
                        .types(Long.class, Long.class);
            } else {
                System.out.println("Executing PageRank example with default links data set.");
                System.out.println("Use --links to specify file input.");
                return PageRankData.getDefaultEdgeDataSet(env);
            }
        }
    
        public static void main(String[] args) throws Exception {
            ParameterTool params = ParameterTool.fromArgs(args);
            final int numPages = params.getInt("numPages", PageRankData.getNumberOfPages());
            final int maxIterations = params.getInt("iterations", 10);
    
            // set up execution environment
            final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    
            // make the parameters available to the web ui
            env.getConfig().setGlobalJobParameters(params);
    
            // get input data
            DataSet<Long> pagesInput = getPagesDataSet(env, params);
            DataSet<Tuple2<Long, Long>> linksInput = getLinksDataSet(env, params);
    
            // assign initial rank to pages
            DataSet<Tuple2<Long, Double>> pagesWithRanks = pagesInput
                    .map(new RankAssigner(1.0d / numPages));
    
            // build adjacency list from link input
            DataSet<Tuple2<Long, Long[]>> adjacencyListInput = linksInput
                    .groupBy(0)
                    .reduceGroup(new BuildOutgoingEdgeList());
    
            // set iterative data set
            IterativeDataSet<Tuple2<Long, Double>> iteration = pagesWithRanks.iterate(maxIterations);
    
            DataSet<Tuple2<Long, Double>> newRanks = iteration.join(adjacencyListInput)
                    .where(0).equalTo(0)
                    //迭代算子
                    .flatMap(new JoinVertexWithEdgesMatch())
                    // collect and sum ranks
                    .groupBy(0)
                    .aggregate(Aggregations.SUM, 1)
                    // apply dampening factor
                    .map(new Dampener(PageRank.DAMPENING_FACTOR, numPages));
    
            //如果没有达到收敛条件,循环10次后结束
            DataSet<Tuple2<Long, Double>> finalPageRanks = iteration.closeWith(
                    newRanks,
                    newRanks.join(iteration).where(0).equalTo(0)
                            // termination condition
                            .filter(new EpsilonFilter()));
    
            // emit result
            if (params.has("output")) {
                finalPageRanks.writeAsCsv(params.get("output"), "
    ", " ");
                // execute program
                env.execute("Basic Page Rank Example");
            } else {
                System.out.println("Printing result to stdout. Use --output to specify output path.");
                finalPageRanks.print();
            }
        }
    
        /**
         * A map function that assigns an initial rank to all pages.
         */
        public static final class RankAssigner implements MapFunction<Long, Tuple2<Long, Double>> {
            Tuple2<Long, Double> outPageWithRank;
    
            public RankAssigner(double rank) {
                this.outPageWithRank = new Tuple2<>(-1L, rank);
            }
    
            @Override
            public Tuple2<Long, Double> map(Long value) throws Exception {
                outPageWithRank.f0 = value;
                return outPageWithRank;
            }
        }
    
        /**
         * A reduce function that takes a sequence of edges and builds the adjacency list for the vertex where the edges
         * originate. Run as a pre-processing step.
         * 将分组后的links数据按Id放入Tuple2<Long, Long[]>
         * 与hadoop的mapreduce的reduce部分类似,groupby就是shuffle
         */
        @FunctionAnnotation.ForwardedFields("0")
        public static final class BuildOutgoingEdgeList implements GroupReduceFunction<Tuple2<Long, Long>, Tuple2<Long, Long[]>> {
    
            private final ArrayList<Long> neighbors = new ArrayList<Long>();
    
            @Override
            public void reduce(Iterable<Tuple2<Long, Long>> values, Collector<Tuple2<Long, Long[]>> out) {
                neighbors.clear();
                Long id = 0L;
    
                for (Tuple2<Long, Long> n : values) {
                    id = n.f0;
                    neighbors.add(n.f1);
                    System.out.println("id: " + id + " ,neighbors: " + n.f1);
                }
                out.collect(new Tuple2<Long, Long[]>(id, neighbors.toArray(new Long[neighbors.size()])));
            }
        }
    
        /**
         * Join function that distributes a fraction of a vertex's rank to all neighbors.
         * 按照页面id以及其连接页面的数量,重新计算相邻点的rank,迭代10次
         * rankToDistribute就是计算了顶点(id)指向边(neighbors)的贡献值,neighbors最终的rank值需要合计后才能算出
         */
        public static final class JoinVertexWithEdgesMatch implements FlatMapFunction<Tuple2<Tuple2<Long, Double>, Tuple2<Long, Long[]>>, Tuple2<Long, Double>> {
    
            @Override
            public void flatMap(Tuple2<Tuple2<Long, Double>, Tuple2<Long, Long[]>> value, Collector<Tuple2<Long, Double>> out) {
                System.out.println("before:" + value);
                Long[] neighbors = value.f1.f1;
                double rank = value.f0.f1;
                double rankToDistribute = rank / ((double) neighbors.length);
    
                for (Long neighbor : neighbors) {
    //                System.out.println("neighbor:" + neighbor + ",rankToDistribute:" + rankToDistribute);
                    out.collect(new Tuple2<Long, Double>(neighbor, rankToDistribute));
                }
            }
        }
    
        /**
         * The function that applies the page rank dampening formula.
         * 阻尼系数公式:PR(A)=(1-d)/N + d(PR(T1)/C(T1)+ ... +PR(Tn)/C(Tn))
         * PR(A) 是页面A的PR值
         * PR(Ti)是页面Ti的PR值,在这里,页面Ti是指向A的所有页面中的某个页面
         * C(Ti)是页面Ti的出度,也就是Ti指向其他页面的边的个数
         * d 为阻尼系数,其意义是,在任意时刻,用户到达某页面后并继续向后浏览的概率,
         * 该数值是根据上网者使用浏览器书签的平均频率估算而得,通常d=0.85
         */
        @FunctionAnnotation.ForwardedFields("0")
        public static final class Dampener implements MapFunction<Tuple2<Long, Double>, Tuple2<Long, Double>> {
    
            private final double dampening;
            private final double randomJump;
    
            public Dampener(double dampening, double numVertices) {
                this.dampening = dampening;
                this.randomJump = (1 - dampening) / numVertices;
            }
    
            @Override
            public Tuple2<Long, Double> map(Tuple2<Long, Double> value) {
                value.f1 = (value.f1 * dampening) + randomJump;
                return value;
            }
        }
    
        /**
         * Filter that filters vertices where the rank difference is below a threshold.
         * 如果迭代之间的参数之和低于此EPSILON,我们将会收敛
         * 每次迭代后都会调用Filter判断是否要退出迭代
         */
        public static final class EpsilonFilter implements FilterFunction<Tuple2<Tuple2<Long, Double>, Tuple2<Long, Double>>> {
    
            @Override
            public boolean filter(Tuple2<Tuple2<Long, Double>, Tuple2<Long, Double>> value) {
                System.out.println("value:"+value+",math:"+Math.abs(value.f0.f1 - value.f1.f1));
                return Math.abs(value.f0.f1 - value.f1.f1) > EPSILON;
            }
        }
    
    }
  • 相关阅读:
    Java中的匿名对象
    Java决策制定
    「干货总结」程序员必知必会的十大排序算法
    阿里开发手册之ArrayList正确操作方式
    Java中this关键字的使用
    Java封装的概念详解
    【C++】数组的最大子数组
    第二章:分治I
    【C++】归并排序
    第一章:算法绪论
  • 原文地址:https://www.cnblogs.com/asker009/p/11000024.html
Copyright © 2011-2022 走看看