zoukankan      html  css  js  c++  java
  • Apache mahout 源码阅读笔记--协同过滤, PearsonCorrelationSimilarity

    协同过滤源码路径:

    ~/project/javaproject/mahout-0.9/core/src $tree main/java/org/apache/mahout/cf/taste/ -L 2

    main/java/org/apache/mahout/cf/taste/

    ├── common

    │   ├── NoSuchItemException.java

    │   ├── NoSuchUserException.java

    │   ├── Refreshable.java

    │   ├── TasteException.java

    │   └── Weighting.java

    ├── eval

    │   ├── DataModelBuilder.java

    │   ├── IRStatistics.java

    │   ├── RecommenderBuilder.java

    │   ├── RecommenderEvaluator.java

    │   ├── RecommenderIRStatsEvaluator.java

    │   └── RelevantItemsDataSplitter.java

    ├── hadoop

    │   ├── EntityEntityWritable.java

    │   ├── EntityPrefWritable.java

    │   ├── MutableRecommendedItem.java

    │   ├── RecommendedItemsWritable.java

    │   ├── TasteHadoopUtils.java

    │   ├── ToEntityPrefsMapper.java

    │   ├── ToItemPrefsMapper.java

    │   ├── TopItemsQueue.java

    │   ├── als

    │   ├── item

    │   ├── preparation

    │   └── similarity

    ├── impl

    │   ├── common

    │   ├── eval

    │   ├── model

    │   ├── neighborhood

    │   ├── recommender

    │   └── similarity

    ├── model

    │   ├── DataModel.java

    │   ├── IDMigrator.java

    │   ├── JDBCDataModel.java

    │   ├── Preference.java

    │   ├── PreferenceArray.java

    │   └── UpdatableIDMigrator.java

    ├── neighborhood

    │   └── UserNeighborhood.java

    ├── recommender

    │   ├── CandidateItemsStrategy.java

    │   ├── IDRescorer.java

    │   ├── ItemBasedRecommender.java

    │   ├── MostSimilarItemsCandidateItemsStrategy.java

    │   ├── RecommendedItem.java

    │   ├── Recommender.java

    │   ├── Rescorer.java

    │   └── UserBasedRecommender.java

    └── similarity

        ├── ItemSimilarity.java

        ├── PreferenceInferrer.java

        ├── UserSimilarity.java

        └── precompute

     

    similarity  相似度的interface定义

    recommender 推荐算法的interface定义

    model  数据model类型的interface定义

     

    impl 目录 则是以上interface定义的实现

     

    PearsonCorrelationSimilarity的实现在

    ~/mahout-core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java

    /**
       * @throws IllegalArgumentException if {@link DataModel} does not have preference values
       */
      public PearsonCorrelationSimilarity(DataModel dataModel, Weighting weighting) throws TasteException {
        //这里CenterData传的时true
        /* pearson其实做的事情就是先把两个向量都减去他们的平均值,然后再计算cosine值。
         * 在 AbstractSimilarity里的实现代码如下:
         *  double result;
            if (centerData) {
              double meanX = sumX / count;
              double meanY = sumY / count;
              // double centeredSumXY = sumXY - meanY * sumX - meanX * sumY + n * meanX * meanY;
              double centeredSumXY = sumXY - meanY * sumX;
              // double centeredSumX2 = sumX2 - 2.0 * meanX * sumX + n * meanX * meanX;
              double centeredSumX2 = sumX2 - meanX * sumX;
              // double centeredSumY2 = sumY2 - 2.0 * meanY * sumY + n * meanY * meanY;
              double centeredSumY2 = sumY2 - meanY * sumY;
              result = computeResult(count, centeredSumXY, centeredSumX2, centeredSumY2, sumXYdiff2);
            } else {
              result = computeResult(count, sumXY, sumX2, sumY2, sumXYdiff2);
            }
         */
        super(dataModel, weighting, true);
        Preconditions.checkArgument(dataModel.hasPreferenceValues(), "DataModel doesn't have preference values");
      }
      
      @Override
      double computeResult(int n, double sumXY, double sumX2, double sumY2, double sumXYdiff2) {
        if (n == 0) {
          return Double.NaN;
        }
        // Note that sum of X and sum of Y don't appear here since they are assumed to be 0;
        // the data is assumed to be centered.
        double denominator = Math.sqrt(sumX2) * Math.sqrt(sumY2);
        if (denominator == 0.0) {
          // One or both parties has -all- the same ratings;
          // can't really say much similarity under this measure
          return Double.NaN;
        }
        return sumXY / denominator;
      }

    就是数学公式的实现:

    具体的累加,在interface里面已经做了,:

    @Override
      public double userSimilarity(long userID1, long userID2) throws TasteException {
        DataModel dataModel = getDataModel();
      //获取用户偏好 PreferenceArray xPrefs
    = dataModel.getPreferencesFromUser(userID1); PreferenceArray yPrefs = dataModel.getPreferencesFromUser(userID2); int xLength = xPrefs.length(); int yLength = yPrefs.length(); if (xLength == 0 || yLength == 0) { return Double.NaN; } long xIndex = xPrefs.getItemID(0); long yIndex = yPrefs.getItemID(0); int xPrefIndex = 0; int yPrefIndex = 0; double sumX = 0.0; double sumX2 = 0.0; double sumY = 0.0; double sumY2 = 0.0; double sumXY = 0.0; double sumXYdiff2 = 0.0; int count = 0; boolean hasInferrer = inferrer != null; while (true) { int compare = xIndex < yIndex ? -1 : xIndex > yIndex ? 1 : 0; if (hasInferrer || compare == 0) { double x; double y; if (xIndex == yIndex) { // Both users expressed a preference for the item x = xPrefs.getValue(xPrefIndex); y = yPrefs.getValue(yPrefIndex); } else { //如果不存在对应的分数,则进行推断... // Only one user expressed a preference, but infer the other one's preference and tally // as if the other user expressed that preference if (compare < 0) { // X has a value; infer Y's x = xPrefs.getValue(xPrefIndex); y = inferrer.inferPreference(userID2, xIndex); } else { // compare > 0 // Y has a value; infer X's x = inferrer.inferPreference(userID1, yIndex); y = yPrefs.getValue(yPrefIndex); } } sumXY += x * y; sumX += x; sumX2 += x * x; sumY += y; sumY2 += y * y; double diff = x - y; sumXYdiff2 += diff * diff; count++; } if (compare <= 0) { if (++xPrefIndex >= xLength) { if (hasInferrer) { // Must count other Ys; pretend next X is far away if (yIndex == Long.MAX_VALUE) { // ... but stop if both are done! break; } xIndex = Long.MAX_VALUE; } else { break; } } else { xIndex = xPrefs.getItemID(xPrefIndex); } } if (compare >= 0) { if (++yPrefIndex >= yLength) { if (hasInferrer) { // Must count other Xs; pretend next Y is far away if (xIndex == Long.MAX_VALUE) { // ... but stop if both are done! break; } yIndex = Long.MAX_VALUE; } else { break; } } else { yIndex = yPrefs.getItemID(yPrefIndex); } } } // "Center" the data. If my math is correct, this'll do it. double result; if (centerData) { double meanX = sumX / count; double meanY = sumY / count; // double centeredSumXY = sumXY - meanY * sumX - meanX * sumY + n * meanX * meanY; double centeredSumXY = sumXY - meanY * sumX; // double centeredSumX2 = sumX2 - 2.0 * meanX * sumX + n * meanX * meanX; double centeredSumX2 = sumX2 - meanX * sumX; // double centeredSumY2 = sumY2 - 2.0 * meanY * sumY + n * meanY * meanY; double centeredSumY2 = sumY2 - meanY * sumY; result = computeResult(count, centeredSumXY, centeredSumX2, centeredSumY2, sumXYdiff2); } else { result = computeResult(count, sumXY, sumX2, sumY2, sumXYdiff2); } if (!Double.isNaN(result)) { result = normalizeWeightResult(result, count, cachedNumItems); } return result; }

    参考:

    http://blog.csdn.net/v_july_v/article/details/7184318

    http://blog.sina.com.cn/s/blog_73de143c010153vp.html

  • 相关阅读:
    算法----(1)冒泡排序
    淘宝爬虫
    爬虫_豆瓣电影top250 (正则表达式)
    爬虫_猫眼电影top100(正则表达式)
    Android 简单调用摄像头
    Android 简单天气预报
    思维模型
    This view is not constrained, it only has designtime positions, so it will jump to (0,0) unless you
    Android studio preview界面无法预览,报错render problem
    Android studio 3.1.2报错,no target device found
  • 原文地址:https://www.cnblogs.com/zhangqingping/p/4105401.html
Copyright © 2011-2022 走看看