zoukankan      html  css  js  c++  java
  • Apache mahout 源码阅读笔记--协同过滤, PearsonCorrelationSimilarity

    协同过滤源码路径:

    ~/project/javaproject/mahout-0.9/core/src $tree main/java/org/apache/mahout/cf/taste/ -L 2

    main/java/org/apache/mahout/cf/taste/

    ├── common

    │   ├── NoSuchItemException.java

    │   ├── NoSuchUserException.java

    │   ├── Refreshable.java

    │   ├── TasteException.java

    │   └── Weighting.java

    ├── eval

    │   ├── DataModelBuilder.java

    │   ├── IRStatistics.java

    │   ├── RecommenderBuilder.java

    │   ├── RecommenderEvaluator.java

    │   ├── RecommenderIRStatsEvaluator.java

    │   └── RelevantItemsDataSplitter.java

    ├── hadoop

    │   ├── EntityEntityWritable.java

    │   ├── EntityPrefWritable.java

    │   ├── MutableRecommendedItem.java

    │   ├── RecommendedItemsWritable.java

    │   ├── TasteHadoopUtils.java

    │   ├── ToEntityPrefsMapper.java

    │   ├── ToItemPrefsMapper.java

    │   ├── TopItemsQueue.java

    │   ├── als

    │   ├── item

    │   ├── preparation

    │   └── similarity

    ├── impl

    │   ├── common

    │   ├── eval

    │   ├── model

    │   ├── neighborhood

    │   ├── recommender

    │   └── similarity

    ├── model

    │   ├── DataModel.java

    │   ├── IDMigrator.java

    │   ├── JDBCDataModel.java

    │   ├── Preference.java

    │   ├── PreferenceArray.java

    │   └── UpdatableIDMigrator.java

    ├── neighborhood

    │   └── UserNeighborhood.java

    ├── recommender

    │   ├── CandidateItemsStrategy.java

    │   ├── IDRescorer.java

    │   ├── ItemBasedRecommender.java

    │   ├── MostSimilarItemsCandidateItemsStrategy.java

    │   ├── RecommendedItem.java

    │   ├── Recommender.java

    │   ├── Rescorer.java

    │   └── UserBasedRecommender.java

    └── similarity

        ├── ItemSimilarity.java

        ├── PreferenceInferrer.java

        ├── UserSimilarity.java

        └── precompute

     

    similarity  相似度的interface定义

    recommender 推荐算法的interface定义

    model  数据model类型的interface定义

     

    impl 目录 则是以上interface定义的实现

     

    PearsonCorrelationSimilarity的实现在

    ~/mahout-core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java

    /**
       * @throws IllegalArgumentException if {@link DataModel} does not have preference values
       */
      public PearsonCorrelationSimilarity(DataModel dataModel, Weighting weighting) throws TasteException {
        //这里CenterData传的时true
        /* pearson其实做的事情就是先把两个向量都减去他们的平均值,然后再计算cosine值。
         * 在 AbstractSimilarity里的实现代码如下:
         *  double result;
            if (centerData) {
              double meanX = sumX / count;
              double meanY = sumY / count;
              // double centeredSumXY = sumXY - meanY * sumX - meanX * sumY + n * meanX * meanY;
              double centeredSumXY = sumXY - meanY * sumX;
              // double centeredSumX2 = sumX2 - 2.0 * meanX * sumX + n * meanX * meanX;
              double centeredSumX2 = sumX2 - meanX * sumX;
              // double centeredSumY2 = sumY2 - 2.0 * meanY * sumY + n * meanY * meanY;
              double centeredSumY2 = sumY2 - meanY * sumY;
              result = computeResult(count, centeredSumXY, centeredSumX2, centeredSumY2, sumXYdiff2);
            } else {
              result = computeResult(count, sumXY, sumX2, sumY2, sumXYdiff2);
            }
         */
        super(dataModel, weighting, true);
        Preconditions.checkArgument(dataModel.hasPreferenceValues(), "DataModel doesn't have preference values");
      }
      
      @Override
      double computeResult(int n, double sumXY, double sumX2, double sumY2, double sumXYdiff2) {
        if (n == 0) {
          return Double.NaN;
        }
        // Note that sum of X and sum of Y don't appear here since they are assumed to be 0;
        // the data is assumed to be centered.
        double denominator = Math.sqrt(sumX2) * Math.sqrt(sumY2);
        if (denominator == 0.0) {
          // One or both parties has -all- the same ratings;
          // can't really say much similarity under this measure
          return Double.NaN;
        }
        return sumXY / denominator;
      }

    就是数学公式的实现:

    具体的累加,在interface里面已经做了,:

    @Override
      public double userSimilarity(long userID1, long userID2) throws TasteException {
        DataModel dataModel = getDataModel();
      //获取用户偏好 PreferenceArray xPrefs
    = dataModel.getPreferencesFromUser(userID1); PreferenceArray yPrefs = dataModel.getPreferencesFromUser(userID2); int xLength = xPrefs.length(); int yLength = yPrefs.length(); if (xLength == 0 || yLength == 0) { return Double.NaN; } long xIndex = xPrefs.getItemID(0); long yIndex = yPrefs.getItemID(0); int xPrefIndex = 0; int yPrefIndex = 0; double sumX = 0.0; double sumX2 = 0.0; double sumY = 0.0; double sumY2 = 0.0; double sumXY = 0.0; double sumXYdiff2 = 0.0; int count = 0; boolean hasInferrer = inferrer != null; while (true) { int compare = xIndex < yIndex ? -1 : xIndex > yIndex ? 1 : 0; if (hasInferrer || compare == 0) { double x; double y; if (xIndex == yIndex) { // Both users expressed a preference for the item x = xPrefs.getValue(xPrefIndex); y = yPrefs.getValue(yPrefIndex); } else { //如果不存在对应的分数,则进行推断... // Only one user expressed a preference, but infer the other one's preference and tally // as if the other user expressed that preference if (compare < 0) { // X has a value; infer Y's x = xPrefs.getValue(xPrefIndex); y = inferrer.inferPreference(userID2, xIndex); } else { // compare > 0 // Y has a value; infer X's x = inferrer.inferPreference(userID1, yIndex); y = yPrefs.getValue(yPrefIndex); } } sumXY += x * y; sumX += x; sumX2 += x * x; sumY += y; sumY2 += y * y; double diff = x - y; sumXYdiff2 += diff * diff; count++; } if (compare <= 0) { if (++xPrefIndex >= xLength) { if (hasInferrer) { // Must count other Ys; pretend next X is far away if (yIndex == Long.MAX_VALUE) { // ... but stop if both are done! break; } xIndex = Long.MAX_VALUE; } else { break; } } else { xIndex = xPrefs.getItemID(xPrefIndex); } } if (compare >= 0) { if (++yPrefIndex >= yLength) { if (hasInferrer) { // Must count other Xs; pretend next Y is far away if (xIndex == Long.MAX_VALUE) { // ... but stop if both are done! break; } yIndex = Long.MAX_VALUE; } else { break; } } else { yIndex = yPrefs.getItemID(yPrefIndex); } } } // "Center" the data. If my math is correct, this'll do it. double result; if (centerData) { double meanX = sumX / count; double meanY = sumY / count; // double centeredSumXY = sumXY - meanY * sumX - meanX * sumY + n * meanX * meanY; double centeredSumXY = sumXY - meanY * sumX; // double centeredSumX2 = sumX2 - 2.0 * meanX * sumX + n * meanX * meanX; double centeredSumX2 = sumX2 - meanX * sumX; // double centeredSumY2 = sumY2 - 2.0 * meanY * sumY + n * meanY * meanY; double centeredSumY2 = sumY2 - meanY * sumY; result = computeResult(count, centeredSumXY, centeredSumX2, centeredSumY2, sumXYdiff2); } else { result = computeResult(count, sumXY, sumX2, sumY2, sumXYdiff2); } if (!Double.isNaN(result)) { result = normalizeWeightResult(result, count, cachedNumItems); } return result; }

    参考:

    http://blog.csdn.net/v_july_v/article/details/7184318

    http://blog.sina.com.cn/s/blog_73de143c010153vp.html

  • 相关阅读:
    然乌湖
    邦达 八宿
    芒康
    巴塘
    禾尼乡 所波大叔
    世界高城 理塘
    相克宗 藏民家
    骑行川藏--新都桥&塔公草原
    d 3
    D2
  • 原文地址:https://www.cnblogs.com/zhangqingping/p/4105401.html
Copyright © 2011-2022 走看看