zoukankan      html  css  js  c++  java
  • Spark高级数据分析· 2数据分析

    wget https://archive.ics.uci.edu/ml/machine-learning-databases/00210/donation.zip
    

    数据清洗

    cd /Users/erichan/garden/spark-1.6.0-bin-hadoop2.6/bin
    ./spark-shell --master local
    
    val data ="/Users/erichan/AliDrive/ml_spark/data/linkage"
    val rawblocks = sc.textFile(data)
    //rawblocks.count()
    //res0: Long = 6552407
    //val head = rawblocks.take(10)
    
    val noheader = rawblocks.filter(l => !l.contains("id_1"))
    
    noheader.filter(l => l.contains("cmp_fname_c1")).foreach(println)
    
    //noheader.count()
    //res1: Long = 6552396
    
    case class MatchData(id1: Int, id2: Int, scores: Array[Double], matched: Boolean)
    
    def toDouble(s: String) = {
      if ("?".equals(s)) Double.NaN else s.toDouble
    }
    def parse(line: String) = {
      val pieces = line.split(',')
      val id1 = pieces(0).toInt
      val id2 = pieces(1).toInt
      val scores = pieces.slice(2, 11).map(toDouble)
      val matched = pieces(11).toBoolean
      MatchData(id1, id2, scores, matched)
    }
    val parsed = noheader.map(line => parse(line))
    
    val matchCounts = parsed.map(md => md.matched).countByValue()
    val matchCountsSeq = matchCounts.toSeq
    
    import java.lang.Double.isNaN
    val stats = (0 until 9).map(i => {
      parsed.map(md => md.scores(i)).filter(!isNaN(_)).stats()
    })
  • 相关阅读:
    IO模型
    Java NIO概述
    消息系统避免分布式事务
    JVM调优总结
    设计模式的六大原则
    Java 内存区域与内存溢出
    windows go安装
    ZooKeeper原理及使用
    再谈HashMap
    Html5 播放实时音频流
  • 原文地址:https://www.cnblogs.com/tychyg/p/5320948.html
Copyright © 2011-2022 走看看