zoukankan      html  css  js  c++  java
  • Spark高级数据分析· 2数据分析

    wget https://archive.ics.uci.edu/ml/machine-learning-databases/00210/donation.zip
    

    数据清洗

    cd /Users/erichan/garden/spark-1.6.0-bin-hadoop2.6/bin
    ./spark-shell --master local
    
    val data ="/Users/erichan/AliDrive/ml_spark/data/linkage"
    val rawblocks = sc.textFile(data)
    //rawblocks.count()
    //res0: Long = 6552407
    //val head = rawblocks.take(10)
    
    val noheader = rawblocks.filter(l => !l.contains("id_1"))
    
    noheader.filter(l => l.contains("cmp_fname_c1")).foreach(println)
    
    //noheader.count()
    //res1: Long = 6552396
    
    case class MatchData(id1: Int, id2: Int, scores: Array[Double], matched: Boolean)
    
    def toDouble(s: String) = {
      if ("?".equals(s)) Double.NaN else s.toDouble
    }
    def parse(line: String) = {
      val pieces = line.split(',')
      val id1 = pieces(0).toInt
      val id2 = pieces(1).toInt
      val scores = pieces.slice(2, 11).map(toDouble)
      val matched = pieces(11).toBoolean
      MatchData(id1, id2, scores, matched)
    }
    val parsed = noheader.map(line => parse(line))
    
    val matchCounts = parsed.map(md => md.matched).countByValue()
    val matchCountsSeq = matchCounts.toSeq
    
    import java.lang.Double.isNaN
    val stats = (0 until 9).map(i => {
      parsed.map(md => md.scores(i)).filter(!isNaN(_)).stats()
    })
  • 相关阅读:
    怎么样从多列的DataTable里取需要的几列
    .net core 生成二维码
    sql server2012卸载
    github实用的搜索小技巧
    c# 中的索引
    IOC
    Python基础-while
    Python基础-判断闰年
    Python基础-while奇数和
    Python基础-奇偶判断调用函数
  • 原文地址:https://www.cnblogs.com/tychyg/p/5320948.html
Copyright © 2011-2022 走看看