zoukankan      html  css  js  c++  java
  • Spark高级数据分析· 2数据分析

    wget https://archive.ics.uci.edu/ml/machine-learning-databases/00210/donation.zip
    

    数据清洗

    cd /Users/erichan/garden/spark-1.6.0-bin-hadoop2.6/bin
    ./spark-shell --master local
    
    val data ="/Users/erichan/AliDrive/ml_spark/data/linkage"
    val rawblocks = sc.textFile(data)
    //rawblocks.count()
    //res0: Long = 6552407
    //val head = rawblocks.take(10)
    
    val noheader = rawblocks.filter(l => !l.contains("id_1"))
    
    noheader.filter(l => l.contains("cmp_fname_c1")).foreach(println)
    
    //noheader.count()
    //res1: Long = 6552396
    
    case class MatchData(id1: Int, id2: Int, scores: Array[Double], matched: Boolean)
    
    def toDouble(s: String) = {
      if ("?".equals(s)) Double.NaN else s.toDouble
    }
    def parse(line: String) = {
      val pieces = line.split(',')
      val id1 = pieces(0).toInt
      val id2 = pieces(1).toInt
      val scores = pieces.slice(2, 11).map(toDouble)
      val matched = pieces(11).toBoolean
      MatchData(id1, id2, scores, matched)
    }
    val parsed = noheader.map(line => parse(line))
    
    val matchCounts = parsed.map(md => md.matched).countByValue()
    val matchCountsSeq = matchCounts.toSeq
    
    import java.lang.Double.isNaN
    val stats = (0 until 9).map(i => {
      parsed.map(md => md.scores(i)).filter(!isNaN(_)).stats()
    })
  • 相关阅读:
    Linux基础知识整理
    小白学习之路,基础四(函数的进阶)
    关于高通量数据格式
    数据库管理系统
    Linux 基本操作
    生信研究内容
    redis6 多线程特性
    Centos8配置NFS4
    关于Mybatis将查询结果中添加常量列并返回
    关于swagger文档的使用方法
  • 原文地址:https://www.cnblogs.com/tychyg/p/5320948.html
Copyright © 2011-2022 走看看