zoukankan      html  css  js  c++  java
  • spark--job和DAGScheduler源码

     一个job对应一个action操作,action执行会有先后顺序;

    每个job执行会先构建一个DAG路径,一个job会含有多个stage,主要逻辑在DAGScheduler。

    spark提交job的源码见(SparkContext.scala的runJob方法):

      def runJob[T, U: ClassTag](
          rdd: RDD[T],
          func: (TaskContext, Iterator[T]) => U,
          partitions: Seq[Int],
          resultHandler: (Int, U) => Unit): Unit = {
        if (stopped.get()) {
          throw new IllegalStateException("SparkContext has been shutdown")
        }
        val callSite = getCallSite
        val cleanedFunc = clean(func)
        logInfo("Starting job: " + callSite.shortForm)
        if (conf.getBoolean("spark.logLineage", false)) {
          logInfo("RDD's recursive dependencies:
    " + rdd.toDebugString)
        }
        dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get)
        progressBar.foreach(_.finishAll())
        rdd.doCheckpoint()
      }
    

    DAGScheduler--job调度的核心入口:

    private[scheduler] def handleJobSubmitted(jobId: Int,
          finalRDD: RDD[_],
          func: (TaskContext, Iterator[_]) => _,
          partitions: Array[Int],
          callSite: CallSite,
          listener: JobListener,
          properties: Properties) {
    //创建finalStage
    var finalStage: ResultStage = null try { // New stage creation may throw an exception if, for example, jobs are run on a // HadoopRDD whose underlying HDFS files have been deleted.
    //创建一个stage对象,并且将stage加入到DAGScheduler内存缓存中
    finalStage = createResultStage(finalRDD, func, partitions, jobId, callSite) } catch { case e: Exception => logWarning("Creating new stage failed due to exception - job: " + jobId, e) listener.jobFailed(e) return } //创建job val job = new ActiveJob(jobId, finalStage, callSite, listener, properties) clearCacheLocs() logInfo("Got job %s (%s) with %d output partitions".format( job.jobId, callSite.shortForm, partitions.length)) logInfo("Final stage: " + finalStage + " (" + finalStage.name + ")") logInfo("Parents of final stage: " + finalStage.parents) logInfo("Missing parents: " + getMissingParentStages(finalStage)) val jobSubmissionTime = clock.getTimeMillis()
    //将job加入到内存缓存中 jobIdToActiveJob(jobId)
    = job activeJobs += job finalStage.setActiveJob(job) val stageIds = jobIdToStageIds(jobId).toArray val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo)) listenerBus.post( SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties)) //使用submitStage() 方法提交finalStage
    submitStage(finalStage) }
  • 相关阅读:
    HDOJ 4747 Mex
    HDU 1203 I NEED A OFFER!
    HDU 2616 Kill the monster
    HDU 3496 Watch The Movie
    Codeforces 347A A. Difference Row
    Codeforces 347B B. Fixed Points
    Codeforces 372B B. Hungry Sequence
    HDU 1476 Sudoku Killer
    HDU 1987 How many ways
    HDU 2564 词组缩写
  • 原文地址:https://www.cnblogs.com/parent-absent-son/p/11747750.html
Copyright © 2011-2022 走看看