zoukankan      html  css  js  c++  java
  • spark的Task的序列化

    Task类型

    Spark一共有两种Task,一种是ResultTask,此Task只有job的最后一个stage才会生成,其他stage生成的Task是ShuffleTask。

    Task生成

      // TODO: Maybe we can keep the taskBinary in Stage to avoid serializing it multiple times.
        // Broadcasted binary for the task, used to dispatch tasks to executors. Note that we broadcast
        // the serialized copy of the RDD and for each task we will deserialize it, which means each
        // task gets a different copy of the RDD. This provides stronger isolation between tasks that
        // might modify state of objects referenced in their closures. This is necessary in Hadoop
        // where the JobConf/Configuration object is not thread-safe.
        //每个Task的环境是独立的,相互不影响的。
        var taskBinary: Broadcast[Array[Byte]] = null
        try {
          // For ShuffleMapTask, serialize and broadcast (rdd, shuffleDep).
          // For ResultTask, serialize and broadcast (rdd, func).
          val taskBinaryBytes: Array[Byte] = stage match {
            case stage: ShuffleMapStage =>
            //此处将所有Task依赖的都给进行序列化,也会对闭包进行处理。
              closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef).array()
            case stage: ResultStage =>
            //此处将所有Task依赖的都给进行序列化,也会对闭包进行处理。
              closureSerializer.serialize((stage.rdd, stage.func): AnyRef).array()
          }
    		
    		//将序列化好的对象进行广播
          taskBinary = sc.broadcast(taskBinaryBytes)
        } catch {
          // In the case of a failure during serialization, abort the stage.
          case e: NotSerializableException =>
            abortStage(stage, "Task not serializable: " + e.toString, Some(e))
            runningStages -= stage
    
            // Abort execution
            return
          case NonFatal(e) =>
            abortStage(stage, s"Task serialization failed: $e
    ${e.getStackTraceString}", Some(e))
            runningStages -= stage
            return
        }
    
    	//生成Task,分两种类型
        val tasks: Seq[Task[_]] = try {
          stage match {
            case stage: ShuffleMapStage =>
              partitionsToCompute.map { id =>
                val locs = taskIdToLocations(id)
                val part = stage.rdd.partitions(id)
                new ShuffleMapTask(stage.id, stage.latestInfo.attemptId,
                  taskBinary, part, locs, stage.internalAccumulators)
              }
    
            case stage: ResultStage =>
              val job = stage.activeJob.get
              partitionsToCompute.map { id =>
                val p: Int = stage.partitions(id)
                val part = stage.rdd.partitions(p)
                val locs = taskIdToLocations(id)
                new ResultTask(stage.id, stage.latestInfo.attemptId,
                  taskBinary, part, locs, id, stage.internalAccumulators)
              }
          }
        } catch {
          case NonFatal(e) =>
            abortStage(stage, s"Task creation failed: $e
    ${e.getStackTraceString}", Some(e))
            runningStages -= stage
            return
        }
    
        if (tasks.size > 0) {
          logInfo("Submitting " + tasks.size + " missing tasks from " + stage + " (" + stage.rdd + ")")
          stage.pendingPartitions ++= tasks.map(_.partitionId)
          logDebug("New pending partitions: " + stage.pendingPartitions)
          taskScheduler.submitTasks(new TaskSet(
            tasks.toArray, stage.id, stage.latestInfo.attemptId, jobId, properties))
          stage.latestInfo.submissionTime = Some(clock.getTimeMillis())
        } else {
          // Because we posted SparkListenerStageSubmitted earlier, we should mark
          // the stage as completed here in case there are no tasks to run
          markStageAsFinished(stage, None)
    
          val debugString = stage match {
            case stage: ShuffleMapStage =>
              s"Stage ${stage} is actually done; " +
                s"(available: ${stage.isAvailable}," +
                s"available outputs: ${stage.numAvailableOutputs}," +
                s"partitions: ${stage.numPartitions})"
            case stage : ResultStage =>
              s"Stage ${stage} is actually done; (partitions: ${stage.numPartitions})"
          }
          logDebug(debugString)
        }
    

    Task序列化

      @throws[TaskNotSerializableException]
      def resourceOffer(
          execId: String,
          host: String,
          maxLocality: TaskLocality.TaskLocality)
        : Option[TaskDescription] =
      {
        if (!isZombie) {
          val curTime = clock.getTimeMillis()
    
          var allowedLocality = maxLocality
    
          if (maxLocality != TaskLocality.NO_PREF) {
            allowedLocality = getAllowedLocalityLevel(curTime)
            if (allowedLocality > maxLocality) {
              // We're not allowed to search for farther-away tasks
              allowedLocality = maxLocality
            }
          }
    
          dequeueTask(execId, host, allowedLocality) match {
            case Some((index, taskLocality, speculative)) => {
              // Found a task; do some bookkeeping and return a task description
              //找到一个任务,然后封装task的信息,包括序列化
              val task = tasks(index)
              val taskId = sched.newTaskId()
              // Do various bookkeeping
              copiesRunning(index) += 1
              val attemptNum = taskAttempts(index).size
              val info = new TaskInfo(taskId, index, attemptNum, curTime,
                execId, host, taskLocality, speculative)
              taskInfos(taskId) = info
              taskAttempts(index) = info :: taskAttempts(index)
              // Update our locality level for delay scheduling
              // NO_PREF will not affect the variables related to delay scheduling
              if (maxLocality != TaskLocality.NO_PREF) {
                currentLocalityIndex = getLocalityIndex(taskLocality)
                lastLaunchTime = curTime
              }
              // Serialize and return the task
              val startTime = clock.getTimeMillis()
              //此处将Task进行序列化
              val serializedTask: ByteBuffer = try {
                Task.serializeWithDependencies(task, sched.sc.addedFiles, sched.sc.addedJars, ser)
              } catch {
                // If the task cannot be serialized, then there's no point to re-attempt the task,
                // as it will always fail. So just abort the whole task-set.
                case NonFatal(e) =>
                  val msg = s"Failed to serialize task $taskId, not attempting to retry it."
                  logError(msg, e)
                  abort(s"$msg Exception during serialization: $e")
                  throw new TaskNotSerializableException(e)
              }
              //由于Task会有依赖关系,因此检查Task的大小是否超出,如果超出,打印警告
              if (serializedTask.limit > TaskSetManager.TASK_SIZE_TO_WARN_KB * 1024 &&
                  !emittedTaskSizeWarning) {
                emittedTaskSizeWarning = true
                logWarning(s"Stage ${task.stageId} contains a task of very large size " +
                  s"(${serializedTask.limit / 1024} KB). The maximum recommended task size is " +
                  s"${TaskSetManager.TASK_SIZE_TO_WARN_KB} KB.")
              }
              addRunningTask(taskId)
    
              // We used to log the time it takes to serialize the task, but task size is already
              // a good proxy to task serialization time.
              // val timeTaken = clock.getTime() - startTime
              val taskName = s"task ${info.id} in stage ${taskSet.id}"
              logInfo(s"Starting $taskName (TID $taskId, $host, partition ${task.partitionId}," +
                s"$taskLocality, ${serializedTask.limit} bytes)")
    
              sched.dagScheduler.taskStarted(task, info)
              return Some(new TaskDescription(taskId = taskId, attemptNumber = attemptNum, execId,
                taskName, index, serializedTask))
            }
            case _ =>
          }
        }
        None
      }
    

    序列化代码

      /**
       * Serialize a task and the current app dependencies (files and JARs added to the SparkContext)
       */
      def serializeWithDependencies(
          task: Task[_],
          currentFiles: HashMap[String, Long],
          currentJars: HashMap[String, Long],
          serializer: SerializerInstance)
        : ByteBuffer = {
    
        val out = new ByteArrayOutputStream(4096)
        val dataOut = new DataOutputStream(out)
    
        // Write currentFiles
        dataOut.writeInt(currentFiles.size)
        for ((name, timestamp) <- currentFiles) {
          dataOut.writeUTF(name)
          dataOut.writeLong(timestamp)
        }
    
        // Write currentJars
        dataOut.writeInt(currentJars.size)
        for ((name, timestamp) <- currentJars) {
          dataOut.writeUTF(name)
          dataOut.writeLong(timestamp)
        }
    
        // Write the task itself and finish
        dataOut.flush()
        val taskBytes = serializer.serialize(task).array()
        out.write(taskBytes)
        ByteBuffer.wrap(out.toByteArray)
      }
    
    

    TaskDescription

    /**
     * Description of a task that gets passed onto executors to be executed, usually created by
     * [[TaskSetManager.resourceOffer]].
     */
    private[spark] class TaskDescription(
        val taskId: Long,
        val attemptNumber: Int,
        val executorId: String,
        val name: String,
        val index: Int,    // Index within this task's TaskSet
        _serializedTask: ByteBuffer)
      extends Serializable {
    
      // Because ByteBuffers are not serializable, wrap the task in a SerializableBuffer
      private val buffer = new SerializableBuffer(_serializedTask)
    
      def serializedTask: ByteBuffer = buffer.value
    
      override def toString: String = "TaskDescription(TID=%d, index=%d)".format(taskId, index)
    }
    

    Task最终生成。

  • 相关阅读:
    mybatis批量更新策略
    tk.mybatis扩展通用接口
    IDEA入门——jdbc连接和工具类的使用
    tensorflow——3
    再战tensorflow
    tensorflow初学
    Anaconda和TensorFlow安装遇到的坑记录
    《企业应用架构模式》——阅读笔记3
    机器学习十讲——第十讲
    机器学习十讲——第九讲
  • 原文地址:https://www.cnblogs.com/luckuan/p/5385251.html
Copyright © 2011-2022 走看看