在Struct Streaming中增加了支持sql处理流数据,在sql包中单独处理,其中StreamExecution是下面提到两处流处理的基类,这个流查询在数据源有新数据到达时会生成一个QueryExecution来执行并将结果输出到指定的Sink(处理后数据存放地)中。
case class ProcessingTimeExecutor(processingTime: ProcessingTime, clock: Clock = new SystemClock()) extends TriggerExecutor with Logging { private val intervalMs = processingTime.intervalMs require(intervalMs >= 0) override def execute(triggerHandler: () => Boolean): Unit = { while (true) { val triggerTimeMs = clock.getTimeMillis val nextTriggerTimeMs = nextBatchTime(triggerTimeMs) val terminated = !triggerHandler() if (intervalMs > 0) { val batchElapsedTimeMs = clock.getTimeMillis - triggerTimeMs if (batchElapsedTimeMs > intervalMs) { notifyBatchFallingBehind(batchElapsedTimeMs) } if (terminated) { return } clock.waitTillTime(nextTriggerTimeMs) } else { if (terminated) { return } } } }
override lazy val logicalPlan: LogicalPlan = { assert(queryExecutionThread eq Thread.currentThread, "logicalPlan must be initialized in QueryExecutionThread " + s"but the current thread was ${Thread.currentThread}") var nextSourceId = 0L val toExecutionRelationMap = MutableMap[StreamingRelation, StreamingExecutionRelation]() val v2ToExecutionRelationMap = MutableMap[StreamingRelationV2, StreamingExecutionRelation]() // We transform each distinct streaming relation into a StreamingExecutionRelation, keeping a // map as we go to ensure each identical relation gets the same StreamingExecutionRelation // object. For each microbatch, the StreamingExecutionRelation will be replaced with a logical // plan for the data within that batch. // Note that we have to use the previous `output` as attributes in StreamingExecutionRelation, // since the existing logical plan has already used those attributes. The per-microbatch // transformation is responsible for replacing attributes with their final values. val _logicalPlan = analyzedPlan.transform { case streamingRelation@StreamingRelation(dataSource, _, output) => toExecutionRelationMap.getOrElseUpdate(streamingRelation, { // Materialize source to avoid creating it in every batch val metadataPath = s"$resolvedCheckpointRoot/sources/$nextSourceId" val source = dataSource.createSource(metadataPath) nextSourceId += 1 StreamingExecutionRelation(source, output)(sparkSession) }) case s@StreamingRelationV2(source: MicroBatchReadSupport, _, options, output, _) => v2ToExecutionRelationMap.getOrElseUpdate(s, { // Materialize source to avoid creating it in every batch val metadataPath = s"$resolvedCheckpointRoot/sources/$nextSourceId" val reader = source.createMicroBatchReader( Optional.empty(), // user specified schema metadataPath, new DataSourceOptions(options.asJava)) nextSourceId += 1 StreamingExecutionRelation(reader, output)(sparkSession) }) case s@StreamingRelationV2(_, sourceName, _, output, v1Relation) => v2ToExecutionRelationMap.getOrElseUpdate(s, { // Materialize source to avoid creating it in every batch val metadataPath = s"$resolvedCheckpointRoot/sources/$nextSourceId" if (v1Relation.isEmpty) { throw new UnsupportedOperationException( s"Data source $sourceName does not support microbatch processing.") } val source = v1Relation.get.dataSource.createSource(metadataPath) nextSourceId += 1 StreamingExecutionRelation(source, output)(sparkSession) }) } sources = _logicalPlan.collect { case s: StreamingExecutionRelation => s.source } uniqueSources = sources.distinct _logicalPlan }
newData = reportTimeTaken("getBatch") { availableOffsets.flatMap { case (source: Source, available) if committedOffsets.get(source).map(_ != available).getOrElse(true) => val current = committedOffsets.get(source) //这部分逻辑基于传入的起始offset范围(包含了每个partition的offset范围)形成一个kafka的DataFrame val batch = source.getBatch(current, available)
val newBatchesPlan = logicalPlan transform { case StreamingExecutionRelation(source, output) => newData.get(source).map { dataPlan => assert(output.size == dataPlan.output.size, s"Invalid batch: ${Utils.truncatedString(output, ",")} != " + s"${Utils.truncatedString(dataPlan.output, ",")}") replacements ++= output.zip(dataPlan.output) dataPlan }.getOrElse { LocalRelation(output, isStreaming = true) } }
reportTimeTaken("queryPlanning") { lastExecution = new IncrementalExecution( sparkSessionToRunBatch, triggerLogicalPlan, outputMode, checkpointFile("state"), runId, currentBatchId, offsetSeqMetadata) lastExecution.executedPlan // Force the lazy generation of execution plan } val nextBatch = new Dataset(sparkSessionToRunBatch, lastExecution, RowEncoder(lastExecution.analyzed.schema))
reportTimeTaken("addBatch") { SQLExecution.withNewExecutionId(sparkSessionToRunBatch, lastExecution) { sink match { case s: Sink => if (s.isInstanceOf[MemorySinkExtend]) { s.addBatch(currentBatchId, nextBatch, batchIdOffsetMap.get(currentBatchId).getOrElse((None, None))) } else { s.addBatch(currentBatchId, nextBatch, (None, None)) } case _: StreamWriteSupport => // This doesn't accumulate any data - it just forces execution of the microbatch writer. nextBatch.collect() } } }
if (hasNewData) { var batchWatermarkMs = offsetSeqMetadata.batchWatermarkMs // Update the eventTime watermarks if we find any in the plan. if (lastExecution != null) { lastExecution.executedPlan.collect { case e: EventTimeWatermarkExec => e }.zipWithIndex.foreach { case (e, index) if e.eventTimeStats.value.count > 0 => logDebug(s"Observed event time stats $index: ${e.eventTimeStats.value}") val newWatermarkMs = e.eventTimeStats.value.max - e.delayMs val prevWatermarkMs = watermarkMsMap.get(index) if (prevWatermarkMs.isEmpty || newWatermarkMs > prevWatermarkMs.get) { watermarkMsMap.put(index, newWatermarkMs) }
lazy val watermarkExpression: Option[Expression] = { WatermarkSupport.watermarkExpression( child.output.find(_.metadata.contains(EventTimeWatermark.delayKey)), eventTimeWatermark) } /** Predicate based on keys that matches data older than the watermark */ lazy val watermarkPredicateForKeys: Option[Predicate] = watermarkExpression.flatMap { e => if (keyExpressions.exists(_.metadata.contains(EventTimeWatermark.delayKey))) { Some(newPredicate(e, keyExpressions)) } else { None } } /** Predicate based on the child output that matches data older than the watermark. */ lazy val watermarkPredicateForData: Option[Predicate] = watermarkExpression.map(newPredicate(_, child.output))