1、Spark整合kafka0100新特性二Spark整合kafka0.10.0新特性(二)接着Spark整合kafka0.10.0新特性(一)开始import org.apache.kafka.clients.consumer.ConsumerRecordimport mon.serialization.StringDeserializerimport org.apache.spark.streaming.kafka010._import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistentimport o
2、rg.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribeval kafkaParams = MapString, Object( bootstrap.servers - localhost:9092,anotherhost:9092, key.deserializer - classOfStringDeserializer, value.deserializer - classOfStringDeserializer, group.id - use_a_separate_group_id_for_each_stream, a
3、uto.offset.reset - latest, mit - (false: java.lang.Boolean)val topics = Array(topicA, topicB)val stream = KafkaUtils.createDirectStreamString, String( streamingContext, PreferConsistent, SubscribeString, String(topics, kafkaParams)stream.map(record = (record.key, record.value)分析完位置策略和消费策略,接下来先看看org.
4、apache.Spark.streaming.kafka010.KafkaUtils$#createDirectStream的具体实现:java view plain copy 在CODE上查看代码片派生到我的代码片Experimental def createDirectStreamK, V( ssc: StreamingContext, locationStrategy: LocationStrategy, consumerStrategy: ConsumerStrategyK, V): InputDStreamConsumerRecordK, V = val ppc = new Defa
5、ultPerPartitionConfig(ssc.sparkContext.getConf) createDirectStreamK, V(ssc, locationStrategy, consumerStrategy, ppc) 返回的是InputDStreamConsumerRecordK,V类型,查看一下ConsumerRecord类型:java view plain copy 在CODE上查看代码片派生到我的代码片/* * A key/value pair to be received from Kafka. This consists of a topic name and a p
6、artition number, from which the * record is being received and an offset that points to the record in a Kafka partition. * 从Kafka接受到的消息对key/value,包含topic名字、分区编号、以及消息在分区的offset */ public final class ConsumerRecord public static final long NO_TIMESTAMP = Record.NO_TIMESTAMP; public static final int NU
7、LL_SIZE = -1; public static final int NULL_CHECKSUM = -1; private final String topic; private final int partition; private final long offset; private final long timestamp; private final TimestampType timestampType; private final long checksum; private final int serializedKeySize; private final int s
8、erializedValueSize; private final K key; private final V value; 等等省略 关于InputDStream具体细节略,看一下类继承结构:所以createDirectStream返回的具体类型是DirectKafkaInputDStream。接着在createDirectStream中创建DefaultPerPartitionConfig,DefaultPerPartitionConfig就是一个设置每一个分区获取消息的组大数率,设置参数为spark.streaming.kafka.maxRatePerPartition.源码如下:ja
9、va view plain copy 在CODE上查看代码片派生到我的代码片package org.apache.spark.streaming.kafka010 import mon.TopicPartition import org.apache.spark.SparkConf import org.apache.spark.annotation.Experimental /* * : Experimental : * Interface for user-supplied configurations that cant otherwise be set via Spark proper
10、ties, * because they need tweaking on a per-partition basis, * * 为用户提供的一个配置接口,但是这些参数不可以使用spark配置文件进行配置,因为spark配置文件配置,因为他们需要 * 对每一个分区的比率进行调整。可以使用SparkConf进行设置数率 */ Experimental abstract class PerPartitionConfig extends Serializable /* * Maximum rate (number of records per second) at which data will b
11、e read * from each Kafka partition. * *从Kafka分区中读取数据的最大比率(每秒最大记录数) */ def maxRatePerPartition(topicPartition: TopicPartition): Long /* * Default per-partition configuration */ private class DefaultPerPartitionConfig(conf: SparkConf) extends PerPartitionConfig val maxRate = conf.getLong(spark.streami
12、ng.kafka.maxRatePerPartition, 0) /从Kafka分区中读取数据的最大比率(每秒最大记录数) def maxRatePerPartition(topicPartition: TopicPartition): Long = maxRate 创建完毕PerPartitionConfig之后再次调用createDirectStream的重载方法:java view plain copy 在CODE上查看代码片派生到我的代码片def createDirectStreamK, V( ssc: StreamingContext, locationStrategy: Locat
13、ionStrategy, consumerStrategy: ConsumerStrategyK, V, perPartitionConfig: PerPartitionConfig ): InputDStreamConsumerRecordK, V = new DirectKafkaInputDStreamK, V(ssc, locationStrategy, consumerStrategy, perPartitionConfig) 接下来重点查看DirectKafkaInputDStream的构造器(注意:Scala类的构造器是从类定义的左开始到右结束都是主构造器):java view
14、plain copy 在CODE上查看代码片派生到我的代码片package org.apache.spark.streaming.kafka010 import java.util = ju import java.util.concurrent.ConcurrentLinkedQueue import java.util.concurrent.atomic.AtomicReference import scala.annotation.tailrec import scala.collection.JavaConverters._ import scala.collection.mutabl
15、e import org.apache.kafka.clients.consumer._ import mon.PartitionInfo, TopicPartition import org.apache.spark.SparkException import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext, Time import org.apache.spark.streamin
16、g.dstream._ import org.apache.spark.streaming.scheduler.RateController, StreamInputInfo import org.apache.spark.streaming.scheduler.rate.RateEstimator /* * * each given Kafka topic/partition corresponds to an RDD partition. * The spark configuration spark.streaming.kafka.maxRatePerPartition gives th
17、e maximum number * of messages * per second that each partition will accept. * * 每个topic的每一个分区对应一个RDD分区 * spark的spark.streaming.kafka.maxRatePerPartition参数配置指定了每秒每一个topic的每一个分区获取的最大消息数 * * param locationStrategy In most cases, pass in PreferConsistent, * see LocationStrategy for more details. * para
18、m executorKafkaParams Kafka * * configuration parameters. * Requires bootstrap.servers to be set with Kafka broker(s), * NOT zookeeper servers, specified in host1:port1,host2:port2 form. * param consumerStrategy In most cases, pass in Subscribe, * see ConsumerStrategy for more details * tparam K typ
19、e of Kafka message key Kafka消息的Key * tparam V type of Kafka message value Kafka消息的Value */ privatespark class DirectKafkaInputDStreamK, V( _ssc: StreamingContext, locationStrategy: LocationStrategy, consumerStrategy: ConsumerStrategyK, V, ppc: PerPartitionConfig ) extends InputDStreamConsumerRecordK
20、, V(_ssc) with Logging with CanCommitOffsets val executorKafkaParams = val ekp = new ju.HashMapString, Object(consumerStrategy.executorKafkaParams) /根据具体的executor调整参数,防止在executor上出问题 KafkaUtils.fixKafkaParams(ekp) ekp /存入当前偏移的 protected var currentOffsets = MapTopicPartition, Long() /如果偏移量为1的话,则设置偏移
21、为1 transient private var kc: ConsumerK, V = null def consumer(): ConsumerK, V = this.synchronized if (null = kc) kc = consumerStrategy.onStart(currentOffsets.mapValues(l = new java.lang.Long(l).asJava) kc override def persist(newLevel: StorageLevel): DStreamConsumerRecordK, V = logError(Kafka Consum
22、erRecord is not serializable. + Use .map to extract fields before calling .persist or .window) super.persist(newLevel) / protected def getBrokers = val c = consumer val result = new ju.HashMapTopicPartition, String() val hosts = new ju.HashMapTopicPartition, String() /assignment()获取该Consumer的TopicPa
23、rtition,返回Set集合 val assignments = c.assignment().iterator() /两层while循环实现获取 while (assignments.hasNext() val tp: TopicPartition = assignments.next() /当前的TopicPartition的主机地址没有的话,需要根据去kafka集群查找该TopicPartition的主机地址 if (null = hosts.get(tp) /partitionsFor获取给定topic和partition的元数据,如果本地没有会发起rpc val infos = c
24、.partitionsFor(tp.topic).iterator() while (infos.hasNext() val i = infos.next() /TopicPartition重写了equals方法 hosts.put(new TopicPartition(i.topic(), i.partition(), i.leader.host() /TopicPartition重写了equals方法,所以可以hosts.get(tp) /到此处就获取到了分区和分区的地址 result.put(tp, hosts.get(tp) result protected def getPrefer
25、redHosts: ju.MapTopicPartition, String = locationStrategy match case PreferBrokers = getBrokers case PreferConsistent = ju.Collections.emptyMapTopicPartition, String() case PreferFixed(hostMap) = hostMap / Keep this consistent with how other streams are named (e.g. Flume polling stream 2) privatestr
26、eaming override def name: String = sKafka 0.10 direct stream $id protectedstreaming override val checkpointData = new DirectKafkaInputDStreamCheckpointData /* * Asynchronously maintains & sends new rate limits to the receiver through the receiver tracker. */ override protectedstreaming val rateContr
27、oller: OptionRateController = if (RateController.isBackPressureEnabled(ssc.conf) Some(new DirectKafkaRateController(id, RateEstimator.create(ssc.conf, context.graph.batchDuration) else None protectedstreaming def maxMessagesPerPartition( offsets: MapTopicPartition, Long): OptionMapTopicPartition, Lo
28、ng = val estimatedRateLimit = rateController.map(_.getLatestRate() / calculate a per-partition rate limit based on current lag val effectiveRateLimitPerPartition = estimatedRateLimit.filter(_ 0) match case Sme(rate) = val lagPerPartition = offsets.map case (tp, offset) = tp - Math.max(offset - curre
29、ntOffsets(tp), 0) val totalLag = lagPerPartition.values.sum lagPerPartition.map case (tp, lag) = val maxRateLimitPerPartition = ppc.maxRatePerPartition(tp) val backpressureRate = Math.round(lag / totalLag.toFloat * rate) tp - (if (maxRateLimitPerPartition 0) Math.min(backpressureRate, maxRateLimitPerPartition) else backpressureRate) case None =
copyright@ 2008-2022 冰豆网网站版权所有
经营许可证编号:鄂ICP备2022015515号-1