apache · kfaraz · Dec 17, 2025 · Dec 4, 2025 · Dec 8, 2025 · Dec 8, 2025
diff --git a/...apache/druid/testing/embedded/indexing/autoscaler/CostBasedAutoScalerIntegrationTest.java b/...apache/druid/testing/embedded/indexing/autoscaler/CostBasedAutoScalerIntegrationTest.java
@@ -0,0 +1,267 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.druid.testing.embedded.indexing.autoscaler;
+
+import org.apache.druid.data.input.impl.TimestampSpec;
+import org.apache.druid.indexing.kafka.KafkaIndexTaskModule;
+import org.apache.druid.indexing.kafka.simulate.KafkaResource;
+import org.apache.druid.indexing.kafka.supervisor.KafkaSupervisorSpec;
+import org.apache.druid.indexing.seekablestream.supervisor.autoscaler.CostBasedAutoScaler;
+import org.apache.druid.indexing.seekablestream.supervisor.autoscaler.CostBasedAutoScalerConfig;
+import org.apache.druid.java.util.common.StringUtils;
+import org.apache.druid.query.DruidMetrics;
+import org.apache.druid.testing.embedded.EmbeddedBroker;
+import org.apache.druid.testing.embedded.EmbeddedClusterApis;
+import org.apache.druid.testing.embedded.EmbeddedCoordinator;
+import org.apache.druid.testing.embedded.EmbeddedDruidCluster;
+import org.apache.druid.testing.embedded.EmbeddedHistorical;
+import org.apache.druid.testing.embedded.EmbeddedIndexer;
+import org.apache.druid.testing.embedded.EmbeddedOverlord;
+import org.apache.druid.testing.embedded.EmbeddedRouter;
+import org.apache.druid.testing.embedded.indexing.MoreResources;
+import org.apache.druid.testing.embedded.junit5.EmbeddedClusterTestBase;
+import org.apache.kafka.clients.producer.ProducerRecord;
+import org.hamcrest.Matchers;
+import org.joda.time.DateTime;
+import org.joda.time.DateTimeZone;
+import org.joda.time.Seconds;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.Timeout;
+
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+import java.util.concurrent.Executors;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+import static org.apache.druid.indexing.seekablestream.supervisor.autoscaler.CostBasedAutoScaler.OPTIMAL_TASK_COUNT_METRIC;
+
+/**
+ * Integration test for {@link CostBasedAutoScaler}.
+ * <p>
+ * Tests the autoscaler's ability to compute optimal task counts based on partition count and cost metrics (lag and idle time).
+ */
+public class CostBasedAutoScalerIntegrationTest extends EmbeddedClusterTestBase
+{
+  private static final String TOPIC = EmbeddedClusterApis.createTestDatasourceName();
+  private static final String EVENT_TEMPLATE = "{\"timestamp\":\"%s\",\"dimension\":\"value%d\",\"metric\":%d}";
+  private static final int PARTITION_COUNT = 50;
+
+  private final EmbeddedBroker broker = new EmbeddedBroker();
+  private final EmbeddedIndexer indexer = new EmbeddedIndexer();
+  private final EmbeddedOverlord overlord = new EmbeddedOverlord();
+  private final EmbeddedHistorical historical = new EmbeddedHistorical();
+  private final EmbeddedCoordinator coordinator = new EmbeddedCoordinator();
+  private KafkaResource kafkaServer;
+
+  @Override
+  public EmbeddedDruidCluster createCluster()
+  {
+    final EmbeddedDruidCluster cluster = EmbeddedDruidCluster.withEmbeddedDerbyAndZookeeper();
+
+    kafkaServer = new KafkaResource()
+    {
+      @Override
+      public void start()
+      {
+        super.start();
+        createTopicWithPartitions(TOPIC, PARTITION_COUNT);
+        produceRecordsToKafka(500, 1);
+      }
+
+      @Override
+      public void stop()
+      {
+        deleteTopic(TOPIC);
+        super.stop();
+      }
+    };
+
+    // Increase worker capacity to handle more tasks
+    indexer.addProperty("druid.segment.handoff.pollDuration", "PT0.1s")
+           .addProperty("druid.worker.capacity", "60");
+
+    overlord.addProperty("druid.indexer.task.default.context", "{\"useConcurrentLocks\": true}")
+            .addProperty("druid.manager.segments.useIncrementalCache", "ifSynced")
+            .addProperty("druid.manager.segments.pollDuration", "PT0.1s");
+
+    coordinator.addProperty("druid.manager.segments.useIncrementalCache", "ifSynced");
+
+    cluster.useLatchableEmitter()
+           .useDefaultTimeoutForLatchableEmitter(120)
+           .addServer(coordinator)
+           .addServer(overlord)
+           .addServer(indexer)
+           .addServer(broker)
+           .addServer(historical)
+           .addExtension(KafkaIndexTaskModule.class)
+           .addCommonProperty("druid.monitoring.emissionPeriod", "PT0.5s")
+           .addResource(kafkaServer)
+           .addServer(new EmbeddedRouter());
+
+    return cluster;
+  }
+
+  @Test
+  @Timeout(45)
+  public void test_autoScaler_computesOptimalTaskCountAndProduceScaleDown()
+  {
+    final String superId = dataSource + "_super";
+    final int initialTaskCount = 10;
+
+    final CostBasedAutoScalerConfig autoScalerConfig = CostBasedAutoScalerConfig
+        .builder()
+        .enableTaskAutoScaler(true)
+        .taskCountMin(1)
+        .taskCountMax(100)
+        .taskCountStart(initialTaskCount)
+        .scaleActionPeriodMillis(1500)
+        .minTriggerScaleActionFrequencyMillis(3000)
+        // Weight configuration: strongly favor lag reduction over idle time
+        .lagWeight(0.9)
+        .idleWeight(0.1)
+        .build();
+
+    final KafkaSupervisorSpec spec = createKafkaSupervisorWithAutoScaler(superId, autoScalerConfig, initialTaskCount);
+
+    // Submit the supervisor
+    Assertions.assertEquals(superId, cluster.callApi().postSupervisor(spec));
+
+    // Wait for the supervisor to be healthy and running
+    overlord.latchableEmitter()
+            .waitForEvent(event -> event.hasMetricName("task/run/time")
+                                        .hasDimension(DruidMetrics.DATASOURCE, dataSource));
+
+    // Wait for autoscaler to emit optimalTaskCount metric indicating scale-down
+    // We expect the optimal task count to 4
+    overlord.latchableEmitter().waitForEvent(
+        event -> event.hasMetricName(OPTIMAL_TASK_COUNT_METRIC)
+                      .hasValueMatching(Matchers.equalTo(6L))
+    );
+
+    // Suspend the supervisor
+    cluster.callApi().postSupervisor(spec.createSuspendedSpec());
+  }
+
+  @Test
+  @Timeout(125)
+  public void test_autoScaler_computesOptimalTaskCountAndProducesScaleUp()
+  {
+    final String superId = dataSource + "_super_scaleup";
+
+    // Start with a low task count (1 task for 50 partitions) and produce a large amount of data
+    // to create lag pressure and low idle ratio, which should trigger a scale-up decision.
+    // With the ideal idle range [0.2, 0.6], a single overloaded task will have idle < 0.2,
+    // triggering the cost function to recommend more tasks.
+    final int lowInitialTaskCount = 1;
+
+    // Produce additional records to create a backlog / lag
+    // This ensures tasks are busy processing (low idle ratio)
+    Executors.newSingleThreadExecutor().submit(() -> produceRecordsToKafka(500_000, 20));
+
+    // These values were carefully handpicked to allow that test to pass in a stable manner.
+    final CostBasedAutoScalerConfig autoScalerConfig = CostBasedAutoScalerConfig
+        .builder()
+        .enableTaskAutoScaler(true)
+        .taskCountMin(1)
+        .taskCountMax(50)
+        .taskCountStart(lowInitialTaskCount)
+        .scaleActionPeriodMillis(500)
+        .minTriggerScaleActionFrequencyMillis(1000)
+        .lagWeight(0.2)
+        .idleWeight(0.8)
+        .build();
+
+    final KafkaSupervisorSpec kafkaSupervisorSpec = createKafkaSupervisorWithAutoScaler(
+        superId,
+        autoScalerConfig,
+        lowInitialTaskCount
+    );
+
+    // Submit the supervisor
+    Assertions.assertEquals(superId, cluster.callApi().postSupervisor(kafkaSupervisorSpec));
+
+    // Wait for the supervisor to be healthy and running
+    overlord.latchableEmitter()
+            .waitForEvent(event -> event.hasMetricName("task/run/time")
+                                        .hasDimension(DruidMetrics.DATASOURCE, dataSource));
+
+    // With 50 partitions and high lag creating a low idle ratio (< 0.2),
+    // the cost function must recommend scaling up to at least 2 tasks.
+    overlord.latchableEmitter().waitForEvent(
+        event -> event.hasMetricName(OPTIMAL_TASK_COUNT_METRIC)
+                      .hasValueMatching(Matchers.greaterThan(1L))
+    );
+
+    // Suspend the supervisor
+    cluster.callApi().postSupervisor(kafkaSupervisorSpec.createSuspendedSpec());
+  }
+
+  private void produceRecordsToKafka(int recordCount, int iterations)
+  {
+    int recordCountPerSlice = recordCount / iterations;
+    int counter = 0;
+    for (int i = 0; i < iterations; i++) {
+      DateTime timestamp = DateTime.now(DateTimeZone.UTC);
+      List<ProducerRecord<byte[], byte[]>> records = IntStream
+          .range(counter, counter + recordCountPerSlice)
+          .mapToObj(k -> new ProducerRecord<byte[], byte[]>(
+                        TOPIC,
+                        k % PARTITION_COUNT,
+                        null,
+                        StringUtils.format(EVENT_TEMPLATE, timestamp, k, k)
+                                   .getBytes(StandardCharsets.UTF_8)
+                    )
+          )
+          .collect(Collectors.toList());
+
+      kafkaServer.produceRecordsToTopic(records);
+      try {
+        Thread.sleep(100L);
+        counter += recordCountPerSlice;
+      }
+      catch (InterruptedException e) {
+        throw new RuntimeException(e);
+      }
+    }
+  }
+
+  private KafkaSupervisorSpec createKafkaSupervisorWithAutoScaler(
+      String supervisorId,
+      CostBasedAutoScalerConfig autoScalerConfig,
+      int taskCount
+  )
+  {
+    return MoreResources.Supervisor.KAFKA_JSON
+        .get()
+        .withDataSchema(schema -> schema.withTimestamp(new TimestampSpec("timestamp", "iso", null)))
+        .withTuningConfig(tuningConfig -> tuningConfig.withMaxRowsPerSegment(100))
+        .withIoConfig(
+            ioConfig -> ioConfig
+                .withConsumerProperties(kafkaServer.consumerProperties())
+                .withTaskCount(taskCount)
+                .withTaskDuration(Seconds.THREE.toPeriod())
+                .withAutoScalerConfig(autoScalerConfig)
+        )
+        .withId(supervisorId)
+        .build(dataSource, TOPIC);
+  }
+}
diff --git a/...-indexing-service/src/main/java/org/apache/druid/indexing/kafka/KafkaConsumerMonitor.java b/...-indexing-service/src/main/java/org/apache/druid/indexing/kafka/KafkaConsumerMonitor.java
@@ -19,6 +19,7 @@
 
 package org.apache.druid.indexing.kafka;
 
+import com.google.common.util.concurrent.AtomicDouble;
 import org.apache.druid.error.DruidException;
 import org.apache.druid.java.util.common.logger.Logger;
 import org.apache.druid.java.util.emitter.service.ServiceEmitter;
@@ -47,9 +48,11 @@ public class KafkaConsumerMonitor extends AbstractMonitor
   private static final String PARTITION_TAG = "partition";
   private static final String NODE_ID_TAG = "node-id";
 
+  private static final String POLL_IDLE_RATIO_METRIC_NAME = "poll-idle-ratio-avg";
+
   /**
    * Kafka metric name -> Kafka metric descriptor. Taken from
-   * https://kafka.apache.org/documentation/#consumer_fetch_monitoring.
+   * <a href="https://kafka.apache.org/documentation/#consumer_fetch_monitoring">Kafka documentation</a>.
    */
   private static final Map<String, KafkaConsumerMetric> METRICS =
       Stream.of(
@@ -129,6 +132,7 @@ public class KafkaConsumerMonitor extends AbstractMonitor
 
   private final KafkaConsumer<?, ?> consumer;
   private final Map<MetricName, AtomicLong> counters = new HashMap<>();
+  private final AtomicDouble pollIdleRatioAvg = new AtomicDouble(1.0d);
 
   public KafkaConsumerMonitor(final KafkaConsumer<?, ?> consumer)
   {
@@ -172,6 +176,13 @@ public boolean doMonitor(final ServiceEmitter emitter)
           emitter.emit(builder.setMetric(kafkaConsumerMetric.getDruidMetricName(), emitValue));
         }
       }
+
+      // Capture `poll-idle-ratio-avg` metric for autoscaler purposes.
+      if (POLL_IDLE_RATIO_METRIC_NAME.equals(metricName.name())) {
+        if (entry.getValue().metricValue() != null) {
+          pollIdleRatioAvg.set(((Number) entry.getValue().metricValue()).doubleValue());
+        }
+      }
     }
 
     return !stopAfterNext;
@@ -181,4 +192,14 @@ public void stopAfterNextEmit()
   {
     stopAfterNext = true;
   }
+
+  /**
+   * Average poll-to-idle ratio as reported by the Kafka consumer.
+   * A value of 0 represents that the consumer is never idle, i.e. always consuming.
+   * A value of 1 represents that the consumer is always idle, i.e. not receiving data.
+   */
+  public double getPollIdleRatioAvg()
+  {
+    return pollIdleRatioAvg.get();
+  }
 }
diff --git a/.../kafka-indexing-service/src/main/java/org/apache/druid/indexing/kafka/KafkaIndexTask.java b/.../kafka-indexing-service/src/main/java/org/apache/druid/indexing/kafka/KafkaIndexTask.java
@@ -49,7 +49,7 @@ public class KafkaIndexTask extends SeekableStreamIndexTask<KafkaTopicPartition,
 
   /**
    * Resources that a {@link KafkaIndexTask} is authorized to use. Includes
-   * performing a read action on external resource of type
+   * performing a read action on an external resource of type
    */
   public static final Set<ResourceAction> INPUT_SOURCE_RESOURCES = Set.of(
       AuthorizationUtils.createExternalResourceReadAction(KafkaIndexTaskModule.SCHEME)

diff --git a/...a-indexing-service/src/main/java/org/apache/druid/indexing/kafka/KafkaRecordSupplier.java b/...a-indexing-service/src/main/java/org/apache/druid/indexing/kafka/KafkaRecordSupplier.java
@@ -234,6 +234,12 @@ public Map<KafkaTopicPartition, Long> getLatestSequenceNumbers(Set<StreamPartiti
     ));
   }
 
+  @Override
+  public double getPollIdleRatioMetric()
+  {
+    return monitor.getPollIdleRatioAvg();
+  }
+
   @Override
   public Set<KafkaTopicPartition> getPartitionIds(String stream)
   {

diff --git a/...ce/src/main/java/org/apache/druid/indexing/common/stats/DropwizardRowIngestionMeters.java b/...ce/src/main/java/org/apache/druid/indexing/common/stats/DropwizardRowIngestionMeters.java
@@ -29,9 +29,9 @@
 
 public class DropwizardRowIngestionMeters implements RowIngestionMeters
 {
-  private static final String ONE_MINUTE_NAME = "1m";
-  private static final String FIVE_MINUTE_NAME = "5m";
-  private static final String FIFTEEN_MINUTE_NAME = "15m";
+  public static final String ONE_MINUTE_NAME = "1m";
+  public static final String FIVE_MINUTE_NAME = "5m";
+  public static final String FIFTEEN_MINUTE_NAME = "15m";
 
   private final Meter processed;
   private final Meter processedBytes;

diff --git a/...rvice/src/main/java/org/apache/druid/indexing/seekablestream/SeekableStreamIndexTask.java b/...rvice/src/main/java/org/apache/druid/indexing/seekablestream/SeekableStreamIndexTask.java
@@ -74,7 +74,7 @@ public abstract class SeekableStreamIndexTask<PartitionIdType, SequenceOffsetTyp
   protected final TaskLockType lockTypeToUse;
   protected final String supervisorId;
 
-  // Lazily initialized, to avoid calling it on the overlord when tasks are instantiated.
+  // Lazily initialized to avoid calling it on the overlord when tasks are instantiated.
   // See https://github.com/apache/druid/issues/7724 for issues that can cause.
   // By the way, lazily init is synchronized because the runner may be needed in multiple threads.
   private final Supplier<SeekableStreamIndexTaskRunner<PartitionIdType, SequenceOffsetType, ?>> runnerSupplier;

diff --git a/...java/org/apache/druid/indexing/seekablestream/SeekableStreamIndexTaskClientAsyncImpl.java b/...java/org/apache/druid/indexing/seekablestream/SeekableStreamIndexTaskClientAsyncImpl.java
@@ -373,7 +373,7 @@ private Map<PartitionIdType, SequenceOffsetType> deserializeOffsetsMap(final byt
 
   /**
    * Helper for {@link #pauseAsync}.
-   *
+   * <p>
    * Calls {@link #getStatusAsync} in a loop until a task is paused, then calls {@link #getCurrentOffsetsAsync} to
    * get the post-pause offsets for the task.
    */
-Original file line number
+Diff line change
@@ Expand Up @@
       /**
        * Helper for {@link #pauseAsync}.
-       *
+       * <p>
        * Calls {@link #getStatusAsync} in a loop until a task is paused, then calls {@link #getCurrentOffsetsAsync} to
        * get the post-pause offsets for the task.
        */
@@ Expand Down @@