Skip to content

Commit 1e7bd56

Browse files
authored
Improved failure messages for MSQCompactionRunner. (#18787)
* Improved failure messages for MSQCompactionRunner. Include the first failure message in the task status itself, so it is not necessary to fetch task logs to see the error message. Also, don't log the entire task JSON for failed subtasks. It is logged once when a subtask is initially run, and that's enough. * Improve messages.
1 parent 5247a25 commit 1e7bd56

File tree

1 file changed

+31
-14
lines changed

1 file changed

+31
-14
lines changed

multi-stage-query/src/main/java/org/apache/druid/msq/indexing/MSQCompactionRunner.java

Lines changed: 31 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -717,40 +717,57 @@ private static TaskStatus runSubtasks(
717717
final int totalNumSpecs = tasks.size();
718718
log.info("Generated [%d] MSQControllerTask specs", totalNumSpecs);
719719

720+
TaskStatus firstFailure = null;
720721
int failCnt = 0;
721722

722-
for (MSQControllerTask eachTask : tasks) {
723-
final String json = toolbox.getJsonMapper().writerWithDefaultPrettyPrinter().writeValueAsString(eachTask);
724-
if (!currentSubTaskHolder.setTask(eachTask)) {
723+
for (int taskCnt = 0; taskCnt < tasks.size(); taskCnt++) {
724+
final MSQControllerTask task = tasks.get(taskCnt);
725+
final String json = toolbox.getJsonMapper().writerWithDefaultPrettyPrinter().writeValueAsString(task);
726+
if (!currentSubTaskHolder.setTask(task)) {
725727
String errMsg = "Task was asked to stop. Finish as failed.";
726-
log.info(errMsg);
728+
log.info("%s", errMsg);
727729
return TaskStatus.failure(compactionTaskId, errMsg);
728730
}
729731
try {
730-
if (eachTask.isReady(toolbox.getTaskActionClient())) {
731-
log.info("Running MSQControllerTask: " + json);
732-
final TaskStatus eachResult = eachTask.run(toolbox);
733-
if (!eachResult.isSuccess()) {
732+
if (task.isReady(toolbox.getTaskActionClient())) {
733+
log.info("Running MSQControllerTask number[%d]: %s", taskCnt, json);
734+
final TaskStatus taskStatus = task.run(toolbox);
735+
if (!taskStatus.isSuccess()) {
734736
failCnt++;
735-
log.warn("Failed to run MSQControllerTask: [%s].\nTrying the next MSQControllerTask.", json);
737+
log.warn("Failed to run MSQControllerTask number[%d]: %s", taskCnt, taskStatus.getErrorMsg());
738+
if (firstFailure == null) {
739+
firstFailure = taskStatus;
740+
}
736741
}
737742
} else {
738743
failCnt++;
739-
log.warn("MSQControllerTask is not ready: [%s].\nTrying the next MSQControllerTask.", json);
744+
log.warn("MSQControllerTask number[%d] is not ready.", taskCnt);
740745
}
741746
}
742747
catch (Exception e) {
743748
failCnt++;
744-
log.warn(e, "Failed to run MSQControllerTask: [%s].\nTrying the next MSQControllerTask.", json);
749+
log.warn(e, "Failed to run MSQControllerTask number[%d].", taskCnt);
745750
}
746751
}
747-
String msg = StringUtils.format(
752+
753+
log.info(
748754
"Ran [%d] MSQControllerTasks, [%d] succeeded, [%d] failed",
749755
totalNumSpecs,
750756
totalNumSpecs - failCnt,
751757
failCnt
752758
);
753-
log.info(msg);
754-
return failCnt == 0 ? TaskStatus.success(compactionTaskId) : TaskStatus.failure(compactionTaskId, msg);
759+
760+
if (failCnt == 0) {
761+
return TaskStatus.success(compactionTaskId);
762+
} else if (firstFailure != null && failCnt == 1) {
763+
return TaskStatus.failure(compactionTaskId, firstFailure.getErrorMsg());
764+
} else {
765+
final StringBuilder msgBuilder =
766+
new StringBuilder().append(failCnt).append("/").append(totalNumSpecs).append(" jobs failed");
767+
if (firstFailure != null) {
768+
msgBuilder.append("; first failure was: ").append(firstFailure.getErrorMsg());
769+
}
770+
return TaskStatus.failure(compactionTaskId, msgBuilder.toString());
771+
}
755772
}
756773
}

0 commit comments

Comments
 (0)