Handle all truncation cases (#637)

mikasenghaas · web-flow · commit e650025deefc · 2025-12-17T13:38:49.000-08:00
* populate is truncated if max_tokens or max_model_len is hit

* handle overlong prompt error gracefully (do not propagate as error)

* parse is_truncated from response

* show summary in vf-eval

* also set is_truncated on bad request error

* fix tests

* fix ty

* show type repr

* do not set prompt too long from req id

* also remove in docs
diff --git a/notes/TRAJECTORIES.md b/notes/TRAJECTORIES.md
@@ -350,9 +350,6 @@ async def add_model_response(
     response: ModelResponse,
 ):
     """Add a model response as a trajectory step."""
-    if response is not None and response.id == "overlong-prompt":
-        state["prompt_too_long"] = True
-        return
     completion_messages = await parse_response_messages(response, self.message_type)
     tokens = await parse_response_tokens(response, self.message_type)
     trajectory_step = TrajectoryStep(
diff --git a/tests/test_eval_utils.py b/tests/test_eval_utils.py
@@ -64,6 +64,8 @@ def test_print_results_rollout_indexing(capsys):
         example_id=example_ids,
         reward=rewards,
         metrics={"test_metric": metric_values},
+        is_truncated=[False] * 6,
+        stop_conditions=[None] * 6,
         metadata=_make_metadata(num_examples, rollouts_per_example),
     )
 
@@ -102,6 +104,8 @@ def test_print_results_single_rollout(capsys):
         example_id=example_ids,
         reward=rewards,
         metrics={},
+        is_truncated=[False] * 3,
+        stop_conditions=[None] * 3,
         metadata=_make_metadata(num_examples, rollouts_per_example),
     )
 
@@ -134,6 +138,8 @@ def test_print_results_three_rollouts(capsys):
         example_id=example_ids,
         reward=rewards,
         metrics={},
+        is_truncated=[False] * 6,
+        stop_conditions=[None] * 6,
         metadata=_make_metadata(num_examples, rollouts_per_example),
     )
 
diff --git a/verifiers/envs/environment.py b/verifiers/envs/environment.py
@@ -565,6 +565,7 @@ async def init_state(
         state["model"] = model
         state["sampling_args"] = sampling_args
         state["is_completed"] = False
+        state["is_truncated"] = False
         state["oai_tools"] = None
         if "info" in state and hasattr(state["info"], "oai_tools"):
             state["oai_tools"] = state["info"]["oai_tools"]
@@ -621,6 +622,9 @@ async def _teardown(self):
     async def _render_stop(self, state: State, condition) -> bool:
         if await condition(state):
             state["is_completed"] = True
+            state["is_truncated"] = state.get("is_truncated", False) or any(
+                step.get("is_truncated", False) for step in state.get("trajectory", [])
+            )
             state["stop_condition"] = condition.__name__
             if state.get("stop_condition") == "has_error":
                 self.logger.error(
@@ -724,6 +728,8 @@ def _prepare_rollout_results(
         infos = [state.get("info", {}) for state in all_states]
         example_ids = [state.get("example_id", 0) for state in all_states]
         rewards = [state.get("reward", 0.0) for state in all_states]
+        stop_conditions = [state.get("stop_condition", None) for state in all_states]
+        is_truncated = [state.get("is_truncated", False) for state in all_states]
 
         metrics: dict[str, list[float]] = {}
         for state in all_states:
@@ -767,6 +773,8 @@ def _prepare_rollout_results(
             example_id=example_ids,
             reward=rewards,
             metrics=metrics,
+            stop_conditions=stop_conditions,
+            is_truncated=is_truncated,
             metadata=metadata,
         )
 
diff --git a/verifiers/envs/multiturn_env.py b/verifiers/envs/multiturn_env.py
@@ -14,6 +14,7 @@
 )
 from verifiers.utils.message_utils import concat_messages
 from verifiers.utils.response_utils import (
+    parse_is_truncated,
     parse_response_messages,
     parse_response_tokens,
 )
@@ -68,19 +69,22 @@ async def add_model_response(
         prompt_messages: Messages,
         response: ModelResponse,
     ):
-        if response is not None and response.id == "overlong-prompt":
-            state["prompt_too_long"] = True
         completion_messages = await parse_response_messages(response, self.message_type)
+        response_is_truncated = await parse_is_truncated(response, self.message_type)
         tokens = await parse_response_tokens(
             response, self.message_type, self.max_seq_len
         )
+        is_truncated = response_is_truncated or (
+            tokens is not None and bool(tokens.get("is_truncated"))
+        )
         trajectory_step = TrajectoryStep(
             prompt=prompt_messages,
             completion=completion_messages,
             response=response,
             tokens=tokens,
             reward=None,
             advantage=None,
+            is_truncated=is_truncated,
             extras={},
         )
         trajectory_step["completion"] = completion_messages
@@ -107,5 +111,9 @@ async def rollout(
                 response = await self.get_model_response(state, prompt_messages)
                 await self.add_model_response(state, prompt_messages, response)
             except vf.Error as e:
-                state["error"] = e
+                if isinstance(e, vf.OverlongPromptError):
+                    state["prompt_too_long"] = True
+                    state["is_truncated"] = True
+                else:
+                    state["error"] = e
         return state
diff --git a/verifiers/types.py b/verifiers/types.py
@@ -67,6 +67,7 @@ class TrajectoryStep(TypedDict):
     tokens: TrajectoryStepTokens | None
     reward: float | None
     advantage: float | None
+    is_truncated: bool
     extras: dict[str, Any]
 
 
@@ -99,6 +100,7 @@ class State(dict):
     sampling_args: SamplingArgs | None
     # created during rollout
     is_completed: bool
+    is_truncated: bool
     stop_condition: str | None
     oai_tools: list[ChatCompletionToolParam]
     trajectory: list[TrajectoryStep]
@@ -167,6 +169,8 @@ class GenerateOutputs(TypedDict):
     example_id: list[int]
     reward: list[float]
     metrics: dict[str, list[float]]
+    stop_conditions: list[str | None]
+    is_truncated: list[bool]
     metadata: GenerateMetadata
 
 
diff --git a/verifiers/utils/eval_utils.py b/verifiers/utils/eval_utils.py
@@ -2,6 +2,7 @@
 import json
 import logging
 import time
+from collections import Counter
 from contextlib import contextmanager
 from pathlib import Path
 from typing import cast
@@ -97,6 +98,19 @@ def print_results(results: GenerateOutputs, num_samples: int = 1):
             out = f"r{i + 1}: {trials}"
             print(out)
 
+    print("Info:")
+    print(
+        f"is_truncated: avg - {np.mean(results['is_truncated']):.3f}, std - {np.std(results['is_truncated']):.3f}"
+    )
+    print(
+        f"stop_conditions: {', '.join([f'{k}={v}' for k, v in Counter(results['stop_conditions']).items()])}"
+    )
+    errors = [e for e in errors if e is not None]
+    if errors:
+        print(
+            f"errors: {', '.join([f'{k}: {v / len(errors):.3f}' for k, v in Counter([type(e).__name__ for e in errors]).items()])}"
+        )
+
 
 async def run_evaluation(config: EvalConfig) -> GenerateOutputs:
     # set up AsyncOpenAI client with high limits to prevent timeouts
diff --git a/verifiers/utils/response_utils.py b/verifiers/utils/response_utils.py
@@ -125,3 +125,18 @@ async def parse_response_messages(
             response_text = response.choices[0].text or ""
         completion_messages = str(response_text)
     return completion_messages
+
+
+async def parse_is_truncated(
+    response: ModelResponse, message_type: MessageType
+) -> bool:
+    if message_type == "chat":
+        assert isinstance(response, ChatCompletion)
+        assert len(response.choices) == 1, "Response should always have one choice"
+        return response.choices[0].finish_reason == "length"
+    elif message_type == "completion":
+        assert isinstance(response, Completion)
+        assert len(response.choices) == 1, "Response should always have one choice"
+        return response.choices[0].finish_reason == "length"
+    else:
+        return False