aj47 · aj47 · Apr 24, 2025 · Apr 25, 2025 · Apr 25, 2025 · Apr 25, 2025
diff --git a/aider/coders/base_coder.py b/aider/coders/base_coder.py
@@ -324,6 +324,11 @@ def __init__(
         auto_copy_context=False,
         auto_accept_architect=True,
     ):
+        # Initialize token tracking attributes
+        self.total_prompt_tokens = 0
+        self.total_completion_tokens = 0
+        self.total_cache_hit_tokens = 0
+        self.total_cache_write_tokens = 0
         # Fill in a dummy Analytics if needed, but it is never .enable()'d
         self.analytics = analytics if analytics is not None else Analytics()
 
@@ -1895,6 +1900,12 @@ def calculate_and_show_tokens_and_cost(self, messages, completion=None):
             )
             cache_write_tokens = getattr(completion.usage, "cache_creation_input_tokens", 0)
 
+            # Update total token counts for benchmark tracking
+            self.total_prompt_tokens += prompt_tokens
+            self.total_completion_tokens += completion_tokens
+            self.total_cache_hit_tokens += cache_hit_tokens
+            self.total_cache_write_tokens += cache_write_tokens
+
             if hasattr(completion.usage, "cache_read_input_tokens") or hasattr(
                 completion.usage, "cache_creation_input_tokens"
             ):
@@ -1906,6 +1917,11 @@ def calculate_and_show_tokens_and_cost(self, messages, completion=None):
         else:
             prompt_tokens = self.main_model.token_count(messages)
             completion_tokens = self.main_model.token_count(self.partial_response_content)
+
+            # Update total token counts for benchmark tracking
+            self.total_prompt_tokens += prompt_tokens
+            self.total_completion_tokens += completion_tokens
+
             self.message_tokens_sent += prompt_tokens
 
         self.message_tokens_received += completion_tokens

diff --git a/benchmark/README.md b/benchmark/README.md
@@ -83,6 +83,21 @@ You can run `./benchmark/benchmark.py --help` for a list of all the arguments, b
 - `--num-tests` specifies how many of the tests to run before stopping. This is another way to start gently as you debug your benchmarking setup.
 - `--keywords` filters the tests to run to only the ones whose name match the supplied argument (similar to `pytest -k xxxx`).
 - `--read-model-settings=<filename.yml>` specify model settings, see here: https://aider.chat/docs/config/adv-model-settings.html#model-settings
+- `--resume` resume a previously paused benchmark run from its checkpoint
+- `--edit-format architect` run in architect mode, which uses two models: one to propose changes and another to implement them
+- `--editor-model` specify the model to use for implementing changes in architect mode
+- `--reasoning-effort` set reasoning effort for models that support it (e.g., "high", "medium", "low")
+
+### Pausing and Resuming Benchmarks
+
+Benchmarks can take a long time to run. You can pause a running benchmark by pressing `Ctrl+C` once. The benchmark will complete the current test and then save a checkpoint before exiting. To resume the benchmark later, use the `--resume` flag:
+
+```
+# Resume a previously paused benchmark
+./benchmark/benchmark.py YYYY-MM-DD-HH-MM-SS--a-helpful-name-for-this-run --resume --model gpt-3.5-turbo --edit-format whole --threads 10
+```
+
+When you resume a benchmark, it will pick up where it left off, using the list of pending tests from the checkpoint file. This allows you to run benchmarks over multiple sessions.
 
 ### Benchmark report
 
@@ -137,6 +152,24 @@ should be enough to reliably reproduce any benchmark run.
 You can see examples of the benchmark report yaml in the
 [aider leaderboard data files](https://github.com/Aider-AI/aider/blob/main/aider/website/_data/).
 
+### Running benchmarks in architect mode
+
+Architect mode uses two models: a main model that proposes changes and an editor model that implements them. This can be particularly useful for models that are good at reasoning but struggle with precise code edits.
+
+Here's an example of running a benchmark in architect mode:
+
+```
+./benchmark/benchmark.py grook-mini-architect-deepseek-editor --model openrouter/x-ai/grok-3-mini-beta --editor-model openrouter/deepseek/deepseek-chat-v3-0324 --edit-format architect --threads 15 --exercises-dir polyglot-benchmark --reasoning-effort high
+```
+
+In this example:
+- The main model is Grok-3-mini-beta (via OpenRouter)
+- The editor model is DeepSeek Chat v3 (via OpenRouter)
+- The edit format is set to "architect"
+- Reasoning effort is set to "high"
+- 15 threads are used for parallel processing
+
+When running in architect mode, the benchmark report will include additional information about the editor model used.
 
 ## Limitations, notes