Skip to content

Commit 9960735

Browse files
teknium1saxster
authored andcommitted
feat: thinking-only prefill continuation for structured reasoning responses (NousResearch#5931)
When the model produces structured reasoning (via API fields like .reasoning, .reasoning_content, .reasoning_details) but no visible text content, append the assistant message as prefill and continue the loop. The model sees its own reasoning context on the next turn and produces the text portion. Inspired by clawdbot's 'incomplete-text' recovery pattern. Up to 2 prefill attempts before falling through to the existing '(empty)' terminal. Key design decisions: - Only triggers for structured reasoning (API fields), NOT inline <think> tags - Prefill messages are popped on success to maintain strict role alternation - _thinking_prefill marker stripped from all API message building paths - Works across all providers: OpenAI (continuation), Anthropic (native prefill) Verified with E2E tests: simulated thinking-only → real OpenRouter continuation produces correct content. Also confirmed Qwen models consistently produce structured-reasoning-only responses under token pressure.
1 parent c85f920 commit 9960735

File tree

2 files changed

+295
-358
lines changed

2 files changed

+295
-358
lines changed

run_agent.py

Lines changed: 58 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5699,6 +5699,7 @@ def flush_memories(self, messages: list = None, min_turns: int = None):
56995699
api_msg.pop("reasoning", None)
57005700
api_msg.pop("finish_reason", None)
57015701
api_msg.pop("_flush_sentinel", None)
5702+
api_msg.pop("_thinking_prefill", None)
57025703
if _is_strict_api:
57035704
self._sanitize_tool_calls_for_strict_api(api_msg)
57045705
api_messages.append(api_msg)
@@ -6775,7 +6776,7 @@ def _handle_max_iterations(self, messages: list, api_call_count: int) -> str:
67756776
api_messages = []
67766777
for msg in messages:
67776778
api_msg = msg.copy()
6778-
for internal_field in ("reasoning", "finish_reason"):
6779+
for internal_field in ("reasoning", "finish_reason", "_thinking_prefill"):
67796780
api_msg.pop(internal_field, None)
67806781
if _is_strict_api:
67816782
self._sanitize_tool_calls_for_strict_api(api_msg)
@@ -6986,6 +6987,7 @@ def run_conversation(
69866987
self._empty_content_retries = 0
69876988
self._incomplete_scratchpad_retries = 0
69886989
self._codex_incomplete_retries = 0
6990+
self._thinking_prefill_retries = 0
69896991
self._last_content_with_tools = None
69906992
self._mute_post_response = False
69916993
self._surrogate_sanitized = False
@@ -7321,6 +7323,8 @@ def run_conversation(
73217323
# Remove finish_reason - not accepted by strict APIs (e.g. Mistral)
73227324
if "finish_reason" in api_msg:
73237325
api_msg.pop("finish_reason")
7326+
# Strip internal thinking-prefill marker
7327+
api_msg.pop("_thinking_prefill", None)
73247328
# Strip Codex Responses API fields (call_id, response_item_id) for
73257329
# strict providers like Mistral that reject unknown fields with 422.
73267330
# Uses new dicts so the internal messages list retains the fields
@@ -8708,6 +8712,15 @@ def _stop_spinner():
87088712
if clean:
87098713
self._vprint(f" ┊ 💬 {clean}")
87108714

8715+
# Pop thinking-only prefill message(s) before appending
8716+
# (tool-call path — same rationale as the final-response path).
8717+
while (
8718+
messages
8719+
and isinstance(messages[-1], dict)
8720+
and messages[-1].get("_thinking_prefill")
8721+
):
8722+
messages.pop()
8723+
87118724
messages.append(assistant_msg)
87128725

87138726
# Close any open streaming display (response box, reasoning
@@ -8821,12 +8834,39 @@ def _stop_spinner():
88218834
self._response_was_previewed = True
88228835
break
88238836

8824-
# No fallback available — this is a genuine empty response.
8825-
# Retry in case the model just had a bad generation.
8837+
# ── Thinking-only prefill continuation ──────────
8838+
# The model produced structured reasoning (via API
8839+
# fields) but no visible text content. Rather than
8840+
# giving up, append the assistant message as-is and
8841+
# continue — the model will see its own reasoning
8842+
# on the next turn and produce the text portion.
8843+
_has_structured = bool(
8844+
getattr(assistant_message, "reasoning", None)
8845+
or getattr(assistant_message, "reasoning_content", None)
8846+
or getattr(assistant_message, "reasoning_details", None)
8847+
)
8848+
if _has_structured and self._thinking_prefill_retries < 2:
8849+
self._thinking_prefill_retries += 1
8850+
self._vprint(
8851+
f"{self.log_prefix}↻ Thinking-only response — "
8852+
f"prefilling to continue "
8853+
f"({self._thinking_prefill_retries}/2)"
8854+
)
8855+
interim_msg = self._build_assistant_message(
8856+
assistant_message, "incomplete"
8857+
)
8858+
interim_msg["_thinking_prefill"] = True
8859+
messages.append(interim_msg)
8860+
self._session_messages = messages
8861+
self._save_session_log(messages)
8862+
continue
8863+
8864+
# Exhausted prefill attempts or no structured
8865+
# reasoning — fall through to empty content retry.
88268866
if not hasattr(self, '_empty_content_retries'):
88278867
self._empty_content_retries = 0
88288868
self._empty_content_retries += 1
8829-
8869+
88308870
reasoning_text = self._extract_reasoning(assistant_message)
88318871
self._vprint(f"{self.log_prefix}⚠️ Response only contains think block with no content after it")
88328872
if reasoning_text:
@@ -8903,6 +8943,8 @@ def _stop_spinner():
89038943
# Reset retry counter on successful content
89048944
if hasattr(self, '_empty_content_retries'):
89058945
self._empty_content_retries = 0
8946+
self._last_empty_content_signature = None
8947+
self._thinking_prefill_retries = 0
89068948

89078949
if (
89088950
self.api_mode == "codex_responses"
@@ -8941,7 +8983,18 @@ def _stop_spinner():
89418983
final_response = self._strip_think_blocks(final_response).strip()
89428984

89438985
final_msg = self._build_assistant_message(assistant_message, finish_reason)
8944-
8986+
8987+
# Pop thinking-only prefill message(s) before appending
8988+
# the final response. This avoids consecutive assistant
8989+
# messages which break strict-alternation providers
8990+
# (Anthropic Messages API) and keeps history clean.
8991+
while (
8992+
messages
8993+
and isinstance(messages[-1], dict)
8994+
and messages[-1].get("_thinking_prefill")
8995+
):
8996+
messages.pop()
8997+
89458998
messages.append(final_msg)
89468999

89479000
if not self.quiet_mode:

0 commit comments

Comments
 (0)