saxster
diff --git a/‎run_agent.py‎
Lines changed: 58 additions & 5 deletions b/‎run_agent.py‎
Lines changed: 58 additions & 5 deletions
@@ -5699,6 +5699,7 @@ def flush_memories(self, messages: list = None, min_turns: int = None):
                 api_msg.pop("reasoning", None)
                 api_msg.pop("finish_reason", None)
                 api_msg.pop("_flush_sentinel", None)
+                api_msg.pop("_thinking_prefill", None)
                 if _is_strict_api:
                     self._sanitize_tool_calls_for_strict_api(api_msg)
                 api_messages.append(api_msg)
@@ -6775,7 +6776,7 @@ def _handle_max_iterations(self, messages: list, api_call_count: int) -> str:
             api_messages = []
             for msg in messages:
                 api_msg = msg.copy()
-                for internal_field in ("reasoning", "finish_reason"):
+                for internal_field in ("reasoning", "finish_reason", "_thinking_prefill"):
                     api_msg.pop(internal_field, None)
                 if _is_strict_api:
                     self._sanitize_tool_calls_for_strict_api(api_msg)
@@ -6986,6 +6987,7 @@ def run_conversation(
         self._empty_content_retries = 0
         self._incomplete_scratchpad_retries = 0
         self._codex_incomplete_retries = 0
+        self._thinking_prefill_retries = 0
         self._last_content_with_tools = None
         self._mute_post_response = False
         self._surrogate_sanitized = False
@@ -7321,6 +7323,8 @@ def run_conversation(
                 # Remove finish_reason - not accepted by strict APIs (e.g. Mistral)
                 if "finish_reason" in api_msg:
                     api_msg.pop("finish_reason")
+                # Strip internal thinking-prefill marker
+                api_msg.pop("_thinking_prefill", None)
                 # Strip Codex Responses API fields (call_id, response_item_id) for
                 # strict providers like Mistral that reject unknown fields with 422.
                 # Uses new dicts so the internal messages list retains the fields
@@ -8708,6 +8712,15 @@ def _stop_spinner():
                             if clean:
                                 self._vprint(f"  ┊ 💬 {clean}")
 
+                    # Pop thinking-only prefill message(s) before appending
+                    # (tool-call path — same rationale as the final-response path).
+                    while (
+                        messages
+                        and isinstance(messages[-1], dict)
+                        and messages[-1].get("_thinking_prefill")
+                    ):
+                        messages.pop()
+
                     messages.append(assistant_msg)
 
                     # Close any open streaming display (response box, reasoning
@@ -8821,12 +8834,39 @@ def _stop_spinner():
                             self._response_was_previewed = True
                             break
 
-                        # No fallback available — this is a genuine empty response.
-                        # Retry in case the model just had a bad generation.
+                        # ── Thinking-only prefill continuation ──────────
+                        # The model produced structured reasoning (via API
+                        # fields) but no visible text content.  Rather than
+                        # giving up, append the assistant message as-is and
+                        # continue — the model will see its own reasoning
+                        # on the next turn and produce the text portion.
+                        _has_structured = bool(
+                            getattr(assistant_message, "reasoning", None)
+                            or getattr(assistant_message, "reasoning_content", None)
+                            or getattr(assistant_message, "reasoning_details", None)
+                        )
+                        if _has_structured and self._thinking_prefill_retries < 2:
+                            self._thinking_prefill_retries += 1
+                            self._vprint(
+                                f"{self.log_prefix}↻ Thinking-only response — "
+                                f"prefilling to continue "
+                                f"({self._thinking_prefill_retries}/2)"
+                            )
+                            interim_msg = self._build_assistant_message(
+                                assistant_message, "incomplete"
+                            )
+                            interim_msg["_thinking_prefill"] = True
+                            messages.append(interim_msg)
+                            self._session_messages = messages
+                            self._save_session_log(messages)
+                            continue
+
+                        # Exhausted prefill attempts or no structured
+                        # reasoning — fall through to empty content retry.
                         if not hasattr(self, '_empty_content_retries'):
                             self._empty_content_retries = 0
                         self._empty_content_retries += 1
-                        
+
                         reasoning_text = self._extract_reasoning(assistant_message)
                         self._vprint(f"{self.log_prefix}⚠️  Response only contains think block with no content after it")
                         if reasoning_text:
@@ -8903,6 +8943,8 @@ def _stop_spinner():
                     # Reset retry counter on successful content
                     if hasattr(self, '_empty_content_retries'):
                         self._empty_content_retries = 0
+                    self._last_empty_content_signature = None
+                    self._thinking_prefill_retries = 0
 
                     if (
                         self.api_mode == "codex_responses"
@@ -8941,7 +8983,18 @@ def _stop_spinner():
                     final_response = self._strip_think_blocks(final_response).strip()
 
                     final_msg = self._build_assistant_message(assistant_message, finish_reason)
-                    
+
+                    # Pop thinking-only prefill message(s) before appending
+                    # the final response.  This avoids consecutive assistant
+                    # messages which break strict-alternation providers
+                    # (Anthropic Messages API) and keeps history clean.
+                    while (
+                        messages
+                        and isinstance(messages[-1], dict)
+                        and messages[-1].get("_thinking_prefill")
+                    ):
+                        messages.pop()
+
                     messages.append(final_msg)
 
                     if not self.quiet_mode: