From bfdd15d1fa2abcd40b56cf6bb895a4fb3c4211b2 Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Wed, 28 May 2025 13:17:48 -0700 Subject: [PATCH 1/3] fix(responses): use input, not original_input when storing the Response (#2300) We must store the full (re-hydrated) input not just the original input in the Response object. Of course, this is not very space efficient and we should likely find a better storage scheme so that we can only store unique entries in the database and then re-hydrate them efficiently later. But that can be done safely later. Closes https://github.com/meta-llama/llama-stack/issues/2299 ## Test Plan Unit test --- .../agents/meta_reference/openai_responses.py | 21 +++--- .../meta_reference/test_openai_responses.py | 66 +++++++++++++++++++ 2 files changed, 76 insertions(+), 11 deletions(-) diff --git a/llama_stack/providers/inline/agents/meta_reference/openai_responses.py b/llama_stack/providers/inline/agents/meta_reference/openai_responses.py index 3a56d41ef..1fcb1c461 100644 --- a/llama_stack/providers/inline/agents/meta_reference/openai_responses.py +++ b/llama_stack/providers/inline/agents/meta_reference/openai_responses.py @@ -292,12 +292,12 @@ class OpenAIResponsesImpl: async def _store_response( self, response: OpenAIResponseObject, - original_input: str | list[OpenAIResponseInput], + input: str | list[OpenAIResponseInput], ) -> None: new_input_id = f"msg_{uuid.uuid4()}" - if isinstance(original_input, str): + if isinstance(input, str): # synthesize a message from the input string - input_content = OpenAIResponseInputMessageContentText(text=original_input) + input_content = OpenAIResponseInputMessageContentText(text=input) input_content_item = OpenAIResponseMessage( role="user", content=[input_content], @@ -307,7 +307,7 @@ class OpenAIResponsesImpl: else: # we already have a list of messages input_items_data = [] - for input_item in original_input: + for input_item in input: if isinstance(input_item, OpenAIResponseMessage): # These may or may not already have an id, so dump to dict, check for id, and add if missing input_item_dict = input_item.model_dump() @@ -334,7 +334,6 @@ class OpenAIResponsesImpl: tools: list[OpenAIResponseInputTool] | None = None, ): stream = False if stream is None else stream - original_input = input # Keep reference for storage output_messages: list[OpenAIResponseOutput] = [] @@ -372,7 +371,7 @@ class OpenAIResponsesImpl: inference_result=inference_result, ctx=ctx, output_messages=output_messages, - original_input=original_input, + input=input, model=model, store=store, tools=tools, @@ -382,7 +381,7 @@ class OpenAIResponsesImpl: inference_result=inference_result, ctx=ctx, output_messages=output_messages, - original_input=original_input, + input=input, model=model, store=store, tools=tools, @@ -393,7 +392,7 @@ class OpenAIResponsesImpl: inference_result: Any, ctx: ChatCompletionContext, output_messages: list[OpenAIResponseOutput], - original_input: str | list[OpenAIResponseInput], + input: str | list[OpenAIResponseInput], model: str, store: bool | None, tools: list[OpenAIResponseInputTool] | None, @@ -423,7 +422,7 @@ class OpenAIResponsesImpl: if store: await self._store_response( response=response, - original_input=original_input, + input=input, ) return response @@ -433,7 +432,7 @@ class OpenAIResponsesImpl: inference_result: Any, ctx: ChatCompletionContext, output_messages: list[OpenAIResponseOutput], - original_input: str | list[OpenAIResponseInput], + input: str | list[OpenAIResponseInput], model: str, store: bool | None, tools: list[OpenAIResponseInputTool] | None, @@ -544,7 +543,7 @@ class OpenAIResponsesImpl: if store: await self._store_response( response=final_response, - original_input=original_input, + input=input, ) # Emit response.completed diff --git a/tests/unit/providers/agents/meta_reference/test_openai_responses.py b/tests/unit/providers/agents/meta_reference/test_openai_responses.py index 9c491accb..5b6cee0ec 100644 --- a/tests/unit/providers/agents/meta_reference/test_openai_responses.py +++ b/tests/unit/providers/agents/meta_reference/test_openai_responses.py @@ -628,3 +628,69 @@ async def test_responses_store_list_input_items_logic(): result = await responses_store.list_response_input_items("resp_123", limit=0, order=Order.asc) assert result.object == "list" assert len(result.data) == 0 # Should return no items + + +@pytest.mark.asyncio +async def test_store_response_uses_rehydrated_input_with_previous_response( + openai_responses_impl, mock_responses_store, mock_inference_api +): + """Test that _store_response uses the full re-hydrated input (including previous responses) + rather than just the original input when previous_response_id is provided.""" + + # Setup - Create a previous response that should be included in the stored input + previous_response = OpenAIResponseObjectWithInput( + id="resp-previous-123", + object="response", + created_at=1234567890, + model="meta-llama/Llama-3.1-8B-Instruct", + status="completed", + input=[ + OpenAIResponseMessage( + id="msg-prev-user", role="user", content=[OpenAIResponseInputMessageContentText(text="What is 2+2?")] + ) + ], + output=[ + OpenAIResponseMessage( + id="msg-prev-assistant", + role="assistant", + content=[OpenAIResponseOutputMessageContentOutputText(text="2+2 equals 4.")], + ) + ], + ) + + mock_responses_store.get_response_object.return_value = previous_response + + current_input = "Now what is 3+3?" + model = "meta-llama/Llama-3.1-8B-Instruct" + mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml") + mock_inference_api.openai_chat_completion.return_value = mock_chat_completion + + # Execute - Create response with previous_response_id + result = await openai_responses_impl.create_openai_response( + input=current_input, + model=model, + previous_response_id="resp-previous-123", + store=True, + ) + + store_call_args = mock_responses_store.store_response_object.call_args + stored_input = store_call_args.kwargs["input"] + + # Verify that the stored input contains the full re-hydrated conversation: + # 1. Previous user message + # 2. Previous assistant response + # 3. Current user message + assert len(stored_input) == 3 + + assert stored_input[0].role == "user" + assert stored_input[0].content[0].text == "What is 2+2?" + + assert stored_input[1].role == "assistant" + assert stored_input[1].content[0].text == "2+2 equals 4." + + assert stored_input[2].role == "user" + assert stored_input[2].content == "Now what is 3+3?" + + # Verify the response itself is correct + assert result.model == model + assert result.status == "completed" From f0d8ceb2422247b1c68bfda9d92f9561012310df Mon Sep 17 00:00:00 2001 From: Mark Campbell Date: Thu, 29 May 2025 17:53:45 +0100 Subject: [PATCH 2/3] chore: fix flaky distro_codegen script (#2305) # What does this PR do? Adds an import for all of the template modules before the executor to prevent deadlock Closes #2278 ## Test Plan ``` # Run the pre-commit multiple times and verify the deadlock doesn't occur for i in {1..10}; do pre-commit run --all-files; done ``` --- scripts/distro_codegen.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/scripts/distro_codegen.py b/scripts/distro_codegen.py index 8820caf55..d33c5de67 100755 --- a/scripts/distro_codegen.py +++ b/scripts/distro_codegen.py @@ -107,6 +107,13 @@ def collect_template_dependencies(template_dir: Path) -> tuple[str | None, list[ return None, [] +def pre_import_templates(template_dirs: list[Path]) -> None: + # Pre-import all template modules to avoid deadlocks. + for template_dir in template_dirs: + module_name = f"llama_stack.templates.{template_dir.name}" + importlib.import_module(module_name) + + def main(): templates_dir = REPO_ROOT / "llama_stack" / "templates" change_tracker = ChangedPathTracker() @@ -118,6 +125,8 @@ def main(): template_dirs = list(find_template_dirs(templates_dir)) task = progress.add_task("Processing distribution templates...", total=len(template_dirs)) + pre_import_templates(template_dirs) + # Create a partial function with the progress bar process_func = partial(process_template, progress=progress, change_tracker=change_tracker) From 168c7113dfd779825cf116727974e60204fda148 Mon Sep 17 00:00:00 2001 From: Jorge Piedrahita Ortiz Date: Thu, 29 May 2025 11:54:23 -0500 Subject: [PATCH 3/3] fix(providers): update sambanova json schema mode (#2306) # What does this PR do? Updates sambanova inference to use strict as false in json_schema structured output ## Test Plan pytest -s -v tests/integration/inference/test_text_inference.py --stack-config=sambanova --text-model=sambanova/Meta-Llama-3.3-70B-Instruct --- llama_stack/providers/remote/inference/sambanova/sambanova.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_stack/providers/remote/inference/sambanova/sambanova.py b/llama_stack/providers/remote/inference/sambanova/sambanova.py index d182aa1dc..20f863665 100644 --- a/llama_stack/providers/remote/inference/sambanova/sambanova.py +++ b/llama_stack/providers/remote/inference/sambanova/sambanova.py @@ -218,7 +218,7 @@ class SambaNovaInferenceAdapter(LiteLLMOpenAIMixin): "json_schema": { "name": name, "schema": fmt, - "strict": True, + "strict": False, }, } if request.tools: