Merge branch 'main' into fix/divide-by-zero-exception-faiss-query-vector

2025-12-28 00:32:03 +00:00 · 2025-06-06 11:29:14 -04:00 · 2025-06-06 11:29:14 -04:00 · b05a3db358
commit b05a3db358
parent 57494665e1 0d0b8d2be1
19 changed files with 254 additions and 389 deletions
--- a/.github/actions/setup-ollama/action.yml
+++ b/.github/actions/setup-ollama/action.yml
@ -1,26 +1,9 @@
 name: Setup Ollama
-description: Start Ollama and cache model
+description: Start Ollama
 inputs:
  models:
    description: Comma-separated list of models to pull
    default: "llama3.2:3b-instruct-fp16,all-minilm:latest"
 runs:
  using: "composite"
  steps:
-    - name: Install and start Ollama
+    - name: Start Ollama
      shell: bash
      run: |
-        # the ollama installer also starts the ollama service
+        docker run -d --name ollama -p 11434:11434 docker.io/leseb/ollama-with-models
        curl -fsSL https://ollama.com/install.sh | sh
    # Do NOT cache models - pulling the cache is actually slower than just pulling the model.
    # It takes ~45 seconds to pull the models from the cache and unpack it, but only 30 seconds to
    # pull them directly.
    # Maybe this is because the cache is being pulled at the same time by all the matrix jobs?
    - name: Pull requested models
      if: inputs.models != ''
      shell: bash
      run: |
        for model in $(echo "${{ inputs.models }}" | tr ',' ' '); do
          ollama pull "$model"
        done
--- a/.github/actions/setup-runner/action.yml
+++ b/.github/actions/setup-runner/action.yml
@ -1,12 +1,17 @@
 name: Setup runner
 description: Prepare a runner for the tests (install uv, python, project dependencies, etc.)
 inputs:
  python-version:
    description: The Python version to use
    required: false
    default: "3.10"
 runs:
  using: "composite"
  steps:
    - name: Install uv
      uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
      with:
-        python-version: "3.10"
+        python-version: ${{ inputs.python-version }}
        activate-environment: true
        version: 0.7.6
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -26,6 +26,7 @@ jobs:
        # TODO: generate matrix list from tests/integration when fixed
        test-type: [agents, inference, datasets, inspect, scoring, post_training, providers, tool_runtime]
        client-type: [library, http]
        python-version: ["3.10", "3.11", "3.12"]
      fail-fast: false # we want to run all tests regardless of failure
    steps:
@ -34,20 +35,22 @@ jobs:
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
        with:
          python-version: ${{ matrix.python-version }}
      - name: Setup ollama
        uses: ./.github/actions/setup-ollama
      - name: Build Llama Stack
        run: |
-          llama stack build --template ollama --image-type venv
+          uv run llama stack build --template ollama --image-type venv
      - name: Start Llama Stack server in background
        if: matrix.client-type == 'http'
        env:
          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
        run: |
-          LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv &
+          LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv --env OLLAMA_URL="http://0.0.0.0:11434" &
      - name: Wait for Llama Stack server to be ready
        if: matrix.client-type == 'http'
@ -84,6 +87,7 @@ jobs:
      - name: Run Integration Tests
        env:
          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
          OLLAMA_URL: "http://0.0.0.0:11434"
        run: |
          if [ "${{ matrix.client-type }}" == "library" ]; then
            stack_config="ollama"
@ -104,13 +108,13 @@ jobs:
      - name: Write ollama logs to file
        if: ${{ always() }}
        run: |
-          sudo journalctl -u ollama.service > ollama.log
+          sudo docker logs ollama > ollama.log
      - name: Upload all logs to artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
        with:
-          name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}
+          name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}-${{ matrix.python-version }}
          path: |
            *.log
          retention-days: 1
--- a/llama_stack/distribution/build.py
+++ b/llama_stack/distribution/build.py
@ -43,23 +43,12 @@ def get_provider_dependencies(
    config: BuildConfig | DistributionTemplate,
 ) -> tuple[list[str], list[str]]:
    """Get normal and special dependencies from provider configuration."""
    # Extract providers based on config type
    if isinstance(config, DistributionTemplate):
-        providers = config.providers
+        config = config.build_config()
    providers = config.distribution_spec.providers
    additional_pip_packages = config.additional_pip_packages
        # TODO: This is a hack to get the dependencies for internal APIs into build
        # We should have a better way to do this by formalizing the concept of "internal" APIs
        # and providers, with a way to specify dependencies for them.
        run_configs = config.run_configs
        additional_pip_packages: list[str] = []
        if run_configs:
            for run_config in run_configs.values():
                run_config_ = run_config.run_config(name="", providers={}, container_image=None)
                if run_config_.inference_store:
                    additional_pip_packages.extend(run_config_.inference_store.pip_packages)
    elif isinstance(config, BuildConfig):
        providers = config.distribution_spec.providers
        additional_pip_packages = config.additional_pip_packages
    deps = []
    registry = get_provider_registry(config)
    for api_str, provider_or_providers in providers.items():
@ -87,8 +76,7 @@ def get_provider_dependencies(
        else:
            normal_deps.append(package)
-    if additional_pip_packages:
+    normal_deps.extend(additional_pip_packages or [])
        normal_deps.extend(additional_pip_packages)
    return list(set(normal_deps)), list(set(special_deps))
--- a/llama_stack/distribution/library_client.py
+++ b/llama_stack/distribution/library_client.py
@ -149,12 +149,13 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
            logger.info(f"Removed handler {handler.__class__.__name__} from root logger")
    def request(self, *args, **kwargs):
        # NOTE: We are using AsyncLlamaStackClient under the hood
        # A new event loop is needed to convert the AsyncStream
        # from async client into SyncStream return type for streaming
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        if kwargs.get("stream"):
            # NOTE: We are using AsyncLlamaStackClient under the hood
            # A new event loop is needed to convert the AsyncStream
            # from async client into SyncStream return type for streaming
            loop = asyncio.new_event_loop()
            asyncio.set_event_loop(loop)
            def sync_generator():
                try:
@ -172,7 +173,14 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
            return sync_generator()
        else:
-            return asyncio.run(self.async_client.request(*args, **kwargs))
+            try:
                result = loop.run_until_complete(self.async_client.request(*args, **kwargs))
            finally:
                pending = asyncio.all_tasks(loop)
                if pending:
                    loop.run_until_complete(asyncio.gather(*pending, return_exceptions=True))
                loop.close()
            return result
 class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
--- a/llama_stack/providers/inline/agents/meta_reference/openai_responses.py
+++ b/llama_stack/providers/inline/agents/meta_reference/openai_responses.py
@ -8,7 +8,7 @@ import json
 import time
 import uuid
 from collections.abc import AsyncIterator
-from typing import Any, cast
+from typing import Any
 from openai.types.chat import ChatCompletionToolParam
 from pydantic import BaseModel
@ -200,7 +200,6 @@ class ChatCompletionContext(BaseModel):
    messages: list[OpenAIMessageParam]
    tools: list[ChatCompletionToolParam] | None = None
    mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP]
    stream: bool
    temperature: float | None
    response_format: OpenAIResponseFormatParam
@ -281,49 +280,6 @@ class OpenAIResponsesImpl:
        """
        return await self.responses_store.list_response_input_items(response_id, after, before, include, limit, order)
    def _is_function_tool_call(
        self,
        tool_call: OpenAIChatCompletionToolCall,
        tools: list[OpenAIResponseInputTool],
    ) -> bool:
        if not tool_call.function:
            return False
        for t in tools:
            if t.type == "function" and t.name == tool_call.function.name:
                return True
        return False
    async def _process_response_choices(
        self,
        chat_response: OpenAIChatCompletion,
        ctx: ChatCompletionContext,
        tools: list[OpenAIResponseInputTool] | None,
    ) -> list[OpenAIResponseOutput]:
        """Handle tool execution and response message creation."""
        output_messages: list[OpenAIResponseOutput] = []
        # Execute tool calls if any
        for choice in chat_response.choices:
            if choice.message.tool_calls and tools:
                # Assume if the first tool is a function, all tools are functions
                if self._is_function_tool_call(choice.message.tool_calls[0], tools):
                    for tool_call in choice.message.tool_calls:
                        output_messages.append(
                            OpenAIResponseOutputMessageFunctionToolCall(
                                arguments=tool_call.function.arguments or "",
                                call_id=tool_call.id,
                                name=tool_call.function.name or "",
                                id=f"fc_{uuid.uuid4()}",
                                status="completed",
                            )
                        )
                else:
                    tool_messages = await self._execute_tool_and_return_final_output(choice, ctx)
                    output_messages.extend(tool_messages)
            else:
                output_messages.append(await _convert_chat_choice_to_response_message(choice))
        return output_messages
    async def _store_response(
        self,
        response: OpenAIResponseObject,
@ -370,9 +326,48 @@ class OpenAIResponsesImpl:
        tools: list[OpenAIResponseInputTool] | None = None,
        max_infer_iters: int | None = 10,
    ):
-        stream = False if stream is None else stream
+        stream = bool(stream)
        text = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")) if text is None else text
        stream_gen = self._create_streaming_response(
            input=input,
            model=model,
            instructions=instructions,
            previous_response_id=previous_response_id,
            store=store,
            temperature=temperature,
            text=text,
            tools=tools,
            max_infer_iters=max_infer_iters,
        )
        if stream:
            return stream_gen
        else:
            response = None
            async for stream_chunk in stream_gen:
                if stream_chunk.type == "response.completed":
                    if response is not None:
                        raise ValueError("The response stream completed multiple times! Earlier response: {response}")
                    response = stream_chunk.response
                    # don't leave the generator half complete!
            if response is None:
                raise ValueError("The response stream never completed")
            return response
    async def _create_streaming_response(
        self,
        input: str | list[OpenAIResponseInput],
        model: str,
        instructions: str | None = None,
        previous_response_id: str | None = None,
        store: bool | None = True,
        temperature: float | None = None,
        text: OpenAIResponseText | None = None,
        tools: list[OpenAIResponseInputTool] | None = None,
        max_infer_iters: int | None = 10,
    ) -> AsyncIterator[OpenAIResponseObjectStream]:
        output_messages: list[OpenAIResponseOutput] = []
        # Input preprocessing
@ -383,7 +378,7 @@ class OpenAIResponsesImpl:
        # Structured outputs
        response_format = await _convert_response_text_to_chat_response_format(text)
-        # Tool setup
+        # Tool setup, TODO: refactor this slightly since this can also yield events
        chat_tools, mcp_tool_to_server, mcp_list_message = (
            await self._convert_response_tools_to_chat_tools(tools) if tools else (None, {}, None)
        )
@ -395,136 +390,10 @@ class OpenAIResponsesImpl:
            messages=messages,
            tools=chat_tools,
            mcp_tool_to_server=mcp_tool_to_server,
            stream=stream,
            temperature=temperature,
            response_format=response_format,
        )
        # Fork to streaming vs non-streaming - let each handle ALL inference rounds
        if stream:
            return self._create_streaming_response(
                ctx=ctx,
                output_messages=output_messages,
                input=input,
                model=model,
                store=store,
                text=text,
                tools=tools,
                max_infer_iters=max_infer_iters,
            )
        else:
            return await self._create_non_streaming_response(
                ctx=ctx,
                output_messages=output_messages,
                input=input,
                model=model,
                store=store,
                text=text,
                tools=tools,
                max_infer_iters=max_infer_iters,
            )
    async def _create_non_streaming_response(
        self,
        ctx: ChatCompletionContext,
        output_messages: list[OpenAIResponseOutput],
        input: str | list[OpenAIResponseInput],
        model: str,
        store: bool | None,
        text: OpenAIResponseText,
        tools: list[OpenAIResponseInputTool] | None,
        max_infer_iters: int,
    ) -> OpenAIResponseObject:
        n_iter = 0
        messages = ctx.messages.copy()
        while True:
            # Do inference (including the first one)
            inference_result = await self.inference_api.openai_chat_completion(
                model=ctx.model,
                messages=messages,
                tools=ctx.tools,
                stream=False,
                temperature=ctx.temperature,
                response_format=ctx.response_format,
            )
            completion = OpenAIChatCompletion(**inference_result.model_dump())
            # Separate function vs non-function tool calls
            function_tool_calls = []
            non_function_tool_calls = []
            for choice in completion.choices:
                if choice.message.tool_calls and tools:
                    for tool_call in choice.message.tool_calls:
                        if self._is_function_tool_call(tool_call, tools):
                            function_tool_calls.append(tool_call)
                        else:
                            non_function_tool_calls.append(tool_call)
            # Process response choices based on tool call types
            if function_tool_calls:
                # For function tool calls, use existing logic and return immediately
                current_output_messages = await self._process_response_choices(
                    chat_response=completion,
                    ctx=ctx,
                    tools=tools,
                )
                output_messages.extend(current_output_messages)
                break
            elif non_function_tool_calls:
                # For non-function tool calls, execute them and continue loop
                for choice in completion.choices:
                    tool_outputs, tool_response_messages = await self._execute_tool_calls_only(choice, ctx)
                    output_messages.extend(tool_outputs)
                    # Add assistant message and tool responses to messages for next iteration
                    messages.append(choice.message)
                    messages.extend(tool_response_messages)
                n_iter += 1
                if n_iter >= max_infer_iters:
                    break
                # Continue with next iteration of the loop
                continue
            else:
                # No tool calls - convert response to message and we're done
                for choice in completion.choices:
                    output_messages.append(await _convert_chat_choice_to_response_message(choice))
                break
        response = OpenAIResponseObject(
            created_at=completion.created,
            id=f"resp-{uuid.uuid4()}",
            model=model,
            object="response",
            status="completed",
            output=output_messages,
            text=text,
        )
        logger.debug(f"OpenAI Responses response: {response}")
        # Store response if requested
        if store:
            await self._store_response(
                response=response,
                input=input,
            )
        return response
    async def _create_streaming_response(
        self,
        ctx: ChatCompletionContext,
        output_messages: list[OpenAIResponseOutput],
        input: str | list[OpenAIResponseInput],
        model: str,
        store: bool | None,
        text: OpenAIResponseText,
        tools: list[OpenAIResponseInputTool] | None,
        max_infer_iters: int | None,
    ) -> AsyncIterator[OpenAIResponseObjectStream]:
        # Create initial response and emit response.created immediately
        response_id = f"resp-{uuid.uuid4()}"
        created_at = int(time.time())
@ -539,15 +408,13 @@ class OpenAIResponsesImpl:
            text=text,
        )
        # Emit response.created immediately
        yield OpenAIResponseObjectStreamResponseCreated(response=initial_response)
        # Implement tool execution loop for streaming - handle ALL inference rounds including the first
        n_iter = 0
        messages = ctx.messages.copy()
        while True:
-            current_inference_result = await self.inference_api.openai_chat_completion(
+            completion_result = await self.inference_api.openai_chat_completion(
                model=ctx.model,
                messages=messages,
                tools=ctx.tools,
@ -568,7 +435,7 @@ class OpenAIResponsesImpl:
            # Create a placeholder message item for delta events
            message_item_id = f"msg_{uuid.uuid4()}"
-            async for chunk in current_inference_result:
+            async for chunk in completion_result:
                chat_response_id = chunk.id
                chunk_created = chunk.created
                chunk_model = chunk.model
@ -628,50 +495,55 @@ class OpenAIResponsesImpl:
                model=chunk_model,
            )
            # Separate function vs non-function tool calls
            function_tool_calls = []
            non_function_tool_calls = []
            next_turn_messages = messages.copy()
            for choice in current_response.choices:
                next_turn_messages.append(choice.message)
                if choice.message.tool_calls and tools:
                    for tool_call in choice.message.tool_calls:
-                        if self._is_function_tool_call(tool_call, tools):
+                        if _is_function_tool_call(tool_call, tools):
                            function_tool_calls.append(tool_call)
                        else:
                            non_function_tool_calls.append(tool_call)
-
+                else:
            # Process response choices based on tool call types
            if function_tool_calls:
                # For function tool calls, use existing logic and break
                current_output_messages = await self._process_response_choices(
                    chat_response=current_response,
                    ctx=ctx,
                    tools=tools,
                )
                output_messages.extend(current_output_messages)
                break
            elif non_function_tool_calls:
                # For non-function tool calls, execute them and continue loop
                for choice in current_response.choices:
                    tool_outputs, tool_response_messages = await self._execute_tool_calls_only(choice, ctx)
                    output_messages.extend(tool_outputs)
                    # Add assistant message and tool responses to messages for next iteration
                    messages.append(choice.message)
                    messages.extend(tool_response_messages)
                n_iter += 1
                if n_iter >= (max_infer_iters or 10):
                    break
                # Continue with next iteration of the loop
                continue
            else:
                # No tool calls - convert response to message and we're done
                for choice in current_response.choices:
                    output_messages.append(await _convert_chat_choice_to_response_message(choice))
            # execute non-function tool calls
            for tool_call in non_function_tool_calls:
                tool_call_log, tool_response_message = await self._execute_tool_call(tool_call, ctx)
                if tool_call_log:
                    output_messages.append(tool_call_log)
                if tool_response_message:
                    next_turn_messages.append(tool_response_message)
            for tool_call in function_tool_calls:
                output_messages.append(
                    OpenAIResponseOutputMessageFunctionToolCall(
                        arguments=tool_call.function.arguments or "",
                        call_id=tool_call.id,
                        name=tool_call.function.name or "",
                        id=f"fc_{uuid.uuid4()}",
                        status="completed",
                    )
                )
            if not function_tool_calls and not non_function_tool_calls:
                break
            if function_tool_calls:
                logger.info("Exiting inference loop since there is a function (client-side) tool call")
                break
            n_iter += 1
            if n_iter >= max_infer_iters:
                logger.info(f"Exiting inference loop since iteration count({n_iter}) exceeds {max_infer_iters=}")
                break
            messages = next_turn_messages
        # Create final response
        final_response = OpenAIResponseObject(
            created_at=created_at,
@ -683,15 +555,15 @@ class OpenAIResponsesImpl:
            output=output_messages,
        )
        # Emit response.completed
        yield OpenAIResponseObjectStreamResponseCompleted(response=final_response)
        if store:
            await self._store_response(
                response=final_response,
                input=input,
            )
        # Emit response.completed
        yield OpenAIResponseObjectStreamResponseCompleted(response=final_response)
    async def _convert_response_tools_to_chat_tools(
        self, tools: list[OpenAIResponseInputTool]
    ) -> tuple[
@ -784,73 +656,6 @@ class OpenAIResponsesImpl:
                raise ValueError(f"Llama Stack OpenAI Responses does not yet support tool type: {input_tool.type}")
        return chat_tools, mcp_tool_to_server, mcp_list_message
    async def _execute_tool_calls_only(
        self,
        choice: OpenAIChoice,
        ctx: ChatCompletionContext,
    ) -> tuple[list[OpenAIResponseOutput], list[OpenAIMessageParam]]:
        """Execute tool calls and return output messages and tool response messages for next inference."""
        output_messages: list[OpenAIResponseOutput] = []
        tool_response_messages: list[OpenAIMessageParam] = []
        if not isinstance(choice.message, OpenAIAssistantMessageParam):
            return output_messages, tool_response_messages
        if not choice.message.tool_calls:
            return output_messages, tool_response_messages
        for tool_call in choice.message.tool_calls:
            tool_call_log, further_input = await self._execute_tool_call(tool_call, ctx)
            if tool_call_log:
                output_messages.append(tool_call_log)
            if further_input:
                tool_response_messages.append(further_input)
        return output_messages, tool_response_messages
    async def _execute_tool_and_return_final_output(
        self,
        choice: OpenAIChoice,
        ctx: ChatCompletionContext,
    ) -> list[OpenAIResponseOutput]:
        output_messages: list[OpenAIResponseOutput] = []
        if not isinstance(choice.message, OpenAIAssistantMessageParam):
            return output_messages
        if not choice.message.tool_calls:
            return output_messages
        next_turn_messages = ctx.messages.copy()
        # Add the assistant message with tool_calls response to the messages list
        next_turn_messages.append(choice.message)
        for tool_call in choice.message.tool_calls:
            # TODO: telemetry spans for tool calls
            tool_call_log, further_input = await self._execute_tool_call(tool_call, ctx)
            if tool_call_log:
                output_messages.append(tool_call_log)
            if further_input:
                next_turn_messages.append(further_input)
        tool_results_chat_response = await self.inference_api.openai_chat_completion(
            model=ctx.model,
            messages=next_turn_messages,
            stream=ctx.stream,
            temperature=ctx.temperature,
        )
        # type cast to appease mypy: this is needed because we don't handle streaming properly :)
        tool_results_chat_response = cast(OpenAIChatCompletion, tool_results_chat_response)
        # Huge TODO: these are NOT the final outputs, we must keep the loop going
        tool_final_outputs = [
            await _convert_chat_choice_to_response_message(choice) for choice in tool_results_chat_response.choices
        ]
        # TODO: Wire in annotations with URLs, titles, etc to these output messages
        output_messages.extend(tool_final_outputs)
        return output_messages
    async def _execute_tool_call(
        self,
        tool_call: OpenAIChatCompletionToolCall,
@ -939,3 +744,15 @@ class OpenAIResponsesImpl:
            input_message = OpenAIToolMessageParam(content=text, tool_call_id=tool_call_id)
        return message, input_message
 def _is_function_tool_call(
    tool_call: OpenAIChatCompletionToolCall,
    tools: list[OpenAIResponseInputTool],
 ) -> bool:
    if not tool_call.function:
        return False
    for t in tools:
        if t.type == "function" and t.name == tool_call.function.name:
            return True
    return False
--- a/llama_stack/providers/registry/agents.py
+++ b/llama_stack/providers/registry/agents.py
@ -24,7 +24,7 @@ def available_providers() -> list[ProviderSpec]:
                "pandas",
                "scikit-learn",
            ]
-            + kvstore_dependencies(),
+            + kvstore_dependencies(),  # TODO make this dynamic based on the kvstore config
            module="llama_stack.providers.inline.agents.meta_reference",
            config_class="llama_stack.providers.inline.agents.meta_reference.MetaReferenceAgentsImplConfig",
            api_dependencies=[
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -345,21 +345,27 @@ class OllamaInferenceAdapter(
            model = await self.register_helper.register_model(model)
        except ValueError:
            pass  # Ignore statically unknown model, will check live listing
        if model.provider_resource_id is None:
            raise ValueError("Model provider_resource_id cannot be None")
        if model.model_type == ModelType.embedding:
            logger.info(f"Pulling embedding model `{model.provider_resource_id}` if necessary...")
-            await self.client.pull(model.provider_resource_id)
+            # TODO: you should pull here only if the model is not found in a list
            response = await self.client.list()
            if model.provider_resource_id not in [m.model for m in response.models]:
                await self.client.pull(model.provider_resource_id)
        # we use list() here instead of ps() -
        #  - ps() only lists running models, not available models
        #  - models not currently running are run by the ollama server as needed
        response = await self.client.list()
-        available_models = [m["model"] for m in response["models"]]
+        available_models = [m.model for m in response.models]
        if model.provider_resource_id is None:
            raise ValueError("Model provider_resource_id cannot be None")
        provider_resource_id = self.register_helper.get_provider_model_id(model.provider_resource_id)
        if provider_resource_id is None:
            provider_resource_id = model.provider_resource_id
        if provider_resource_id not in available_models:
-            available_models_latest = [m["model"].split(":latest")[0] for m in response["models"]]
+            available_models_latest = [m.model.split(":latest")[0] for m in response.models]
            if provider_resource_id in available_models_latest:
                logger.warning(
                    f"Imprecise provider resource id was used but 'latest' is available in Ollama - using '{model.provider_resource_id}:latest'"
--- a/llama_stack/providers/utils/kvstore/config.py
+++ b/llama_stack/providers/utils/kvstore/config.py
@ -36,6 +36,10 @@ class RedisKVStoreConfig(CommonConfig):
    def url(self) -> str:
        return f"redis://{self.host}:{self.port}"
    @property
    def pip_packages(self) -> list[str]:
        return ["redis"]
    @classmethod
    def sample_run_config(cls):
        return {
@ -53,6 +57,10 @@ class SqliteKVStoreConfig(CommonConfig):
        description="File path for the sqlite database",
    )
    @property
    def pip_packages(self) -> list[str]:
        return ["aiosqlite"]
    @classmethod
    def sample_run_config(cls, __distro_dir__: str, db_name: str = "kvstore.db"):
        return {
@ -100,6 +108,10 @@ class PostgresKVStoreConfig(CommonConfig):
            raise ValueError("Table name must be less than 63 characters")
        return v
    @property
    def pip_packages(self) -> list[str]:
        return ["psycopg2-binary"]
 class MongoDBKVStoreConfig(CommonConfig):
    type: Literal[KVStoreType.mongodb.value] = KVStoreType.mongodb.value
@ -110,6 +122,10 @@ class MongoDBKVStoreConfig(CommonConfig):
    password: str | None = None
    collection_name: str = "llamastack_kvstore"
    @property
    def pip_packages(self) -> list[str]:
        return ["pymongo"]
    @classmethod
    def sample_run_config(cls, collection_name: str = "llamastack_kvstore"):
        return {
--- a/llama_stack/providers/utils/kvstore/kvstore.py
+++ b/llama_stack/providers/utils/kvstore/kvstore.py
@ -10,6 +10,13 @@ from .config import KVStoreConfig, KVStoreType
 def kvstore_dependencies():
    """
    Returns all possible kvstore dependencies for registry/provider specifications.
    NOTE: For specific kvstore implementations, use config.pip_packages instead.
    This function returns the union of all dependencies for cases where the specific
    kvstore type is not known at declaration time (e.g., provider registries).
    """
    return ["aiosqlite", "psycopg2-binary", "redis", "pymongo"]
--- a/llama_stack/templates/postgres-demo/build.yaml
+++ b/llama_stack/templates/postgres-demo/build.yaml
@ -21,4 +21,5 @@ distribution_spec:
 image_type: conda
 additional_pip_packages:
 - asyncpg
 - psycopg2-binary
 - sqlalchemy[asyncio]
--- a/llama_stack/templates/template.py
+++ b/llama_stack/templates/template.py
@ -186,8 +186,14 @@ class DistributionTemplate(BaseModel):
        additional_pip_packages: list[str] = []
        for run_config in self.run_configs.values():
            run_config_ = run_config.run_config(self.name, self.providers, self.container_image)
            # TODO: This is a hack to get the dependencies for internal APIs into build
            # We should have a better way to do this by formalizing the concept of "internal" APIs
            # and providers, with a way to specify dependencies for them.
            if run_config_.inference_store:
                additional_pip_packages.extend(run_config_.inference_store.pip_packages)
            if run_config_.metadata_store:
                additional_pip_packages.extend(run_config_.metadata_store.pip_packages)
        if self.additional_pip_packages:
            additional_pip_packages.extend(self.additional_pip_packages)
--- a/llama_stack/ui/package.json
+++ b/llama_stack/ui/package.json
@ -19,7 +19,7 @@
    "@radix-ui/react-tooltip": "^1.2.6",
    "class-variance-authority": "^0.7.1",
    "clsx": "^2.1.1",
-    "llama-stack-client": "0.2.9",
+    "llama-stack-client": "0.2.10",
    "lucide-react": "^0.510.0",
    "next": "15.3.2",
    "next-themes": "^0.4.6",
--- a/pyproject.toml
+++ b/pyproject.toml
@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "llama_stack"
-version = "0.2.9"
+version = "0.2.10"
 authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }]
 description = "Llama Stack"
 readme = "README.md"
@ -22,12 +22,13 @@ classifiers = [
 ]
 dependencies = [
    "aiohttp",
    "fastapi>=0.115.0,<1.0",
    "fire",
    "httpx",
    "huggingface-hub",
    "jinja2>=3.1.6",
    "jsonschema",
-    "llama-stack-client>=0.2.9",
+    "llama-stack-client>=0.2.10",
    "openai>=1.66",
    "prompt-toolkit",
    "python-dotenv",
@ -48,7 +49,7 @@ dependencies = [
 ui = [
    "streamlit",
    "pandas",
-    "llama-stack-client>=0.2.9",
+    "llama-stack-client>=0.2.10",
    "streamlit-option-menu",
 ]
@ -67,7 +68,6 @@ dev = [
    "types-setuptools",
    "pre-commit",
    "uvicorn",
    "fastapi",
    "ruamel.yaml",        # needed for openapi generator
 ]
 # These are the dependencies required for running unit tests.
@ -133,7 +133,8 @@ llama = "llama_stack.cli.llama:main"
 install-wheel-from-presigned = "llama_stack.cli.scripts.run:install_wheel_from_presigned"
 [tool.setuptools.packages.find]
-include = ["llama_stack"]
+where = ["."]
 include = ["llama_stack", "llama_stack.*"]
 [[tool.uv.index]]
 name = "pytorch-cpu"
--- a/requirements.txt
+++ b/requirements.txt
@ -42,6 +42,8 @@ ecdsa==0.19.1
    # via python-jose
 exceptiongroup==1.2.2 ; python_full_version < '3.11'
    # via anyio
 fastapi==0.115.8
    # via llama-stack
 filelock==3.17.0
    # via huggingface-hub
 fire==0.7.0
@ -79,7 +81,7 @@ jsonschema==4.23.0
    # via llama-stack
 jsonschema-specifications==2024.10.1
    # via jsonschema
-llama-stack-client==0.2.9
+llama-stack-client==0.2.10
    # via llama-stack
 markdown-it-py==3.0.0
    # via rich
@ -117,6 +119,7 @@ pyasn1==0.4.8
    #   rsa
 pydantic==2.10.6
    # via
    #   fastapi
    #   llama-stack
    #   llama-stack-client
    #   openai
@ -171,7 +174,9 @@ sniffio==1.3.1
    #   llama-stack-client
    #   openai
 starlette==0.45.3
-    # via llama-stack
+    # via
    #   fastapi
    #   llama-stack
 termcolor==2.5.0
    # via
    #   fire
@ -187,6 +192,7 @@ tqdm==4.67.1
 typing-extensions==4.12.2
    # via
    #   anyio
    #   fastapi
    #   huggingface-hub
    #   llama-stack-client
    #   multidict
--- a/scripts/distro_codegen.py
+++ b/scripts/distro_codegen.py
@ -15,11 +15,6 @@ from pathlib import Path
 from rich.progress import Progress, SpinnerColumn, TextColumn
 from llama_stack.distribution.build import (
    SERVER_DEPENDENCIES,
    get_provider_dependencies,
 )
 REPO_ROOT = Path(__file__).parent.parent
@ -90,23 +85,6 @@ def check_for_changes(change_tracker: ChangedPathTracker) -> bool:
    return has_changes
 def collect_template_dependencies(template_dir: Path) -> tuple[str | None, list[str]]:
    try:
        module_name = f"llama_stack.templates.{template_dir.name}"
        module = importlib.import_module(module_name)
        if template_func := getattr(module, "get_distribution_template", None):
            template = template_func()
            normal_deps, special_deps = get_provider_dependencies(template)
            # Combine all dependencies in order: normal deps, special deps, server deps
            all_deps = sorted(set(normal_deps + SERVER_DEPENDENCIES)) + sorted(set(special_deps))
            return template.name, all_deps
    except Exception:
        return None, []
    return None, []
 def pre_import_templates(template_dirs: list[Path]) -> None:
    # Pre-import all template modules to avoid deadlocks.
    for template_dir in template_dirs:
--- a/tests/Containerfile
+++ b/tests/Containerfile
@ -0,0 +1,13 @@
 # Containerfile used to build our all in one ollama image to run tests in CI
 # podman build --platform linux/amd64 -f Containerfile -t ollama-with-models .
 #
 FROM --platform=linux/amd64 ollama/ollama:latest
 # Start ollama and pull models in a single layer
 RUN ollama serve & \
    sleep 5 && \
    ollama pull llama3.2:3b-instruct-fp16 && \
    ollama pull all-minilm:latest
 # Set the entrypoint to start ollama serve
 ENTRYPOINT ["ollama", "serve"]
--- a/tests/integration/files/test_files.py
+++ b/tests/integration/files/test_files.py
@ -6,9 +6,15 @@
 from io import BytesIO
 import pytest
-def test_openai_client_basic_operations(openai_client):
+from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
 def test_openai_client_basic_operations(openai_client, client_with_models):
    """Test basic file operations through OpenAI client."""
    if isinstance(client_with_models, LlamaStackAsLibraryClient):
        pytest.skip("OpenAI files are not supported when testing with library client yet.")
    client = openai_client
    test_content = b"files test content"
--- a/tests/unit/providers/agents/meta_reference/test_openai_responses.py
+++ b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
@ -80,6 +80,37 @@ def openai_responses_impl(mock_inference_api, mock_tool_groups_api, mock_tool_ru
    )
 async def fake_stream(fixture: str = "simple_chat_completion.yaml"):
    value = load_chat_completion_fixture(fixture)
    yield ChatCompletionChunk(
        id=value.id,
        choices=[
            Choice(
                index=0,
                delta=ChoiceDelta(
                    content=c.message.content,
                    role=c.message.role,
                    tool_calls=[
                        ChoiceDeltaToolCall(
                            index=0,
                            id=t.id,
                            function=ChoiceDeltaToolCallFunction(
                                name=t.function.name,
                                arguments=t.function.arguments,
                            ),
                        )
                        for t in (c.message.tool_calls or [])
                    ],
                ),
            )
            for c in value.choices
        ],
        created=1,
        model=value.model,
        object="chat.completion.chunk",
    )
@pytest.mark.asyncio
 async def test_create_openai_response_with_string_input(openai_responses_impl, mock_inference_api):
    """Test creating an OpenAI response with a simple string input."""
@ -88,8 +119,7 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m
    model = "meta-llama/Llama-3.1-8B-Instruct"
    # Load the chat completion fixture
-    mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml")
+    mock_inference_api.openai_chat_completion.return_value = fake_stream()
    mock_inference_api.openai_chat_completion.return_value = mock_chat_completion
    # Execute
    result = await openai_responses_impl.create_openai_response(
@ -104,7 +134,7 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m
        messages=[OpenAIUserMessageParam(role="user", content="What is the capital of Ireland?", name=None)],
        response_format=OpenAIResponseFormatText(),
        tools=None,
-        stream=False,
+        stream=True,
        temperature=0.1,
    )
    openai_responses_impl.responses_store.store_response_object.assert_called_once()
@ -121,20 +151,15 @@ async def test_create_openai_response_with_string_input_with_tools(openai_respon
    input_text = "What is the capital of Ireland?"
    model = "meta-llama/Llama-3.1-8B-Instruct"
    # Load the chat completion fixtures
    tool_call_completion = load_chat_completion_fixture("tool_call_completion.yaml")
    tool_response_completion = load_chat_completion_fixture("simple_chat_completion.yaml")
    mock_inference_api.openai_chat_completion.side_effect = [
-        tool_call_completion,
+        fake_stream("tool_call_completion.yaml"),
-        tool_response_completion,
+        fake_stream(),
    ]
    openai_responses_impl.tool_groups_api.get_tool.return_value = Tool(
        identifier="web_search",
        provider_id="client",
        toolgroup_id="web_search",
        tool_host="client",
        description="Search the web for information",
        parameters=[
            ToolParameter(name="query", parameter_type="string", description="The query to search for", required=True)
@ -189,7 +214,7 @@ async def test_create_openai_response_with_tool_call_type_none(openai_responses_
    input_text = "How hot it is in San Francisco today?"
    model = "meta-llama/Llama-3.1-8B-Instruct"
-    async def fake_stream():
+    async def fake_stream_toolcall():
        yield ChatCompletionChunk(
            id="123",
            choices=[
@ -212,7 +237,7 @@ async def test_create_openai_response_with_tool_call_type_none(openai_responses_
            object="chat.completion.chunk",
        )
-    mock_inference_api.openai_chat_completion.return_value = fake_stream()
+    mock_inference_api.openai_chat_completion.return_value = fake_stream_toolcall()
    # Execute
    result = await openai_responses_impl.create_openai_response(
@ -271,7 +296,7 @@ async def test_create_openai_response_with_multiple_messages(openai_responses_im
    ]
    model = "meta-llama/Llama-3.1-8B-Instruct"
-    mock_inference_api.openai_chat_completion.return_value = load_chat_completion_fixture("simple_chat_completion.yaml")
+    mock_inference_api.openai_chat_completion.return_value = fake_stream()
    # Execute
    await openai_responses_impl.create_openai_response(
@ -399,9 +424,7 @@ async def test_create_openai_response_with_instructions(openai_responses_impl, m
    model = "meta-llama/Llama-3.1-8B-Instruct"
    instructions = "You are a geography expert. Provide concise answers."
-    # Load the chat completion fixture
+    mock_inference_api.openai_chat_completion.return_value = fake_stream()
    mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml")
    mock_inference_api.openai_chat_completion.return_value = mock_chat_completion
    # Execute
    await openai_responses_impl.create_openai_response(
@ -440,8 +463,7 @@ async def test_create_openai_response_with_instructions_and_multiple_messages(
    model = "meta-llama/Llama-3.1-8B-Instruct"
    instructions = "You are a geography expert. Provide concise answers."
-    mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml")
+    mock_inference_api.openai_chat_completion.return_value = fake_stream()
    mock_inference_api.openai_chat_completion.return_value = mock_chat_completion
    # Execute
    await openai_responses_impl.create_openai_response(
@ -499,8 +521,8 @@ async def test_create_openai_response_with_instructions_and_previous_response(
    model = "meta-llama/Llama-3.1-8B-Instruct"
    instructions = "You are a geography expert. Provide concise answers."
-    mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml")
+
-    mock_inference_api.openai_chat_completion.return_value = mock_chat_completion
+    mock_inference_api.openai_chat_completion.return_value = fake_stream()
    # Execute
    await openai_responses_impl.create_openai_response(
@ -674,8 +696,8 @@ async def test_store_response_uses_rehydrated_input_with_previous_response(
    current_input = "Now what is 3+3?"
    model = "meta-llama/Llama-3.1-8B-Instruct"
-    mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml")
+
-    mock_inference_api.openai_chat_completion.return_value = mock_chat_completion
+    mock_inference_api.openai_chat_completion.return_value = fake_stream()
    # Execute - Create response with previous_response_id
    result = await openai_responses_impl.create_openai_response(
@ -732,9 +754,7 @@ async def test_create_openai_response_with_text_format(
    input_text = "How hot it is in San Francisco today?"
    model = "meta-llama/Llama-3.1-8B-Instruct"
-    # Load the chat completion fixture
+    mock_inference_api.openai_chat_completion.return_value = fake_stream()
    mock_chat_completion = load_chat_completion_fixture("simple_chat_completion.yaml")
    mock_inference_api.openai_chat_completion.return_value = mock_chat_completion
    # Execute
    _result = await openai_responses_impl.create_openai_response(