Merge remote-tracking branch 'origin/main' into storage_fix

2025-12-04 18:13:44 +00:00 · 2025-11-12 10:17:56 -08:00 · 2025-11-12 10:17:56 -08:00 · 08024d44f2
commit 08024d44f2
parent 7ce0c5c5dc 356f37b1ba
89 changed files with 4786 additions and 3941 deletions
--- a/src/llama_stack/apis/agents/agents.py
+++ b/src/llama_stack/apis/agents/agents.py
@ -87,6 +87,7 @@ class Agents(Protocol):
                "List of guardrails to apply during response generation. Guardrails provide safety and content moderation."
            ),
        ] = None,
+        max_tool_calls: int | None = None,
    ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
        """Create a model response.

@ -97,6 +98,7 @@ class Agents(Protocol):
        :param conversation: (Optional) The ID of a conversation to add the response to. Must begin with 'conv_'. Input and output messages will be automatically added to the conversation.
        :param include: (Optional) Additional fields to include in the response.
        :param guardrails: (Optional) List of guardrails to apply during response generation. Can be guardrail IDs (strings) or guardrail specifications.
+        :param max_tool_calls: (Optional) Max number of total calls to built-in tools that can be processed in a response.
        :returns: An OpenAIResponseObject.
        """
        ...
--- a/src/llama_stack/apis/agents/openai_responses.py
+++ b/src/llama_stack/apis/agents/openai_responses.py
@ -594,6 +594,7 @@ class OpenAIResponseObject(BaseModel):
    :param truncation: (Optional) Truncation strategy applied to the response
    :param usage: (Optional) Token usage information for the response
    :param instructions: (Optional) System message inserted into the model's context
+    :param max_tool_calls: (Optional) Max number of total calls to built-in tools that can be processed in a response
    """

    created_at: int
@ -615,6 +616,7 @@ class OpenAIResponseObject(BaseModel):
    truncation: str | None = None
    usage: OpenAIResponseUsage | None = None
    instructions: str | None = None
+    max_tool_calls: int | None = None


@json_schema_type
--- a/src/llama_stack/apis/benchmarks/benchmarks.py
+++ b/src/llama_stack/apis/benchmarks/benchmarks.py
@ -74,7 +74,7 @@ class Benchmarks(Protocol):
        """
        ...

-    @webmethod(route="/eval/benchmarks", method="POST", level=LLAMA_STACK_API_V1ALPHA)
+    @webmethod(route="/eval/benchmarks", method="POST", level=LLAMA_STACK_API_V1ALPHA, deprecated=True)
    async def register_benchmark(
        self,
        benchmark_id: str,
@ -95,7 +95,7 @@ class Benchmarks(Protocol):
        """
        ...

-    @webmethod(route="/eval/benchmarks/{benchmark_id}", method="DELETE", level=LLAMA_STACK_API_V1ALPHA)
+    @webmethod(route="/eval/benchmarks/{benchmark_id}", method="DELETE", level=LLAMA_STACK_API_V1ALPHA, deprecated=True)
    async def unregister_benchmark(self, benchmark_id: str) -> None:
        """Unregister a benchmark.

--- a/src/llama_stack/apis/datasets/datasets.py
+++ b/src/llama_stack/apis/datasets/datasets.py
@ -146,7 +146,7 @@ class ListDatasetsResponse(BaseModel):


 class Datasets(Protocol):
-    @webmethod(route="/datasets", method="POST", level=LLAMA_STACK_API_V1BETA)
+    @webmethod(route="/datasets", method="POST", level=LLAMA_STACK_API_V1BETA, deprecated=True)
    async def register_dataset(
        self,
        purpose: DatasetPurpose,
@ -235,7 +235,7 @@ class Datasets(Protocol):
        """
        ...

-    @webmethod(route="/datasets/{dataset_id:path}", method="DELETE", level=LLAMA_STACK_API_V1BETA)
+    @webmethod(route="/datasets/{dataset_id:path}", method="DELETE", level=LLAMA_STACK_API_V1BETA, deprecated=True)
    async def unregister_dataset(
        self,
        dataset_id: str,
--- a/src/llama_stack/apis/inference/event_logger.py
+++ b/src/llama_stack/apis/inference/event_logger.py
@ -1,43 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from termcolor import cprint
-
-from llama_stack.apis.inference import (
-    ChatCompletionResponseEventType,
-    ChatCompletionResponseStreamChunk,
-)
-
-
-class LogEvent:
-    def __init__(
-        self,
-        content: str = "",
-        end: str = "\n",
-        color="white",
-    ):
-        self.content = content
-        self.color = color
-        self.end = "\n" if end is None else end
-
-    def print(self, flush=True):
-        cprint(f"{self.content}", color=self.color, end=self.end, flush=flush)
-
-
-class EventLogger:
-    async def log(self, event_generator):
-        async for chunk in event_generator:
-            if isinstance(chunk, ChatCompletionResponseStreamChunk):
-                event = chunk.event
-                if event.event_type == ChatCompletionResponseEventType.start:
-                    yield LogEvent("Assistant> ", color="cyan", end="")
-                elif event.event_type == ChatCompletionResponseEventType.progress:
-                    yield LogEvent(event.delta, color="yellow", end="")
-                elif event.event_type == ChatCompletionResponseEventType.complete:
-                    yield LogEvent("")
-            else:
-                yield LogEvent("Assistant> ", color="cyan", end="")
-                yield LogEvent(chunk.completion_message.content, color="yellow")
--- a/src/llama_stack/apis/inference/inference.py
+++ b/src/llama_stack/apis/inference/inference.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.

 from collections.abc import AsyncIterator
-from enum import Enum
+from enum import Enum, StrEnum
 from typing import (
    Annotated,
    Any,
@ -15,28 +15,18 @@ from typing import (
 )

 from fastapi import Body
-from pydantic import BaseModel, Field, field_validator
+from pydantic import BaseModel, Field
 from typing_extensions import TypedDict

-from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent
-from llama_stack.apis.common.responses import MetricResponseMixin, Order
+from llama_stack.apis.common.content_types import InterleavedContent
+from llama_stack.apis.common.responses import (
+    Order,
+)
 from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.models import Model
 from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA
-from llama_stack.models.llama.datatypes import (
-    BuiltinTool,
-    StopReason,
-    ToolCall,
-    ToolDefinition,
-    ToolPromptFormat,
-)
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod

-register_schema(ToolCall)
-register_schema(ToolDefinition)
-
-from enum import StrEnum
-

@json_schema_type
 class GreedySamplingStrategy(BaseModel):
@ -201,58 +191,6 @@ class ToolResponseMessage(BaseModel):
    content: InterleavedContent


-@json_schema_type
-class CompletionMessage(BaseModel):
-    """A message containing the model's (assistant) response in a chat conversation.
-
-    :param role: Must be "assistant" to identify this as the model's response
-    :param content: The content of the model's response
-    :param stop_reason: Reason why the model stopped generating. Options are:
-        - `StopReason.end_of_turn`: The model finished generating the entire response.
-        - `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response.
-        - `StopReason.out_of_tokens`: The model ran out of token budget.
-    :param tool_calls: List of tool calls. Each tool call is a ToolCall object.
-    """
-
-    role: Literal["assistant"] = "assistant"
-    content: InterleavedContent
-    stop_reason: StopReason
-    tool_calls: list[ToolCall] | None = Field(default_factory=lambda: [])
-
-
-Message = Annotated[
-    UserMessage | SystemMessage | ToolResponseMessage | CompletionMessage,
-    Field(discriminator="role"),
-]
-register_schema(Message, name="Message")
-
-
-@json_schema_type
-class ToolResponse(BaseModel):
-    """Response from a tool invocation.
-
-    :param call_id: Unique identifier for the tool call this response is for
-    :param tool_name: Name of the tool that was invoked
-    :param content: The response content from the tool
-    :param metadata: (Optional) Additional metadata about the tool response
-    """
-
-    call_id: str
-    tool_name: BuiltinTool | str
-    content: InterleavedContent
-    metadata: dict[str, Any] | None = None
-
-    @field_validator("tool_name", mode="before")
-    @classmethod
-    def validate_field(cls, v):
-        if isinstance(v, str):
-            try:
-                return BuiltinTool(v)
-            except ValueError:
-                return v
-        return v
-
-
 class ToolChoice(Enum):
    """Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model.

@ -289,22 +227,6 @@ class ChatCompletionResponseEventType(Enum):
    progress = "progress"


-@json_schema_type
-class ChatCompletionResponseEvent(BaseModel):
-    """An event during chat completion generation.
-
-    :param event_type: Type of the event
-    :param delta: Content generated since last event. This can be one or more tokens, or a tool call.
-    :param logprobs: Optional log probabilities for generated tokens
-    :param stop_reason: Optional reason why generation stopped, if complete
-    """
-
-    event_type: ChatCompletionResponseEventType
-    delta: ContentDelta
-    logprobs: list[TokenLogProbs] | None = None
-    stop_reason: StopReason | None = None
-
-
 class ResponseFormatType(StrEnum):
    """Types of formats for structured (guided) decoding.

@ -357,34 +279,6 @@ class CompletionRequest(BaseModel):
    logprobs: LogProbConfig | None = None


-@json_schema_type
-class CompletionResponse(MetricResponseMixin):
-    """Response from a completion request.
-
-    :param content: The generated completion text
-    :param stop_reason: Reason why generation stopped
-    :param logprobs: Optional log probabilities for generated tokens
-    """
-
-    content: str
-    stop_reason: StopReason
-    logprobs: list[TokenLogProbs] | None = None
-
-
-@json_schema_type
-class CompletionResponseStreamChunk(MetricResponseMixin):
-    """A chunk of a streamed completion response.
-
-    :param delta: New content generated since last chunk. This can be one or more tokens.
-    :param stop_reason: Optional reason why generation stopped, if complete
-    :param logprobs: Optional log probabilities for generated tokens
-    """
-
-    delta: str
-    stop_reason: StopReason | None = None
-    logprobs: list[TokenLogProbs] | None = None
-
-
 class SystemMessageBehavior(Enum):
    """Config for how to override the default system prompt.

@ -398,70 +292,6 @@ class SystemMessageBehavior(Enum):
    replace = "replace"


-@json_schema_type
-class ToolConfig(BaseModel):
-    """Configuration for tool use.
-
-    :param tool_choice: (Optional) Whether tool use is automatic, required, or none. Can also specify a tool name to use a specific tool. Defaults to ToolChoice.auto.
-    :param tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model.
-        - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
-        - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag.
-        - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls.
-    :param system_message_behavior: (Optional) Config for how to override the default system prompt.
-        - `SystemMessageBehavior.append`: Appends the provided system message to the default system prompt.
-        - `SystemMessageBehavior.replace`: Replaces the default system prompt with the provided system message. The system message can include the string
-            '{{function_definitions}}' to indicate where the function definitions should be inserted.
-    """
-
-    tool_choice: ToolChoice | str | None = Field(default=ToolChoice.auto)
-    tool_prompt_format: ToolPromptFormat | None = Field(default=None)
-    system_message_behavior: SystemMessageBehavior | None = Field(default=SystemMessageBehavior.append)
-
-    def model_post_init(self, __context: Any) -> None:
-        if isinstance(self.tool_choice, str):
-            try:
-                self.tool_choice = ToolChoice[self.tool_choice]
-            except KeyError:
-                pass
-
-
-# This is an internally used class
-@json_schema_type
-class ChatCompletionRequest(BaseModel):
-    model: str
-    messages: list[Message]
-    sampling_params: SamplingParams | None = Field(default_factory=SamplingParams)
-
-    tools: list[ToolDefinition] | None = Field(default_factory=lambda: [])
-    tool_config: ToolConfig | None = Field(default_factory=ToolConfig)
-
-    response_format: ResponseFormat | None = None
-    stream: bool | None = False
-    logprobs: LogProbConfig | None = None
-
-
-@json_schema_type
-class ChatCompletionResponseStreamChunk(MetricResponseMixin):
-    """A chunk of a streamed chat completion response.
-
-    :param event: The event containing the new content
-    """
-
-    event: ChatCompletionResponseEvent
-
-
-@json_schema_type
-class ChatCompletionResponse(MetricResponseMixin):
-    """Response from a chat completion request.
-
-    :param completion_message: The complete response message
-    :param logprobs: Optional log probabilities for generated tokens
-    """
-
-    completion_message: CompletionMessage
-    logprobs: list[TokenLogProbs] | None = None
-
-
@json_schema_type
 class EmbeddingsResponse(BaseModel):
    """Response containing generated embeddings.
--- a/src/llama_stack/apis/inspect/inspect.py
+++ b/src/llama_stack/apis/inspect/inspect.py
@ -76,7 +76,7 @@ class Inspect(Protocol):

        List all available API routes with their methods and implementing providers.

-        :param api_filter: Optional filter to control which routes are returned. Can be an API level ('v1', 'v1alpha', 'v1beta') to show non-deprecated routes at that level, or 'deprecated' to show deprecated routes across all levels. If not specified, returns only non-deprecated v1 routes.
+        :param api_filter: Optional filter to control which routes are returned. Can be an API level ('v1', 'v1alpha', 'v1beta') to show non-deprecated routes at that level, or 'deprecated' to show deprecated routes across all levels. If not specified, returns all non-deprecated routes.
        :returns: Response containing information about all available routes.
        """
        ...
--- a/src/llama_stack/apis/models/models.py
+++ b/src/llama_stack/apis/models/models.py
@ -136,7 +136,7 @@ class Models(Protocol):
        """
        ...

-    @webmethod(route="/models", method="POST", level=LLAMA_STACK_API_V1)
+    @webmethod(route="/models", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    async def register_model(
        self,
        model_id: str,
@ -158,7 +158,7 @@ class Models(Protocol):
        """
        ...

-    @webmethod(route="/models/{model_id:path}", method="DELETE", level=LLAMA_STACK_API_V1)
+    @webmethod(route="/models/{model_id:path}", method="DELETE", level=LLAMA_STACK_API_V1, deprecated=True)
    async def unregister_model(
        self,
        model_id: str,
--- a/src/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/src/llama_stack/apis/scoring_functions/scoring_functions.py
@ -178,7 +178,7 @@ class ScoringFunctions(Protocol):
        """
        ...

-    @webmethod(route="/scoring-functions", method="POST", level=LLAMA_STACK_API_V1)
+    @webmethod(route="/scoring-functions", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    async def register_scoring_function(
        self,
        scoring_fn_id: str,
@ -199,7 +199,9 @@ class ScoringFunctions(Protocol):
        """
        ...

-    @webmethod(route="/scoring-functions/{scoring_fn_id:path}", method="DELETE", level=LLAMA_STACK_API_V1)
+    @webmethod(
+        route="/scoring-functions/{scoring_fn_id:path}", method="DELETE", level=LLAMA_STACK_API_V1, deprecated=True
+    )
    async def unregister_scoring_function(self, scoring_fn_id: str) -> None:
        """Unregister a scoring function.

--- a/src/llama_stack/apis/shields/shields.py
+++ b/src/llama_stack/apis/shields/shields.py
@ -67,7 +67,7 @@ class Shields(Protocol):
        """
        ...

-    @webmethod(route="/shields", method="POST", level=LLAMA_STACK_API_V1)
+    @webmethod(route="/shields", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    async def register_shield(
        self,
        shield_id: str,
@ -85,7 +85,7 @@ class Shields(Protocol):
        """
        ...

-    @webmethod(route="/shields/{identifier:path}", method="DELETE", level=LLAMA_STACK_API_V1)
+    @webmethod(route="/shields/{identifier:path}", method="DELETE", level=LLAMA_STACK_API_V1, deprecated=True)
    async def unregister_shield(self, identifier: str) -> None:
        """Unregister a shield.

--- a/src/llama_stack/apis/tools/tools.py
+++ b/src/llama_stack/apis/tools/tools.py
@ -109,7 +109,7 @@ class ListToolDefsResponse(BaseModel):
@runtime_checkable
@telemetry_traceable
 class ToolGroups(Protocol):
-    @webmethod(route="/toolgroups", method="POST", level=LLAMA_STACK_API_V1)
+    @webmethod(route="/toolgroups", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    async def register_tool_group(
        self,
        toolgroup_id: str,
@ -167,7 +167,7 @@ class ToolGroups(Protocol):
        """
        ...

-    @webmethod(route="/toolgroups/{toolgroup_id:path}", method="DELETE", level=LLAMA_STACK_API_V1)
+    @webmethod(route="/toolgroups/{toolgroup_id:path}", method="DELETE", level=LLAMA_STACK_API_V1, deprecated=True)
    async def unregister_toolgroup(
        self,
        toolgroup_id: str,
--- a/src/llama_stack/apis/vector_io/vector_io.py
+++ b/src/llama_stack/apis/vector_io/vector_io.py
@ -10,7 +10,7 @@
 # the root directory of this source tree.
 from typing import Annotated, Any, Literal, Protocol, runtime_checkable

-from fastapi import Body
+from fastapi import Body, Query
 from pydantic import BaseModel, Field

 from llama_stack.apis.common.tracing import telemetry_traceable
@ -224,10 +224,16 @@ class VectorStoreContent(BaseModel):

    :param type: Content type, currently only "text" is supported
    :param text: The actual text content
+    :param embedding: Optional embedding vector for this content chunk
+    :param chunk_metadata: Optional chunk metadata
+    :param metadata: Optional user-defined metadata
    """

    type: Literal["text"]
    text: str
+    embedding: list[float] | None = None
+    chunk_metadata: ChunkMetadata | None = None
+    metadata: dict[str, Any] | None = None


@json_schema_type
@ -280,6 +286,22 @@ class VectorStoreDeleteResponse(BaseModel):
    deleted: bool = True


+@json_schema_type
+class VectorStoreFileContentResponse(BaseModel):
+    """Represents the parsed content of a vector store file.
+
+    :param object: The object type, which is always `vector_store.file_content.page`
+    :param data: Parsed content of the file
+    :param has_more: Indicates if there are more content pages to fetch
+    :param next_page: The token for the next page, if any
+    """
+
+    object: Literal["vector_store.file_content.page"] = "vector_store.file_content.page"
+    data: list[VectorStoreContent]
+    has_more: bool = False
+    next_page: str | None = None
+
+
@json_schema_type
 class VectorStoreChunkingStrategyAuto(BaseModel):
    """Automatic chunking strategy for vector store files.
@ -395,22 +417,6 @@ class VectorStoreListFilesResponse(BaseModel):
    has_more: bool = False


-@json_schema_type
-class VectorStoreFileContentsResponse(BaseModel):
-    """Response from retrieving the contents of a vector store file.
-
-    :param file_id: Unique identifier for the file
-    :param filename: Name of the file
-    :param attributes: Key-value attributes associated with the file
-    :param content: List of content items from the file
-    """
-
-    file_id: str
-    filename: str
-    attributes: dict[str, Any]
-    content: list[VectorStoreContent]
-
-
@json_schema_type
 class VectorStoreFileDeleteResponse(BaseModel):
    """Response from deleting a vector store file.
@ -732,12 +738,16 @@ class VectorIO(Protocol):
        self,
        vector_store_id: str,
        file_id: str,
-    ) -> VectorStoreFileContentsResponse:
+        include_embeddings: Annotated[bool | None, Query(default=False)] = False,
+        include_metadata: Annotated[bool | None, Query(default=False)] = False,
+    ) -> VectorStoreFileContentResponse:
        """Retrieves the contents of a vector store file.

        :param vector_store_id: The ID of the vector store containing the file to retrieve.
        :param file_id: The ID of the file to retrieve.
-        :returns: A list of InterleavedContent representing the file contents.
+        :param include_embeddings: Whether to include embedding vectors in the response.
+        :param include_metadata: Whether to include chunk metadata in the response.
+        :returns: File contents, optionally with embeddings and metadata based on query parameters.
        """
        ...

--- a/src/llama_stack/core/build.py
+++ b/src/llama_stack/core/build.py
@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-import importlib.resources
 import sys

 from pydantic import BaseModel
@ -12,9 +11,6 @@ from termcolor import cprint

 from llama_stack.core.datatypes import BuildConfig
 from llama_stack.core.distribution import get_provider_registry
-from llama_stack.core.external import load_external_apis
-from llama_stack.core.utils.exec import run_command
-from llama_stack.core.utils.image_types import LlamaStackImageType
 from llama_stack.distributions.template import DistributionTemplate
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import Api
@ -101,64 +97,3 @@ def print_pip_install_help(config: BuildConfig):
    for special_dep in special_deps:
        cprint(f"uv pip install {special_dep}", color="yellow", file=sys.stderr)
    print()
-
-
-def build_image(
-    build_config: BuildConfig,
-    image_name: str,
-    distro_or_config: str,
-    run_config: str | None = None,
-):
-    container_base = build_config.distribution_spec.container_image or "python:3.12-slim"
-
-    normal_deps, special_deps, external_provider_deps = get_provider_dependencies(build_config)
-    normal_deps += SERVER_DEPENDENCIES
-    if build_config.external_apis_dir:
-        external_apis = load_external_apis(build_config)
-        if external_apis:
-            for _, api_spec in external_apis.items():
-                normal_deps.extend(api_spec.pip_packages)
-
-    if build_config.image_type == LlamaStackImageType.CONTAINER.value:
-        script = str(importlib.resources.files("llama_stack") / "core/build_container.sh")
-        args = [
-            script,
-            "--distro-or-config",
-            distro_or_config,
-            "--image-name",
-            image_name,
-            "--container-base",
-            container_base,
-            "--normal-deps",
-            " ".join(normal_deps),
-        ]
-        # When building from a config file (not a template), include the run config path in the
-        # build arguments
-        if run_config is not None:
-            args.extend(["--run-config", run_config])
-    else:
-        script = str(importlib.resources.files("llama_stack") / "core/build_venv.sh")
-        args = [
-            script,
-            "--env-name",
-            str(image_name),
-            "--normal-deps",
-            " ".join(normal_deps),
-        ]
-
-    # Always pass both arguments, even if empty, to maintain consistent positional arguments
-    if special_deps:
-        args.extend(["--optional-deps", "#".join(special_deps)])
-    if external_provider_deps:
-        args.extend(
-            ["--external-provider-deps", "#".join(external_provider_deps)]
-        )  # the script will install external provider module, get its deps, and install those too.
-
-    return_code = run_command(args)
-
-    if return_code != 0:
-        log.error(
-            f"Failed to build target {image_name} with return code {return_code}",
-        )
-
-    return return_code
--- a/src/llama_stack/core/inspect.py
+++ b/src/llama_stack/core/inspect.py
@ -15,7 +15,6 @@ from llama_stack.apis.inspect import (
    RouteInfo,
    VersionInfo,
 )
-from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.core.datatypes import StackRunConfig
 from llama_stack.core.external import load_external_apis
 from llama_stack.core.server.routes import get_all_api_routes
@ -46,8 +45,8 @@ class DistributionInspectImpl(Inspect):
        # Helper function to determine if a route should be included based on api_filter
        def should_include_route(webmethod) -> bool:
            if api_filter is None:
-                # Default: only non-deprecated v1 APIs
-                return not webmethod.deprecated and webmethod.level == LLAMA_STACK_API_V1
+                # Default: only non-deprecated APIs
+                return not webmethod.deprecated
            elif api_filter == "deprecated":
                # Special filter: show deprecated routes regardless of their actual level
                return bool(webmethod.deprecated)
--- a/src/llama_stack/core/library_client.py
+++ b/src/llama_stack/core/library_client.py
@ -389,6 +389,12 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
        matched_func, path_params, route_path, webmethod = find_matching_route(options.method, path, self.route_impls)
        body |= path_params

+        # Pass through params that aren't already handled as path params
+        if options.params:
+            extra_query_params = {k: v for k, v in options.params.items() if k not in path_params}
+            if extra_query_params:
+                body["extra_query"] = extra_query_params
+
        body, field_names = self._handle_file_uploads(options, body)

        body = self._convert_body(matched_func, body, exclude_params=set(field_names))
--- a/src/llama_stack/core/routers/safety.py
+++ b/src/llama_stack/core/routers/safety.py
@ -6,7 +6,7 @@

 from typing import Any

-from llama_stack.apis.inference import Message
+from llama_stack.apis.inference import OpenAIMessageParam
 from llama_stack.apis.safety import RunShieldResponse, Safety
 from llama_stack.apis.safety.safety import ModerationObject
 from llama_stack.apis.shields import Shield
@ -52,7 +52,7 @@ class SafetyRouter(Safety):
    async def run_shield(
        self,
        shield_id: str,
-        messages: list[Message],
+        messages: list[OpenAIMessageParam],
        params: dict[str, Any] = None,
    ) -> RunShieldResponse:
        logger.debug(f"SafetyRouter.run_shield: {shield_id}")
--- a/src/llama_stack/core/routers/vector_io.py
+++ b/src/llama_stack/core/routers/vector_io.py
@ -24,7 +24,7 @@ from llama_stack.apis.vector_io import (
    VectorStoreChunkingStrategyStaticConfig,
    VectorStoreDeleteResponse,
    VectorStoreFileBatchObject,
-    VectorStoreFileContentsResponse,
+    VectorStoreFileContentResponse,
    VectorStoreFileDeleteResponse,
    VectorStoreFileObject,
    VectorStoreFilesListInBatchResponse,
@ -247,6 +247,13 @@ class VectorIORouter(VectorIO):
        metadata: dict[str, Any] | None = None,
    ) -> VectorStoreObject:
        logger.debug(f"VectorIORouter.openai_update_vector_store: {vector_store_id}")
+
+        # Check if provider_id is being changed (not supported)
+        if metadata and "provider_id" in metadata:
+            current_store = await self.routing_table.get_object_by_identifier("vector_store", vector_store_id)
+            if current_store and current_store.provider_id != metadata["provider_id"]:
+                raise ValueError("provider_id cannot be changed after vector store creation")
+
        provider = await self.routing_table.get_provider_impl(vector_store_id)
        return await provider.openai_update_vector_store(
            vector_store_id=vector_store_id,
@ -338,12 +345,19 @@ class VectorIORouter(VectorIO):
        self,
        vector_store_id: str,
        file_id: str,
-    ) -> VectorStoreFileContentsResponse:
-        logger.debug(f"VectorIORouter.openai_retrieve_vector_store_file_contents: {vector_store_id}, {file_id}")
-        provider = await self.routing_table.get_provider_impl(vector_store_id)
-        return await provider.openai_retrieve_vector_store_file_contents(
+        include_embeddings: bool | None = False,
+        include_metadata: bool | None = False,
+    ) -> VectorStoreFileContentResponse:
+        logger.debug(
+            f"VectorIORouter.openai_retrieve_vector_store_file_contents: {vector_store_id}, {file_id}, "
+            f"include_embeddings={include_embeddings}, include_metadata={include_metadata}"
+        )
+
+        return await self.routing_table.openai_retrieve_vector_store_file_contents(
            vector_store_id=vector_store_id,
            file_id=file_id,
+            include_embeddings=include_embeddings,
+            include_metadata=include_metadata,
        )

    async def openai_update_vector_store_file(
--- a/src/llama_stack/core/routing_tables/vector_stores.py
+++ b/src/llama_stack/core/routing_tables/vector_stores.py
@ -15,7 +15,7 @@ from llama_stack.apis.vector_io.vector_io import (
    SearchRankingOptions,
    VectorStoreChunkingStrategy,
    VectorStoreDeleteResponse,
-    VectorStoreFileContentsResponse,
+    VectorStoreFileContentResponse,
    VectorStoreFileDeleteResponse,
    VectorStoreFileObject,
    VectorStoreFileStatus,
@ -195,12 +195,17 @@ class VectorStoresRoutingTable(CommonRoutingTableImpl):
        self,
        vector_store_id: str,
        file_id: str,
-    ) -> VectorStoreFileContentsResponse:
+        include_embeddings: bool | None = False,
+        include_metadata: bool | None = False,
+    ) -> VectorStoreFileContentResponse:
        await self.assert_action_allowed("read", "vector_store", vector_store_id)
+
        provider = await self.get_provider_impl(vector_store_id)
        return await provider.openai_retrieve_vector_store_file_contents(
            vector_store_id=vector_store_id,
            file_id=file_id,
+            include_embeddings=include_embeddings,
+            include_metadata=include_metadata,
        )

    async def openai_update_vector_store_file(
--- a/src/llama_stack/distributions/oci/init.py
+++ b/src/llama_stack/distributions/oci/init.py
@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .oci import get_distribution_template  # noqa: F401
--- a/src/llama_stack/distributions/oci/build.yaml
+++ b/src/llama_stack/distributions/oci/build.yaml
@ -0,0 +1,35 @@
+version: 2
+distribution_spec:
+  description: Use Oracle Cloud Infrastructure (OCI) Generative AI for running LLM
+    inference with scalable cloud services
+  providers:
+    inference:
+    - provider_type: remote::oci
+    vector_io:
+    - provider_type: inline::faiss
+    - provider_type: remote::chromadb
+    - provider_type: remote::pgvector
+    safety:
+    - provider_type: inline::llama-guard
+    agents:
+    - provider_type: inline::meta-reference
+    eval:
+    - provider_type: inline::meta-reference
+    datasetio:
+    - provider_type: remote::huggingface
+    - provider_type: inline::localfs
+    scoring:
+    - provider_type: inline::basic
+    - provider_type: inline::llm-as-judge
+    - provider_type: inline::braintrust
+    tool_runtime:
+    - provider_type: remote::brave-search
+    - provider_type: remote::tavily-search
+    - provider_type: inline::rag-runtime
+    - provider_type: remote::model-context-protocol
+    files:
+    - provider_type: inline::localfs
+image_type: venv
+additional_pip_packages:
+- aiosqlite
+- sqlalchemy[asyncio]
--- a/src/llama_stack/distributions/oci/doc_template.md
+++ b/src/llama_stack/distributions/oci/doc_template.md
@ -0,0 +1,140 @@
+---
+orphan: true
+---
+# OCI Distribution
+
+The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
+
+{{ providers_table }}
+
+{% if run_config_env_vars %}
+### Environment Variables
+
+The following environment variables can be configured:
+
+{% for var, (default_value, description) in run_config_env_vars.items() %}
+- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
+{% endfor %}
+{% endif %}
+
+{% if default_models %}
+### Models
+
+The following models are available by default:
+
+{% for model in default_models %}
+- `{{ model.model_id }} {{ model.doc_string }}`
+{% endfor %}
+{% endif %}
+
+## Prerequisites
+### Oracle Cloud Infrastructure Setup
+
+Before using the OCI Generative AI distribution, ensure you have:
+
+1. **Oracle Cloud Infrastructure Account**: Sign up at [Oracle Cloud Infrastructure](https://cloud.oracle.com/)
+2. **Generative AI Service Access**: Enable the Generative AI service in your OCI tenancy
+3. **Compartment**: Create or identify a compartment where you'll deploy Generative AI models
+4. **Authentication**: Configure authentication using either:
+   - **Instance Principal** (recommended for cloud-hosted deployments)
+   - **API Key** (for on-premises or development environments)
+
+### Authentication Methods
+
+#### Instance Principal Authentication (Recommended)
+Instance Principal authentication allows OCI resources to authenticate using the identity of the compute instance they're running on. This is the most secure method for production deployments.
+
+Requirements:
+- Instance must be running in an Oracle Cloud Infrastructure compartment
+- Instance must have appropriate IAM policies to access Generative AI services
+
+#### API Key Authentication
+For development or on-premises deployments, follow [this doc](https://docs.oracle.com/en-us/iaas/Content/API/Concepts/apisigningkey.htm) to learn how to create your API signing key for your config file.
+
+### Required IAM Policies
+
+Ensure your OCI user or instance has the following policy statements:
+
+```
+Allow group <group_name> to use generative-ai-inference-endpoints in compartment <compartment_name>
+Allow group <group_name> to manage generative-ai-inference-endpoints in compartment <compartment_name>
+```
+
+## Supported Services
+
+### Inference: OCI Generative AI
+Oracle Cloud Infrastructure Generative AI provides access to high-performance AI models through OCI's Platform-as-a-Service offering. The service supports:
+
+- **Chat Completions**: Conversational AI with context awareness
+- **Text Generation**: Complete prompts and generate text content
+
+#### Available Models
+Common OCI Generative AI models include access to Meta, Cohere, OpenAI, Grok, and more models.
+
+### Safety: Llama Guard
+For content safety and moderation, this distribution uses Meta's LlamaGuard model through the OCI Generative AI service to provide:
+- Content filtering and moderation
+- Policy compliance checking
+- Harmful content detection
+
+### Vector Storage: Multiple Options
+The distribution supports several vector storage providers:
+- **FAISS**: Local in-memory vector search
+- **ChromaDB**: Distributed vector database
+- **PGVector**: PostgreSQL with vector extensions
+
+### Additional Services
+- **Dataset I/O**: Local filesystem and Hugging Face integration
+- **Tool Runtime**: Web search (Brave, Tavily) and RAG capabilities
+- **Evaluation**: Meta reference evaluation framework
+
+## Running Llama Stack with OCI
+
+You can run the OCI distribution via Docker or local virtual environment.
+
+### Via venv
+
+If you've set up your local development environment, you can also build the image using your local virtual environment.
+
+```bash
+OCI_AUTH=$OCI_AUTH_TYPE OCI_REGION=$OCI_REGION OCI_COMPARTMENT_OCID=$OCI_COMPARTMENT_OCID llama stack run --port 8321 oci
+```
+
+### Configuration Examples
+
+#### Using Instance Principal (Recommended for Production)
+```bash
+export OCI_AUTH_TYPE=instance_principal
+export OCI_REGION=us-chicago-1
+export OCI_COMPARTMENT_OCID=ocid1.compartment.oc1..<your-compartment-id>
+```
+
+#### Using API Key Authentication (Development)
+```bash
+export OCI_AUTH_TYPE=config_file
+export OCI_CONFIG_FILE_PATH=~/.oci/config
+export OCI_CLI_PROFILE=DEFAULT
+export OCI_REGION=us-chicago-1
+export OCI_COMPARTMENT_OCID=ocid1.compartment.oc1..your-compartment-id
+```
+
+## Regional Endpoints
+
+OCI Generative AI is available in multiple regions. The service automatically routes to the appropriate regional endpoint based on your configuration. For a full list of regional model availability, visit:
+
+https://docs.oracle.com/en-us/iaas/Content/generative-ai/overview.htm#regions
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Authentication Errors**: Verify your OCI credentials and IAM policies
+2. **Model Not Found**: Ensure the model OCID is correct and the model is available in your region
+3. **Permission Denied**: Check compartment permissions and Generative AI service access
+4. **Region Unavailable**: Verify the specified region supports Generative AI services
+
+### Getting Help
+
+For additional support:
+- [OCI Generative AI Documentation](https://docs.oracle.com/en-us/iaas/Content/generative-ai/home.htm)
+- [Llama Stack Issues](https://github.com/meta-llama/llama-stack/issues)
--- a/src/llama_stack/distributions/oci/oci.py
+++ b/src/llama_stack/distributions/oci/oci.py
@ -0,0 +1,108 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from pathlib import Path
+
+from llama_stack.core.datatypes import BuildProvider, Provider, ToolGroupInput
+from llama_stack.distributions.template import DistributionTemplate, RunConfigSettings
+from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplConfig
+from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
+from llama_stack.providers.remote.inference.oci.config import OCIConfig
+
+
+def get_distribution_template(name: str = "oci") -> DistributionTemplate:
+    providers = {
+        "inference": [BuildProvider(provider_type="remote::oci")],
+        "vector_io": [
+            BuildProvider(provider_type="inline::faiss"),
+            BuildProvider(provider_type="remote::chromadb"),
+            BuildProvider(provider_type="remote::pgvector"),
+        ],
+        "safety": [BuildProvider(provider_type="inline::llama-guard")],
+        "agents": [BuildProvider(provider_type="inline::meta-reference")],
+        "eval": [BuildProvider(provider_type="inline::meta-reference")],
+        "datasetio": [
+            BuildProvider(provider_type="remote::huggingface"),
+            BuildProvider(provider_type="inline::localfs"),
+        ],
+        "scoring": [
+            BuildProvider(provider_type="inline::basic"),
+            BuildProvider(provider_type="inline::llm-as-judge"),
+            BuildProvider(provider_type="inline::braintrust"),
+        ],
+        "tool_runtime": [
+            BuildProvider(provider_type="remote::brave-search"),
+            BuildProvider(provider_type="remote::tavily-search"),
+            BuildProvider(provider_type="inline::rag-runtime"),
+            BuildProvider(provider_type="remote::model-context-protocol"),
+        ],
+        "files": [BuildProvider(provider_type="inline::localfs")],
+    }
+
+    inference_provider = Provider(
+        provider_id="oci",
+        provider_type="remote::oci",
+        config=OCIConfig.sample_run_config(),
+    )
+
+    vector_io_provider = Provider(
+        provider_id="faiss",
+        provider_type="inline::faiss",
+        config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
+    )
+
+    files_provider = Provider(
+        provider_id="meta-reference-files",
+        provider_type="inline::localfs",
+        config=LocalfsFilesImplConfig.sample_run_config(f"~/.llama/distributions/{name}"),
+    )
+    default_tool_groups = [
+        ToolGroupInput(
+            toolgroup_id="builtin::websearch",
+            provider_id="tavily-search",
+        ),
+    ]
+
+    return DistributionTemplate(
+        name=name,
+        distro_type="remote_hosted",
+        description="Use Oracle Cloud Infrastructure (OCI) Generative AI for running LLM inference with scalable cloud services",
+        container_image=None,
+        template_path=Path(__file__).parent / "doc_template.md",
+        providers=providers,
+        run_configs={
+            "run.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": [inference_provider],
+                    "vector_io": [vector_io_provider],
+                    "files": [files_provider],
+                },
+                default_tool_groups=default_tool_groups,
+            ),
+        },
+        run_config_env_vars={
+            "OCI_AUTH_TYPE": (
+                "instance_principal",
+                "OCI authentication type (instance_principal or config_file)",
+            ),
+            "OCI_REGION": (
+                "",
+                "OCI region (e.g., us-ashburn-1, us-chicago-1, us-phoenix-1, eu-frankfurt-1)",
+            ),
+            "OCI_COMPARTMENT_OCID": (
+                "",
+                "OCI compartment ID for the Generative AI service",
+            ),
+            "OCI_CONFIG_FILE_PATH": (
+                "~/.oci/config",
+                "OCI config file path (required if OCI_AUTH_TYPE is config_file)",
+            ),
+            "OCI_CLI_PROFILE": (
+                "DEFAULT",
+                "OCI CLI profile name to use from config file",
+            ),
+        },
+    )
--- a/src/llama_stack/distributions/oci/run.yaml
+++ b/src/llama_stack/distributions/oci/run.yaml
@ -0,0 +1,136 @@
+version: 2
+image_name: oci
+apis:
+- agents
+- datasetio
+- eval
+- files
+- inference
+- safety
+- scoring
+- tool_runtime
+- vector_io
+providers:
+  inference:
+  - provider_id: oci
+    provider_type: remote::oci
+    config:
+      oci_auth_type: ${env.OCI_AUTH_TYPE:=instance_principal}
+      oci_config_file_path: ${env.OCI_CONFIG_FILE_PATH:=~/.oci/config}
+      oci_config_profile: ${env.OCI_CLI_PROFILE:=DEFAULT}
+      oci_region: ${env.OCI_REGION:=us-ashburn-1}
+      oci_compartment_id: ${env.OCI_COMPARTMENT_OCID:=}
+  vector_io:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      persistence:
+        namespace: vector_io::faiss
+        backend: kv_default
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config:
+      excluded_categories: []
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence:
+        agent_state:
+          namespace: agents
+          backend: kv_default
+        responses:
+          table_name: responses
+          backend: sql_default
+          max_write_queue_size: 10000
+          num_writers: 4
+  eval:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      kvstore:
+        namespace: eval
+        backend: kv_default
+  datasetio:
+  - provider_id: huggingface
+    provider_type: remote::huggingface
+    config:
+      kvstore:
+        namespace: datasetio::huggingface
+        backend: kv_default
+  - provider_id: localfs
+    provider_type: inline::localfs
+    config:
+      kvstore:
+        namespace: datasetio::localfs
+        backend: kv_default
+  scoring:
+  - provider_id: basic
+    provider_type: inline::basic
+  - provider_id: llm-as-judge
+    provider_type: inline::llm-as-judge
+  - provider_id: braintrust
+    provider_type: inline::braintrust
+    config:
+      openai_api_key: ${env.OPENAI_API_KEY:=}
+  tool_runtime:
+  - provider_id: brave-search
+    provider_type: remote::brave-search
+    config:
+      api_key: ${env.BRAVE_SEARCH_API_KEY:=}
+      max_results: 3
+  - provider_id: tavily-search
+    provider_type: remote::tavily-search
+    config:
+      api_key: ${env.TAVILY_SEARCH_API_KEY:=}
+      max_results: 3
+  - provider_id: rag-runtime
+    provider_type: inline::rag-runtime
+  - provider_id: model-context-protocol
+    provider_type: remote::model-context-protocol
+  files:
+  - provider_id: meta-reference-files
+    provider_type: inline::localfs
+    config:
+      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/oci/files}
+      metadata_store:
+        table_name: files_metadata
+        backend: sql_default
+storage:
+  backends:
+    kv_default:
+      type: kv_sqlite
+      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/oci}/kvstore.db
+    sql_default:
+      type: sql_sqlite
+      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/oci}/sql_store.db
+  stores:
+    metadata:
+      namespace: registry
+      backend: kv_default
+    inference:
+      table_name: inference_store
+      backend: sql_default
+      max_write_queue_size: 10000
+      num_writers: 4
+    conversations:
+      table_name: openai_conversations
+      backend: sql_default
+    prompts:
+      namespace: prompts
+      backend: kv_default
+registered_resources:
+  models: []
+  shields: []
+  vector_dbs: []
+  datasets: []
+  scoring_fns: []
+  benchmarks: []
+  tool_groups:
+  - toolgroup_id: builtin::websearch
+    provider_id: tavily-search
+server:
+  port: 8321
+telemetry:
+  enabled: true
--- a/src/llama_stack/models/llama/llama3/generation.py
+++ b/src/llama_stack/models/llama/llama3/generation.py
@ -26,8 +26,10 @@ from fairscale.nn.model_parallel.initialize import (
 )
 from termcolor import cprint

+from llama_stack.models.llama.datatypes import ToolPromptFormat
+
 from ..checkpoint import maybe_reshard_state_dict
-from ..datatypes import GenerationResult, QuantizationMode, RawContent, RawMessage, ToolPromptFormat
+from ..datatypes import GenerationResult, QuantizationMode, RawContent, RawMessage
 from .args import ModelArgs
 from .chat_format import ChatFormat, LLMInput
 from .model import Transformer
--- a/src/llama_stack/models/llama/llama3/interface.py
+++ b/src/llama_stack/models/llama/llama3/interface.py
@ -15,13 +15,10 @@ from pathlib import Path

 from termcolor import colored

+from llama_stack.models.llama.datatypes import BuiltinTool, StopReason, ToolCall, ToolDefinition, ToolPromptFormat
+
 from ..datatypes import (
-    BuiltinTool,
    RawMessage,
-    StopReason,
-    ToolCall,
-    ToolDefinition,
-    ToolPromptFormat,
 )
 from . import template_data
 from .chat_format import ChatFormat
--- a/src/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py
+++ b/src/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py
@ -15,7 +15,7 @@ import textwrap
 from datetime import datetime
 from typing import Any

-from llama_stack.apis.inference import (
+from llama_stack.models.llama.datatypes import (
    BuiltinTool,
    ToolDefinition,
 )
--- a/src/llama_stack/models/llama/llama3/tool_utils.py
+++ b/src/llama_stack/models/llama/llama3/tool_utils.py
@ -8,8 +8,9 @@ import json
 import re

 from llama_stack.log import get_logger
+from llama_stack.models.llama.datatypes import BuiltinTool, ToolCall, ToolPromptFormat

-from ..datatypes import BuiltinTool, RecursiveType, ToolCall, ToolPromptFormat
+from ..datatypes import RecursiveType

 logger = get_logger(name=__name__, category="models::llama")

--- a/src/llama_stack/models/llama/llama4/prompt_templates/system_prompts.py
+++ b/src/llama_stack/models/llama/llama4/prompt_templates/system_prompts.py
@ -13,7 +13,7 @@

 import textwrap

-from llama_stack.apis.inference import ToolDefinition
+from llama_stack.models.llama.datatypes import ToolDefinition
 from llama_stack.models.llama.llama3.prompt_templates.base import (
    PromptTemplate,
    PromptTemplateGeneratorBase,
--- a/src/llama_stack/providers/inline/agents/meta_reference/agents.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/agents.py
@ -102,6 +102,7 @@ class MetaReferenceAgentsImpl(Agents):
        include: list[str] | None = None,
        max_infer_iters: int | None = 10,
        guardrails: list[ResponseGuardrail] | None = None,
+        max_tool_calls: int | None = None,
    ) -> OpenAIResponseObject:
        assert self.openai_responses_impl is not None, "OpenAI responses not initialized"
        result = await self.openai_responses_impl.create_openai_response(
@ -119,6 +120,7 @@ class MetaReferenceAgentsImpl(Agents):
            include,
            max_infer_iters,
            guardrails,
+            max_tool_calls,
        )
        return result  # type: ignore[no-any-return]

--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
@ -255,6 +255,7 @@ class OpenAIResponsesImpl:
        include: list[str] | None = None,
        max_infer_iters: int | None = 10,
        guardrails: list[str | ResponseGuardrailSpec] | None = None,
+        max_tool_calls: int | None = None,
    ):
        stream = bool(stream)
        text = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")) if text is None else text
@ -270,6 +271,9 @@ class OpenAIResponsesImpl:
            if not conversation.startswith("conv_"):
                raise InvalidConversationIdError(conversation)

+        if max_tool_calls is not None and max_tool_calls < 1:
+            raise ValueError(f"Invalid {max_tool_calls=}; should be >= 1")
+
        stream_gen = self._create_streaming_response(
            input=input,
            conversation=conversation,
@ -282,6 +286,7 @@ class OpenAIResponsesImpl:
            tools=tools,
            max_infer_iters=max_infer_iters,
            guardrail_ids=guardrail_ids,
+            max_tool_calls=max_tool_calls,
        )

        if stream:
@ -331,6 +336,7 @@ class OpenAIResponsesImpl:
        tools: list[OpenAIResponseInputTool] | None = None,
        max_infer_iters: int | None = 10,
        guardrail_ids: list[str] | None = None,
+        max_tool_calls: int | None = None,
    ) -> AsyncIterator[OpenAIResponseObjectStream]:
        # These should never be None when called from create_openai_response (which sets defaults)
        # but we assert here to help mypy understand the types
@ -373,6 +379,7 @@ class OpenAIResponsesImpl:
            safety_api=self.safety_api,
            guardrail_ids=guardrail_ids,
            instructions=instructions,
+            max_tool_calls=max_tool_calls,
        )

        # Stream the response
--- a/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
+++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@ -115,6 +115,7 @@ class StreamingResponseOrchestrator:
        safety_api,
        guardrail_ids: list[str] | None = None,
        prompt: OpenAIResponsePrompt | None = None,
+        max_tool_calls: int | None = None,
    ):
        self.inference_api = inference_api
        self.ctx = ctx
@ -126,6 +127,10 @@ class StreamingResponseOrchestrator:
        self.safety_api = safety_api
        self.guardrail_ids = guardrail_ids or []
        self.prompt = prompt
+        # System message that is inserted into the model's context
+        self.instructions = instructions
+        # Max number of total calls to built-in tools that can be processed in a response
+        self.max_tool_calls = max_tool_calls
        self.sequence_number = 0
        # Store MCP tool mapping that gets built during tool processing
        self.mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP] = (
@ -139,8 +144,8 @@ class StreamingResponseOrchestrator:
        self.accumulated_usage: OpenAIResponseUsage | None = None
        # Track if we've sent a refusal response
        self.violation_detected = False
-        # system message that is inserted into the model's context
-        self.instructions = instructions
+        # Track total calls made to built-in tools
+        self.accumulated_builtin_tool_calls = 0

    async def _create_refusal_response(self, violation_message: str) -> OpenAIResponseObjectStream:
        """Create a refusal response to replace streaming content."""
@ -186,6 +191,7 @@ class StreamingResponseOrchestrator:
            usage=self.accumulated_usage,
            instructions=self.instructions,
            prompt=self.prompt,
+            max_tool_calls=self.max_tool_calls,
        )

    async def create_response(self) -> AsyncIterator[OpenAIResponseObjectStream]:
@ -894,6 +900,11 @@ class StreamingResponseOrchestrator:
        """Coordinate execution of both function and non-function tool calls."""
        # Execute non-function tool calls
        for tool_call in non_function_tool_calls:
+            # Check if total calls made to built-in and mcp tools exceed max_tool_calls
+            if self.max_tool_calls is not None and self.accumulated_builtin_tool_calls >= self.max_tool_calls:
+                logger.info(f"Ignoring built-in and mcp tool call since reached the limit of {self.max_tool_calls=}.")
+                break
+
            # Find the item_id for this tool call
            matching_item_id = None
            for index, item_id in completion_result_data.tool_call_item_ids.items():
@ -974,6 +985,9 @@ class StreamingResponseOrchestrator:
            if tool_response_message:
                next_turn_messages.append(tool_response_message)

+            # Track number of calls made to built-in and mcp tools
+            self.accumulated_builtin_tool_calls += 1
+
        # Execute function tool calls (client-side)
        for tool_call in function_tool_calls:
            # Find the item_id for this tool call from our tracking dictionary
--- a/src/llama_stack/providers/inline/inference/meta_reference/generators.py
+++ b/src/llama_stack/providers/inline/inference/meta_reference/generators.py
@ -5,7 +5,6 @@
 # the root directory of this source tree.

 import math
-from collections.abc import Generator
 from typing import Optional

 import torch
@ -14,21 +13,19 @@ from lmformatenforcer import JsonSchemaParser, TokenEnforcer, TokenEnforcerToken
 from llama_stack.apis.inference import (
    GreedySamplingStrategy,
    JsonSchemaResponseFormat,
+    OpenAIChatCompletionRequestWithExtraBody,
+    OpenAIResponseFormatJSONSchema,
    ResponseFormat,
+    ResponseFormatType,
    SamplingParams,
    TopPSamplingStrategy,
 )
-from llama_stack.models.llama.datatypes import QuantizationMode
+from llama_stack.models.llama.datatypes import QuantizationMode, ToolPromptFormat
 from llama_stack.models.llama.llama3.generation import Llama3
 from llama_stack.models.llama.llama3.tokenizer import Tokenizer as Llama3Tokenizer
 from llama_stack.models.llama.llama4.generation import Llama4
 from llama_stack.models.llama.llama4.tokenizer import Tokenizer as Llama4Tokenizer
 from llama_stack.models.llama.sku_types import Model, ModelFamily
-from llama_stack.providers.utils.inference.prompt_adapter import (
-    ChatCompletionRequestWithRawContent,
-    CompletionRequestWithRawContent,
-    get_default_tool_prompt_format,
-)

 from .common import model_checkpoint_dir
 from .config import MetaReferenceInferenceConfig
@ -106,14 +103,6 @@ def _infer_sampling_params(sampling_params: SamplingParams):
    return temperature, top_p


-def _infer_tool_prompt_format(request: ChatCompletionRequestWithRawContent):
-    tool_config = request.tool_config
-    if tool_config is not None and tool_config.tool_prompt_format is not None:
-        return tool_config.tool_prompt_format
-    else:
-        return get_default_tool_prompt_format(request.model)
-
-
 class LlamaGenerator:
    def __init__(
        self,
@ -157,55 +146,56 @@ class LlamaGenerator:
        self.args = self.inner_generator.args
        self.formatter = self.inner_generator.formatter

-    def completion(
-        self,
-        request_batch: list[CompletionRequestWithRawContent],
-    ) -> Generator:
-        first_request = request_batch[0]
-        sampling_params = first_request.sampling_params or SamplingParams()
-        max_gen_len = sampling_params.max_tokens
-        if max_gen_len is None or max_gen_len == 0 or max_gen_len >= self.args.max_seq_len:
-            max_gen_len = self.args.max_seq_len - 1
-
-        temperature, top_p = _infer_sampling_params(sampling_params)
-        yield from self.inner_generator.generate(
-            llm_inputs=[self.formatter.encode_content(request.content) for request in request_batch],
-            max_gen_len=max_gen_len,
-            temperature=temperature,
-            top_p=top_p,
-            logprobs=bool(first_request.logprobs),
-            echo=False,
-            logits_processor=get_logits_processor(
-                self.tokenizer,
-                self.args.vocab_size,
-                first_request.response_format,
-            ),
-        )
-
    def chat_completion(
        self,
-        request_batch: list[ChatCompletionRequestWithRawContent],
-    ) -> Generator:
-        first_request = request_batch[0]
-        sampling_params = first_request.sampling_params or SamplingParams()
+        request: OpenAIChatCompletionRequestWithExtraBody,
+        raw_messages: list,
+    ):
+        """Generate chat completion using OpenAI request format.
+
+        Args:
+            request: OpenAI chat completion request
+            raw_messages: Pre-converted list of RawMessage objects
+        """
+
+        # Determine tool prompt format
+        tool_prompt_format = ToolPromptFormat.json if request.tools else ToolPromptFormat.json
+
+        # Prepare sampling params
+        sampling_params = SamplingParams()
+        if request.temperature is not None or request.top_p is not None:
+            sampling_params.strategy = TopPSamplingStrategy(
+                temperature=request.temperature if request.temperature is not None else 1.0,
+                top_p=request.top_p if request.top_p is not None else 1.0,
+            )
+        if request.max_tokens:
+            sampling_params.max_tokens = request.max_tokens
+
        max_gen_len = sampling_params.max_tokens
        if max_gen_len is None or max_gen_len == 0 or max_gen_len >= self.args.max_seq_len:
            max_gen_len = self.args.max_seq_len - 1

        temperature, top_p = _infer_sampling_params(sampling_params)
+
+        # Get logits processor for response format
+        logits_processor = None
+        if request.response_format:
+            if isinstance(request.response_format, OpenAIResponseFormatJSONSchema):
+                # Extract the actual schema from OpenAIJSONSchema TypedDict
+                schema_dict = request.response_format.json_schema.get("schema") or {}
+                json_schema_format = JsonSchemaResponseFormat(
+                    type=ResponseFormatType.json_schema,
+                    json_schema=schema_dict,
+                )
+                logits_processor = get_logits_processor(self.tokenizer, self.args.vocab_size, json_schema_format)
+
+        # Generate
        yield from self.inner_generator.generate(
-            llm_inputs=[
-                self.formatter.encode_dialog_prompt(request.messages, _infer_tool_prompt_format(request))
-                for request in request_batch
-            ],
+            llm_inputs=[self.formatter.encode_dialog_prompt(raw_messages, tool_prompt_format)],
            max_gen_len=max_gen_len,
            temperature=temperature,
            top_p=top_p,
-            logprobs=bool(first_request.logprobs),
+            logprobs=False,
            echo=False,
-            logits_processor=get_logits_processor(
-                self.tokenizer,
-                self.args.vocab_size,
-                first_request.response_format,
-            ),
+            logits_processor=logits_processor,
        )
--- a/src/llama_stack/providers/inline/inference/meta_reference/inference.py
+++ b/src/llama_stack/providers/inline/inference/meta_reference/inference.py
@ -5,12 +5,19 @@
 # the root directory of this source tree.

 import asyncio
+import time
+import uuid
 from collections.abc import AsyncIterator

 from llama_stack.apis.inference import (
    InferenceProvider,
+    OpenAIAssistantMessageParam,
    OpenAIChatCompletionRequestWithExtraBody,
+    OpenAIChatCompletionUsage,
+    OpenAIChoice,
    OpenAICompletionRequestWithExtraBody,
+    OpenAIUserMessageParam,
+    ToolChoice,
 )
 from llama_stack.apis.inference.inference import (
    OpenAIChatCompletion,
@ -19,12 +26,20 @@ from llama_stack.apis.inference.inference import (
 )
 from llama_stack.apis.models import Model, ModelType
 from llama_stack.log import get_logger
+from llama_stack.models.llama.datatypes import RawMessage, RawTextItem, ToolDefinition
 from llama_stack.models.llama.llama3.chat_format import ChatFormat as Llama3ChatFormat
+from llama_stack.models.llama.llama3.prompt_templates import (
+    JsonCustomToolGenerator,
+    SystemDefaultGenerator,
+)
 from llama_stack.models.llama.llama3.tokenizer import Tokenizer as Llama3Tokenizer
 from llama_stack.models.llama.llama4.chat_format import ChatFormat as Llama4ChatFormat
+from llama_stack.models.llama.llama4.prompt_templates.system_prompts import (
+    PythonListCustomToolGenerator as PythonListCustomToolGeneratorLlama4,
+)
 from llama_stack.models.llama.llama4.tokenizer import Tokenizer as Llama4Tokenizer
 from llama_stack.models.llama.sku_list import resolve_model
-from llama_stack.models.llama.sku_types import ModelFamily
+from llama_stack.models.llama.sku_types import ModelFamily, is_multimodal
 from llama_stack.providers.datatypes import ModelsProtocolPrivate
 from llama_stack.providers.utils.inference.embedding_mixin import (
    SentenceTransformerEmbeddingMixin,
@ -44,6 +59,170 @@ log = get_logger(__name__, category="inference")
 SEMAPHORE = asyncio.Semaphore(1)


+def _convert_openai_tool_to_tool_definition(tool) -> ToolDefinition:
+    """Convert OpenAI tool format to ToolDefinition format."""
+    # OpenAI tools have function.name and function.parameters
+    return ToolDefinition(
+        tool_name=tool.function.name,
+        description=tool.function.description or "",
+        parameters=tool.function.parameters or {},
+    )
+
+
+def _get_tool_choice_prompt(tool_choice, tools) -> str:
+    """Generate prompt text for tool_choice behavior."""
+    if not tool_choice or tool_choice == ToolChoice.auto or tool_choice == "auto":
+        return ""
+    elif tool_choice == ToolChoice.required or tool_choice == "required":
+        return "You MUST use one of the provided functions/tools to answer the user query."
+    elif tool_choice == ToolChoice.none or tool_choice == "none":
+        return ""
+    else:
+        # Specific tool specified
+        return f"You MUST use the tool `{tool_choice}` to answer the user query."
+
+
+def _raw_content_as_str(content) -> str:
+    """Convert RawContent to string for system messages."""
+    if isinstance(content, str):
+        return content
+    elif isinstance(content, RawTextItem):
+        return content.text
+    elif isinstance(content, list):
+        return "\n".join(_raw_content_as_str(c) for c in content)
+    else:
+        return "<media>"
+
+
+def _augment_raw_messages_for_tools_llama_3_1(
+    raw_messages: list[RawMessage],
+    tools: list,
+    tool_choice,
+) -> list[RawMessage]:
+    """Augment raw messages with tool definitions for Llama 3.1 style models."""
+    messages = raw_messages.copy()
+    existing_system_message = None
+    if messages and messages[0].role == "system":
+        existing_system_message = messages.pop(0)
+
+    sys_content = ""
+
+    # Add tool definitions first (if present)
+    if tools:
+        # Convert OpenAI tools to ToolDefinitions
+        tool_definitions = [_convert_openai_tool_to_tool_definition(t) for t in tools]
+
+        # For OpenAI format, all tools are custom (have string names)
+        tool_gen = JsonCustomToolGenerator()
+        tool_template = tool_gen.gen(tool_definitions)
+        sys_content += tool_template.render()
+        sys_content += "\n"
+
+    # Add default system prompt
+    default_gen = SystemDefaultGenerator()
+    default_template = default_gen.gen()
+    sys_content += default_template.render()
+
+    # Add existing system message if present
+    if existing_system_message:
+        sys_content += "\n" + _raw_content_as_str(existing_system_message.content)
+
+    # Add tool choice prompt if needed
+    if tool_choice_prompt := _get_tool_choice_prompt(tool_choice, tools):
+        sys_content += "\n" + tool_choice_prompt
+
+    # Create new system message
+    new_system_message = RawMessage(
+        role="system",
+        content=[RawTextItem(text=sys_content.strip())],
+    )
+
+    return [new_system_message] + messages
+
+
+def _augment_raw_messages_for_tools_llama_4(
+    raw_messages: list[RawMessage],
+    tools: list,
+    tool_choice,
+) -> list[RawMessage]:
+    """Augment raw messages with tool definitions for Llama 4/3.2/3.3 style models."""
+    messages = raw_messages.copy()
+    existing_system_message = None
+    if messages and messages[0].role == "system":
+        existing_system_message = messages.pop(0)
+
+    sys_content = ""
+
+    # Add tool definitions if present
+    if tools:
+        # Convert OpenAI tools to ToolDefinitions
+        tool_definitions = [_convert_openai_tool_to_tool_definition(t) for t in tools]
+
+        # Use python_list format for Llama 4
+        tool_gen = PythonListCustomToolGeneratorLlama4()
+        system_prompt = None
+        if existing_system_message:
+            system_prompt = _raw_content_as_str(existing_system_message.content)
+
+        tool_template = tool_gen.gen(tool_definitions, system_prompt)
+        sys_content = tool_template.render()
+    elif existing_system_message:
+        # No tools, just use existing system message
+        sys_content = _raw_content_as_str(existing_system_message.content)
+
+    # Add tool choice prompt if needed
+    if tool_choice_prompt := _get_tool_choice_prompt(tool_choice, tools):
+        sys_content += "\n" + tool_choice_prompt
+
+    if sys_content:
+        new_system_message = RawMessage(
+            role="system",
+            content=[RawTextItem(text=sys_content.strip())],
+        )
+        return [new_system_message] + messages
+
+    return messages
+
+
+def augment_raw_messages_for_tools(
+    raw_messages: list[RawMessage],
+    params: OpenAIChatCompletionRequestWithExtraBody,
+    llama_model,
+) -> list[RawMessage]:
+    """Augment raw messages with tool definitions based on model family."""
+    if not params.tools:
+        return raw_messages
+
+    # Determine augmentation strategy based on model family
+    if llama_model.model_family == ModelFamily.llama3_1 or (
+        llama_model.model_family == ModelFamily.llama3_2 and is_multimodal(llama_model.core_model_id)
+    ):
+        # Llama 3.1 and Llama 3.2 multimodal use JSON format
+        return _augment_raw_messages_for_tools_llama_3_1(
+            raw_messages,
+            params.tools,
+            params.tool_choice,
+        )
+    elif llama_model.model_family in (
+        ModelFamily.llama3_2,
+        ModelFamily.llama3_3,
+        ModelFamily.llama4,
+    ):
+        # Llama 3.2/3.3/4 use python_list format
+        return _augment_raw_messages_for_tools_llama_4(
+            raw_messages,
+            params.tools,
+            params.tool_choice,
+        )
+    else:
+        # Default to Llama 3.1 style
+        return _augment_raw_messages_for_tools_llama_3_1(
+            raw_messages,
+            params.tools,
+            params.tool_choice,
+        )
+
+
 def llama_builder_fn(config: MetaReferenceInferenceConfig, model_id: str, llama_model: Model) -> LlamaGenerator:
    return LlamaGenerator(config, model_id, llama_model)

@ -136,10 +315,13 @@ class MetaReferenceInferenceImpl(
        self.llama_model = llama_model

        log.info("Warming up...")
+
        await self.openai_chat_completion(
-            model=model_id,
-            messages=[{"role": "user", "content": "Hi how are you?"}],
-            max_tokens=20,
+            params=OpenAIChatCompletionRequestWithExtraBody(
+                model=model_id,
+                messages=[OpenAIUserMessageParam(role="user", content="Hi how are you?")],
+                max_tokens=20,
+            )
        )
        log.info("Warmed up!")

@ -155,4 +337,207 @@ class MetaReferenceInferenceImpl(
        self,
        params: OpenAIChatCompletionRequestWithExtraBody,
    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
-        raise NotImplementedError("OpenAI chat completion not supported by meta-reference inference provider")
+        self.check_model(params)
+
+        # Convert OpenAI messages to RawMessages
+        from llama_stack.models.llama.datatypes import StopReason
+        from llama_stack.providers.utils.inference.prompt_adapter import (
+            convert_openai_message_to_raw_message,
+            decode_assistant_message,
+        )
+
+        raw_messages = [await convert_openai_message_to_raw_message(msg) for msg in params.messages]
+
+        # Augment messages with tool definitions if tools are present
+        raw_messages = augment_raw_messages_for_tools(raw_messages, params, self.llama_model)
+
+        # Call generator's chat_completion method (works for both single-GPU and model-parallel)
+        if isinstance(self.generator, LlamaGenerator):
+            generator = self.generator.chat_completion(params, raw_messages)
+        else:
+            # Model parallel: submit task to process group
+            generator = self.generator.group.run_inference(("chat_completion", [params, raw_messages]))
+
+        # Check if streaming is requested
+        if params.stream:
+            return self._stream_chat_completion(generator, params)
+
+        # Non-streaming: collect all generated text
+        generated_text = ""
+        for result_batch in generator:
+            for result in result_batch:
+                if not result.ignore_token and result.source == "output":
+                    generated_text += result.text
+
+        # Decode assistant message to extract tool calls and determine stop_reason
+        # Default to end_of_turn if generation completed normally
+        decoded_message = decode_assistant_message(generated_text, StopReason.end_of_turn)
+
+        # Convert tool calls to OpenAI format
+        openai_tool_calls = None
+        if decoded_message.tool_calls:
+            from llama_stack.apis.inference import (
+                OpenAIChatCompletionToolCall,
+                OpenAIChatCompletionToolCallFunction,
+            )
+
+            openai_tool_calls = [
+                OpenAIChatCompletionToolCall(
+                    # generate a uuid for the call id. This is the only inline provider that does this, so need to get creative.
+                    id=f"call_{uuid.uuid4().hex[:24]}",
+                    type="function",
+                    function=OpenAIChatCompletionToolCallFunction(
+                        name=str(tc.tool_name),
+                        arguments=tc.arguments,
+                    ),
+                )
+                for tc in decoded_message.tool_calls
+            ]
+
+        # Determine finish_reason based on whether tool calls are present
+        finish_reason = "tool_calls" if openai_tool_calls else "stop"
+
+        # Extract content from decoded message
+        content = ""
+        if isinstance(decoded_message.content, str):
+            content = decoded_message.content
+        elif isinstance(decoded_message.content, list):
+            for item in decoded_message.content:
+                if isinstance(item, RawTextItem):
+                    content += item.text
+
+        # Create OpenAI response
+        # generate a uuid for the call id. This is the only inline provider that does this, so need to get creative.
+        response_id = f"chatcmpl-{uuid.uuid4().hex[:24]}"
+        created = int(time.time())
+
+        return OpenAIChatCompletion(
+            id=response_id,
+            object="chat.completion",
+            created=created,
+            model=params.model,
+            choices=[
+                OpenAIChoice(
+                    index=0,
+                    message=OpenAIAssistantMessageParam(
+                        role="assistant",
+                        content=content,
+                        tool_calls=openai_tool_calls,
+                    ),
+                    finish_reason=finish_reason,
+                    logprobs=None,
+                )
+            ],
+            usage=OpenAIChatCompletionUsage(
+                prompt_tokens=0,  # TODO: calculate properly
+                completion_tokens=0,  # TODO: calculate properly
+                total_tokens=0,  # TODO: calculate properly
+            ),
+        )
+
+    async def _stream_chat_completion(
+        self,
+        generator,
+        params: OpenAIChatCompletionRequestWithExtraBody,
+    ) -> AsyncIterator[OpenAIChatCompletionChunk]:
+        """Stream chat completion chunks as they're generated."""
+        from llama_stack.apis.inference import (
+            OpenAIChatCompletionChunk,
+            OpenAIChatCompletionToolCall,
+            OpenAIChatCompletionToolCallFunction,
+            OpenAIChoiceDelta,
+            OpenAIChunkChoice,
+        )
+        from llama_stack.models.llama.datatypes import StopReason
+        from llama_stack.providers.utils.inference.prompt_adapter import decode_assistant_message
+
+        response_id = f"chatcmpl-{uuid.uuid4().hex[:24]}"
+        created = int(time.time())
+        generated_text = ""
+
+        # Yield chunks as tokens are generated
+        for result_batch in generator:
+            for result in result_batch:
+                if result.ignore_token or result.source != "output":
+                    continue
+
+                generated_text += result.text
+
+                # Yield delta chunk with the new text
+                chunk = OpenAIChatCompletionChunk(
+                    id=response_id,
+                    object="chat.completion.chunk",
+                    created=created,
+                    model=params.model,
+                    choices=[
+                        OpenAIChunkChoice(
+                            index=0,
+                            delta=OpenAIChoiceDelta(
+                                role="assistant",
+                                content=result.text,
+                            ),
+                            finish_reason="",
+                            logprobs=None,
+                        )
+                    ],
+                )
+                yield chunk
+
+        # After generation completes, decode the full message to extract tool calls
+        decoded_message = decode_assistant_message(generated_text, StopReason.end_of_turn)
+
+        # If tool calls are present, yield a final chunk with tool_calls
+        if decoded_message.tool_calls:
+            openai_tool_calls = [
+                OpenAIChatCompletionToolCall(
+                    # generate a uuid for the call id. This is the only inline provider that does this, so need to get creative.
+                    id=f"call_{uuid.uuid4().hex[:24]}",
+                    type="function",
+                    function=OpenAIChatCompletionToolCallFunction(
+                        name=str(tc.tool_name),
+                        arguments=tc.arguments,
+                    ),
+                )
+                for tc in decoded_message.tool_calls
+            ]
+
+            # Yield chunk with tool_calls
+            chunk = OpenAIChatCompletionChunk(
+                id=response_id,
+                object="chat.completion.chunk",
+                created=created,
+                model=params.model,
+                choices=[
+                    OpenAIChunkChoice(
+                        index=0,
+                        delta=OpenAIChoiceDelta(
+                            role="assistant",
+                            tool_calls=openai_tool_calls,
+                        ),
+                        finish_reason="",
+                        logprobs=None,
+                    )
+                ],
+            )
+            yield chunk
+
+            finish_reason = "tool_calls"
+        else:
+            finish_reason = "stop"
+
+        # Yield final chunk with finish_reason
+        final_chunk = OpenAIChatCompletionChunk(
+            id=response_id,
+            object="chat.completion.chunk",
+            created=created,
+            model=params.model,
+            choices=[
+                OpenAIChunkChoice(
+                    index=0,
+                    delta=OpenAIChoiceDelta(),
+                    finish_reason=finish_reason,
+                    logprobs=None,
+                )
+            ],
+        )
+        yield final_chunk
--- a/src/llama_stack/providers/inline/inference/meta_reference/model_parallel.py
+++ b/src/llama_stack/providers/inline/inference/meta_reference/model_parallel.py
@ -4,17 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from collections.abc import Callable, Generator
-from copy import deepcopy
+from collections.abc import Callable
 from functools import partial
 from typing import Any

 from llama_stack.models.llama.llama3.chat_format import ChatFormat as Llama3ChatFormat
 from llama_stack.models.llama.llama4.chat_format import ChatFormat as Llama4ChatFormat
-from llama_stack.providers.utils.inference.prompt_adapter import (
-    ChatCompletionRequestWithRawContent,
-    CompletionRequestWithRawContent,
-)

 from .parallel_utils import ModelParallelProcessGroup

@ -23,12 +18,14 @@ class ModelRunner:
    def __init__(self, llama):
        self.llama = llama

-    # the `task` object is the same that is sent to `ModelParallelProcessGroup.run_inference()`
    def __call__(self, task: Any):
-        if task[0] == "chat_completion":
-            return self.llama.chat_completion(task[1])
+        task_type = task[0]
+        if task_type == "chat_completion":
+            # task[1] is [params, raw_messages]
+            params, raw_messages = task[1]
+            return self.llama.chat_completion(params, raw_messages)
        else:
-            raise ValueError(f"Unexpected task type {task[0]}")
+            raise ValueError(f"Unexpected task type {task_type}")


 def init_model_cb(
@ -78,19 +75,3 @@ class LlamaModelParallelGenerator:

    def __exit__(self, exc_type, exc_value, exc_traceback):
        self.group.stop()
-
-    def completion(
-        self,
-        request_batch: list[CompletionRequestWithRawContent],
-    ) -> Generator:
-        req_obj = deepcopy(request_batch)
-        gen = self.group.run_inference(("completion", req_obj))
-        yield from gen
-
-    def chat_completion(
-        self,
-        request_batch: list[ChatCompletionRequestWithRawContent],
-    ) -> Generator:
-        req_obj = deepcopy(request_batch)
-        gen = self.group.run_inference(("chat_completion", req_obj))
-        yield from gen
--- a/src/llama_stack/providers/inline/inference/meta_reference/parallel_utils.py
+++ b/src/llama_stack/providers/inline/inference/meta_reference/parallel_utils.py
@ -33,10 +33,6 @@ from torch.distributed.launcher.api import LaunchConfig, elastic_launch

 from llama_stack.log import get_logger
 from llama_stack.models.llama.datatypes import GenerationResult
-from llama_stack.providers.utils.inference.prompt_adapter import (
-    ChatCompletionRequestWithRawContent,
-    CompletionRequestWithRawContent,
-)

 log = get_logger(name=__name__, category="inference")

@ -69,10 +65,7 @@ class CancelSentinel(BaseModel):

 class TaskRequest(BaseModel):
    type: Literal[ProcessingMessageName.task_request] = ProcessingMessageName.task_request
-    task: tuple[
-        str,
-        list[CompletionRequestWithRawContent] | list[ChatCompletionRequestWithRawContent],
-    ]
+    task: tuple[str, list]


 class TaskResponse(BaseModel):
@ -328,10 +321,7 @@ class ModelParallelProcessGroup:

    def run_inference(
        self,
-        req: tuple[
-            str,
-            list[CompletionRequestWithRawContent] | list[ChatCompletionRequestWithRawContent],
-        ],
+        req: tuple[str, list],
    ) -> Generator:
        assert not self.running, "inference already running"

--- a/src/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
+++ b/src/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
@ -22,9 +22,6 @@ from llama_stack.providers.datatypes import Model, ModelsProtocolPrivate
 from llama_stack.providers.utils.inference.embedding_mixin import (
    SentenceTransformerEmbeddingMixin,
 )
-from llama_stack.providers.utils.inference.openai_compat import (
-    OpenAIChatCompletionToLlamaStackMixin,
-)

 from .config import SentenceTransformersInferenceConfig

@ -32,7 +29,6 @@ log = get_logger(name=__name__, category="inference")


 class SentenceTransformersInferenceImpl(
-    OpenAIChatCompletionToLlamaStackMixin,
    SentenceTransformerEmbeddingMixin,
    InferenceProvider,
    ModelsProtocolPrivate,
--- a/src/llama_stack/providers/registry/inference.py
+++ b/src/llama_stack/providers/registry/inference.py
@ -297,6 +297,20 @@ Available Models:
 Azure OpenAI inference provider for accessing GPT models and other Azure services.
 Provider documentation
 https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview
+""",
+        ),
+        RemoteProviderSpec(
+            api=Api.inference,
+            provider_type="remote::oci",
+            adapter_type="oci",
+            pip_packages=["oci"],
+            module="llama_stack.providers.remote.inference.oci",
+            config_class="llama_stack.providers.remote.inference.oci.config.OCIConfig",
+            provider_data_validator="llama_stack.providers.remote.inference.oci.config.OCIProviderDataValidator",
+            description="""
+Oracle Cloud Infrastructure (OCI) Generative AI inference provider for accessing OCI's Generative AI Platform-as-a-Service models.
+Provider documentation
+https://docs.oracle.com/en-us/iaas/Content/generative-ai/home.htm
 """,
        ),
    ]
--- a/src/llama_stack/providers/remote/inference/oci/init.py
+++ b/src/llama_stack/providers/remote/inference/oci/init.py
@ -0,0 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.inference import InferenceProvider
+
+from .config import OCIConfig
+
+
+async def get_adapter_impl(config: OCIConfig, _deps) -> InferenceProvider:
+    from .oci import OCIInferenceAdapter
+
+    adapter = OCIInferenceAdapter(config=config)
+    await adapter.initialize()
+    return adapter
--- a/src/llama_stack/providers/remote/inference/oci/auth.py
+++ b/src/llama_stack/providers/remote/inference/oci/auth.py
@ -0,0 +1,79 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from collections.abc import Generator, Mapping
+from typing import Any, override
+
+import httpx
+import oci
+import requests
+from oci.config import DEFAULT_LOCATION, DEFAULT_PROFILE
+
+OciAuthSigner = type[oci.signer.AbstractBaseSigner]
+
+
+class HttpxOciAuth(httpx.Auth):
+    """
+    Custom HTTPX authentication class that implements OCI request signing.
+
+    This class handles the authentication flow for HTTPX requests by signing them
+    using the OCI Signer, which adds the necessary authentication headers for
+    OCI API calls.
+
+    Attributes:
+        signer (oci.signer.Signer): The OCI signer instance used for request signing
+    """
+
+    def __init__(self, signer: OciAuthSigner):
+        self.signer = signer
+
+    @override
+    def auth_flow(self, request: httpx.Request) -> Generator[httpx.Request, httpx.Response, None]:
+        # Read the request content to handle streaming requests properly
+        try:
+            content = request.content
+        except httpx.RequestNotRead:
+            # For streaming requests, we need to read the content first
+            content = request.read()
+
+        req = requests.Request(
+            method=request.method,
+            url=str(request.url),
+            headers=dict(request.headers),
+            data=content,
+        )
+        prepared_request = req.prepare()
+
+        # Sign the request using the OCI Signer
+        self.signer.do_request_sign(prepared_request)  # type: ignore
+
+        # Update the original HTTPX request with the signed headers
+        request.headers.update(prepared_request.headers)
+
+        yield request
+
+
+class OciInstancePrincipalAuth(HttpxOciAuth):
+    def __init__(self, **kwargs: Mapping[str, Any]):
+        self.signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner(**kwargs)
+
+
+class OciUserPrincipalAuth(HttpxOciAuth):
+    def __init__(self, config_file: str = DEFAULT_LOCATION, profile_name: str = DEFAULT_PROFILE):
+        config = oci.config.from_file(config_file, profile_name)
+        oci.config.validate_config(config)  # type: ignore
+        key_content = ""
+        with open(config["key_file"]) as f:
+            key_content = f.read()
+
+        self.signer = oci.signer.Signer(
+            tenancy=config["tenancy"],
+            user=config["user"],
+            fingerprint=config["fingerprint"],
+            private_key_file_location=config.get("key_file"),
+            pass_phrase="none",  # type: ignore
+            private_key_content=key_content,
+        )
--- a/src/llama_stack/providers/remote/inference/oci/config.py
+++ b/src/llama_stack/providers/remote/inference/oci/config.py
@ -0,0 +1,75 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import os
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
+from llama_stack.schema_utils import json_schema_type
+
+
+class OCIProviderDataValidator(BaseModel):
+    oci_auth_type: str = Field(
+        description="OCI authentication type (must be one of: instance_principal, config_file)",
+    )
+    oci_region: str = Field(
+        description="OCI region (e.g., us-ashburn-1)",
+    )
+    oci_compartment_id: str = Field(
+        description="OCI compartment ID for the Generative AI service",
+    )
+    oci_config_file_path: str | None = Field(
+        default="~/.oci/config",
+        description="OCI config file path (required if oci_auth_type is config_file)",
+    )
+    oci_config_profile: str | None = Field(
+        default="DEFAULT",
+        description="OCI config profile (required if oci_auth_type is config_file)",
+    )
+
+
+@json_schema_type
+class OCIConfig(RemoteInferenceProviderConfig):
+    oci_auth_type: str = Field(
+        description="OCI authentication type (must be one of: instance_principal, config_file)",
+        default_factory=lambda: os.getenv("OCI_AUTH_TYPE", "instance_principal"),
+    )
+    oci_region: str = Field(
+        default_factory=lambda: os.getenv("OCI_REGION", "us-ashburn-1"),
+        description="OCI region (e.g., us-ashburn-1)",
+    )
+    oci_compartment_id: str = Field(
+        default_factory=lambda: os.getenv("OCI_COMPARTMENT_OCID", ""),
+        description="OCI compartment ID for the Generative AI service",
+    )
+    oci_config_file_path: str = Field(
+        default_factory=lambda: os.getenv("OCI_CONFIG_FILE_PATH", "~/.oci/config"),
+        description="OCI config file path (required if oci_auth_type is config_file)",
+    )
+    oci_config_profile: str = Field(
+        default_factory=lambda: os.getenv("OCI_CLI_PROFILE", "DEFAULT"),
+        description="OCI config profile (required if oci_auth_type is config_file)",
+    )
+
+    @classmethod
+    def sample_run_config(
+        cls,
+        oci_auth_type: str = "${env.OCI_AUTH_TYPE:=instance_principal}",
+        oci_config_file_path: str = "${env.OCI_CONFIG_FILE_PATH:=~/.oci/config}",
+        oci_config_profile: str = "${env.OCI_CLI_PROFILE:=DEFAULT}",
+        oci_region: str = "${env.OCI_REGION:=us-ashburn-1}",
+        oci_compartment_id: str = "${env.OCI_COMPARTMENT_OCID:=}",
+        **kwargs,
+    ) -> dict[str, Any]:
+        return {
+            "oci_auth_type": oci_auth_type,
+            "oci_config_file_path": oci_config_file_path,
+            "oci_config_profile": oci_config_profile,
+            "oci_region": oci_region,
+            "oci_compartment_id": oci_compartment_id,
+        }
--- a/src/llama_stack/providers/remote/inference/oci/oci.py
+++ b/src/llama_stack/providers/remote/inference/oci/oci.py
@ -0,0 +1,140 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+from collections.abc import Iterable
+from typing import Any
+
+import httpx
+import oci
+from oci.generative_ai.generative_ai_client import GenerativeAiClient
+from oci.generative_ai.models import ModelCollection
+from openai._base_client import DefaultAsyncHttpxClient
+
+from llama_stack.apis.inference.inference import (
+    OpenAIEmbeddingsRequestWithExtraBody,
+    OpenAIEmbeddingsResponse,
+)
+from llama_stack.apis.models import ModelType
+from llama_stack.log import get_logger
+from llama_stack.providers.remote.inference.oci.auth import OciInstancePrincipalAuth, OciUserPrincipalAuth
+from llama_stack.providers.remote.inference.oci.config import OCIConfig
+from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
+
+logger = get_logger(name=__name__, category="inference::oci")
+
+OCI_AUTH_TYPE_INSTANCE_PRINCIPAL = "instance_principal"
+OCI_AUTH_TYPE_CONFIG_FILE = "config_file"
+VALID_OCI_AUTH_TYPES = [OCI_AUTH_TYPE_INSTANCE_PRINCIPAL, OCI_AUTH_TYPE_CONFIG_FILE]
+DEFAULT_OCI_REGION = "us-ashburn-1"
+
+MODEL_CAPABILITIES = ["TEXT_GENERATION", "TEXT_SUMMARIZATION", "TEXT_EMBEDDINGS", "CHAT"]
+
+
+class OCIInferenceAdapter(OpenAIMixin):
+    config: OCIConfig
+
+    async def initialize(self) -> None:
+        """Initialize and validate OCI configuration."""
+        if self.config.oci_auth_type not in VALID_OCI_AUTH_TYPES:
+            raise ValueError(
+                f"Invalid OCI authentication type: {self.config.oci_auth_type}."
+                f"Valid types are one of: {VALID_OCI_AUTH_TYPES}"
+            )
+
+        if not self.config.oci_compartment_id:
+            raise ValueError("OCI_COMPARTMENT_OCID is a required parameter. Either set in env variable or config.")
+
+    def get_base_url(self) -> str:
+        region = self.config.oci_region or DEFAULT_OCI_REGION
+        return f"https://inference.generativeai.{region}.oci.oraclecloud.com/20231130/actions/v1"
+
+    def get_api_key(self) -> str | None:
+        # OCI doesn't use API keys, it uses request signing
+        return "<NOTUSED>"
+
+    def get_extra_client_params(self) -> dict[str, Any]:
+        """
+        Get extra parameters for the AsyncOpenAI client, including OCI-specific auth and headers.
+        """
+        auth = self._get_auth()
+        compartment_id = self.config.oci_compartment_id or ""
+
+        return {
+            "http_client": DefaultAsyncHttpxClient(
+                auth=auth,
+                headers={
+                    "CompartmentId": compartment_id,
+                },
+            ),
+        }
+
+    def _get_oci_signer(self) -> oci.signer.AbstractBaseSigner | None:
+        if self.config.oci_auth_type == OCI_AUTH_TYPE_INSTANCE_PRINCIPAL:
+            return oci.auth.signers.InstancePrincipalsSecurityTokenSigner()
+        return None
+
+    def _get_oci_config(self) -> dict:
+        if self.config.oci_auth_type == OCI_AUTH_TYPE_INSTANCE_PRINCIPAL:
+            config = {"region": self.config.oci_region}
+        elif self.config.oci_auth_type == OCI_AUTH_TYPE_CONFIG_FILE:
+            config = oci.config.from_file(self.config.oci_config_file_path, self.config.oci_config_profile)
+            if not config.get("region"):
+                raise ValueError(
+                    "Region not specified in config. Please specify in config or with OCI_REGION env variable."
+                )
+
+        return config
+
+    def _get_auth(self) -> httpx.Auth:
+        if self.config.oci_auth_type == OCI_AUTH_TYPE_INSTANCE_PRINCIPAL:
+            return OciInstancePrincipalAuth()
+        elif self.config.oci_auth_type == OCI_AUTH_TYPE_CONFIG_FILE:
+            return OciUserPrincipalAuth(
+                config_file=self.config.oci_config_file_path, profile_name=self.config.oci_config_profile
+            )
+        else:
+            raise ValueError(f"Invalid OCI authentication type: {self.config.oci_auth_type}")
+
+    async def list_provider_model_ids(self) -> Iterable[str]:
+        """
+        List available models from OCI Generative AI service.
+        """
+        oci_config = self._get_oci_config()
+        oci_signer = self._get_oci_signer()
+        compartment_id = self.config.oci_compartment_id or ""
+
+        if oci_signer is None:
+            client = GenerativeAiClient(config=oci_config)
+        else:
+            client = GenerativeAiClient(config=oci_config, signer=oci_signer)
+
+        models: ModelCollection = client.list_models(
+            compartment_id=compartment_id, capability=MODEL_CAPABILITIES, lifecycle_state="ACTIVE"
+        ).data
+
+        seen_models = set()
+        model_ids = []
+        for model in models.items:
+            if model.time_deprecated or model.time_on_demand_retired:
+                continue
+
+            if "CHAT" not in model.capabilities or "FINE_TUNE" in model.capabilities:
+                continue
+
+            # Use display_name + model_type as the key to avoid conflicts
+            model_key = (model.display_name, ModelType.llm)
+            if model_key in seen_models:
+                continue
+
+            seen_models.add(model_key)
+            model_ids.append(model.display_name)
+
+        return model_ids
+
+    async def openai_embeddings(self, params: OpenAIEmbeddingsRequestWithExtraBody) -> OpenAIEmbeddingsResponse:
+        # The constructed url is a mask that hits OCI's "chat" action, which is not supported for embeddings.
+        raise NotImplementedError("OCI Provider does not (currently) support embeddings")
--- a/src/llama_stack/providers/utils/inference/litellm_openai_mixin.py
+++ b/src/llama_stack/providers/utils/inference/litellm_openai_mixin.py
@ -11,9 +11,7 @@ from collections.abc import AsyncIterator
 import litellm

 from llama_stack.apis.inference import (
-    ChatCompletionRequest,
    InferenceProvider,
-    JsonSchemaResponseFormat,
    OpenAIChatCompletion,
    OpenAIChatCompletionChunk,
    OpenAIChatCompletionRequestWithExtraBody,
@ -23,15 +21,11 @@ from llama_stack.apis.inference import (
    OpenAIEmbeddingsRequestWithExtraBody,
    OpenAIEmbeddingsResponse,
    OpenAIEmbeddingUsage,
-    ToolChoice,
 )
 from llama_stack.core.request_headers import NeedsRequestProviderData
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper, ProviderModelEntry
 from llama_stack.providers.utils.inference.openai_compat import (
-    convert_message_to_openai_dict_new,
-    convert_tooldef_to_openai_tool,
-    get_sampling_options,
    prepare_openai_completion_params,
 )

@ -127,51 +121,6 @@ class LiteLLMOpenAIMixin(

        return schema

-    async def _get_params(self, request: ChatCompletionRequest) -> dict:
-        from typing import Any
-
-        input_dict: dict[str, Any] = {}
-
-        input_dict["messages"] = [
-            await convert_message_to_openai_dict_new(m, download_images=self.download_images) for m in request.messages
-        ]
-        if fmt := request.response_format:
-            if not isinstance(fmt, JsonSchemaResponseFormat):
-                raise ValueError(
-                    f"Unsupported response format: {type(fmt)}. Only JsonSchemaResponseFormat is supported."
-                )
-
-            # Convert to dict for manipulation
-            fmt_dict = dict(fmt.json_schema)
-            name = fmt_dict["title"]
-            del fmt_dict["title"]
-            fmt_dict["additionalProperties"] = False
-
-            # Apply additionalProperties: False recursively to all objects
-            fmt_dict = self._add_additional_properties_recursive(fmt_dict)
-
-            input_dict["response_format"] = {
-                "type": "json_schema",
-                "json_schema": {
-                    "name": name,
-                    "schema": fmt_dict,
-                    "strict": self.json_schema_strict,
-                },
-            }
-        if request.tools:
-            input_dict["tools"] = [convert_tooldef_to_openai_tool(tool) for tool in request.tools]
-            if request.tool_config and (tool_choice := request.tool_config.tool_choice):
-                input_dict["tool_choice"] = tool_choice.value if isinstance(tool_choice, ToolChoice) else tool_choice
-
-        return {
-            "model": request.model,
-            "api_key": self.get_api_key(),
-            "api_base": self.api_base,
-            **input_dict,
-            "stream": request.stream,
-            **get_sampling_options(request.sampling_params),
-        }
-
    def get_api_key(self) -> str:
        provider_data = self.get_request_provider_data()
        key_field = self.provider_data_api_key_field
--- a/src/llama_stack/providers/utils/inference/openai_compat.py
+++ b/src/llama_stack/providers/utils/inference/openai_compat.py
--- a/src/llama_stack/providers/utils/inference/prompt_adapter.py
+++ b/src/llama_stack/providers/utils/inference/prompt_adapter.py
@ -21,19 +21,18 @@ from llama_stack.apis.common.content_types import (
    TextContentItem,
 )
 from llama_stack.apis.inference import (
-    ChatCompletionRequest,
    CompletionRequest,
-    Message,
+    OpenAIAssistantMessageParam,
    OpenAIChatCompletionContentPartImageParam,
    OpenAIChatCompletionContentPartTextParam,
    OpenAIFile,
+    OpenAIMessageParam,
+    OpenAISystemMessageParam,
+    OpenAIToolMessageParam,
+    OpenAIUserMessageParam,
    ResponseFormat,
    ResponseFormatType,
-    SystemMessage,
-    SystemMessageBehavior,
    ToolChoice,
-    ToolDefinition,
-    UserMessage,
 )
 from llama_stack.log import get_logger
 from llama_stack.models.llama.datatypes import (
@ -42,33 +41,19 @@ from llama_stack.models.llama.datatypes import (
    RawMediaItem,
    RawMessage,
    RawTextItem,
-    Role,
    StopReason,
+    ToolCall,
+    ToolDefinition,
    ToolPromptFormat,
 )
 from llama_stack.models.llama.llama3.chat_format import ChatFormat
-from llama_stack.models.llama.llama3.prompt_templates import (
-    BuiltinToolGenerator,
-    FunctionTagCustomToolGenerator,
-    JsonCustomToolGenerator,
-    PythonListCustomToolGenerator,
-    SystemDefaultGenerator,
-)
 from llama_stack.models.llama.llama3.tokenizer import Tokenizer
-from llama_stack.models.llama.llama4.prompt_templates.system_prompts import (
-    PythonListCustomToolGenerator as PythonListCustomToolGeneratorLlama4,
-)
 from llama_stack.models.llama.sku_list import resolve_model
 from llama_stack.models.llama.sku_types import ModelFamily, is_multimodal
-from llama_stack.providers.utils.inference import supported_inference_models

 log = get_logger(name=__name__, category="providers::utils")


-class ChatCompletionRequestWithRawContent(ChatCompletionRequest):
-    messages: list[RawMessage]
-
-
 class CompletionRequestWithRawContent(CompletionRequest):
    content: RawContent

@ -103,28 +88,6 @@ def interleaved_content_as_str(
        return _process(content)


-async def convert_request_to_raw(
-    request: ChatCompletionRequest | CompletionRequest,
-) -> ChatCompletionRequestWithRawContent | CompletionRequestWithRawContent:
-    if isinstance(request, ChatCompletionRequest):
-        messages = []
-        for m in request.messages:
-            content = await interleaved_content_convert_to_raw(m.content)
-            d = m.model_dump()
-            d["content"] = content
-            messages.append(RawMessage(**d))
-
-        d = request.model_dump()
-        d["messages"] = messages
-        request = ChatCompletionRequestWithRawContent(**d)
-    else:
-        d = request.model_dump()
-        d["content"] = await interleaved_content_convert_to_raw(request.content)
-        request = CompletionRequestWithRawContent(**d)
-
-    return request
-
-
 async def interleaved_content_convert_to_raw(
    content: InterleavedContent,
 ) -> RawContent:
@ -171,6 +134,36 @@ async def interleaved_content_convert_to_raw(
        return await _localize_single(content)


+async def convert_openai_message_to_raw_message(message: OpenAIMessageParam) -> RawMessage:
+    """Convert OpenAI message format to RawMessage format used by Llama formatters."""
+    if isinstance(message, OpenAIUserMessageParam):
+        content = await interleaved_content_convert_to_raw(message.content)  # type: ignore[arg-type]
+        return RawMessage(role="user", content=content)
+    elif isinstance(message, OpenAISystemMessageParam):
+        content = await interleaved_content_convert_to_raw(message.content)  # type: ignore[arg-type]
+        return RawMessage(role="system", content=content)
+    elif isinstance(message, OpenAIAssistantMessageParam):
+        content = await interleaved_content_convert_to_raw(message.content or "")  # type: ignore[arg-type]
+        tool_calls = []
+        if message.tool_calls:
+            for tc in message.tool_calls:
+                if tc.function:
+                    tool_calls.append(
+                        ToolCall(
+                            call_id=tc.id or "",
+                            tool_name=tc.function.name or "",
+                            arguments=tc.function.arguments or "{}",
+                        )
+                    )
+        return RawMessage(role="assistant", content=content, tool_calls=tool_calls)
+    elif isinstance(message, OpenAIToolMessageParam):
+        content = await interleaved_content_convert_to_raw(message.content)  # type: ignore[arg-type]
+        return RawMessage(role="tool", content=content)
+    else:
+        # Handle OpenAIDeveloperMessageParam if needed
+        raise ValueError(f"Unsupported message type: {type(message)}")
+
+
 def content_has_media(content: InterleavedContent):
    def _has_media_content(c):
        return isinstance(c, ImageContentItem)
@ -181,17 +174,6 @@ def content_has_media(content: InterleavedContent):
        return _has_media_content(content)


-def messages_have_media(messages: list[Message]):
-    return any(content_has_media(m.content) for m in messages)
-
-
-def request_has_media(request: ChatCompletionRequest | CompletionRequest):
-    if isinstance(request, ChatCompletionRequest):
-        return messages_have_media(request.messages)
-    else:
-        return content_has_media(request.content)
-
-
 async def localize_image_content(uri: str) -> tuple[bytes, str] | None:
    if uri.startswith("http"):
        async with httpx.AsyncClient() as client:
@ -253,79 +235,6 @@ def augment_content_with_response_format_prompt(response_format, content):
    return content


-async def chat_completion_request_to_prompt(request: ChatCompletionRequest, llama_model: str) -> str:
-    messages = chat_completion_request_to_messages(request, llama_model)
-    request.messages = messages
-    request = await convert_request_to_raw(request)
-
-    formatter = ChatFormat(tokenizer=Tokenizer.get_instance())
-    model_input = formatter.encode_dialog_prompt(
-        request.messages,
-        tool_prompt_format=request.tool_config.tool_prompt_format or get_default_tool_prompt_format(llama_model),
-    )
-    return formatter.tokenizer.decode(model_input.tokens)
-
-
-async def chat_completion_request_to_model_input_info(
-    request: ChatCompletionRequest, llama_model: str
-) -> tuple[str, int]:
-    messages = chat_completion_request_to_messages(request, llama_model)
-    request.messages = messages
-    request = await convert_request_to_raw(request)
-
-    formatter = ChatFormat(tokenizer=Tokenizer.get_instance())
-    model_input = formatter.encode_dialog_prompt(
-        request.messages,
-        tool_prompt_format=request.tool_config.tool_prompt_format or get_default_tool_prompt_format(llama_model),
-    )
-    return (
-        formatter.tokenizer.decode(model_input.tokens),
-        len(model_input.tokens),
-    )
-
-
-def chat_completion_request_to_messages(
-    request: ChatCompletionRequest,
-    llama_model: str,
-) -> list[Message]:
-    """Reads chat completion request and augments the messages to handle tools.
-    For eg. for llama_3_1, add system message with the appropriate tools or
-    add user messsage for custom tools, etc.
-    """
-    assert llama_model is not None, "llama_model is required"
-    model = resolve_model(llama_model)
-    if model is None:
-        log.error(f"Could not resolve model {llama_model}")
-        return request.messages
-
-    allowed_models = supported_inference_models()
-    descriptors = [m.descriptor() for m in allowed_models]
-    if model.descriptor() not in descriptors:
-        log.error(f"Unsupported inference model? {model.descriptor()}")
-        return request.messages
-
-    if model.model_family == ModelFamily.llama3_1 or (
-        model.model_family == ModelFamily.llama3_2 and is_multimodal(model.core_model_id)
-    ):
-        # llama3.1 and llama3.2 multimodal models follow the same tool prompt format
-        messages = augment_messages_for_tools_llama_3_1(request)
-    elif model.model_family in (
-        ModelFamily.llama3_2,
-        ModelFamily.llama3_3,
-    ):
-        # llama3.2, llama3.3 follow the same tool prompt format
-        messages = augment_messages_for_tools_llama(request, PythonListCustomToolGenerator)
-    elif model.model_family == ModelFamily.llama4:
-        messages = augment_messages_for_tools_llama(request, PythonListCustomToolGeneratorLlama4)
-    else:
-        messages = request.messages
-
-    if fmt_prompt := response_format_prompt(request.response_format):
-        messages.append(UserMessage(content=fmt_prompt))
-
-    return messages
-
-
 def response_format_prompt(fmt: ResponseFormat | None):
    if not fmt:
        return None
@ -338,128 +247,6 @@ def response_format_prompt(fmt: ResponseFormat | None):
        raise ValueError(f"Unknown response format {fmt.type}")


-def augment_messages_for_tools_llama_3_1(
-    request: ChatCompletionRequest,
-) -> list[Message]:
-    existing_messages = request.messages
-    existing_system_message = None
-    if existing_messages[0].role == Role.system.value:
-        existing_system_message = existing_messages.pop(0)
-
-    assert existing_messages[0].role != Role.system.value, "Should only have 1 system message"
-
-    messages = []
-
-    default_gen = SystemDefaultGenerator()
-    default_template = default_gen.gen()
-
-    sys_content = ""
-
-    tool_template = None
-    if request.tools:
-        tool_gen = BuiltinToolGenerator()
-        tool_template = tool_gen.gen(request.tools)
-
-        sys_content += tool_template.render()
-        sys_content += "\n"
-
-    sys_content += default_template.render()
-
-    if existing_system_message:
-        # TODO: this fn is needed in many places
-        def _process(c):
-            if isinstance(c, str):
-                return c
-            else:
-                return "<media>"
-
-        sys_content += "\n"
-
-        if isinstance(existing_system_message.content, str):
-            sys_content += _process(existing_system_message.content)
-        elif isinstance(existing_system_message.content, list):
-            sys_content += "\n".join([_process(c) for c in existing_system_message.content])
-
-    tool_choice_prompt = _get_tool_choice_prompt(request.tool_config.tool_choice, request.tools)
-    if tool_choice_prompt:
-        sys_content += "\n" + tool_choice_prompt
-
-    messages.append(SystemMessage(content=sys_content))
-
-    has_custom_tools = request.tools is not None and any(isinstance(dfn.tool_name, str) for dfn in request.tools)
-    if has_custom_tools:
-        fmt = request.tool_config.tool_prompt_format or ToolPromptFormat.json
-        if fmt == ToolPromptFormat.json:
-            tool_gen = JsonCustomToolGenerator()
-        elif fmt == ToolPromptFormat.function_tag:
-            tool_gen = FunctionTagCustomToolGenerator()
-        else:
-            raise ValueError(f"Non supported ToolPromptFormat {fmt}")
-
-        custom_tools = [t for t in request.tools if isinstance(t.tool_name, str)]
-        custom_template = tool_gen.gen(custom_tools)
-        messages.append(UserMessage(content=custom_template.render()))
-
-    # Add back existing messages from the request
-    messages += existing_messages
-
-    return messages
-
-
-def augment_messages_for_tools_llama(
-    request: ChatCompletionRequest,
-    custom_tool_prompt_generator,
-) -> list[Message]:
-    existing_messages = request.messages
-    existing_system_message = None
-    if existing_messages[0].role == Role.system.value:
-        existing_system_message = existing_messages.pop(0)
-
-    assert existing_messages[0].role != Role.system.value, "Should only have 1 system message"
-
-    sys_content = ""
-    custom_tools, builtin_tools = [], []
-    for t in request.tools:
-        if isinstance(t.tool_name, str):
-            custom_tools.append(t)
-        else:
-            builtin_tools.append(t)
-
-    if builtin_tools:
-        tool_gen = BuiltinToolGenerator()
-        tool_template = tool_gen.gen(builtin_tools)
-
-        sys_content += tool_template.render()
-        sys_content += "\n"
-
-    custom_tools = [dfn for dfn in request.tools if isinstance(dfn.tool_name, str)]
-    if custom_tools:
-        fmt = request.tool_config.tool_prompt_format or ToolPromptFormat.python_list
-        if fmt != ToolPromptFormat.python_list:
-            raise ValueError(f"Non supported ToolPromptFormat {request.tool_config.tool_prompt_format}")
-
-        system_prompt = None
-        if existing_system_message and request.tool_config.system_message_behavior == SystemMessageBehavior.replace:
-            system_prompt = existing_system_message.content
-
-        tool_template = custom_tool_prompt_generator().gen(custom_tools, system_prompt)
-
-        sys_content += tool_template.render()
-        sys_content += "\n"
-
-    if existing_system_message and (
-        request.tool_config.system_message_behavior == SystemMessageBehavior.append or not custom_tools
-    ):
-        sys_content += interleaved_content_as_str(existing_system_message.content, sep="\n")
-
-    tool_choice_prompt = _get_tool_choice_prompt(request.tool_config.tool_choice, request.tools)
-    if tool_choice_prompt:
-        sys_content += "\n" + tool_choice_prompt
-
-    messages = [SystemMessage(content=sys_content.strip("\n")), *existing_messages]
-    return messages
-
-
 def _get_tool_choice_prompt(tool_choice: ToolChoice | str, tools: list[ToolDefinition]) -> str:
    if tool_choice == ToolChoice.auto:
        return ""
--- a/src/llama_stack/providers/utils/memory/openai_vector_store_mixin.py
+++ b/src/llama_stack/providers/utils/memory/openai_vector_store_mixin.py
@ -30,7 +30,7 @@ from llama_stack.apis.vector_io import (
    VectorStoreContent,
    VectorStoreDeleteResponse,
    VectorStoreFileBatchObject,
-    VectorStoreFileContentsResponse,
+    VectorStoreFileContentResponse,
    VectorStoreFileCounts,
    VectorStoreFileDeleteResponse,
    VectorStoreFileLastError,
@ -704,34 +704,35 @@ class OpenAIVectorStoreMixin(ABC):
            # Unknown filter type, default to no match
            raise ValueError(f"Unsupported filter type: {filter_type}")

-    def _chunk_to_vector_store_content(self, chunk: Chunk) -> list[VectorStoreContent]:
-        # content is InterleavedContent
+    def _chunk_to_vector_store_content(
+        self, chunk: Chunk, include_embeddings: bool = False, include_metadata: bool = False
+    ) -> list[VectorStoreContent]:
+        def extract_fields() -> dict:
+            """Extract embedding and metadata fields from chunk based on include flags."""
+            return {
+                "embedding": chunk.embedding if include_embeddings else None,
+                "chunk_metadata": chunk.chunk_metadata if include_metadata else None,
+                "metadata": chunk.metadata if include_metadata else None,
+            }
+
+        fields = extract_fields()
+
        if isinstance(chunk.content, str):
-            content = [
-                VectorStoreContent(
-                    type="text",
-                    text=chunk.content,
-                )
-            ]
+            content_item = VectorStoreContent(type="text", text=chunk.content, **fields)
+            content = [content_item]
        elif isinstance(chunk.content, list):
            # TODO: Add support for other types of content
-            content = [
-                VectorStoreContent(
-                    type="text",
-                    text=item.text,
-                )
-                for item in chunk.content
-                if item.type == "text"
-            ]
+            content = []
+            for item in chunk.content:
+                if item.type == "text":
+                    content_item = VectorStoreContent(type="text", text=item.text, **fields)
+                    content.append(content_item)
        else:
            if chunk.content.type != "text":
                raise ValueError(f"Unsupported content type: {chunk.content.type}")
-            content = [
-                VectorStoreContent(
-                    type="text",
-                    text=chunk.content.text,
-                )
-            ]
+
+            content_item = VectorStoreContent(type="text", text=chunk.content.text, **fields)
+            content = [content_item]
        return content

    async def openai_attach_file_to_vector_store(
@ -820,13 +821,12 @@ class OpenAIVectorStoreMixin(ABC):
                message=str(e),
            )

-        # Create OpenAI vector store file metadata
+        # Save vector store file to persistent storage AFTER insert_chunks
+        # so that chunks include the embeddings that were generated
        file_info = vector_store_file_object.model_dump(exclude={"last_error"})
        file_info["filename"] = file_response.filename if file_response else ""

-        # Save vector store file to persistent storage (provider-specific)
        dict_chunks = [c.model_dump() for c in chunks]
-        # This should be updated to include chunk_id
        await self._save_openai_vector_store_file(vector_store_id, file_id, file_info, dict_chunks)

        # Update file_ids and file_counts in vector store metadata
@ -921,22 +921,27 @@ class OpenAIVectorStoreMixin(ABC):
        self,
        vector_store_id: str,
        file_id: str,
-    ) -> VectorStoreFileContentsResponse:
+        include_embeddings: bool | None = False,
+        include_metadata: bool | None = False,
+    ) -> VectorStoreFileContentResponse:
        """Retrieves the contents of a vector store file."""
        if vector_store_id not in self.openai_vector_stores:
            raise VectorStoreNotFoundError(vector_store_id)

-        file_info = await self._load_openai_vector_store_file(vector_store_id, file_id)
+        # Parameters are already provided directly
+        # include_embeddings and include_metadata are now function parameters
+
        dict_chunks = await self._load_openai_vector_store_file_contents(vector_store_id, file_id)
        chunks = [Chunk.model_validate(c) for c in dict_chunks]
        content = []
        for chunk in chunks:
-            content.extend(self._chunk_to_vector_store_content(chunk))
-        return VectorStoreFileContentsResponse(
-            file_id=file_id,
-            filename=file_info.get("filename", ""),
-            attributes=file_info.get("attributes", {}),
-            content=content,
+            content.extend(
+                self._chunk_to_vector_store_content(
+                    chunk, include_embeddings=include_embeddings or False, include_metadata=include_metadata or False
+                )
+            )
+        return VectorStoreFileContentResponse(
+            data=content,
        )

    async def openai_update_vector_store_file(