chore: move all Llama Stack types from llama-models to llama-stack (#1098)

llama-models should have extremely minimal cruft. Its sole purpose should be didactic -- show the simplest implementation of the llama models and document the prompt formats, etc. This PR is the complement to https://github.com/meta-llama/llama-models/pull/279 ## Test Plan Ensure all `llama` CLI `model` sub-commands work: ```bash llama model list llama model download --model-id ... llama model prompt-format -m ... ``` Ran tests: ```bash cd tests/client-sdk LLAMA_STACK_CONFIG=fireworks pytest -s -v inference/ LLAMA_STACK_CONFIG=fireworks pytest -s -v vector_io/ LLAMA_STACK_CONFIG=fireworks pytest -s -v agents/ ``` Create a fresh venv `uv venv && source .venv/bin/activate` and run `llama stack build --template fireworks --image-type venv` followed by `llama stack run together --image-type venv` <-- the server runs Also checked that the OpenAPI generator can run and there is no change in the generated files as a result. ```bash cd docs/openapi_generator sh run_openapi_generator.sh ```
2025-02-14 09:10:59 -08:00 · 2025-02-14 09:10:59 -08:00 · 314ee09ae3
commit 314ee09ae3
parent c0ee512980
138 changed files with 8491 additions and 465 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -30,6 +30,7 @@ repos:
    rev: v0.9.4
    hooks:
    -   id: ruff
        exclude: ^llama_stack/strong_typing/.*$
    -   id: ruff-format
 -   repo: https://github.com/adamchainz/blacken-docs
@ -43,7 +44,13 @@ repos:
    rev: 0.5.26
    hooks:
    -   id: uv-export
-        args: ["--frozen", "--no-hashes", "--no-emit-project"]
+        args: [
            "--frozen", 
            "--no-hashes", 
            "--no-emit-project", 
            "--output-file=requirements.txt"
        ]
        files: ^pyproject\.toml$
    -   id: uv-sync
 # -   repo: https://github.com/pre-commit/mirrors-mypy
--- a/docs/openapi_generator/generate.py
+++ b/docs/openapi_generator/generate.py
@ -16,18 +16,6 @@ from pathlib import Path
 import fire
 import ruamel.yaml as yaml
 from llama_models import schema_utils
 # We do some monkey-patching to ensure our definitions only use the minimal
 # (json_schema_type, webmethod) definitions from the llama_models package. For
 # generation though, we need the full definitions and implementations from the
 #  (json-strong-typing) package.
 from .strong_typing.schema import json_schema_type, register_schema
 schema_utils.json_schema_type = json_schema_type
 schema_utils.register_schema = register_schema
 from llama_stack.apis.version import LLAMA_STACK_API_VERSION  # noqa: E402
 from llama_stack.distribution.stack import LlamaStack  # noqa: E402
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@ -10,9 +10,9 @@ import typing
 from dataclasses import make_dataclass
 from typing import Any, Dict, Set, Union
-from ..strong_typing.core import JsonType
+from llama_stack.strong_typing.core import JsonType
-from ..strong_typing.docstring import Docstring, parse_type
+from llama_stack.strong_typing.docstring import Docstring, parse_type
-from ..strong_typing.inspection import (
+from llama_stack.strong_typing.inspection import (
    is_generic_list,
    is_type_optional,
    is_type_union,
@ -20,15 +20,15 @@ from ..strong_typing.inspection import (
    unwrap_optional_type,
    unwrap_union_types,
 )
-from ..strong_typing.name import python_type_to_name
+from llama_stack.strong_typing.name import python_type_to_name
-from ..strong_typing.schema import (
+from llama_stack.strong_typing.schema import (
    get_schema_identifier,
    JsonSchemaGenerator,
    register_schema,
    Schema,
    SchemaOptions,
 )
-from ..strong_typing.serialization import json_dump_string, object_to_json
+from llama_stack.strong_typing.serialization import json_dump_string, object_to_json
 from .operations import (
    EndpointOperation,
--- a/docs/openapi_generator/pyopenapi/operations.py
+++ b/docs/openapi_generator/pyopenapi/operations.py
@ -15,7 +15,7 @@ from llama_stack.apis.version import LLAMA_STACK_API_VERSION
 from termcolor import colored
-from ..strong_typing.inspection import get_signature
+from llama_stack.strong_typing.inspection import get_signature
 def split_prefix(
--- a/docs/openapi_generator/pyopenapi/specification.py
+++ b/docs/openapi_generator/pyopenapi/specification.py
@ -9,7 +9,7 @@ import enum
 from dataclasses import dataclass
 from typing import Any, ClassVar, Dict, List, Optional, Union
-from ..strong_typing.schema import JsonType, Schema, StrictJsonType
+from llama_stack.strong_typing.schema import JsonType, Schema, StrictJsonType
 URL = str
--- a/docs/openapi_generator/pyopenapi/utility.py
+++ b/docs/openapi_generator/pyopenapi/utility.py
@ -9,7 +9,7 @@ import typing
 from pathlib import Path
 from typing import TextIO
-from ..strong_typing.schema import object_to_json, StrictJsonType
+from llama_stack.strong_typing.schema import object_to_json, StrictJsonType
 from .generator import Generator
 from .options import Options
--- a/llama_stack/apis/agents/agents.py
+++ b/llama_stack/apis/agents/agents.py
@ -19,7 +19,6 @@ from typing import (
    runtime_checkable,
 )
 from llama_models.schema_utils import json_schema_type, register_schema, webmethod
 from pydantic import BaseModel, ConfigDict, Field
 from llama_stack.apis.common.content_types import URL, ContentDelta, InterleavedContent
@ -38,6 +37,7 @@ from llama_stack.apis.inference import (
 from llama_stack.apis.safety import SafetyViolation
 from llama_stack.apis.tools import ToolDef
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
 class Attachment(BaseModel):
--- a/llama_stack/apis/agents/event_logger.py
+++ b/llama_stack/apis/agents/event_logger.py
@ -1,206 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from typing import Optional
 from llama_models.llama3.api.datatypes import ToolPromptFormat
 from llama_models.llama3.api.tool_utils import ToolUtils
 from termcolor import cprint
 from llama_stack.apis.agents import AgentTurnResponseEventType, StepType
 from llama_stack.apis.common.content_types import ToolCallParseStatus
 from llama_stack.apis.inference import ToolResponseMessage
 from llama_stack.providers.utils.inference.prompt_adapter import (
    interleaved_content_as_str,
 )
 class LogEvent:
    def __init__(
        self,
        role: Optional[str] = None,
        content: str = "",
        end: str = "\n",
        color="white",
    ):
        self.role = role
        self.content = content
        self.color = color
        self.end = "\n" if end is None else end
    def __str__(self):
        if self.role is not None:
            return f"{self.role}> {self.content}"
        else:
            return f"{self.content}"
    def print(self, flush=True):
        cprint(f"{str(self)}", color=self.color, end=self.end, flush=flush)
 EventType = AgentTurnResponseEventType
 class EventLogger:
    async def log(
        self,
        event_generator,
        stream=True,
        tool_prompt_format: ToolPromptFormat = ToolPromptFormat.json,
    ):
        previous_event_type = None
        previous_step_type = None
        async for chunk in event_generator:
            if not hasattr(chunk, "event"):
                # Need to check for custom tool first
                # since it does not produce event but instead
                # a Message
                if isinstance(chunk, ToolResponseMessage):
                    yield (
                        chunk,
                        LogEvent(role="CustomTool", content=chunk.content, color="grey"),
                    )
                continue
            event = chunk.event
            event_type = event.payload.event_type
            if event_type in {
                EventType.turn_start.value,
                EventType.turn_complete.value,
            }:
                # Currently not logging any turn realted info
                yield event, None
                continue
            step_type = event.payload.step_type
            # handle safety
            if step_type == StepType.shield_call and event_type == EventType.step_complete.value:
                violation = event.payload.step_details.violation
                if not violation:
                    yield (
                        event,
                        LogEvent(role=step_type, content="No Violation", color="magenta"),
                    )
                else:
                    yield (
                        event,
                        LogEvent(
                            role=step_type,
                            content=f"{violation.metadata} {violation.user_message}",
                            color="red",
                        ),
                    )
            # handle inference
            if step_type == StepType.inference:
                if stream:
                    if event_type == EventType.step_start.value:
                        # TODO: Currently this event is never received
                        yield (
                            event,
                            LogEvent(role=step_type, content="", end="", color="yellow"),
                        )
                    elif event_type == EventType.step_progress.value:
                        # HACK: if previous was not step/event was not inference's step_progress
                        # this is the first time we are getting model inference response
                        # aka equivalent to step_start for inference. Hence,
                        # start with "Model>".
                        if (
                            previous_event_type != EventType.step_progress.value
                            and previous_step_type != StepType.inference
                        ):
                            yield (
                                event,
                                LogEvent(role=step_type, content="", end="", color="yellow"),
                            )
                        delta = event.payload.delta
                        if delta.type == "tool_call":
                            if delta.parse_status == ToolCallParseStatus.succeeded:
                                yield (
                                    event,
                                    LogEvent(
                                        role=None,
                                        content=delta.tool_call,
                                        end="",
                                        color="cyan",
                                    ),
                                )
                        else:
                            yield (
                                event,
                                LogEvent(
                                    role=None,
                                    content=delta.text,
                                    end="",
                                    color="yellow",
                                ),
                            )
                    else:
                        # step_complete
                        yield event, LogEvent(role=None, content="")
                else:
                    # Not streaming
                    if event_type == EventType.step_complete.value:
                        response = event.payload.step_details.model_response
                        if response.tool_calls:
                            content = ToolUtils.encode_tool_call(response.tool_calls[0], tool_prompt_format)
                        else:
                            content = response.content
                        yield (
                            event,
                            LogEvent(
                                role=step_type,
                                content=content,
                                color="yellow",
                            ),
                        )
            # handle tool_execution
            if (
                step_type == StepType.tool_execution
                and
                # Only print tool calls and responses at the step_complete event
                event_type == EventType.step_complete.value
            ):
                details = event.payload.step_details
                for t in details.tool_calls:
                    yield (
                        event,
                        LogEvent(
                            role=step_type,
                            content=f"Tool:{t.tool_name} Args:{t.arguments}",
                            color="green",
                        ),
                    )
                for r in details.tool_responses:
                    yield (
                        event,
                        LogEvent(
                            role=step_type,
                            content=f"Tool:{r.tool_name} Response:{r.content}",
                            color="green",
                        ),
                    )
            if step_type == StepType.memory_retrieval and event_type == EventType.step_complete.value:
                details = event.payload.step_details
                inserted_context = interleaved_content_as_str(details.inserted_context)
                content = f"fetched {len(inserted_context)} bytes from {details.vector_db_ids}"
                yield (
                    event,
                    LogEvent(
                        role=step_type,
                        content=content,
                        color="cyan",
                    ),
                )
            previous_event_type = event_type
            previous_step_type = step_type
--- a/llama_stack/apis/batch_inference/batch_inference.py
+++ b/llama_stack/apis/batch_inference/batch_inference.py
@ -6,7 +6,6 @@
 from typing import List, Optional, Protocol, runtime_checkable
 from llama_models.schema_utils import json_schema_type, webmethod
 from pydantic import BaseModel
 from llama_stack.apis.inference import (
@ -21,6 +20,7 @@ from llama_stack.apis.inference import (
    ToolDefinition,
    ToolPromptFormat,
 )
 from llama_stack.schema_utils import json_schema_type, webmethod
@json_schema_type
--- a/llama_stack/apis/benchmarks/benchmarks.py
+++ b/llama_stack/apis/benchmarks/benchmarks.py
@ -5,10 +5,10 @@
 # the root directory of this source tree.
 from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
 from llama_models.schema_utils import json_schema_type, webmethod
 from pydantic import BaseModel, Field
 from llama_stack.apis.resource import Resource, ResourceType
 from llama_stack.schema_utils import json_schema_type, webmethod
 class CommonBenchmarkFields(BaseModel):
--- a/llama_stack/apis/common/content_types.py
+++ b/llama_stack/apis/common/content_types.py
@ -7,10 +7,11 @@
 from enum import Enum
 from typing import Annotated, List, Literal, Optional, Union
 from llama_models.llama3.api.datatypes import ToolCall
 from llama_models.schema_utils import json_schema_type, register_schema
 from pydantic import BaseModel, Field, model_validator
 from llama_stack.models.llama.datatypes import ToolCall
 from llama_stack.schema_utils import json_schema_type, register_schema
@json_schema_type
 class URL(BaseModel):
--- a/llama_stack/apis/common/deployment_types.py
+++ b/llama_stack/apis/common/deployment_types.py
@ -7,10 +7,10 @@
 from enum import Enum
 from typing import Any, Dict, Optional
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel
 from llama_stack.apis.common.content_types import URL
 from llama_stack.schema_utils import json_schema_type
@json_schema_type
--- a/llama_stack/apis/common/job_types.py
+++ b/llama_stack/apis/common/job_types.py
@ -5,9 +5,10 @@
 # the root directory of this source tree.
 from enum import Enum
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel
 from llama_stack.schema_utils import json_schema_type
@json_schema_type
 class Job(BaseModel):
--- a/llama_stack/apis/common/training_types.py
+++ b/llama_stack/apis/common/training_types.py
@ -7,9 +7,10 @@
 from datetime import datetime
 from typing import Optional
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel
 from llama_stack.schema_utils import json_schema_type
@json_schema_type
 class PostTrainingMetric(BaseModel):
--- a/llama_stack/apis/common/type_system.py
+++ b/llama_stack/apis/common/type_system.py
@ -6,10 +6,11 @@
 from typing import Literal, Union
 from llama_models.schema_utils import json_schema_type, register_schema
 from pydantic import BaseModel, Field
 from typing_extensions import Annotated
 from llama_stack.schema_utils import json_schema_type, register_schema
@json_schema_type
 class StringType(BaseModel):
--- a/llama_stack/apis/datasetio/datasetio.py
+++ b/llama_stack/apis/datasetio/datasetio.py
@ -6,10 +6,10 @@
 from typing import Any, Dict, List, Optional, Protocol, runtime_checkable
 from llama_models.schema_utils import json_schema_type, webmethod
 from pydantic import BaseModel
 from llama_stack.apis.datasets import Dataset
 from llama_stack.schema_utils import json_schema_type, webmethod
@json_schema_type
--- a/llama_stack/apis/datasets/datasets.py
+++ b/llama_stack/apis/datasets/datasets.py
@ -6,12 +6,12 @@
 from typing import Any, Dict, List, Literal, Optional, Protocol
 from llama_models.schema_utils import json_schema_type, webmethod
 from pydantic import BaseModel, Field
 from llama_stack.apis.common.content_types import URL
 from llama_stack.apis.common.type_system import ParamType
 from llama_stack.apis.resource import Resource, ResourceType
 from llama_stack.schema_utils import json_schema_type, webmethod
 class CommonDatasetFields(BaseModel):
--- a/llama_stack/apis/datatypes.py
+++ b/llama_stack/apis/datatypes.py
@ -6,7 +6,7 @@
 from enum import Enum
-from llama_models.schema_utils import json_schema_type
+from llama_stack.schema_utils import json_schema_type
@json_schema_type
--- a/llama_stack/apis/eval/eval.py
+++ b/llama_stack/apis/eval/eval.py
@ -6,7 +6,6 @@
 from typing import Any, Dict, List, Literal, Optional, Protocol, Union
 from llama_models.schema_utils import json_schema_type, register_schema, webmethod
 from pydantic import BaseModel, Field
 from typing_extensions import Annotated
@ -15,6 +14,7 @@ from llama_stack.apis.common.job_types import Job, JobStatus
 from llama_stack.apis.inference import SamplingParams, SystemMessage
 from llama_stack.apis.scoring import ScoringResult
 from llama_stack.apis.scoring_functions import ScoringFnParams
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
@json_schema_type
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -17,7 +17,13 @@ from typing import (
    runtime_checkable,
 )
-from llama_models.llama3.api.datatypes import (
+from pydantic import BaseModel, Field, field_validator
 from typing_extensions import Annotated
 from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent
 from llama_stack.apis.models import Model
 from llama_stack.apis.telemetry.telemetry import MetricResponseMixin
 from llama_stack.models.llama.datatypes import (
    BuiltinTool,
    SamplingParams,
    StopReason,
@ -25,14 +31,8 @@ from llama_models.llama3.api.datatypes import (
    ToolDefinition,
    ToolPromptFormat,
 )
 from llama_models.schema_utils import json_schema_type, register_schema, webmethod
 from pydantic import BaseModel, Field, field_validator
 from typing_extensions import Annotated
 from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent
 from llama_stack.apis.models import Model
 from llama_stack.apis.telemetry.telemetry import MetricResponseMixin
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
 class LogProbConfig(BaseModel):
--- a/llama_stack/apis/inspect/inspect.py
+++ b/llama_stack/apis/inspect/inspect.py
@ -6,9 +6,10 @@
 from typing import List, Protocol, runtime_checkable
 from llama_models.schema_utils import json_schema_type, webmethod
 from pydantic import BaseModel
 from llama_stack.schema_utils import json_schema_type, webmethod
@json_schema_type
 class ProviderInfo(BaseModel):
--- a/llama_stack/apis/models/models.py
+++ b/llama_stack/apis/models/models.py
@ -7,11 +7,11 @@
 from enum import Enum
 from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
 from llama_models.schema_utils import json_schema_type, webmethod
 from pydantic import BaseModel, ConfigDict, Field
 from llama_stack.apis.resource import Resource, ResourceType
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod
 class CommonModelFields(BaseModel):
--- a/llama_stack/apis/post_training/post_training.py
+++ b/llama_stack/apis/post_training/post_training.py
@ -8,13 +8,13 @@ from datetime import datetime
 from enum import Enum
 from typing import Any, Dict, List, Literal, Optional, Protocol, Union
 from llama_models.schema_utils import json_schema_type, register_schema, webmethod
 from pydantic import BaseModel, Field
 from typing_extensions import Annotated
 from llama_stack.apis.common.content_types import URL
 from llama_stack.apis.common.job_types import JobStatus
 from llama_stack.apis.common.training_types import Checkpoint
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
@json_schema_type
--- a/llama_stack/apis/safety/safety.py
+++ b/llama_stack/apis/safety/safety.py
@ -7,12 +7,12 @@
 from enum import Enum
 from typing import Any, Dict, List, Optional, Protocol, runtime_checkable
 from llama_models.schema_utils import json_schema_type, webmethod
 from pydantic import BaseModel, Field
 from llama_stack.apis.inference import Message
 from llama_stack.apis.shields import Shield
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod
@json_schema_type
--- a/llama_stack/apis/scoring/scoring.py
+++ b/llama_stack/apis/scoring/scoring.py
@ -6,10 +6,10 @@
 from typing import Any, Dict, List, Optional, Protocol, runtime_checkable
 from llama_models.schema_utils import json_schema_type, webmethod
 from pydantic import BaseModel
 from llama_stack.apis.scoring_functions import ScoringFn, ScoringFnParams
 from llama_stack.schema_utils import json_schema_type, webmethod
 # mapping of metric to value
 ScoringResultRow = Dict[str, Any]
--- a/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/llama_stack/apis/scoring_functions/scoring_functions.py
@ -16,12 +16,12 @@ from typing import (
    runtime_checkable,
 )
 from llama_models.schema_utils import json_schema_type, register_schema, webmethod
 from pydantic import BaseModel, Field
 from typing_extensions import Annotated
 from llama_stack.apis.common.type_system import ParamType
 from llama_stack.apis.resource import Resource, ResourceType
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
 # Perhaps more structure can be imposed on these functions. Maybe they could be associated
--- a/llama_stack/apis/shields/shields.py
+++ b/llama_stack/apis/shields/shields.py
@ -6,11 +6,11 @@
 from typing import Any, Dict, List, Literal, Optional, Protocol, runtime_checkable
 from llama_models.schema_utils import json_schema_type, webmethod
 from pydantic import BaseModel
 from llama_stack.apis.resource import Resource, ResourceType
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod
 class CommonShieldFields(BaseModel):
--- a/llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py
+++ b/llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py
@ -7,10 +7,10 @@
 from enum import Enum
 from typing import Any, Dict, List, Optional, Protocol, Union
 from llama_models.schema_utils import json_schema_type, webmethod
 from pydantic import BaseModel
 from llama_stack.apis.inference import Message
 from llama_stack.schema_utils import json_schema_type, webmethod
 class FilteringFunction(Enum):
--- a/llama_stack/apis/telemetry/telemetry.py
+++ b/llama_stack/apis/telemetry/telemetry.py
@ -17,11 +17,12 @@ from typing import (
    runtime_checkable,
 )
 from llama_models.llama3.api.datatypes import Primitive
 from llama_models.schema_utils import json_schema_type, register_schema, webmethod
 from pydantic import BaseModel, Field
 from typing_extensions import Annotated
 from llama_stack.models.llama.datatypes import Primitive
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
 # Add this constant near the top of the file, after the imports
 DEFAULT_TTL_DAYS = 7
--- a/llama_stack/apis/tools/rag_tool.py
+++ b/llama_stack/apis/tools/rag_tool.py
@ -7,12 +7,12 @@
 from enum import Enum
 from typing import Any, Dict, List, Literal, Optional, Union
 from llama_models.schema_utils import json_schema_type, register_schema, webmethod
 from pydantic import BaseModel, Field
 from typing_extensions import Annotated, Protocol, runtime_checkable
 from llama_stack.apis.common.content_types import URL, InterleavedContent
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
@json_schema_type
--- a/llama_stack/apis/tools/tools.py
+++ b/llama_stack/apis/tools/tools.py
@ -7,13 +7,13 @@
 from enum import Enum
 from typing import Any, Dict, List, Literal, Optional
 from llama_models.schema_utils import json_schema_type, webmethod
 from pydantic import BaseModel, Field
 from typing_extensions import Protocol, runtime_checkable
 from llama_stack.apis.common.content_types import URL, InterleavedContent
 from llama_stack.apis.resource import Resource, ResourceType
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod
 from .rag_tool import RAGToolRuntime
--- a/llama_stack/apis/vector_dbs/vector_dbs.py
+++ b/llama_stack/apis/vector_dbs/vector_dbs.py
@ -6,11 +6,11 @@
 from typing import List, Literal, Optional, Protocol, runtime_checkable
 from llama_models.schema_utils import json_schema_type, webmethod
 from pydantic import BaseModel
 from llama_stack.apis.resource import Resource, ResourceType
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod
@json_schema_type
--- a/llama_stack/apis/vector_io/vector_io.py
+++ b/llama_stack/apis/vector_io/vector_io.py
@ -10,12 +10,12 @@
 # the root directory of this source tree.
 from typing import Any, Dict, List, Optional, Protocol, runtime_checkable
 from llama_models.schema_utils import json_schema_type, webmethod
 from pydantic import BaseModel, Field
 from llama_stack.apis.inference import InterleavedContent
 from llama_stack.apis.vector_dbs import VectorDB
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod
 class Chunk(BaseModel):
--- a/llama_stack/cli/download.py
+++ b/llama_stack/cli/download.py
@ -16,8 +16,6 @@ from pathlib import Path
 from typing import Dict, List, Optional
 import httpx
 from llama_models.datatypes import Model
 from llama_models.sku_list import LlamaDownloadInfo
 from pydantic import BaseModel, ConfigDict
 from rich.console import Console
 from rich.progress import (
@ -31,6 +29,8 @@ from rich.progress import (
 from termcolor import cprint
 from llama_stack.cli.subcommand import Subcommand
 from llama_stack.models.llama.datatypes import Model
 from llama_stack.models.llama.sku_list import LlamaDownloadInfo
 class Download(Subcommand):
@ -454,7 +454,7 @@ def run_download_cmd(args: argparse.Namespace, parser: argparse.ArgumentParser):
        # Handle comma-separated model IDs
        model_ids = [model_id.strip() for model_id in args.model_id.split(",")]
-        from llama_models.sku_list import llama_meta_net_info, resolve_model
+        from llama_stack.models.llama.sku_list import llama_meta_net_info, resolve_model
        from .model.safety_models import (
            prompt_guard_download_info,
--- a/llama_stack/cli/model/describe.py
+++ b/llama_stack/cli/model/describe.py
@ -7,11 +7,11 @@
 import argparse
 import json
 from llama_models.sku_list import resolve_model
 from termcolor import colored
 from llama_stack.cli.subcommand import Subcommand
 from llama_stack.cli.table import print_table
 from llama_stack.models.llama.sku_list import resolve_model
 class ModelDescribe(Subcommand):
--- a/llama_stack/cli/model/list.py
+++ b/llama_stack/cli/model/list.py
@ -6,10 +6,9 @@
 import argparse
 from llama_models.sku_list import all_registered_models
 from llama_stack.cli.subcommand import Subcommand
 from llama_stack.cli.table import print_table
 from llama_stack.models.llama.sku_list import all_registered_models
 class ModelList(Subcommand):
--- a/llama_stack/cli/model/prompt_format.py
+++ b/llama_stack/cli/model/prompt_format.py
@ -8,9 +8,8 @@ import argparse
 import textwrap
 from io import StringIO
 from llama_models.datatypes import CoreModelId, ModelFamily, is_multimodal, model_family
 from llama_stack.cli.subcommand import Subcommand
 from llama_stack.models.llama.datatypes import CoreModelId, ModelFamily, is_multimodal, model_family
 class ModelPromptFormat(Subcommand):
--- a/llama_stack/cli/model/safety_models.py
+++ b/llama_stack/cli/model/safety_models.py
@ -6,11 +6,11 @@
 from typing import Any, Dict, Optional
 from llama_models.datatypes import CheckpointQuantizationFormat
 from llama_models.llama3.api.datatypes import SamplingParams
 from llama_models.sku_list import LlamaDownloadInfo
 from pydantic import BaseModel, ConfigDict, Field
 from llama_stack.models.llama.datatypes import CheckpointQuantizationFormat, SamplingParams
 from llama_stack.models.llama.sku_list import LlamaDownloadInfo
 class PromptGuardModel(BaseModel):
    """Make a 'fake' Model-like object for Prompt Guard. Eventually this will be removed."""
--- a/llama_stack/distribution/client.py
+++ b/llama_stack/distribution/client.py
@ -186,33 +186,3 @@ def extract_async_iterator_type(type_hint):
                inner_args = get_args(arg)
                return inner_args[0]
    return None
 async def example(model: str = None):
    from llama_stack.apis.inference import Inference, UserMessage  # noqa: F403
    from llama_stack.apis.inference.event_logger import EventLogger
    client_class = create_api_client_class(Inference)
    client = client_class("http://localhost:5003")
    if not model:
        model = "Llama3.2-3B-Instruct"
    message = UserMessage(content="hello world, write me a 2 sentence poem about the moon")
    cprint(f"User>{message.content}", "green")
    stream = True
    iterator = await client.chat_completion(
        model=model,
        messages=[message],
        stream=stream,
    )
    async for log in EventLogger().log(iterator):
        log.print()
 if __name__ == "__main__":
    import asyncio
    asyncio.run(example())
--- a/llama_stack/models/llama/datatypes.py
+++ b/llama_stack/models/llama/datatypes.py
@ -0,0 +1,277 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # top-level folder for each specific model found within the models/ directory at
 # the top-level of this source tree.
 from enum import Enum
 from typing import Any, Dict, Literal, Optional, Union
 # import all for backwards compatibility
 from llama_models.datatypes import *  # noqa: F403
 from pydantic import BaseModel, ConfigDict, Field, field_validator
 from typing_extensions import Annotated
 from llama_stack.schema_utils import json_schema_type, register_schema
 register_schema(ToolCall)
@json_schema_type
 class ToolParamDefinition(BaseModel):
    param_type: str
    description: Optional[str] = None
    required: Optional[bool] = True
    default: Optional[Any] = None
@json_schema_type
 class ToolDefinition(BaseModel):
    tool_name: Union[BuiltinTool, str]
    description: Optional[str] = None
    parameters: Optional[Dict[str, ToolParamDefinition]] = None
    @field_validator("tool_name", mode="before")
    @classmethod
    def validate_field(cls, v):
        if isinstance(v, str):
            try:
                return BuiltinTool(v)
            except ValueError:
                return v
        return v
@json_schema_type
 class GreedySamplingStrategy(BaseModel):
    type: Literal["greedy"] = "greedy"
@json_schema_type
 class TopPSamplingStrategy(BaseModel):
    type: Literal["top_p"] = "top_p"
    temperature: Optional[float] = Field(..., gt=0.0)
    top_p: Optional[float] = 0.95
@json_schema_type
 class TopKSamplingStrategy(BaseModel):
    type: Literal["top_k"] = "top_k"
    top_k: int = Field(..., ge=1)
 SamplingStrategy = register_schema(
    Annotated[
        Union[GreedySamplingStrategy, TopPSamplingStrategy, TopKSamplingStrategy],
        Field(discriminator="type"),
    ],
    name="SamplingStrategy",
 )
@json_schema_type
 class SamplingParams(BaseModel):
    strategy: SamplingStrategy = Field(default_factory=GreedySamplingStrategy)
    max_tokens: Optional[int] = 0
    repetition_penalty: Optional[float] = 1.0
 class CheckpointQuantizationFormat(Enum):
    # default format
    bf16 = "bf16"
    # used for enabling fp8_rowwise inference, some weights are bf16
    fp8_mixed = "fp8-mixed"
    int8 = "int8"
    int4 = "int4"
 class ModelFamily(Enum):
    llama2 = "llama2"
    llama3 = "llama3"
    llama3_1 = "llama3_1"
    llama3_2 = "llama3_2"
    llama3_3 = "llama3_3"
    safety = "safety"
 class CoreModelId(Enum):
    """Each of these models is a unique "SKU". These root models can be served in various garbs (especially by quantizing them)"""
    # Llama 2 family
    llama2_7b = "Llama-2-7b"
    llama2_13b = "Llama-2-13b"
    llama2_70b = "Llama-2-70b"
    llama2_7b_chat = "Llama-2-7b-chat"
    llama2_13b_chat = "Llama-2-13b-chat"
    llama2_70b_chat = "Llama-2-70b-chat"
    # Llama 3 family
    llama3_8b = "Llama-3-8B"
    llama3_70b = "Llama-3-70B"
    llama3_8b_instruct = "Llama-3-8B-Instruct"
    llama3_70b_instruct = "Llama-3-70B-Instruct"
    # Llama 3.1 family
    llama3_1_8b = "Llama3.1-8B"
    llama3_1_70b = "Llama3.1-70B"
    llama3_1_405b = "Llama3.1-405B"
    llama3_1_8b_instruct = "Llama3.1-8B-Instruct"
    llama3_1_70b_instruct = "Llama3.1-70B-Instruct"
    llama3_1_405b_instruct = "Llama3.1-405B-Instruct"
    # Llama 3.2 family
    llama3_2_1b = "Llama3.2-1B"
    llama3_2_3b = "Llama3.2-3B"
    llama3_2_1b_instruct = "Llama3.2-1B-Instruct"
    llama3_2_3b_instruct = "Llama3.2-3B-Instruct"
    llama3_2_11b_vision = "Llama3.2-11B-Vision"
    llama3_2_90b_vision = "Llama3.2-90B-Vision"
    llama3_2_11b_vision_instruct = "Llama3.2-11B-Vision-Instruct"
    llama3_2_90b_vision_instruct = "Llama3.2-90B-Vision-Instruct"
    # Llama 3.3 family
    llama3_3_70b_instruct = "Llama3.3-70B-Instruct"
    # Safety models
    llama_guard_3_8b = "Llama-Guard-3-8B"
    llama_guard_2_8b = "Llama-Guard-2-8B"
    llama_guard_3_11b_vision = "Llama-Guard-3-11B-Vision"
    llama_guard_3_1b = "Llama-Guard-3-1B"
 def is_multimodal(model_id) -> bool:
    if model_id in [
        CoreModelId.llama3_2_11b_vision,
        CoreModelId.llama3_2_90b_vision,
        CoreModelId.llama3_2_11b_vision_instruct,
        CoreModelId.llama3_2_90b_vision_instruct,
    ]:
        return True
    else:
        return False
 def model_family(model_id) -> ModelFamily:
    if model_id in [
        CoreModelId.llama2_7b,
        CoreModelId.llama2_13b,
        CoreModelId.llama2_70b,
        CoreModelId.llama2_7b_chat,
        CoreModelId.llama2_13b_chat,
        CoreModelId.llama2_70b_chat,
    ]:
        return ModelFamily.llama2
    elif model_id in [
        CoreModelId.llama3_8b,
        CoreModelId.llama3_70b,
        CoreModelId.llama3_8b_instruct,
        CoreModelId.llama3_70b_instruct,
    ]:
        return ModelFamily.llama3
    elif model_id in [
        CoreModelId.llama3_1_8b,
        CoreModelId.llama3_1_70b,
        CoreModelId.llama3_1_405b,
        CoreModelId.llama3_1_8b_instruct,
        CoreModelId.llama3_1_70b_instruct,
        CoreModelId.llama3_1_405b_instruct,
    ]:
        return ModelFamily.llama3_1
    elif model_id in [
        CoreModelId.llama3_2_1b,
        CoreModelId.llama3_2_3b,
        CoreModelId.llama3_2_1b_instruct,
        CoreModelId.llama3_2_3b_instruct,
        CoreModelId.llama3_2_11b_vision,
        CoreModelId.llama3_2_90b_vision,
        CoreModelId.llama3_2_11b_vision_instruct,
        CoreModelId.llama3_2_90b_vision_instruct,
    ]:
        return ModelFamily.llama3_2
    elif model_id in [
        CoreModelId.llama3_3_70b_instruct,
    ]:
        return ModelFamily.llama3_3
    elif model_id in [
        CoreModelId.llama_guard_3_8b,
        CoreModelId.llama_guard_2_8b,
        CoreModelId.llama_guard_3_11b_vision,
        CoreModelId.llama_guard_3_1b,
    ]:
        return ModelFamily.safety
    else:
        raise ValueError(f"Unknown model family for {model_id}")
 class Model(BaseModel):
    core_model_id: CoreModelId
    description: str
    huggingface_repo: Optional[str] = None
    recommended_sampling_params: Optional[SamplingParams] = None
    arch_args: Dict[str, Any]
    variant: str = ""
    quantization_format: CheckpointQuantizationFormat = CheckpointQuantizationFormat.bf16
    pth_file_count: int
    metadata: Optional[Dict[str, Any]] = Field(default_factory=dict)
    # silence pydantic until we remove the `model_` fields
    model_config = ConfigDict(protected_namespaces=())
    @property
    def model_family(self) -> ModelFamily:
        return model_family(self.core_model_id)
    # The SKU is uniquely identified by (model_id, variant) combo
    def descriptor(self, shorten_default_variant: bool = True) -> str:
        if not self.variant:
            return self.core_model_id.value
        return f"{self.core_model_id.value}:{self.variant}"
    @property
    def is_instruct_model(self) -> bool:
        return "instruct" in self.id.name
    # Featured models are shown in the non-exhaustive model list
    @property
    def is_featured(self) -> bool:
        return self.model_family in [
            ModelFamily.llama3_1,
            ModelFamily.llama3_2,
            ModelFamily.llama3_3,
            ModelFamily.safety,
        ]
    @property
    def max_seq_length(self) -> int:
        if self.model_family == ModelFamily.llama2:
            return 4096
        elif self.core_model_id == CoreModelId.llama_guard_2_8b:
            return 4096
        elif self.model_family == ModelFamily.llama3:
            return 8192
        elif self.model_family in [ModelFamily.llama3_1, ModelFamily.llama3_3]:
            return 131072
        elif self.model_family == ModelFamily.llama3_2:
            if self.quantization_format == CheckpointQuantizationFormat.int4:
                return 8192
            return 131072
        elif self.core_model_id in [
            CoreModelId.llama_guard_3_8b,
            CoreModelId.llama_guard_3_11b_vision,
            CoreModelId.llama_guard_3_1b,
        ]:
            return 131072
        else:
            raise ValueError(f"Unknown max_seq_len for {self.core_model_id}")
--- a/llama_stack/models/llama/llama3/dog.jpg
+++ b/llama_stack/models/llama/llama3/dog.jpg
--- a/llama_stack/models/llama/llama3/interface.py
+++ b/llama_stack/models/llama/llama3/interface.py
@ -0,0 +1,257 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # top-level folder for each specific model found within the models/ directory at
 # the top-level of this source tree.
 from pathlib import Path
 from typing import List, Optional
 from llama_models.datatypes import (
    BuiltinTool,
    RawMessage,
    StopReason,
    ToolCall,
    ToolPromptFormat,
 )
 from llama_models.llama3.api.chat_format import ChatFormat
 from llama_models.llama3.api.tokenizer import Tokenizer
 from termcolor import colored
 from llama_stack.models.llama.datatypes import ToolDefinition
 from . import template_data
 from .prompt_templates import (
    BuiltinToolGenerator,
    FunctionTagCustomToolGenerator,
    JsonCustomToolGenerator,
    SystemDefaultGenerator,
    ToolResponseGenerator,
 )
 THIS_DIR = Path(__file__).parent
 class Template:
    def __init__(
        self,
        role,
        template_name,
        data_provider=None,
        notes=None,
    ):
        self.role = role
        self.template_name = template_name
        self.data_provider = data_provider or ""
        self._notes = notes or ""
    @property
    def notes(self):
        default = "↵ represents newline"
        notes = default
        if self._notes:
            notes += "\n"
            notes += self._notes
        return notes
 TEMPLATES = [
    Template(
        "user",
        "user-default",
        "user_default",
    ),
    Template(
        "user",
        "user-images",
        "user_images",
    ),
    Template("user", "user-interleaved-images", "user_interleaved_images"),
    Template(
        "assistant",
        "assistant-builtin-tool-call",
        "assistant_builtin_tool_call",
        "Notice <|python_tag|>",
    ),
    Template(
        "assistant",
        "assistant-custom-tool-call",
        "assistant_custom_tool_call",
        "Notice <function=...> format",
    ),
    Template(
        "assistant",
        "assistant-default",
        "assistant_default",
    ),
    Template(
        "system",
        "system-builtin-and-custom-tools",
        "system_message_builtin_and_custom_tools",
    ),
    Template(
        "system",
        "system-builtin-tools-only",
        "system_message_builtin_tools_only",
    ),
    Template(
        "system",
        "system-custom-tools-only",
        "system_message_custom_tools_only",
    ),
    Template(
        "system",
        "system-default",
        "system_default",
    ),
    Template(
        "tool",
        "tool-success",
        "tool_success",
        "Note ipython header and [stdout]",
    ),
    Template(
        "tool",
        "tool-failure",
        "tool_failure",
        "Note ipython header and [stderr]",
    ),
 ]
 class LLama31Interface:
    def __init__(self, tool_prompt_format: ToolPromptFormat = ToolPromptFormat.json):
        self.tokenizer = Tokenizer.get_instance()
        self.formatter = ChatFormat(self.tokenizer)
        self.tool_prompt_format = tool_prompt_format
    def get_tokens(self, messages: List[RawMessage]) -> List[int]:
        model_input = self.formatter.encode_dialog_prompt(
            messages,
            self.tool_prompt_format,
        )
        return model_input.tokens
    def tool_response_messages(self, *args, **kwargs):
        template = ToolResponseGenerator().gen(*args, **kwargs)
        return [
            RawMessage(
                role="tool",
                content=template.render(),
            )
        ]
    def system_messages(
        self,
        builtin_tools: List[BuiltinTool],
        custom_tools: List[ToolDefinition],
        instruction: Optional[str] = None,
    ) -> List[RawMessage]:
        messages = []
        default_gen = SystemDefaultGenerator()
        default_template = default_gen.gen()
        sys_content = ""
        tool_template = None
        if builtin_tools or custom_tools:
            tool_gen = BuiltinToolGenerator()
            tool_template = tool_gen.gen(builtin_tools + custom_tools)
            sys_content += tool_template.render()
            sys_content += "\n"
        sys_content += default_template.render()
        if instruction:
            sys_content += "\n\n"
            sys_content += instruction
        sys_content += "\n"
        messages.append(RawMessage(role="system", content=sys_content))
        if custom_tools:
            if self.tool_prompt_format == ToolPromptFormat.json:
                tool_gen = JsonCustomToolGenerator()
            elif self.tool_prompt_format == ToolPromptFormat.function_tag:
                tool_gen = FunctionTagCustomToolGenerator()
            else:
                raise ValueError(f"Non supported ToolPromptFormat {self.tool_prompt_format}")
            custom_template = tool_gen.gen(custom_tools)
            messages.append(RawMessage(role="user", content=custom_template.render()))
        return messages
    def assistant_response_messages(
        self,
        content: str,
        stop_reason: StopReason,
        tool_call: Optional[ToolCall] = None,
    ) -> List[RawMessage]:
        tool_calls = []
        if tool_call:
            tool_calls.append(tool_call)
        return [
            RawMessage(
                role="assistant",
                content=content,
                tool_calls=tool_calls,
                stop_reason=stop_reason,
            )
        ]
    def user_message(self, content: str) -> List[RawMessage]:
        return [RawMessage(role="user", content=content)]
    def display_message_as_tokens(self, message: RawMessage) -> None:
        """Util to print tokenized string to shell"""
        tokens = self.formatter.encode_message(message, self.tool_prompt_format)
        on_colors = [
            "on_red",
            "on_green",
            "on_yellow",
            "on_blue",
            "on_magenta",
            "on_cyan",
        ]
        for i, t in enumerate(tokens):
            on_col = on_colors[i % len(on_colors)]
            print(colored(self.tokenizer.decode([t]), "white", on_col), end="")
        print("\n", end="")
 def list_jinja_templates() -> List[Template]:
    return TEMPLATES
 def render_jinja_template(name: str, tool_prompt_format: ToolPromptFormat):
    by_name = {t.template_name: t for t in TEMPLATES}
    if name not in by_name:
        raise ValueError(f"No template found for `{name}`")
    template = by_name[name]
    interface = LLama31Interface(tool_prompt_format)
    data_func = getattr(template_data, template.data_provider)
    if template.role == "system":
        messages = interface.system_messages(**data_func())
    elif template.role == "tool":
        messages = interface.tool_response_messages(**data_func())
    elif template.role == "assistant":
        messages = interface.assistant_response_messages(**data_func())
    elif template.role == "user":
        messages = interface.user_message(**data_func())
    tokens = interface.get_tokens(messages)
    special_tokens = list(interface.tokenizer.special_tokens.values())
    tokens = [(interface.tokenizer.decode([t]), t in special_tokens) for t in tokens]
    return template, tokens
--- a/llama_stack/models/llama/llama3/pasta.jpeg
+++ b/llama_stack/models/llama/llama3/pasta.jpeg
--- a/llama_stack/models/llama/llama3/prompt_templates/init.py
+++ b/llama_stack/models/llama/llama3/prompt_templates/init.py
@ -0,0 +1,22 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # top-level folder for each specific model found within the models/ directory at
 # the top-level of this source tree.
 from .base import PromptTemplate, PromptTemplateGeneratorBase  # noqa: F401
 from .system_prompts import (  # noqa: F401
    BuiltinToolGenerator,
    FunctionTagCustomToolGenerator,
    JsonCustomToolGenerator,
    PythonListCustomToolGenerator,
    SystemDefaultGenerator,
 )
 from .tool_response import ToolResponseGenerator  # noqa: F401
--- a/llama_stack/models/llama/llama3/prompt_templates/base.py
+++ b/llama_stack/models/llama/llama3/prompt_templates/base.py
@ -0,0 +1,39 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # top-level folder for each specific model found within the models/ directory at
 # the top-level of this source tree.
 from dataclasses import dataclass
 from typing import Any, Dict, List
 from jinja2 import Template
@dataclass
 class PromptTemplate:
    template: str
    data: Dict[str, Any]
    def render(self):
        template = Template(self.template)
        return template.render(self.data)
 class PromptTemplateGeneratorBase:
    """
    Base class for prompt template generators.
    """
    def gen(self, *args, **kwargs) -> PromptTemplate:
        raise NotImplementedError()
    def data_examples(self) -> List[Any]:
        raise NotImplementedError()
--- a/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py
+++ b/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py
@ -0,0 +1,311 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # top-level folder for each specific model found within the models/ directory at
 # the top-level of this source tree.
 import textwrap
 from datetime import datetime
 from typing import Any, List, Optional
 from llama_models.datatypes import (
    BuiltinTool,
 )
 from llama_stack.models.llama.datatypes import (
    ToolDefinition,
    ToolParamDefinition,
 )
 from .base import PromptTemplate, PromptTemplateGeneratorBase
 class SystemDefaultGenerator(PromptTemplateGeneratorBase):
    def gen(self, *args, **kwargs) -> PromptTemplate:
        template_str = textwrap.dedent(
            """
            Cutting Knowledge Date: December 2023
            Today Date: {{ today }}
            """
        )
        return PromptTemplate(
            template_str.lstrip("\n"),
            {"today": datetime.now().strftime("%d %B %Y")},
        )
    def data_examples(self) -> List[Any]:
        return [None]
 class BuiltinToolGenerator(PromptTemplateGeneratorBase):
    def _tool_breakdown(self, tools: List[ToolDefinition]):
        builtin_tools, custom_tools = [], []
        for dfn in tools:
            if isinstance(dfn.tool_name, BuiltinTool):
                builtin_tools.append(dfn)
            else:
                custom_tools.append(dfn)
        return builtin_tools, custom_tools
    def gen(self, tools: List[ToolDefinition]) -> PromptTemplate:
        builtin_tools, custom_tools = self._tool_breakdown(tools)
        template_str = textwrap.dedent(
            """
            {% if builtin_tools or custom_tools -%}
            Environment: ipython
            {% endif -%}
            {% set builtin_tools = builtin_tools | reject('equalto', 'code_interpreter') | list -%}
            {% if builtin_tools -%}
            Tools: {{ builtin_tools | join(", ") | trim -}}
            {% endif %}
            """
        )
        return PromptTemplate(
            template_str.lstrip("\n"),
            {
                "builtin_tools": [t.tool_name.value for t in builtin_tools],
                "custom_tools": custom_tools,
            },
        )
    def data_examples(self) -> List[List[ToolDefinition]]:
        return [
            # builtin tools
            [
                ToolDefinition(tool_name=BuiltinTool.code_interpreter),
                ToolDefinition(tool_name=BuiltinTool.brave_search),
                ToolDefinition(tool_name=BuiltinTool.wolfram_alpha),
            ],
            # only code interpretor
            [
                ToolDefinition(tool_name=BuiltinTool.code_interpreter),
            ],
        ]
 class JsonCustomToolGenerator(PromptTemplateGeneratorBase):
    def gen(self, custom_tools: List[ToolDefinition]) -> PromptTemplate:
        template_str = textwrap.dedent(
            """
            Answer the user's question by making use of the following functions if needed.
            If none of the function can be used, please say so.
            Here is a list of functions in JSON format:
            {% for t in custom_tools -%}
            {# manually setting up JSON because jinja sorts keys in unexpected ways -#}
            {%- set tname = t.tool_name -%}
            {%- set tdesc = t.description -%}
            {%- set tparams = t.parameters -%}
            {%- set required_params = [] -%}
            {%- for name, param in tparams.items() if param.required == true -%}
                {%- set _ = required_params.append(name) -%}
            {%- endfor -%}
            {
                "type": "function",
                "function": {
                    "name": "{{tname}}",
                    "description": "{{tdesc}}",
                    "parameters": {
                        "type": "object",
                        "properties": [
                            {%- for name, param in tparams.items() %}
                            {
                                "{{name}}": {
                                    "type": "object",
                                    "description": "{{param.description}}"
                                }
                            }{% if not loop.last %},{% endif %}
                            {%- endfor %}
                        ],
                        "required": {{ required_params | tojson }}
                    }
                }
            }
            {% endfor %}
            Return function calls in JSON format.
            """
        )
        return PromptTemplate(
            template_str.lstrip("\n"),
            {"custom_tools": [t.model_dump() for t in custom_tools]},
        )
    def data_examples(self) -> List[List[ToolDefinition]]:
        return [
            [
                ToolDefinition(
                    tool_name="trending_songs",
                    description="Returns the trending songs on a Music site",
                    parameters={
                        "n": ToolParamDefinition(
                            param_type="int",
                            description="The number of songs to return",
                            required=True,
                        ),
                        "genre": ToolParamDefinition(
                            param_type="str",
                            description="The genre of the songs to return",
                            required=False,
                        ),
                    },
                ),
            ]
        ]
 class FunctionTagCustomToolGenerator(PromptTemplateGeneratorBase):
    def gen(self, custom_tools: List[ToolDefinition]) -> PromptTemplate:
        template_str = textwrap.dedent(
            """
            You have access to the following functions:
            {% for t in custom_tools %}
            {#- manually setting up JSON because jinja sorts keys in unexpected ways -#}
            {%- set tname = t.tool_name -%}
            {%- set tdesc = t.description -%}
            {%- set modified_params = t.parameters.copy() -%}
            {%- for key, value in modified_params.items() -%}
                {%- if 'default' in value -%}
                    {%- set _ = value.pop('default', None) -%}
                {%- endif -%}
            {%- endfor -%}
            {%- set tparams = modified_params | tojson -%}
            Use the function '{{ tname }}' to '{{ tdesc }}':
            {"name": "{{tname}}", "description": "{{tdesc}}", "parameters": {{tparams}}}
            {% endfor -%}
            Think very carefully before calling functions.
            If you choose to call a function ONLY reply in the following format with no prefix or suffix:
            <function=example_function_name>{"example_name": "example_value"}</function>
            Reminder:
            - If looking for real time information use relevant functions before falling back to brave_search
            - Function calls MUST follow the specified format, start with <function= and end with </function>
            - Required parameters MUST be specified
            - Only call one function at a time
            - Put the entire function call reply on one line
            """
        )
        return PromptTemplate(
            template_str.lstrip("\n"),
            {"custom_tools": [t.model_dump() for t in custom_tools]},
        )
    def data_examples(self) -> List[List[ToolDefinition]]:
        return [
            [
                ToolDefinition(
                    tool_name="trending_songs",
                    description="Returns the trending songs on a Music site",
                    parameters={
                        "n": ToolParamDefinition(
                            param_type="int",
                            description="The number of songs to return",
                            required=True,
                        ),
                        "genre": ToolParamDefinition(
                            param_type="str",
                            description="The genre of the songs to return",
                            required=False,
                        ),
                    },
                ),
            ]
        ]
 class PythonListCustomToolGenerator(PromptTemplateGeneratorBase):  # noqa: N801
    DEFAULT_PROMPT = textwrap.dedent(
        """
        You are an expert in composing functions. You are given a question and a set of possible functions.
        Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
        If none of the function can be used, point it out. If the given question lacks the parameters required by the function,
        also point it out. You should only return the function call in tools call sections.
        {{ function_description }}
        """.strip("\n")
    )
    def gen(self, custom_tools: List[ToolDefinition], system_prompt: Optional[str] = None) -> PromptTemplate:
        system_prompt = system_prompt or self.DEFAULT_PROMPT
        return PromptTemplate(
            system_prompt,
            {"function_description": self._gen_function_description(custom_tools)},
        )
    def _gen_function_description(self, custom_tools: List[ToolDefinition]) -> PromptTemplate:
        template_str = textwrap.dedent(
            """
            If you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]
            You SHOULD NOT include any other text in the response.
            Here is a list of functions in JSON format that you can invoke.
            [
                {% for t in tools -%}
                {# manually setting up JSON because jinja sorts keys in unexpected ways -#}
                {%- set tname = t.tool_name -%}
                {%- set tdesc = t.description -%}
                {%- set tparams = t.parameters -%}
                {%- set required_params = [] -%}
                {%- for name, param in tparams.items() if param.required == true -%}
                    {%- set _ = required_params.append(name) -%}
                {%- endfor -%}
                {
                    "name": "{{tname}}",
                    "description": "{{tdesc}}",
                    "parameters": {
                        "type": "dict",
                        "required": {{ required_params | tojson }},
                        "properties": {
                            {%- for name, param in tparams.items() %}
                            "{{name}}": {
                                "type": "{{param.param_type}}",
                                "description": "{{param.description}}"{% if param.default %},
                                "default": "{{param.default}}"{% endif %}
                            }{% if not loop.last %},{% endif %}
                            {%- endfor %}
                        }
                    }
                }{% if not loop.last %},
                {% endif -%}
                {%- endfor %}
            ]
            """
        )
        return PromptTemplate(
            template_str.strip("\n"),
            {"tools": [t.model_dump() for t in custom_tools]},
        ).render()
    def data_examples(self) -> List[List[ToolDefinition]]:
        return [
            [
                ToolDefinition(
                    tool_name="get_weather",
                    description="Get weather info for places",
                    parameters={
                        "city": ToolParamDefinition(
                            param_type="string",
                            description="The name of the city to get the weather for",
                            required=True,
                        ),
                        "metric": ToolParamDefinition(
                            param_type="string",
                            description="The metric for weather. Options are: celsius, fahrenheit",
                            required=False,
                            default="celsius",
                        ),
                    },
                ),
            ]
        ]
--- a/llama_stack/models/llama/llama3/prompt_templates/tool_response.py
+++ b/llama_stack/models/llama/llama3/prompt_templates/tool_response.py
@ -0,0 +1,63 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # top-level folder for each specific model found within the models/ directory at
 # the top-level of this source tree.
 import textwrap
 from typing import Optional
 from .base import PromptTemplate, PromptTemplateGeneratorBase
 class ToolResponseGenerator(PromptTemplateGeneratorBase):
    def gen(
        self,
        status: str,
        stdout: Optional[str] = None,
        stderr: Optional[str] = None,
    ):
        assert status in [
            "success",
            "failure",
        ], f"status must be 'success' or 'failure'; Got: {status}"
        template_str = textwrap.dedent(
            """
            {% if status == "success" %}completed{% else %}failed{% endif %}
            {%- if stdout %}
            [stdout]{{ stdout }}[/stdout]
            {%- endif -%}
            {%- if stderr %}
            [stderr]{{ stderr }}[/stderr]
            {%- endif -%}
            """
        )
        return PromptTemplate(
            template_str.lstrip("\n"),
            {
                "status": status,
                "stdout": stdout,
                "stderr": stderr,
            },
        )
    def data_examples(self):
        return [
            # success
            {
                "status": "success",
                "stdout": '{"results":["something something"]}',
            },
            # failure
            {
                "status": "failure",
                "stderr": "brave_search encounter an error: could not communicate with api.brave.com",
            },
        ]
--- a/llama_stack/models/llama/llama3/template_data.py
+++ b/llama_stack/models/llama/llama3/template_data.py
@ -0,0 +1,120 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # top-level folder for each specific model found within the models/ directory at
 # the top-level of this source tree.
 from llama_models.datatypes import (
    BuiltinTool,
    StopReason,
    ToolCall,
 )
 from .prompt_templates import (
    BuiltinToolGenerator,
    JsonCustomToolGenerator,
    ToolResponseGenerator,
 )
 INSTRUCTION = "You are a helpful assistant."
 def system_message_builtin_tools_only():
    return {
        "builtin_tools": BuiltinToolGenerator().data_examples()[0],
        "custom_tools": [],
        "instruction": INSTRUCTION,
    }
 def system_message_builtin_code_only():
    return {
        "builtin_tools": BuiltinToolGenerator().data_examples()[1],
        "custom_tools": [],
        "instruction": "",
    }
 def system_message_custom_tools_only():
    return {
        "builtin_tools": [],
        "custom_tools": JsonCustomToolGenerator().data_examples()[0],
        "instruction": INSTRUCTION,
    }
 def system_message_builtin_and_custom_tools():
    return {
        "builtin_tools": BuiltinToolGenerator().data_examples()[0],
        "custom_tools": JsonCustomToolGenerator().data_examples()[0],
        "instruction": INSTRUCTION,
    }
 def system_default():
    return {
        "builtin_tools": [],
        "custom_tools": [],
        "instruction": INSTRUCTION,
    }
 def tool_success():
    return ToolResponseGenerator().data_examples()[0]
 def tool_failure():
    return ToolResponseGenerator().data_examples()[1]
 def assistant_builtin_tool_call():
    return {
        "content": "",
        "tool_call": ToolCall(
            call_id="uuid",
            tool_name=BuiltinTool.brave_search,
            arguments={
                "query": "Who won NBA in 2024?",
            },
        ),
        "stop_reason": StopReason.end_of_message,
    }
 def assistant_custom_tool_call():
    return {
        "content": "",
        "tool_call": ToolCall(
            call_id="uuid",
            tool_name="trending_songs",
            arguments={"country": "US", "n": 10},
        ),
        "stop_reason": StopReason.end_of_turn,
    }
 def assistant_default():
    return {
        "content": "Hi, I am a helpful assistant. What can I help you with today?",
        "tool_call": None,
        "stop_reason": StopReason.end_of_turn,
    }
 def user_default():
    return {"content": "Please tell me how to plan a trip to New York"}
 def user_images():
    return {"content": "<|image|><|image|>What do these images depict?"}
 def user_interleaved_images():
    return {"content": "<|image|>Describe the image in one sentence.<|image|>Write a haiku about these images"}
--- a/llama_stack/models/llama/llama3/test_system_prompts.py
+++ b/llama_stack/models/llama/llama3/test_system_prompts.py
@ -0,0 +1,199 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # top-level folder for each specific model found within the models/ directory at
 # the top-level of this source tree.
 import textwrap
 import unittest
 from datetime import datetime
 from .prompt_templates import (
    BuiltinToolGenerator,
    FunctionTagCustomToolGenerator,
    JsonCustomToolGenerator,
    PythonListCustomToolGenerator,
    SystemDefaultGenerator,
 )
 class PromptTemplateTests(unittest.TestCase):
    def check_generator_output(self, generator, expected_text):
        example = generator.data_examples()[0]
        pt = generator.gen(example)
        text = pt.render()
        # print(text)  # debugging
        assert text == expected_text, f"Expected:\n{expected_text}\nActual:\n{text}"
    def test_system_default(self):
        generator = SystemDefaultGenerator()
        today = datetime.now().strftime("%d %B %Y")
        expected_text = f"Cutting Knowledge Date: December 2023\nToday Date: {today}"
        self.check_generator_output(generator, expected_text)
    def test_system_builtin_only(self):
        generator = BuiltinToolGenerator()
        expected_text = textwrap.dedent(
            """
            Environment: ipython
            Tools: brave_search, wolfram_alpha
            """
        )
        self.check_generator_output(generator, expected_text.strip("\n"))
    def test_system_custom_only(self):
        self.maxDiff = None
        generator = JsonCustomToolGenerator()
        expected_text = textwrap.dedent(
            """
            Answer the user's question by making use of the following functions if needed.
            If none of the function can be used, please say so.
            Here is a list of functions in JSON format:
            {
                "type": "function",
                "function": {
                    "name": "trending_songs",
                    "description": "Returns the trending songs on a Music site",
                    "parameters": {
                        "type": "object",
                        "properties": [
                            {
                                "n": {
                                    "type": "object",
                                    "description": "The number of songs to return"
                                }
                            },
                            {
                                "genre": {
                                    "type": "object",
                                    "description": "The genre of the songs to return"
                                }
                            }
                        ],
                        "required": ["n"]
                    }
                }
            }
            Return function calls in JSON format.
            """
        )
        self.check_generator_output(generator, expected_text.strip("\n"))
    def test_system_custom_function_tag(self):
        self.maxDiff = None
        generator = FunctionTagCustomToolGenerator()
        expected_text = textwrap.dedent(
            """
            You have access to the following functions:
            Use the function 'trending_songs' to 'Returns the trending songs on a Music site':
            {"name": "trending_songs", "description": "Returns the trending songs on a Music site", "parameters": {"genre": {"description": "The genre of the songs to return", "param_type": "str", "required": false}, "n": {"description": "The number of songs to return", "param_type": "int", "required": true}}}
            Think very carefully before calling functions.
            If you choose to call a function ONLY reply in the following format with no prefix or suffix:
            <function=example_function_name>{"example_name": "example_value"}</function>
            Reminder:
            - If looking for real time information use relevant functions before falling back to brave_search
            - Function calls MUST follow the specified format, start with <function= and end with </function>
            - Required parameters MUST be specified
            - Only call one function at a time
            - Put the entire function call reply on one line
            """
        )
        self.check_generator_output(generator, expected_text.strip("\n"))
    def test_llama_3_2_system_zero_shot(self):
        generator = PythonListCustomToolGenerator()
        expected_text = textwrap.dedent(
            """
            You are an expert in composing functions. You are given a question and a set of possible functions.
            Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
            If none of the function can be used, point it out. If the given question lacks the parameters required by the function,
            also point it out. You should only return the function call in tools call sections.
            If you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]
            You SHOULD NOT include any other text in the response.
            Here is a list of functions in JSON format that you can invoke.
            [
                {
                    "name": "get_weather",
                    "description": "Get weather info for places",
                    "parameters": {
                        "type": "dict",
                        "required": ["city"],
                        "properties": {
                            "city": {
                                "type": "string",
                                "description": "The name of the city to get the weather for"
                            },
                            "metric": {
                                "type": "string",
                                "description": "The metric for weather. Options are: celsius, fahrenheit",
                                "default": "celsius"
                            }
                        }
                    }
                }
            ]
            """
        )
        self.check_generator_output(generator, expected_text.strip("\n"))
    def test_llama_3_2_provided_system_prompt(self):
        generator = PythonListCustomToolGenerator()
        expected_text = textwrap.dedent(
            """
            Overriding message.
            If you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]
            You SHOULD NOT include any other text in the response.
            Here is a list of functions in JSON format that you can invoke.
            [
                {
                    "name": "get_weather",
                    "description": "Get weather info for places",
                    "parameters": {
                        "type": "dict",
                        "required": ["city"],
                        "properties": {
                            "city": {
                                "type": "string",
                                "description": "The name of the city to get the weather for"
                            },
                            "metric": {
                                "type": "string",
                                "description": "The metric for weather. Options are: celsius, fahrenheit",
                                "default": "celsius"
                            }
                        }
                    }
                }
            ]"""
        )
        user_system_prompt = textwrap.dedent(
            """
            Overriding message.
            {{ function_description }}
            """
        )
        example = generator.data_examples()[0]
        pt = generator.gen(example, user_system_prompt)
        text = pt.render()
        assert text == expected_text, f"Expected:\n{expected_text}\nActual:\n{text}"
--- a/llama_stack/models/llama/llama3_1/init.py
+++ b/llama_stack/models/llama/llama3_1/init.py
@ -0,0 +1,12 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # top-level folder for each specific model found within the models/ directory at
 # the top-level of this source tree.
--- a/llama_stack/models/llama/llama3_1/prompts.py
+++ b/llama_stack/models/llama/llama3_1/prompts.py
@ -0,0 +1,259 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # top-level folder for each specific model found within the models/ directory at
 # the top-level of this source tree.
 import textwrap
 from typing import List
 from llama_models.datatypes import (
    BuiltinTool,
    RawMessage,
    StopReason,
    ToolCall,
    ToolPromptFormat,
 )
 from ..prompt_format import (
    # llama3_1_e2e_tool_call_dialog,
    TextCompletionContent,
    UseCase,
    llama3_1_builtin_tool_call_dialog,
    llama3_1_custom_tool_call_dialog,
 )
 def wolfram_alpha_response():
    return textwrap.dedent(
        """
        {
            "queryresult": {
                "success": true,
                "inputstring": "100th decimal of pi",
                "pods": [
                    {
                        "title": "Input interpretation",
                        "subpods": [
                            {
                                "title": "",
                                "plaintext": "100th digit | \u03c0"
                            }
                        ]
                    },
                    {
                        "title": "Nearby digits",
                        "subpods": [
                            {
                                "title": "",
                                "plaintext": "...86208998628034825342117067982148086513282306647093..."
                            }
                        ]
                    },
                    {
                        "title": "Result",
                        "primary": true,
                        "subpods": [
                            {
                                "title": "",
                                "plaintext": "7"
                            }
                        ]
                    }
                ]
            }
        }
        """
    )
 def usecases() -> List[UseCase | str]:
    return [
        textwrap.dedent(
            """
            # Llama 3.1 - Prompt Formats
            ## Tokens
            Here is a list of special tokens that are supported by Llama 3.1:
            - `<|begin_of_text|>`: Specifies the start of the prompt
            - `<|end_of_text|>`: Model will cease to generate more tokens. This token is generated only by the base models.
            - `<|finetune_right_pad_id|>`: This token is used for padding text sequences to the same length in a batch.
            - `<|start_header_id|>` and `<|end_header_id|>`: These tokens enclose the role for a particular message. The possible roles are: [system, user, assistant and tool]
            - `<|eom_id|>`: End of message. A message represents a possible stopping point for execution where the model can inform the executor that a tool call needs to be made. This is used for multi-step interactions between the model and any available tools. This token is emitted by the model when the Environment: ipython instruction is used in the system prompt, or if the model calls for a built-in tool.
            - `<|eot_id|>`: End of turn. Represents when the model has determined that it has finished interacting with the user message that initiated its response. This is used in two scenarios:
                - at the end of a direct interaction between the model and the user
                - at the end of multiple interactions between the model and any available tools
                This token signals to the executor that the model has finished generating a response.
            - `<|python_tag|>`: Is a special tag used in the model's response to signify a tool call.
            """
        ),
        textwrap.dedent(
            """
            There are 4 different roles that are supported by Llama 3.1
            - `system`: Sets the context in which to interact with the AI model. It typically includes rules, guidelines, or necessary information that helps the model respond effectively.
            - `user`: Represents the human interacting with the model. It includes the inputs, commands, and questions to the model.
            - `tool`: A new role introduced in Llama 3.1. This role is used to mark messages with the output of a tool call when sent back to the model from the executor. (The actual token used by the model for this role is "ipython".)
            - `assistant`: Represents the response generated by the AI model based on the context provided in the `system`, `tool` and `user` prompts.
            """
        ),
        UseCase(
            title="Llama 3.1 Base Model",
            description="Text completion for Llama 3.1 base model uses this format.",
            dialogs=[TextCompletionContent(content="Color of sky is blue but sometimes can also be")],
            notes="Note start special tag",
        ),
        "## Llama 3.1 Instruct Model",
        UseCase(
            title="User and assistant conversation",
            description="Here is a regular multi-turn user assistant conversation and how its formatted.",
            dialogs=[
                [
                    RawMessage(role="system", content="You are a helpful assistant"),
                    RawMessage(
                        role="user",
                        content="Answer who are you in the form of jeopardy?",
                    ),
                ]
            ],
            notes="",
        ),
        "## Tool Calling Formats",
        textwrap.dedent(
            """
            The three built-in tools (brave_search, wolfram_alpha, and code interpreter) can be turned on using the system prompt:
            - Brave Search: Tool call to perform web searches.
            - Wolfram Alpha: Tool call to perform complex mathematical calculations.
            - Code Interpreter: Enables the model to output python code.
            """
        ),
        UseCase(
            title="Builtin Tool Calling",
            description=textwrap.dedent(
                """
                Here is an example of a conversation using brave search
                """
            ),
            dialogs=[llama3_1_builtin_tool_call_dialog()],
            notes=textwrap.dedent(
                """
                - Just including Environment: ipython turns on code interpreter; therefore, you don't need to specify code interpretation on the Tools: line. The model can generate python code which is interpreted by the executor, with the result provided back to the model.
                - The message body of the assistant response starts with a special tag <|python_tag|>
                - As alluded to above, in such an environment, the model can generate <|eom_id|> instead of just the standard <|eot_id|> . The latter indicates the turn is finished, while the former indicates continued multi-step reasoning. That is, the model is expecting a continuation message with the output of the tool call.
                - The model tool call response is of the form `tool.call(query="...")` wher tool is `brave_search` or `wolfram_alpha`
                """
            ),
        ),
        UseCase(
            title="Builtin Code Interpreter",
            description="Here is an actual example of model responding with code",
            dialogs=[
                [
                    RawMessage(role="system", content="Environment: ipython"),
                    RawMessage(
                        role="user",
                        content="Write code to check if number is prime, use that to see if the number 7 is prime",
                    ),
                ],
            ],
            notes=textwrap.dedent(
                """
                - Model starts with <|python_tag|> and continues writing python code that it needs to be executed
                - No explicit mention of code_interpreter in system prompt. `Environment: ipython` implicitly enables it.
                """
            ),
        ),
        UseCase(
            title="Built-in tools full interaction",
            description="Here is a full interaction with the built-in tools including the tool response and the final assistant response.",
            dialogs=[
                [
                    RawMessage(
                        role="system",
                        content="Environment: ipython\nTools: brave_search, wolfram_alpha\n",
                    ),
                    RawMessage(role="user", content="What is the 100th decimal of pi?"),
                    RawMessage(
                        role="assistant",
                        content="",
                        stop_reason=StopReason.end_of_message,
                        tool_calls=[
                            ToolCall(
                                call_id="tool_call_id",
                                tool_name=BuiltinTool.wolfram_alpha,
                                arguments={"query": "100th decimal of pi"},
                            )
                        ],
                    ),
                    RawMessage(
                        role="tool",
                        content=wolfram_alpha_response(),
                    ),
                ],
            ],
            notes=textwrap.dedent(
                """
                - Note the `<|python_tag|>` in the assistant response.
                - Role is `tool` for the wolfram alpha response that is passed back to the model.
                - Final message from assistant has <|eot_id|> tag.
                """
            ),
        ),
        "## Zero shot tool calling",
        UseCase(
            title="JSON based tool calling",
            description=textwrap.dedent(
                """
                Llama models can now output custom tool calls from a single message to allow easier tool calling.
                The following prompts provide an example of how custom tools can be called from the output of the model.
                It's important to note that the model itself does not execute the calls; it provides structured output to facilitate calling by an executor.
                """
            ),
            dialogs=[llama3_1_custom_tool_call_dialog()],
            notes=textwrap.dedent(
                """
                - JSON format for providing tools needs name, description and parameters
                - Model responds with `<|python_tag|>` and `<|eom_id|>` as `Environment: ipython` was in the system prompt
                - Instructions for tools added as a user message
                - Only single tool calls are supported as of now
                """
            ),
        ),
        # FIXME: This is not working yet as expected
        # UseCase(
        #     title="E2E tool call example",
        #     description=textwrap.dedent(
        #         """
        #         Here is an example showing the whole multi-step turn by taking custom tool outputs and passing back to the model.
        #         """
        #     ),
        #     dialogs=[
        #         llama3_1_e2e_tool_call_dialog(
        #             tool_prompt_format=ToolPromptFormat.function_tag
        #         )
        #     ],
        #     notes="",
        # ),
        "## Example of a user defined tool calling",
        UseCase(
            title="`<function>` based tool calling",
            description=textwrap.dedent(
                """
                Here is an example of how you could also write custom instructions for model to do zero shot tool calling.
                In this example, we define a custom tool calling format using the `<function>` tag.
                """
            ),
            dialogs=[llama3_1_custom_tool_call_dialog(ToolPromptFormat.function_tag)],
            notes=textwrap.dedent(
                """
                - In this case, model does NOT respond with `<|python_tag|>` and ends with `<|eot_id|>`
                - Instructions for tools added as a user message
                """
            ),
        ),
    ]
--- a/llama_stack/models/llama/llama3_2/init.py
+++ b/llama_stack/models/llama/llama3_2/init.py
@ -0,0 +1,12 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # top-level folder for each specific model found within the models/ directory at
 # the top-level of this source tree.
--- a/llama_stack/models/llama/llama3_2/prompts_text.py
+++ b/llama_stack/models/llama/llama3_2/prompts_text.py
@ -0,0 +1,235 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # top-level folder for each specific model found within the models/ directory at
 # the top-level of this source tree.
 import json
 import textwrap
 from llama_models.datatypes import (
    RawMessage,
    StopReason,
    ToolCall,
    ToolPromptFormat,
 )
 from ..prompt_format import (
    TextCompletionContent,
    UseCase,
    llama3_1_builtin_code_interpreter_dialog,
 )
 def user_tool_call():
    content = textwrap.dedent(
        """
        Questions: Can you retrieve the details for the user with the ID 7890, who has black as their special request?
        Here is a list of functions in JSON format that you can invoke:
        [
            {
                "name": "get_user_info",
                "description": "Retrieve details for a specific user by their unique identifier. Note that the provided function is in Python 3 syntax.",
                "parameters": {
                    "type": "dict",
                    "required": [
                        "user_id"
                    ],
                    "properties": {
                        "user_id": {
                        "type": "integer",
                        "description": "The unique identifier of the user. It is used to fetch the specific user details from the database."
                    },
                    "special": {
                        "type": "string",
                        "description": "Any special information or parameters that need to be considered while fetching user details.",
                        "default": "none"
                        }
                    }
                }
            }
        ]
        Should you decide to return the function call(s),Put it in the format of [func1(params_name=params_value, params_name2=params_value2...), func2(params)]
        NO other text MUST be included.
        """
    )
    return content.strip()
 def system_tool_call():
    content = textwrap.dedent(
        """
        You are an expert in composing functions. You are given a question and a set of possible functions.
        Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
        If none of the function can be used, point it out. If the given question lacks the parameters required by the function,
        also point it out. You should only return the function call in tools call sections.
        If you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]
        You SHOULD NOT include any other text in the response.
        Here is a list of functions in JSON format that you can invoke.
        [
            {
                "name": "get_weather",
                "description": "Get weather info for places",
                "parameters": {
                    "type": "dict",
                    "required": [
                        "city"
                    ],
                    "properties": {
                        "city": {
                            "type": "string",
                            "description": "The name of the city to get the weather for"
                        },
                        "metric": {
                            "type": "string",
                            "description": "The metric for weather. Options are: celsius, fahrenheit",
                            "default": "celsius"
                        }
                    }
                }
            }
        ]
        """
    )
    return content.strip()
 def usecases():
    return [
        UseCase(
            title="User and assistant conversation",
            description="Here is a regular multi-turn user assistant conversation and how its formatted.",
            dialogs=[
                [
                    RawMessage(role="system", content="You are a helpful assistant"),
                    RawMessage(role="user", content="Who are you?"),
                ]
            ],
            notes="This format is unchanged from Llama3.1",
        ),
        UseCase(
            title="Zero shot function calling",
            description=textwrap.dedent(
                """
                For Llama3.2 1B and 3B instruct models, we are introducing a new format for zero shot function calling.
                This new format is designed to be more flexible and powerful than the previous format.
                All available functions can be provided in the system message. A key difference is in the format of how the assistant responds with function calls.
                It is pythonic in the form of `[func1(params_name=params_value, params_name2=params_value2...), func2(params)]` instead of the `json` or `<function>` tag that were defined in Llama3.1.
                Here is an example for the same,
                """
            ),
            dialogs=[
                # Zero shot tool calls as system message
                [
                    RawMessage(role="system", content=system_tool_call()),
                    RawMessage(role="user", content="What is the weather in SF and Seattle?"),
                ],
            ],
            notes=textwrap.dedent(
                """
                - The output supports multiple tool calls natively
                - JSON format for defining the functions in the system prompt is similar to Llama3.1
                """
            ),
        ),
        UseCase(
            title="Zero shot function calling with user message",
            description=textwrap.dedent(
                """
                While the default is to provide all function calls in a system message, in Llama3.2 text models you can also provide information for all the available tools in a user message.
                """
            ),
            dialogs=[
                # Zero shot tool call as user message
                [
                    RawMessage(role="user", content=user_tool_call()),
                ],
            ],
            notes=textwrap.dedent(
                """
                - The tool call format for the model is the same whether your function calls are provided in the system or user message.
                - While builtin tool calls end with a <|eom_id|>, notice the <|eot_id|> for zero shot tool calls.
                """
            ),
        ),
        UseCase(
            title="Code Interpreter",
            description=textwrap.dedent(
                """
                Code Interpreter continues to work in 3.2 text models similar to Llama 3.1 model family.
                Here is an example,
                """
            ),
            dialogs=[llama3_1_builtin_code_interpreter_dialog()],
            notes=textwrap.dedent(
                """
                - Note `Environment: ipython` in the system prompt.
                - Note that the response starts with `<|python_tag|>` and ends with `<|eom_id|>`
                """
            ),
        ),
        UseCase(
            title="Zero shot function calling E2E format",
            description=textwrap.dedent(
                """
                Here is an example of the e2e cycle of tool calls with the model in a muti-step way.
                """
            ),
            dialogs=[
                [
                    RawMessage(role="system", content=system_tool_call()),
                    RawMessage(role="user", content="What is the weather in SF?"),
                    RawMessage(
                        role="assistant",
                        content="",
                        stop_reason=StopReason.end_of_turn,
                        tool_calls=[
                            ToolCall(
                                call_id="cc",
                                tool_name="get_weather",
                                arguments={
                                    "city": "San Francisco",
                                    "metric": "celsius",
                                },
                            )
                        ],
                    ),
                    RawMessage(
                        role="tool",
                        content=json.dumps("25 C"),
                    ),
                ],
            ],
            notes=textwrap.dedent(
                """
                - The output of the function call is provided back to the model as a tool response ( in json format ).
                - Notice `<|start_header_id|>ipython<|end_header_id|>` as the header message preceding the tool response.
                - The model finally summarizes the information from the tool response and returns the result to the user.
                """
            ),
            tool_prompt_format=ToolPromptFormat.python_list,
        ),
        UseCase(
            title="Prompt format for base models",
            description=textwrap.dedent(
                """
                For base models (Llama3.2-1B and Llama3.2-3B), the prompt format for a simple completion is as follows
                """
            ),
            dialogs=[
                TextCompletionContent(content="The color of the sky is blue but sometimes it can also be"),
            ],
            notes="Same as Llama3.1",
        ),
    ]
--- a/llama_stack/models/llama/llama3_2/prompts_vision.py
+++ b/llama_stack/models/llama/llama3_2/prompts_vision.py
@ -0,0 +1,133 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # top-level folder for each specific model found within the models/ directory at
 # the top-level of this source tree.
 import textwrap
 from pathlib import Path
 from llama_models.datatypes import (
    RawMediaItem,
    RawMessage,
    RawTextItem,
 )
 from ..prompt_format import (
    TextCompletionContent,
    UseCase,
    llama3_1_builtin_tool_call_dialog,
    # llama3_1_builtin_tool_call_with_image_dialog,
    llama3_2_user_assistant_conversation,
 )
 def usecases():
    this_dir = Path(__file__).parent.parent.resolve()
    with open(this_dir / "scripts/resources/dog.jpg", "rb") as f:
        img = f.read()
    return [
        llama3_2_user_assistant_conversation(),
        UseCase(
            title="User and assistant conversation with Images",
            description="This example shows how to pass and image to the model as part of the messages.",
            dialogs=[
                [
                    RawMessage(
                        role="user",
                        content=[
                            RawMediaItem(data=img),
                            RawTextItem(text="Describe this image in two sentences"),
                        ],
                    )
                ],
            ],
            notes=textwrap.dedent(
                """
                - The `<|image|>` tag is used to indicate presence of the image
                - The model isn't an early fusion model so doesn't actually translate an image into several tokens. Instead the cross-attention layers take input "on the side" from a vision encoder
                ![Image](mm-model.png)
                - Its important to postion the <|image|> tag appropriately in the prompt. Image will only attend to the subsequent text tokens
                - The <|image|> tag is part of the user message body, implying that it should only come after the header `<|start_header_id|>{role}<|end_header_id|>` in the message body
                - We recommend using a single image in one prompt
                """
            ),
        ),
        UseCase(
            title="Builtin and Zero Shot Tool Calling",
            description=textwrap.dedent(
                """
                Llama3.2 vision models follow the same tool calling format as Llama3.1 models when inputs are text only.
                Use `Environment: ipython` to enable tools.
                Add `Tools: {{tool_name1}},{{tool_name2}}` for each of the builtin tools.
                The same builtin tools as Llama3.1 are available,
                - code_interpreter (for executing python code)
                - brave_search (to search the web)
                - wolfram_alpha (for querying wolfram alpha for mathematical questions)
                """,
            ),
            dialogs=[llama3_1_builtin_tool_call_dialog()],
            notes=textwrap.dedent(
                """
                - Note the `<|python_tag|>` before `brave_search` function call.
                - The `<|eom_id|>` tag is used to indicate the end of the message.
                - Similar to Llama3.1, code_interpreter is not explicitly mentioned but is enabled via `Environment: ipython`.
                - Tool Calling does NOT work with images in the prompt as of now.
                """
            ),
        ),
        # UseCase(
        #     title="Tool Calling for vision models",
        #     description=textwrap.dedent(
        #         """
        #         While Llama3.2 vision models follow the same tool calling format as Llama3.1 models when inputs are text only,
        #         they are not able to do tool calling when prompt contains image inputs (along with text).
        #         The recommended way would be to separate out the image understanding from the tool calling in successive prompts.
        #         Here is an example of how that could be done,
        #         """,
        #     ),
        #     dialogs=[llama3_1_builtin_tool_call_with_image_dialog()],
        #     notes=textwrap.dedent(
        #         """
        #         - Instead of a single prompt (image understanding + tool call), we split into two prompts to achieve the same result.
        #         """
        #     ),
        # ),
        UseCase(
            title="Prompt format for base models",
            description=textwrap.dedent(
                """
                For base models (Llama3.2-11B-Vision and Llama3.2-90B-Vision), the prompt format for a simple completion is as follows
                """
            ),
            dialogs=[
                TextCompletionContent(content="The color of the sky is blue but sometimes it can also be"),
            ],
            notes="- Same as Llama3.1",
        ),
        UseCase(
            title="Prompt format for base models with Image",
            description=textwrap.dedent(
                """
                For base models (Llama3.2-11B-Vision and Llama3.2-90B-Vision), here is an example of how the text completion format looks with an image,
                """
            ),
            dialogs=[
                TextCompletionContent(
                    content=[
                        RawMediaItem(data=img),
                        RawTextItem(text="If I had to write a haiku for this one"),
                    ]
                ),
            ],
            notes="- Note the placement of the special tags <|begin_of_text|> and <|image|>",
        ),
    ]
--- a/llama_stack/models/llama/llama3_3/prompts.py
+++ b/llama_stack/models/llama/llama3_3/prompts.py
@ -0,0 +1,258 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # top-level folder for each specific model found within the models/ directory at
 # the top-level of this source tree.
 import textwrap
 from typing import List
 from llama_models.datatypes import (
    BuiltinTool,
    RawMessage,
    StopReason,
    ToolCall,
    ToolPromptFormat,
 )
 from ..prompt_format import (
    # llama3_1_e2e_tool_call_dialog,
    TextCompletionContent,
    UseCase,
    llama3_1_builtin_tool_call_dialog,
    llama3_1_custom_tool_call_dialog,
 )
 def wolfram_alpha_response():
    return textwrap.dedent(
        """
        {
            "queryresult": {
                "success": true,
                "inputstring": "100th decimal of pi",
                "pods": [
                    {
                        "title": "Input interpretation",
                        "subpods": [
                            {
                                "title": "",
                                "plaintext": "100th digit | \u03c0"
                            }
                        ]
                    },
                    {
                        "title": "Nearby digits",
                        "subpods": [
                            {
                                "title": "",
                                "plaintext": "...86208998628034825342117067982148086513282306647093..."
                            }
                        ]
                    },
                    {
                        "title": "Result",
                        "primary": true,
                        "subpods": [
                            {
                                "title": "",
                                "plaintext": "7"
                            }
                        ]
                    }
                ]
            }
        }
        """
    )
 def usecases() -> List[UseCase | str]:
    return [
        textwrap.dedent(
            """
            # Llama 3.1 - Prompt Formats
            ## Tokens
            Here is a list of special tokens that are supported by Llama 3.1:
            - `<|begin_of_text|>`: Specifies the start of the prompt
            - `<|end_of_text|>`: Model will cease to generate more tokens. This token is generated only by the base models.
            - `<|finetune_right_pad_id|>`: This token is used for padding text sequences to the same length in a batch.
            - `<|start_header_id|>` and `<|end_header_id|>`: These tokens enclose the role for a particular message. The possible roles are: [system, user, assistant and tool]
            - `<|eom_id|>`: End of message. A message represents a possible stopping point for execution where the model can inform the executor that a tool call needs to be made. This is used for multi-step interactions between the model and any available tools. This token is emitted by the model when the Environment: ipython instruction is used in the system prompt, or if the model calls for a built-in tool.
            - `<|eot_id|>`: End of turn. Represents when the model has determined that it has finished interacting with the user message that initiated its response. This is used in two scenarios:
                - at the end of a direct interaction between the model and the user
                - at the end of multiple interactions between the model and any available tools
                This token signals to the executor that the model has finished generating a response.
            - `<|python_tag|>`: Is a special tag used in the model's response to signify a tool call.
            """
        ),
        textwrap.dedent(
            """
            There are 4 different roles that are supported by Llama 3.1
            - `system`: Sets the context in which to interact with the AI model. It typically includes rules, guidelines, or necessary information that helps the model respond effectively.
            - `user`: Represents the human interacting with the model. It includes the inputs, commands, and questions to the model.
            - `tool`: A new role introduced in Llama 3.1. This role is used to mark messages with the output of a tool call when sent back to the model from the executor. (The actual token used by the model for this role is "ipython".)
            - `assistant`: Represents the response generated by the AI model based on the context provided in the `system`, `tool` and `user` prompts.
            """
        ),
        UseCase(
            title="Llama 3.1 Base Model",
            description="Text completion for Llama 3.1 base model uses this format.",
            dialogs=[TextCompletionContent(content="Color of sky is blue but sometimes can also be")],
            notes="Note start special tag",
        ),
        "## Llama 3.1 Instruct Model",
        UseCase(
            title="User and assistant conversation",
            description="Here is a regular multi-turn user assistant conversation and how its formatted.",
            dialogs=[
                [
                    RawMessage(role="system", content="You are a helpful assistant"),
                    RawMessage(
                        role="user",
                        content="Answer who are you in the form of jeopardy?",
                    ),
                ]
            ],
            notes="",
        ),
        "## Tool Calling Formats",
        textwrap.dedent(
            """
            The three built-in tools (brave_search, wolfram_alpha, and code interpreter) can be turned on using the system prompt:
            - Brave Search: Tool call to perform web searches.
            - Wolfram Alpha: Tool call to perform complex mathematical calculations.
            - Code Interpreter: Enables the model to output python code.
            """
        ),
        UseCase(
            title="Builtin Tool Calling",
            description=textwrap.dedent(
                """
                Here is an example of a conversation using brave search
                """
            ),
            dialogs=[llama3_1_builtin_tool_call_dialog()],
            notes=textwrap.dedent(
                """
                - Just including Environment: ipython turns on code interpreter; therefore, you don't need to specify code interpretation on the Tools: line. The model can generate python code which is interpreted by the executor, with the result provided back to the model.
                - The message body of the assistant response starts with a special tag <|python_tag|>
                - As alluded to above, in such an environment, the model can generate <|eom_id|> instead of just the standard <|eot_id|> . The latter indicates the turn is finished, while the former indicates continued multi-step reasoning. That is, the model is expecting a continuation message with the output of the tool call.
                - The model tool call response is of the form `tool.call(query="...")` wher tool is `brave_search` or `wolfram_alpha`
                """
            ),
        ),
        UseCase(
            title="Builtin Code Interpreter",
            description="Here is an actual example of model responding with code",
            dialogs=[
                [
                    RawMessage(role="system", content="Environment: ipython"),
                    RawMessage(
                        role="user",
                        content="Write code to check if number is prime, use that to see if the number 7 is prime",
                    ),
                ],
            ],
            notes=textwrap.dedent(
                """
                - Model starts with <|python_tag|> and continues writing python code that it needs to be executed
                - No explicit mention of code_interpreter in system prompt. `Environment: ipython` implicitly enables it.
                """
            ),
        ),
        UseCase(
            title="Built-in tools full interaction",
            description="Here is a full interaction with the built-in tools including the tool response and the final assistant response.",
            dialogs=[
                [
                    RawMessage(
                        role="system",
                        content="Environment: ipython\nTools: brave_search, wolfram_alpha\n",
                    ),
                    RawMessage(role="user", content="What is the 100th decimal of pi?"),
                    RawMessage(
                        content="",
                        stop_reason=StopReason.end_of_message,
                        tool_calls=[
                            ToolCall(
                                call_id="tool_call_id",
                                tool_name=BuiltinTool.wolfram_alpha,
                                arguments={"query": "100th decimal of pi"},
                            )
                        ],
                    ),
                    RawMessage(
                        role="tool",
                        content=wolfram_alpha_response(),
                    ),
                ],
            ],
            notes=textwrap.dedent(
                """
                - Note the `<|python_tag|>` in the assistant response.
                - Role is `tool` for the wolfram alpha response that is passed back to the model.
                - Final message from assistant has <|eot_id|> tag.
                """
            ),
        ),
        "## Zero shot tool calling",
        UseCase(
            title="JSON based tool calling",
            description=textwrap.dedent(
                """
                Llama models can now output custom tool calls from a single message to allow easier tool calling.
                The following prompts provide an example of how custom tools can be called from the output of the model.
                It's important to note that the model itself does not execute the calls; it provides structured output to facilitate calling by an executor.
                """
            ),
            dialogs=[llama3_1_custom_tool_call_dialog()],
            notes=textwrap.dedent(
                """
                - JSON format for providing tools needs name, description and parameters
                - Model responds with `<|python_tag|>` and `<|eom_id|>` as `Environment: ipython` was in the system prompt
                - Instructions for tools added as a user message
                - Only single tool calls are supported as of now
                """
            ),
        ),
        # FIXME: This is not working yet as expected
        # UseCase(
        #     title="E2E tool call example",
        #     description=textwrap.dedent(
        #         """
        #         Here is an example showing the whole multi-step turn by taking custom tool outputs and passing back to the model.
        #         """
        #     ),
        #     dialogs=[
        #         llama3_1_e2e_tool_call_dialog(
        #             tool_prompt_format=ToolPromptFormat.function_tag
        #         )
        #     ],
        #     notes="",
        # ),
        "## Example of a user defined tool calling",
        UseCase(
            title="`<function>` based tool calling",
            description=textwrap.dedent(
                """
                Here is an example of how you could also write custom instructions for model to do zero shot tool calling.
                In this example, we define a custom tool calling format using the `<function>` tag.
                """
            ),
            dialogs=[llama3_1_custom_tool_call_dialog(ToolPromptFormat.function_tag)],
            notes=textwrap.dedent(
                """
                - In this case, model does NOT respond with `<|python_tag|>` and ends with `<|eot_id|>`
                - Instructions for tools added as a user message
                """
            ),
        ),
    ]
--- a/llama_stack/models/llama/prompt_format.py
+++ b/llama_stack/models/llama/prompt_format.py
@ -0,0 +1,204 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # top-level folder for each specific model found within the models/ directory at
 # the top-level of this source tree.
 import json
 import textwrap
 from pathlib import Path
 from typing import List
 from llama_models.datatypes import (
    RawContent,
    RawMediaItem,
    RawMessage,
    RawTextItem,
    StopReason,
    ToolCall,
    ToolPromptFormat,
 )
 from pydantic import BaseModel, Field
 from .llama3.interface import LLama31Interface
 from .llama3.template_data import (
    system_message_builtin_code_only,
    system_message_builtin_tools_only,
    system_message_custom_tools_only,
 )
 class TextCompletionContent(BaseModel):
    content: RawContent = ""
 class UseCase(BaseModel):
    title: str = ""
    description: str = ""
    dialogs: List[List[RawMessage] | TextCompletionContent | str] = Field(default_factory=list)
    notes: str = ""
    tool_prompt_format: ToolPromptFormat = ToolPromptFormat.json
    def md_format(self):
        section = textwrap.dedent(
            """
            ## {title}
            {description}
            {dialogs_text}
            {notes}
            """
        )
        return section.lstrip()
    def dialogs_to_text(self, generator) -> str:
        def _code_block(text):
            return f"```\n{text}\n```"
        text = ""
        for dialog in self.dialogs:
            if isinstance(dialog, str):
                text += dialog
                text += "\n\n"
                continue
            elif isinstance(dialog, TextCompletionContent):
                input_tokens, output_tokens = generator.text_completion_raw(
                    dialog.content,
                    max_gen_len=64,
                    temperature=0.1,
                    top_p=0.95,
                )
            else:
                input_tokens, output_tokens = generator.chat_completion_raw(
                    dialog,
                    max_gen_len=512,
                    temperature=0.0,
                    top_p=0.95,
                    tool_prompt_format=self.tool_prompt_format,
                )
            text += "##### Input Prompt Format\n"
            # FIXME: This is added to undo the hack in chat_formatter where
            # vision tokens are replaced with 128256.
            input_tokens = [generator.formatter.vision_token if t == 128256 else t for t in input_tokens]
            text += _code_block(generator.tokenizer.decode(input_tokens))
            # TODO: Figure out if "↵" needs to be added for newlines or end or some indication
            text += "\n\n"
            text += "##### Model Response Format\n"
            text += _code_block(generator.tokenizer.decode(output_tokens))
            text += "\n\n"
        return text
    def to_text(self, generator):
        section = self.md_format()
        dialogs_text = self.dialogs_to_text(generator)
        notes = f"##### Notes\n{self.notes}" if self.notes else ""
        section = section.format(
            title=self.title,
            description=self.description,
            dialogs_text=dialogs_text,
            notes=notes,
        )
        return section
 def llama3_1_builtin_tool_call_dialog(tool_prompt_format=ToolPromptFormat.json):
    interface = LLama31Interface(tool_prompt_format)
    messages = interface.system_messages(**system_message_builtin_tools_only())
    messages += interface.user_message(content="Search the web for the latest price of 1oz gold?")
    return messages
 def llama3_1_builtin_code_interpreter_dialog(tool_prompt_format=ToolPromptFormat.json):
    interface = LLama31Interface(tool_prompt_format)
    messages = interface.system_messages(**system_message_builtin_code_only())
    messages += interface.user_message(
        content="Write code to check if number is prime. Use it to verify if number 7 is prime"
    )
    return messages
 def llama3_1_builtin_tool_call_with_image_dialog(
    tool_prompt_format=ToolPromptFormat.json,
 ):
    this_dir = Path(__file__).parent
    with open(this_dir / "llama3/dog.jpg", "rb") as f:
        img = f.read()
    interface = LLama31Interface(tool_prompt_format)
    messages = interface.system_messages(**system_message_builtin_tools_only())
    messages += interface.user_message(content=[RawMediaItem(data=img), RawTextItem(text="What is this dog breed?")])
    messages += interface.assistant_response_messages(
        "Based on the description of the dog in the image, it appears to be a small breed dog, possibly a terrier mix",
        StopReason.end_of_turn,
    )
    messages += interface.user_message("Search the web for some food recommendations for the indentified breed")
    return messages
 def llama3_1_custom_tool_call_dialog(tool_prompt_format=ToolPromptFormat.json):
    interface = LLama31Interface(tool_prompt_format)
    messages = interface.system_messages(**system_message_custom_tools_only())
    messages += interface.user_message(content="Use tools to get latest trending songs")
    return messages
 def llama3_1_e2e_tool_call_dialog(tool_prompt_format=ToolPromptFormat.json):
    tool_response = json.dumps(["great song1", "awesome song2", "cool song3"])
    interface = LLama31Interface(tool_prompt_format)
    messages = interface.system_messages(**system_message_custom_tools_only())
    messages += interface.user_message(content="Use tools to get latest trending songs")
    messages.append(
        RawMessage(
            role="assistant",
            content="",
            stop_reason=StopReason.end_of_message,
            tool_calls=[
                ToolCall(
                    call_id="call_id",
                    tool_name="trending_songs",
                    arguments={"n": "10", "genre": "latest"},
                )
            ],
        ),
    )
    messages.append(
        RawMessage(
            role="assistant",
            content=tool_response,
        )
    )
    return messages
 def llama3_2_user_assistant_conversation():
    return UseCase(
        title="User and assistant conversation",
        description="Here is a regular multi-turn user assistant conversation and how its formatted.",
        dialogs=[
            [
                RawMessage(role="system", content="You are a helpful assistant"),
                RawMessage(role="user", content="Who are you?"),
            ]
        ],
        notes="This format is unchanged from Llama3.1",
    )
--- a/llama_stack/models/llama/sku_list.py
+++ b/llama_stack/models/llama/sku_list.py
--- a/llama_stack/providers/datatypes.py
+++ b/llama_stack/providers/datatypes.py
@ -7,7 +7,6 @@
 from typing import Any, List, Optional, Protocol
 from urllib.parse import urlparse
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field
 from llama_stack.apis.benchmarks import Benchmark
@ -18,6 +17,7 @@ from llama_stack.apis.scoring_functions import ScoringFn
 from llama_stack.apis.shields import Shield
 from llama_stack.apis.tools import Tool
 from llama_stack.apis.vector_dbs import VectorDB
 from llama_stack.schema_utils import json_schema_type
 class ModelsProtocolPrivate(Protocol):
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@ -17,7 +17,6 @@ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
 from urllib.parse import urlparse
 import httpx
 from llama_models.llama3.api.datatypes import BuiltinTool, ToolCall, ToolParamDefinition
 from pydantic import TypeAdapter
 from llama_stack.apis.agents import (
@ -63,6 +62,7 @@ from llama_stack.apis.inference import (
 from llama_stack.apis.safety import Safety
 from llama_stack.apis.tools import RAGDocument, RAGQueryConfig, ToolGroups, ToolRuntime
 from llama_stack.apis.vector_io import VectorIO
 from llama_stack.models.llama.datatypes import BuiltinTool, ToolCall, ToolParamDefinition
 from llama_stack.providers.utils.kvstore import KVStore
 from llama_stack.providers.utils.memory.vector_store import concat_interleaved_content
 from llama_stack.providers.utils.telemetry import tracing
--- a/llama_stack/providers/inline/agents/meta_reference/tests/test_chat_agent.py
+++ b/llama_stack/providers/inline/agents/meta_reference/tests/test_chat_agent.py
@ -8,7 +8,6 @@ import tempfile
 from typing import AsyncIterator, List, Optional, Union
 import pytest
 from llama_models.llama3.api.datatypes import BuiltinTool
 from llama_stack.apis.agents import (
    AgentConfig,
@ -41,6 +40,7 @@ from llama_stack.apis.tools import (
    ToolInvocationResult,
 )
 from llama_stack.apis.vector_io import QueryChunksResponse
 from llama_stack.models.llama.datatypes import BuiltinTool
 from llama_stack.providers.inline.agents.meta_reference.agent_instance import (
    MEMORY_QUERY_TOOL,
 )
--- a/llama_stack/providers/inline/inference/meta_reference/generation.py
+++ b/llama_stack/providers/inline/inference/meta_reference/generation.py
@ -23,20 +23,13 @@ from fairscale.nn.model_parallel.initialize import (
    initialize_model_parallel,
    model_parallel_is_initialized,
 )
 from llama_models.datatypes import (
    GreedySamplingStrategy,
    SamplingParams,
    TopPSamplingStrategy,
 )
 from llama_models.llama3.api.args import ModelArgs
 from llama_models.llama3.api.chat_format import ChatFormat, LLMInput
 from llama_models.llama3.api.datatypes import Model
 from llama_models.llama3.api.tokenizer import Tokenizer
 from llama_models.llama3.reference_impl.model import Transformer
 from llama_models.llama3.reference_impl.multimodal.model import (
    CrossAttentionTransformer,
 )
 from llama_models.sku_list import resolve_model
 from lmformatenforcer import JsonSchemaParser, TokenEnforcer, TokenEnforcerTokenizerData
 from pydantic import BaseModel
@ -47,6 +40,13 @@ from llama_stack.apis.inference import (
    ResponseFormatType,
 )
 from llama_stack.distribution.utils.model_utils import model_local_dir
 from llama_stack.models.llama.datatypes import (
    GreedySamplingStrategy,
    Model,
    SamplingParams,
    TopPSamplingStrategy,
 )
 from llama_stack.models.llama.sku_list import resolve_model
 from llama_stack.providers.utils.inference.prompt_adapter import (
    ChatCompletionRequestWithRawContent,
    CompletionRequestWithRawContent,
--- a/llama_stack/providers/inline/inference/meta_reference/inference.py
+++ b/llama_stack/providers/inline/inference/meta_reference/inference.py
@ -8,14 +8,6 @@ import asyncio
 import logging
 from typing import AsyncGenerator, List, Optional, Union
 from llama_models.llama3.api.datatypes import (
    SamplingParams,
    StopReason,
    ToolDefinition,
    ToolPromptFormat,
 )
 from llama_models.sku_list import resolve_model
 from llama_stack.apis.common.content_types import (
    TextDelta,
    ToolCallDelta,
@ -41,6 +33,13 @@ from llama_stack.apis.inference import (
    ToolConfig,
 )
 from llama_stack.apis.models import Model, ModelType
 from llama_stack.models.llama.datatypes import (
    SamplingParams,
    StopReason,
    ToolDefinition,
    ToolPromptFormat,
 )
 from llama_stack.models.llama.sku_list import resolve_model
 from llama_stack.providers.datatypes import ModelsProtocolPrivate
 from llama_stack.providers.utils.inference.embedding_mixin import (
    SentenceTransformerEmbeddingMixin,
--- a/llama_stack/providers/inline/inference/meta_reference/model_parallel.py
+++ b/llama_stack/providers/inline/inference/meta_reference/model_parallel.py
@ -10,10 +10,10 @@ from functools import partial
 from typing import Any, Generator
 from llama_models.llama3.api.chat_format import ChatFormat
 from llama_models.llama3.api.datatypes import Model
 from llama_models.llama3.api.tokenizer import Tokenizer
 from llama_models.sku_list import resolve_model
 from llama_stack.models.llama.datatypes import Model
 from llama_stack.models.llama.sku_list import resolve_model
 from llama_stack.providers.utils.inference.prompt_adapter import (
    ChatCompletionRequestWithRawContent,
    CompletionRequestWithRawContent,
--- a/llama_stack/providers/inline/inference/meta_reference/quantization/loader.py
+++ b/llama_stack/providers/inline/inference/meta_reference/quantization/loader.py
@ -14,14 +14,14 @@ from typing import Any, Dict, List, Optional
 import torch
 from fairscale.nn.model_parallel.layers import ColumnParallelLinear, RowParallelLinear
 from fairscale.nn.model_parallel.mappings import reduce_from_model_parallel_region
 from llama_models.datatypes import CheckpointQuantizationFormat
 from llama_models.llama3.api.args import ModelArgs
 from llama_models.llama3.reference_impl.model import Transformer, TransformerBlock
 from llama_models.sku_list import resolve_model
 from torch import Tensor, nn
 from torchao.quantization.GPTQ import Int8DynActInt4WeightLinear
 from llama_stack.apis.inference import QuantizationType
 from llama_stack.models.llama.datatypes import CheckpointQuantizationFormat
 from llama_stack.models.llama.sku_list import resolve_model
 from ..config import MetaReferenceQuantizedInferenceConfig
--- a/llama_stack/providers/inline/inference/vllm/config.py
+++ b/llama_stack/providers/inline/inference/vllm/config.py
@ -4,10 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field, field_validator
 from llama_stack.providers.utils.inference import supported_inference_models
 from llama_stack.schema_utils import json_schema_type
@json_schema_type
--- a/llama_stack/providers/inline/inference/vllm/vllm.py
+++ b/llama_stack/providers/inline/inference/vllm/vllm.py
@ -11,7 +11,6 @@ from typing import AsyncGenerator, List, Optional
 from llama_models.llama3.api.chat_format import ChatFormat
 from llama_models.llama3.api.tokenizer import Tokenizer
 from llama_models.sku_list import resolve_model
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.sampling_params import SamplingParams as VLLMSamplingParams
@ -35,6 +34,7 @@ from llama_stack.apis.inference import (
    ToolPromptFormat,
 )
 from llama_stack.apis.models import Model
 from llama_stack.models.llama.sku_list import resolve_model
 from llama_stack.providers.datatypes import ModelsProtocolPrivate
 from llama_stack.providers.utils.inference.openai_compat import (
    OpenAICompatCompletionChoice,
--- a/llama_stack/providers/inline/post_training/torchtune/common/utils.py
+++ b/llama_stack/providers/inline/post_training/torchtune/common/utils.py
@ -13,8 +13,6 @@
 from typing import Any, Callable, Dict
 import torch
 from llama_models.datatypes import Model
 from llama_models.sku_list import resolve_model
 from pydantic import BaseModel
 from torchtune.data._messages import InputOutputToMessages, ShareGPTToMessages
 from torchtune.models.llama3 import llama3_tokenizer
@ -24,6 +22,8 @@ from torchtune.models.llama3_2 import lora_llama3_2_3b
 from torchtune.modules.transforms import Transform
 from llama_stack.apis.post_training import DatasetFormat
 from llama_stack.models.llama.datatypes import Model
 from llama_stack.models.llama.sku_list import resolve_model
 class ModelConfig(BaseModel):
--- a/llama_stack/providers/inline/post_training/torchtune/post_training.py
+++ b/llama_stack/providers/inline/post_training/torchtune/post_training.py
@ -6,8 +6,6 @@
 from datetime import datetime
 from typing import Any, Dict, Optional
 from llama_models.schema_utils import webmethod
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
 from llama_stack.apis.post_training import (
@ -27,6 +25,7 @@ from llama_stack.providers.inline.post_training.torchtune.config import (
 from llama_stack.providers.inline.post_training.torchtune.recipes.lora_finetuning_single_device import (
    LoraFinetuningSingleDevice,
 )
 from llama_stack.schema_utils import webmethod
 class TorchtunePostTrainingImpl:
--- a/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
+++ b/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
@ -14,7 +14,6 @@ from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
 import torch
 from llama_models.sku_list import resolve_model
 from torch import nn
 from torch.optim import Optimizer
 from torch.utils.data import DataLoader, DistributedSampler
@ -46,6 +45,7 @@ from llama_stack.apis.post_training import (
 )
 from llama_stack.distribution.utils.config_dirs import DEFAULT_CHECKPOINT_DIR
 from llama_stack.distribution.utils.model_utils import model_local_dir
 from llama_stack.models.llama.sku_list import resolve_model
 from llama_stack.providers.inline.post_training.common.validator import (
    validate_input_dataset_schema,
 )
--- a/llama_stack/providers/inline/safety/llama_guard/llama_guard.py
+++ b/llama_stack/providers/inline/safety/llama_guard/llama_guard.py
@ -8,9 +8,6 @@ import re
 from string import Template
 from typing import Any, Dict, List, Optional
 from llama_models.datatypes import CoreModelId
 from llama_models.llama3.api.datatypes import Role
 from llama_stack.apis.common.content_types import ImageContentItem, TextContentItem
 from llama_stack.apis.inference import (
    ChatCompletionResponseEventType,
@ -26,6 +23,7 @@ from llama_stack.apis.safety import (
 )
 from llama_stack.apis.shields import Shield
 from llama_stack.distribution.datatypes import Api
 from llama_stack.models.llama.datatypes import CoreModelId, Role
 from llama_stack.providers.datatypes import ShieldsProtocolPrivate
 from llama_stack.providers.utils.inference.prompt_adapter import (
    interleaved_content_as_str,
--- a/llama_stack/providers/inline/vector_io/faiss/config.py
+++ b/llama_stack/providers/inline/vector_io/faiss/config.py
@ -6,13 +6,13 @@
 from typing import Any, Dict
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel
 from llama_stack.providers.utils.kvstore.config import (
    KVStoreConfig,
    SqliteKVStoreConfig,
 )
 from llama_stack.schema_utils import json_schema_type
@json_schema_type
--- a/llama_stack/providers/remote/inference/bedrock/bedrock.py
+++ b/llama_stack/providers/remote/inference/bedrock/bedrock.py
@ -8,7 +8,6 @@ import json
 from typing import AsyncGenerator, AsyncIterator, Dict, List, Optional, Union
 from botocore.client import BaseClient
 from llama_models.datatypes import CoreModelId
 from llama_models.llama3.api.chat_format import ChatFormat
 from llama_models.llama3.api.tokenizer import Tokenizer
@ -28,6 +27,7 @@ from llama_stack.apis.inference import (
    ToolDefinition,
    ToolPromptFormat,
 )
 from llama_stack.models.llama.datatypes import CoreModelId
 from llama_stack.providers.remote.inference.bedrock.config import BedrockConfig
 from llama_stack.providers.utils.bedrock.client import create_bedrock_client
 from llama_stack.providers.utils.inference.model_registry import (
--- a/llama_stack/providers/remote/inference/cerebras/cerebras.py
+++ b/llama_stack/providers/remote/inference/cerebras/cerebras.py
@ -7,9 +7,7 @@
 from typing import AsyncGenerator, List, Optional, Union
 from cerebras.cloud.sdk import AsyncCerebras
 from llama_models.datatypes import CoreModelId
 from llama_models.llama3.api.chat_format import ChatFormat
 from llama_models.llama3.api.datatypes import TopKSamplingStrategy
 from llama_models.llama3.api.tokenizer import Tokenizer
 from llama_stack.apis.common.content_types import InterleavedContent
@ -28,6 +26,7 @@ from llama_stack.apis.inference import (
    ToolDefinition,
    ToolPromptFormat,
 )
 from llama_stack.models.llama.datatypes import CoreModelId, TopKSamplingStrategy
 from llama_stack.providers.utils.inference.model_registry import (
    ModelRegistryHelper,
    build_model_alias,
--- a/llama_stack/providers/remote/inference/cerebras/config.py
+++ b/llama_stack/providers/remote/inference/cerebras/config.py
@ -7,9 +7,10 @@
 import os
 from typing import Any, Dict, Optional
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field, SecretStr
 from llama_stack.schema_utils import json_schema_type
 DEFAULT_BASE_URL = "https://api.cerebras.ai"
--- a/llama_stack/providers/remote/inference/databricks/config.py
+++ b/llama_stack/providers/remote/inference/databricks/config.py
@ -5,9 +5,10 @@
 # the root directory of this source tree.
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field
 from llama_stack.schema_utils import json_schema_type
@json_schema_type
 class DatabricksImplConfig(BaseModel):
--- a/llama_stack/providers/remote/inference/databricks/databricks.py
+++ b/llama_stack/providers/remote/inference/databricks/databricks.py
@ -6,7 +6,6 @@
 from typing import AsyncGenerator, List, Optional
 from llama_models.datatypes import CoreModelId
 from llama_models.llama3.api.chat_format import ChatFormat
 from llama_models.llama3.api.tokenizer import Tokenizer
 from openai import OpenAI
@ -25,6 +24,7 @@ from llama_stack.apis.inference import (
    ToolDefinition,
    ToolPromptFormat,
 )
 from llama_stack.models.llama.datatypes import CoreModelId
 from llama_stack.providers.utils.inference.model_registry import (
    ModelRegistryHelper,
    build_model_alias,
--- a/llama_stack/providers/remote/inference/fireworks/config.py
+++ b/llama_stack/providers/remote/inference/fireworks/config.py
@ -6,9 +6,10 @@
 from typing import Any, Dict, Optional
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field, SecretStr
 from llama_stack.schema_utils import json_schema_type
@json_schema_type
 class FireworksImplConfig(BaseModel):
--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@ -7,7 +7,6 @@
 from typing import AsyncGenerator, List, Optional, Union
 from fireworks.client import Fireworks
 from llama_models.datatypes import CoreModelId
 from llama_models.llama3.api.chat_format import ChatFormat
 from llama_models.llama3.api.tokenizer import Tokenizer
@ -30,6 +29,7 @@ from llama_stack.apis.inference import (
    ToolPromptFormat,
 )
 from llama_stack.distribution.request_headers import NeedsRequestProviderData
 from llama_stack.models.llama.datatypes import CoreModelId
 from llama_stack.providers.utils.inference.model_registry import (
    ModelRegistryHelper,
    build_model_alias,
--- a/llama_stack/providers/remote/inference/groq/config.py
+++ b/llama_stack/providers/remote/inference/groq/config.py
@ -6,9 +6,10 @@
 from typing import Optional
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field
 from llama_stack.schema_utils import json_schema_type
@json_schema_type
 class GroqConfig(BaseModel):
--- a/llama_stack/providers/remote/inference/groq/groq.py
+++ b/llama_stack/providers/remote/inference/groq/groq.py
@ -9,9 +9,6 @@ from typing import AsyncIterator, List, Optional, Union
 import groq
 from groq import Groq
 from llama_models.datatypes import SamplingParams
 from llama_models.llama3.api.datatypes import ToolDefinition, ToolPromptFormat
 from llama_models.sku_list import CoreModelId
 from llama_stack.apis.inference import (
    ChatCompletionRequest,
@ -29,6 +26,8 @@ from llama_stack.apis.inference import (
    ToolConfig,
 )
 from llama_stack.distribution.request_headers import NeedsRequestProviderData
 from llama_stack.models.llama.datatypes import SamplingParams, ToolDefinition, ToolPromptFormat
 from llama_stack.models.llama.sku_list import CoreModelId
 from llama_stack.providers.remote.inference.groq.config import GroqConfig
 from llama_stack.providers.utils.inference.model_registry import (
    ModelRegistryHelper,
--- a/llama_stack/providers/remote/inference/groq/groq_utils.py
+++ b/llama_stack/providers/remote/inference/groq/groq_utils.py
@ -24,7 +24,6 @@ from groq.types.chat.chat_completion_user_message_param import (
 )
 from groq.types.chat.completion_create_params import CompletionCreateParams
 from groq.types.shared.function_definition import FunctionDefinition
 from llama_models.llama3.api.datatypes import ToolParamDefinition
 from llama_stack.apis.common.content_types import (
    TextDelta,
@ -44,6 +43,7 @@ from llama_stack.apis.inference import (
    ToolDefinition,
    ToolPromptFormat,
 )
 from llama_stack.models.llama.datatypes import ToolParamDefinition
 from llama_stack.providers.utils.inference.openai_compat import (
    UnparseableToolCall,
    convert_tool_call,
--- a/llama_stack/providers/remote/inference/nvidia/config.py
+++ b/llama_stack/providers/remote/inference/nvidia/config.py
@ -7,9 +7,10 @@
 import os
 from typing import Any, Dict, Optional
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field, SecretStr
 from llama_stack.schema_utils import json_schema_type
@json_schema_type
 class NVIDIAConfig(BaseModel):
--- a/llama_stack/providers/remote/inference/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py
@ -7,9 +7,6 @@
 import warnings
 from typing import AsyncIterator, List, Optional, Union
 from llama_models.datatypes import SamplingParams
 from llama_models.llama3.api.datatypes import ToolDefinition, ToolPromptFormat
 from llama_models.sku_list import CoreModelId
 from openai import APIConnectionError, AsyncOpenAI
 from llama_stack.apis.inference import (
@ -28,6 +25,7 @@ from llama_stack.apis.inference import (
    ToolChoice,
    ToolConfig,
 )
 from llama_stack.models.llama.datatypes import CoreModelId, SamplingParams, ToolDefinition, ToolPromptFormat
 from llama_stack.providers.utils.inference.model_registry import (
    ModelRegistryHelper,
    build_model_alias,
--- a/llama_stack/providers/remote/inference/nvidia/openai_utils.py
+++ b/llama_stack/providers/remote/inference/nvidia/openai_utils.py
@ -8,17 +8,6 @@ import json
 import warnings
 from typing import Any, AsyncGenerator, Dict, Generator, Iterable, List, Optional, Union
 from llama_models.datatypes import (
    GreedySamplingStrategy,
    TopKSamplingStrategy,
    TopPSamplingStrategy,
 )
 from llama_models.llama3.api.datatypes import (
    BuiltinTool,
    StopReason,
    ToolCall,
    ToolDefinition,
 )
 from openai import AsyncStream
 from openai.types.chat import (
    ChatCompletionAssistantMessageParam as OpenAIChatCompletionAssistantMessage,
@ -87,6 +76,15 @@ from llama_stack.apis.inference import (
    ToolResponseMessage,
    UserMessage,
 )
 from llama_stack.models.llama.datatypes import (
    BuiltinTool,
    GreedySamplingStrategy,
    StopReason,
    ToolCall,
    ToolDefinition,
    TopKSamplingStrategy,
    TopPSamplingStrategy,
 )
 from llama_stack.providers.utils.inference.prompt_adapter import (
    convert_image_content_to_url,
 )
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -8,7 +8,6 @@ import logging
 from typing import AsyncGenerator, List, Optional, Union
 import httpx
 from llama_models.datatypes import CoreModelId
 from llama_models.llama3.api.chat_format import ChatFormat
 from llama_models.llama3.api.tokenizer import Tokenizer
 from ollama import AsyncClient
@ -34,6 +33,7 @@ from llama_stack.apis.inference import (
    ToolPromptFormat,
 )
 from llama_stack.apis.models import Model, ModelType
 from llama_stack.models.llama.datatypes import CoreModelId
 from llama_stack.providers.datatypes import ModelsProtocolPrivate
 from llama_stack.providers.utils.inference.model_registry import (
    ModelRegistryHelper,
--- a/llama_stack/providers/remote/inference/runpod/config.py
+++ b/llama_stack/providers/remote/inference/runpod/config.py
@ -6,9 +6,10 @@
 from typing import Optional
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field
 from llama_stack.schema_utils import json_schema_type
@json_schema_type
 class RunpodImplConfig(BaseModel):
--- a/llama_stack/providers/remote/inference/runpod/runpod.py
+++ b/llama_stack/providers/remote/inference/runpod/runpod.py
@ -6,11 +6,11 @@
 from typing import AsyncGenerator
 from llama_models.llama3.api.chat_format import ChatFormat
 from llama_models.llama3.api.datatypes import Message
 from llama_models.llama3.api.tokenizer import Tokenizer
 from openai import OpenAI
 from llama_stack.apis.inference import *  # noqa: F403
 from llama_stack.models.llama.datatypes import Message
 # from llama_stack.providers.datatypes import ModelsProtocolPrivate
 from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
--- a/llama_stack/providers/remote/inference/sambanova/config.py
+++ b/llama_stack/providers/remote/inference/sambanova/config.py
@ -6,9 +6,10 @@
 from typing import Any, Dict, Optional
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field
 from llama_stack.schema_utils import json_schema_type
@json_schema_type
 class SambaNovaImplConfig(BaseModel):
--- a/llama_stack/providers/remote/inference/sambanova/sambanova.py
+++ b/llama_stack/providers/remote/inference/sambanova/sambanova.py
@ -7,12 +7,6 @@
 import json
 from typing import AsyncGenerator
 from llama_models.datatypes import (
    CoreModelId,
    GreedySamplingStrategy,
    TopKSamplingStrategy,
    TopPSamplingStrategy,
 )
 from llama_models.llama3.api.chat_format import ChatFormat
 from llama_models.llama3.api.tokenizer import Tokenizer
 from openai import OpenAI
@ -23,6 +17,12 @@ from llama_stack.apis.common.content_types import (
    TextContentItem,
 )
 from llama_stack.apis.inference import *  # noqa: F403
 from llama_stack.models.llama.datatypes import (
    CoreModelId,
    GreedySamplingStrategy,
    TopKSamplingStrategy,
    TopPSamplingStrategy,
 )
 from llama_stack.providers.utils.inference.model_registry import (
    ModelRegistryHelper,
    build_model_alias,
--- a/llama_stack/providers/remote/inference/tgi/config.py
+++ b/llama_stack/providers/remote/inference/tgi/config.py
@ -6,9 +6,10 @@
 from typing import Optional
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field, SecretStr
 from llama_stack.schema_utils import json_schema_type
@json_schema_type
 class TGIImplConfig(BaseModel):
--- a/llama_stack/providers/remote/inference/tgi/tgi.py
+++ b/llama_stack/providers/remote/inference/tgi/tgi.py
@ -11,7 +11,6 @@ from typing import AsyncGenerator, List, Optional
 from huggingface_hub import AsyncInferenceClient, HfApi
 from llama_models.llama3.api.chat_format import ChatFormat
 from llama_models.llama3.api.tokenizer import Tokenizer
 from llama_models.sku_list import all_registered_models
 from llama_stack.apis.common.content_types import InterleavedContent
 from llama_stack.apis.inference import (
@ -31,6 +30,7 @@ from llama_stack.apis.inference import (
    ToolPromptFormat,
 )
 from llama_stack.apis.models import Model
 from llama_stack.models.llama.sku_list import all_registered_models
 from llama_stack.providers.datatypes import ModelsProtocolPrivate
 from llama_stack.providers.utils.inference.model_registry import (
    ModelRegistryHelper,
--- a/llama_stack/providers/remote/inference/together/config.py
+++ b/llama_stack/providers/remote/inference/together/config.py
@ -6,9 +6,10 @@
 from typing import Any, Dict, Optional
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field, SecretStr
 from llama_stack.schema_utils import json_schema_type
@json_schema_type
 class TogetherImplConfig(BaseModel):
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@ -6,7 +6,6 @@
 from typing import AsyncGenerator, List, Optional, Union
 from llama_models.datatypes import CoreModelId
 from llama_models.llama3.api.chat_format import ChatFormat
 from llama_models.llama3.api.tokenizer import Tokenizer
 from together import Together
@ -29,6 +28,7 @@ from llama_stack.apis.inference import (
    ToolPromptFormat,
 )
 from llama_stack.distribution.request_headers import NeedsRequestProviderData
 from llama_stack.models.llama.datatypes import CoreModelId
 from llama_stack.providers.utils.inference.model_registry import (
    ModelRegistryHelper,
    build_model_alias,
--- a/llama_stack/providers/remote/inference/vllm/config.py
+++ b/llama_stack/providers/remote/inference/vllm/config.py
@ -6,9 +6,10 @@
 from typing import Optional
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field
 from llama_stack.schema_utils import json_schema_type
@json_schema_type
 class VLLMInferenceAdapterConfig(BaseModel):
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@ -7,10 +7,9 @@ import json
 import logging
 from typing import AsyncGenerator, List, Optional, Union
-from llama_models.llama3.api import StopReason, ToolCall
+from llama_models.datatypes import StopReason, ToolCall
 from llama_models.llama3.api.chat_format import ChatFormat
 from llama_models.llama3.api.tokenizer import Tokenizer
 from llama_models.sku_list import all_registered_models
 from openai import OpenAI
 from llama_stack.apis.common.content_types import InterleavedContent, TextDelta, ToolCallDelta, ToolCallParseStatus
@ -37,6 +36,7 @@ from llama_stack.apis.inference import (
    ToolPromptFormat,
 )
 from llama_stack.apis.models import Model, ModelType
 from llama_stack.models.llama.sku_list import all_registered_models
 from llama_stack.providers.datatypes import ModelsProtocolPrivate
 from llama_stack.providers.utils.inference.model_registry import (
    ModelRegistryHelper,
--- a/llama_stack/providers/remote/safety/bedrock/config.py
+++ b/llama_stack/providers/remote/safety/bedrock/config.py
@ -5,9 +5,8 @@
 # the root directory of this source tree.
 from llama_models.schema_utils import json_schema_type
 from llama_stack.providers.utils.bedrock.config import BedrockBaseConfig
 from llama_stack.schema_utils import json_schema_type
@json_schema_type
--- a/llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py
+++ b/llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py
@ -7,7 +7,6 @@
 from typing import Any, Dict, List, Optional
 import requests
 from llama_models.llama3.api.datatypes import BuiltinTool
 from llama_stack.apis.common.content_types import URL
 from llama_stack.apis.tools import (
@ -18,6 +17,7 @@ from llama_stack.apis.tools import (
    ToolRuntime,
 )
 from llama_stack.distribution.request_headers import NeedsRequestProviderData
 from llama_stack.models.llama.datatypes import BuiltinTool
 from llama_stack.providers.datatypes import ToolsProtocolPrivate
 from .config import BraveSearchToolConfig
--- a/llama_stack/providers/remote/vector_io/pgvector/config.py
+++ b/llama_stack/providers/remote/vector_io/pgvector/config.py
@ -4,9 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field
 from llama_stack.schema_utils import json_schema_type
@json_schema_type
 class PGVectorVectorIOConfig(BaseModel):
--- a/llama_stack/providers/remote/vector_io/qdrant/config.py
+++ b/llama_stack/providers/remote/vector_io/qdrant/config.py
@ -6,9 +6,10 @@
 from typing import Optional
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel
 from llama_stack.schema_utils import json_schema_type
@json_schema_type
 class QdrantVectorIOConfig(BaseModel):
--- a/llama_stack/providers/tests/agents/test_agents.py
+++ b/llama_stack/providers/tests/agents/test_agents.py
@ -7,8 +7,6 @@
 import os
 import pytest
 from llama_models.datatypes import SamplingParams, TopPSamplingStrategy
 from llama_models.llama3.api.datatypes import BuiltinTool
 from llama_stack.apis.agents import (
    AgentConfig,
@ -25,6 +23,7 @@ from llama_stack.apis.agents import (
 )
 from llama_stack.apis.inference import CompletionMessage, UserMessage
 from llama_stack.apis.safety import ViolationLevel
 from llama_stack.models.llama.datatypes import BuiltinTool, SamplingParams, TopPSamplingStrategy
 from llama_stack.providers.datatypes import Api
 # How to run this test:
--- a/Show more
+++ b/Show more