Merge branch 'main' into vectordb_name

2025-12-23 23:53:55 +00:00 · 2025-07-09 20:53:46 -04:00 · 2025-07-09 20:53:46 -04:00 · 36ca9543a5
commit 36ca9543a5
parent 42ce6b96fc 7915551eee
35 changed files with 2282 additions and 1644 deletions
--- a/llama_stack/providers/remote/inference/anthropic/models.py
+++ b/llama_stack/providers/remote/inference/anthropic/models.py
@ -15,21 +15,26 @@ LLM_MODEL_IDS = [
    "anthropic/claude-3-5-haiku-latest",
 ]

+SAFETY_MODELS_ENTRIES = []

-MODEL_ENTRIES = [ProviderModelEntry(provider_model_id=m) for m in LLM_MODEL_IDS] + [
-    ProviderModelEntry(
-        provider_model_id="anthropic/voyage-3",
-        model_type=ModelType.embedding,
-        metadata={"embedding_dimension": 1024, "context_length": 32000},
-    ),
-    ProviderModelEntry(
-        provider_model_id="anthropic/voyage-3-lite",
-        model_type=ModelType.embedding,
-        metadata={"embedding_dimension": 512, "context_length": 32000},
-    ),
-    ProviderModelEntry(
-        provider_model_id="anthropic/voyage-code-3",
-        model_type=ModelType.embedding,
-        metadata={"embedding_dimension": 1024, "context_length": 32000},
-    ),
-]
+MODEL_ENTRIES = (
+    [ProviderModelEntry(provider_model_id=m) for m in LLM_MODEL_IDS]
+    + [
+        ProviderModelEntry(
+            provider_model_id="anthropic/voyage-3",
+            model_type=ModelType.embedding,
+            metadata={"embedding_dimension": 1024, "context_length": 32000},
+        ),
+        ProviderModelEntry(
+            provider_model_id="anthropic/voyage-3-lite",
+            model_type=ModelType.embedding,
+            metadata={"embedding_dimension": 512, "context_length": 32000},
+        ),
+        ProviderModelEntry(
+            provider_model_id="anthropic/voyage-code-3",
+            model_type=ModelType.embedding,
+            metadata={"embedding_dimension": 1024, "context_length": 32000},
+        ),
+    ]
+    + SAFETY_MODELS_ENTRIES
+)
--- a/llama_stack/providers/remote/inference/bedrock/models.py
+++ b/llama_stack/providers/remote/inference/bedrock/models.py
@ -9,6 +9,10 @@ from llama_stack.providers.utils.inference.model_registry import (
    build_hf_repo_model_entry,
 )

+SAFETY_MODELS_ENTRIES = []
+
+
+# https://docs.aws.amazon.com/bedrock/latest/userguide/models-supported.html
 MODEL_ENTRIES = [
    build_hf_repo_model_entry(
        "meta.llama3-1-8b-instruct-v1:0",
@ -22,4 +26,4 @@ MODEL_ENTRIES = [
        "meta.llama3-1-405b-instruct-v1:0",
        CoreModelId.llama3_1_405b_instruct.value,
    ),
-]
+] + SAFETY_MODELS_ENTRIES
--- a/llama_stack/providers/remote/inference/cerebras/models.py
+++ b/llama_stack/providers/remote/inference/cerebras/models.py
@ -9,6 +9,9 @@ from llama_stack.providers.utils.inference.model_registry import (
    build_hf_repo_model_entry,
 )

+SAFETY_MODELS_ENTRIES = []
+
+# https://inference-docs.cerebras.ai/models
 MODEL_ENTRIES = [
    build_hf_repo_model_entry(
        "llama3.1-8b",
@ -18,4 +21,8 @@ MODEL_ENTRIES = [
        "llama-3.3-70b",
        CoreModelId.llama3_3_70b_instruct.value,
    ),
-]
+    build_hf_repo_model_entry(
+        "llama-4-scout-17b-16e-instruct",
+        CoreModelId.llama4_scout_17b_16e_instruct.value,
+    ),
+] + SAFETY_MODELS_ENTRIES
--- a/llama_stack/providers/remote/inference/databricks/databricks.py
+++ b/llama_stack/providers/remote/inference/databricks/databricks.py
@ -47,7 +47,10 @@ from llama_stack.providers.utils.inference.prompt_adapter import (

 from .config import DatabricksImplConfig

-model_entries = [
+SAFETY_MODELS_ENTRIES = []
+
+# https://docs.databricks.com/aws/en/machine-learning/model-serving/foundation-model-overview
+MODEL_ENTRIES = [
    build_hf_repo_model_entry(
        "databricks-meta-llama-3-1-70b-instruct",
        CoreModelId.llama3_1_70b_instruct.value,
@ -56,7 +59,7 @@ model_entries = [
        "databricks-meta-llama-3-1-405b-instruct",
        CoreModelId.llama3_1_405b_instruct.value,
    ),
-]
+] + SAFETY_MODELS_ENTRIES


 class DatabricksInferenceAdapter(
@ -66,7 +69,7 @@ class DatabricksInferenceAdapter(
    OpenAICompletionToLlamaStackMixin,
 ):
    def __init__(self, config: DatabricksImplConfig) -> None:
-        ModelRegistryHelper.__init__(self, model_entries=model_entries)
+        ModelRegistryHelper.__init__(self, model_entries=MODEL_ENTRIES)
        self.config = config

    async def initialize(self) -> None:
--- a/llama_stack/providers/remote/inference/fireworks/models.py
+++ b/llama_stack/providers/remote/inference/fireworks/models.py
@ -11,6 +11,17 @@ from llama_stack.providers.utils.inference.model_registry import (
    build_hf_repo_model_entry,
 )

+SAFETY_MODELS_ENTRIES = [
+    build_hf_repo_model_entry(
+        "accounts/fireworks/models/llama-guard-3-8b",
+        CoreModelId.llama_guard_3_8b.value,
+    ),
+    build_hf_repo_model_entry(
+        "accounts/fireworks/models/llama-guard-3-11b-vision",
+        CoreModelId.llama_guard_3_11b_vision.value,
+    ),
+]
+
 MODEL_ENTRIES = [
    build_hf_repo_model_entry(
        "accounts/fireworks/models/llama-v3p1-8b-instruct",
@ -40,14 +51,6 @@ MODEL_ENTRIES = [
        "accounts/fireworks/models/llama-v3p3-70b-instruct",
        CoreModelId.llama3_3_70b_instruct.value,
    ),
-    build_hf_repo_model_entry(
-        "accounts/fireworks/models/llama-guard-3-8b",
-        CoreModelId.llama_guard_3_8b.value,
-    ),
-    build_hf_repo_model_entry(
-        "accounts/fireworks/models/llama-guard-3-11b-vision",
-        CoreModelId.llama_guard_3_11b_vision.value,
-    ),
    build_hf_repo_model_entry(
        "accounts/fireworks/models/llama4-scout-instruct-basic",
        CoreModelId.llama4_scout_17b_16e_instruct.value,
@ -64,4 +67,4 @@ MODEL_ENTRIES = [
            "context_length": 8192,
        },
    ),
-]
+] + SAFETY_MODELS_ENTRIES
--- a/llama_stack/providers/remote/inference/gemini/models.py
+++ b/llama_stack/providers/remote/inference/gemini/models.py
@ -17,11 +17,16 @@ LLM_MODEL_IDS = [
    "gemini/gemini-2.5-pro",
 ]

+SAFETY_MODELS_ENTRIES = []

-MODEL_ENTRIES = [ProviderModelEntry(provider_model_id=m) for m in LLM_MODEL_IDS] + [
-    ProviderModelEntry(
-        provider_model_id="gemini/text-embedding-004",
-        model_type=ModelType.embedding,
-        metadata={"embedding_dimension": 768, "context_length": 2048},
-    ),
-]
+MODEL_ENTRIES = (
+    [ProviderModelEntry(provider_model_id=m) for m in LLM_MODEL_IDS]
+    + [
+        ProviderModelEntry(
+            provider_model_id="gemini/text-embedding-004",
+            model_type=ModelType.embedding,
+            metadata={"embedding_dimension": 768, "context_length": 2048},
+        ),
+    ]
+    + SAFETY_MODELS_ENTRIES
+)
--- a/llama_stack/providers/remote/inference/groq/models.py
+++ b/llama_stack/providers/remote/inference/groq/models.py
@ -10,6 +10,8 @@ from llama_stack.providers.utils.inference.model_registry import (
    build_model_entry,
 )

+SAFETY_MODELS_ENTRIES = []
+
 MODEL_ENTRIES = [
    build_hf_repo_model_entry(
        "groq/llama3-8b-8192",
@ -51,4 +53,4 @@ MODEL_ENTRIES = [
        "groq/meta-llama/llama-4-maverick-17b-128e-instruct",
        CoreModelId.llama4_maverick_17b_128e_instruct.value,
    ),
-]
+] + SAFETY_MODELS_ENTRIES
--- a/llama_stack/providers/remote/inference/nvidia/models.py
+++ b/llama_stack/providers/remote/inference/nvidia/models.py
@ -11,6 +11,9 @@ from llama_stack.providers.utils.inference.model_registry import (
    build_hf_repo_model_entry,
 )

+SAFETY_MODELS_ENTRIES = []
+
+# https://docs.nvidia.com/nim/large-language-models/latest/supported-llm-agnostic-architectures.html
 MODEL_ENTRIES = [
    build_hf_repo_model_entry(
        "meta/llama3-8b-instruct",
@ -99,4 +102,4 @@ MODEL_ENTRIES = [
    ),
    # TODO(mf): how do we handle Nemotron models?
    # "Llama3.1-Nemotron-51B-Instruct" -> "meta/llama-3.1-nemotron-51b-instruct",
-]
+] + SAFETY_MODELS_ENTRIES
--- a/llama_stack/providers/remote/inference/openai/models.py
+++ b/llama_stack/providers/remote/inference/openai/models.py
@ -48,16 +48,20 @@ EMBEDDING_MODEL_IDS: dict[str, EmbeddingModelInfo] = {
    "text-embedding-3-small": EmbeddingModelInfo(1536, 8192),
    "text-embedding-3-large": EmbeddingModelInfo(3072, 8192),
 }
+SAFETY_MODELS_ENTRIES = []

-
-MODEL_ENTRIES = [ProviderModelEntry(provider_model_id=m) for m in LLM_MODEL_IDS] + [
-    ProviderModelEntry(
-        provider_model_id=model_id,
-        model_type=ModelType.embedding,
-        metadata={
-            "embedding_dimension": model_info.embedding_dimension,
-            "context_length": model_info.context_length,
-        },
-    )
-    for model_id, model_info in EMBEDDING_MODEL_IDS.items()
-]
+MODEL_ENTRIES = (
+    [ProviderModelEntry(provider_model_id=m) for m in LLM_MODEL_IDS]
+    + [
+        ProviderModelEntry(
+            provider_model_id=model_id,
+            model_type=ModelType.embedding,
+            metadata={
+                "embedding_dimension": model_info.embedding_dimension,
+                "context_length": model_info.context_length,
+            },
+        )
+        for model_id, model_info in EMBEDDING_MODEL_IDS.items()
+    ]
+    + SAFETY_MODELS_ENTRIES
+)
--- a/llama_stack/providers/remote/inference/runpod/runpod.py
+++ b/llama_stack/providers/remote/inference/runpod/runpod.py
@ -11,7 +11,7 @@ from llama_stack.apis.inference import *  # noqa: F403
 from llama_stack.apis.inference import OpenAIEmbeddingsResponse

 # from llama_stack.providers.datatypes import ModelsProtocolPrivate
-from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
+from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper, build_hf_repo_model_entry
 from llama_stack.providers.utils.inference.openai_compat import (
    OpenAIChatCompletionToLlamaStackMixin,
    OpenAICompletionToLlamaStackMixin,
@ -25,6 +25,8 @@ from llama_stack.providers.utils.inference.prompt_adapter import (

 from .config import RunpodImplConfig

+# https://docs.runpod.io/serverless/vllm/overview#compatible-models
+# https://github.com/runpod-workers/worker-vllm/blob/main/README.md#compatible-model-architectures
 RUNPOD_SUPPORTED_MODELS = {
    "Llama3.1-8B": "meta-llama/Llama-3.1-8B",
    "Llama3.1-70B": "meta-llama/Llama-3.1-70B",
@ -40,6 +42,14 @@ RUNPOD_SUPPORTED_MODELS = {
    "Llama3.2-3B": "meta-llama/Llama-3.2-3B",
 }

+SAFETY_MODELS_ENTRIES = []
+
+# Create MODEL_ENTRIES from RUNPOD_SUPPORTED_MODELS for compatibility with starter template
+MODEL_ENTRIES = [
+    build_hf_repo_model_entry(provider_model_id, model_descriptor)
+    for provider_model_id, model_descriptor in RUNPOD_SUPPORTED_MODELS.items()
+] + SAFETY_MODELS_ENTRIES
+

 class RunpodInferenceAdapter(
    ModelRegistryHelper,
@ -61,25 +71,25 @@ class RunpodInferenceAdapter(
        self,
        model: str,
        content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
    ) -> AsyncGenerator:
        raise NotImplementedError()

    async def chat_completion(
        self,
        model: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
+        messages: list[Message],
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_choice: ToolChoice | None = ToolChoice.auto,
+        tool_prompt_format: ToolPromptFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+        tool_config: ToolConfig | None = None,
    ) -> AsyncGenerator:
        if sampling_params is None:
            sampling_params = SamplingParams()
@ -129,10 +139,10 @@ class RunpodInferenceAdapter(
    async def embeddings(
        self,
        model: str,
-        contents: List[str] | List[InterleavedContentItem],
-        text_truncation: Optional[TextTruncation] = TextTruncation.none,
-        output_dimension: Optional[int] = None,
-        task_type: Optional[EmbeddingTaskType] = None,
+        contents: list[str] | list[InterleavedContentItem],
+        text_truncation: TextTruncation | None = TextTruncation.none,
+        output_dimension: int | None = None,
+        task_type: EmbeddingTaskType | None = None,
    ) -> EmbeddingsResponse:
        raise NotImplementedError()

--- a/llama_stack/providers/remote/inference/sambanova/models.py
+++ b/llama_stack/providers/remote/inference/sambanova/models.py
@ -9,6 +9,14 @@ from llama_stack.providers.utils.inference.model_registry import (
    build_hf_repo_model_entry,
 )

+SAFETY_MODELS_ENTRIES = [
+    build_hf_repo_model_entry(
+        "sambanova/Meta-Llama-Guard-3-8B",
+        CoreModelId.llama_guard_3_8b.value,
+    ),
+]
+
+
 MODEL_ENTRIES = [
    build_hf_repo_model_entry(
        "sambanova/Meta-Llama-3.1-8B-Instruct",
@ -46,8 +54,4 @@ MODEL_ENTRIES = [
        "sambanova/Llama-4-Maverick-17B-128E-Instruct",
        CoreModelId.llama4_maverick_17b_128e_instruct.value,
    ),
-    build_hf_repo_model_entry(
-        "sambanova/Meta-Llama-Guard-3-8B",
-        CoreModelId.llama_guard_3_8b.value,
-    ),
-]
+] + SAFETY_MODELS_ENTRIES
--- a/llama_stack/providers/remote/inference/together/models.py
+++ b/llama_stack/providers/remote/inference/together/models.py
@ -11,6 +11,16 @@ from llama_stack.providers.utils.inference.model_registry import (
    build_hf_repo_model_entry,
 )

+SAFETY_MODELS_ENTRIES = [
+    build_hf_repo_model_entry(
+        "meta-llama/Llama-Guard-3-8B",
+        CoreModelId.llama_guard_3_8b.value,
+    ),
+    build_hf_repo_model_entry(
+        "meta-llama/Llama-Guard-3-11B-Vision-Turbo",
+        CoreModelId.llama_guard_3_11b_vision.value,
+    ),
+]
 MODEL_ENTRIES = [
    build_hf_repo_model_entry(
        "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
@ -40,14 +50,6 @@ MODEL_ENTRIES = [
        "meta-llama/Llama-3.3-70B-Instruct-Turbo",
        CoreModelId.llama3_3_70b_instruct.value,
    ),
-    build_hf_repo_model_entry(
-        "meta-llama/Meta-Llama-Guard-3-8B",
-        CoreModelId.llama_guard_3_8b.value,
-    ),
-    build_hf_repo_model_entry(
-        "meta-llama/Llama-Guard-3-11B-Vision-Turbo",
-        CoreModelId.llama_guard_3_11b_vision.value,
-    ),
    ProviderModelEntry(
        provider_model_id="togethercomputer/m2-bert-80M-8k-retrieval",
        model_type=ModelType.embedding,
@ -78,4 +80,4 @@ MODEL_ENTRIES = [
            "together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
        ],
    ),
-]
+] + SAFETY_MODELS_ENTRIES
--- a/llama_stack/providers/remote/vector_io/milvus/init.py
+++ b/llama_stack/providers/remote/vector_io/milvus/init.py
@ -14,6 +14,6 @@ async def get_adapter_impl(config: MilvusVectorIOConfig, deps: dict[Api, Provide

    assert isinstance(config, MilvusVectorIOConfig), f"Unexpected config type: {type(config)}"

-    impl = MilvusVectorIOAdapter(config, deps[Api.inference])
+    impl = MilvusVectorIOAdapter(config, deps[Api.inference], deps.get(Api.files, None))
    await impl.initialize()
    return impl
--- a/llama_stack/providers/remote/vector_io/milvus/config.py
+++ b/llama_stack/providers/remote/vector_io/milvus/config.py
@ -8,6 +8,7 @@ from typing import Any

 from pydantic import BaseModel, ConfigDict, Field

+from llama_stack.providers.utils.kvstore.config import KVStoreConfig
 from llama_stack.schema_utils import json_schema_type


@ -16,6 +17,7 @@ class MilvusVectorIOConfig(BaseModel):
    uri: str = Field(description="The URI of the Milvus server")
    token: str | None = Field(description="The token of the Milvus server")
    consistency_level: str = Field(description="The consistency level of the Milvus server", default="Strong")
+    kvstore: KVStoreConfig | None = Field(description="Config for KV store backend (SQLite only for now)", default=None)

    # This configuration allows additional fields to be passed through to the underlying Milvus client.
    # See the [Milvus](https://milvus.io/docs/install-overview.md) documentation for more details about Milvus in general.