feat(starter)!: simplify starter distro; litellm model registry changes (#2916)

2025-12-04 02:03:44 +00:00 · 2025-07-25 15:02:04 -07:00 · 2025-07-25 15:02:04 -07:00 · 9583f468f8
commit 9583f468f8
parent 3344d8a9e5
64 changed files with 2027 additions and 4092 deletions
--- a/llama_stack/providers/remote/inference/anthropic/anthropic.py
+++ b/llama_stack/providers/remote/inference/anthropic/anthropic.py
@ -15,6 +15,7 @@ class AnthropicInferenceAdapter(LiteLLMOpenAIMixin):
        LiteLLMOpenAIMixin.__init__(
            self,
            MODEL_ENTRIES,
+            litellm_provider_name="anthropic",
            api_key_from_config=config.api_key,
            provider_data_api_key_field="anthropic_api_key",
        )
--- a/llama_stack/providers/remote/inference/anthropic/config.py
+++ b/llama_stack/providers/remote/inference/anthropic/config.py
@ -26,7 +26,7 @@ class AnthropicConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.ANTHROPIC_API_KEY}", **kwargs) -> dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.ANTHROPIC_API_KEY:=}", **kwargs) -> dict[str, Any]:
        return {
            "api_key": api_key,
        }
--- a/llama_stack/providers/remote/inference/anthropic/models.py
+++ b/llama_stack/providers/remote/inference/anthropic/models.py
@ -10,9 +10,9 @@ from llama_stack.providers.utils.inference.model_registry import (
 )

 LLM_MODEL_IDS = [
-    "anthropic/claude-3-5-sonnet-latest",
-    "anthropic/claude-3-7-sonnet-latest",
-    "anthropic/claude-3-5-haiku-latest",
+    "claude-3-5-sonnet-latest",
+    "claude-3-7-sonnet-latest",
+    "claude-3-5-haiku-latest",
 ]

 SAFETY_MODELS_ENTRIES = []
@ -21,17 +21,17 @@ MODEL_ENTRIES = (
    [ProviderModelEntry(provider_model_id=m) for m in LLM_MODEL_IDS]
    + [
        ProviderModelEntry(
-            provider_model_id="anthropic/voyage-3",
+            provider_model_id="voyage-3",
            model_type=ModelType.embedding,
            metadata={"embedding_dimension": 1024, "context_length": 32000},
        ),
        ProviderModelEntry(
-            provider_model_id="anthropic/voyage-3-lite",
+            provider_model_id="voyage-3-lite",
            model_type=ModelType.embedding,
            metadata={"embedding_dimension": 512, "context_length": 32000},
        ),
        ProviderModelEntry(
-            provider_model_id="anthropic/voyage-code-3",
+            provider_model_id="voyage-code-3",
            model_type=ModelType.embedding,
            metadata={"embedding_dimension": 1024, "context_length": 32000},
        ),
--- a/llama_stack/providers/remote/inference/bedrock/bedrock.py
+++ b/llama_stack/providers/remote/inference/bedrock/bedrock.py
@ -63,18 +63,20 @@ class BedrockInferenceAdapter(
    def __init__(self, config: BedrockConfig) -> None:
        ModelRegistryHelper.__init__(self, MODEL_ENTRIES)
        self._config = config
-
-        self._client = create_bedrock_client(config)
+        self._client = None

    @property
    def client(self) -> BaseClient:
+        if self._client is None:
+            self._client = create_bedrock_client(self._config)
        return self._client

    async def initialize(self) -> None:
        pass

    async def shutdown(self) -> None:
-        self.client.close()
+        if self._client is not None:
+            self._client.close()

    async def completion(
        self,
--- a/llama_stack/providers/remote/inference/cerebras/cerebras.py
+++ b/llama_stack/providers/remote/inference/cerebras/cerebras.py
@ -65,6 +65,7 @@ class CerebrasInferenceAdapter(
        )
        self.config = config

+        # TODO: make this use provider data, etc. like other providers
        self.client = AsyncCerebras(
            base_url=self.config.base_url,
            api_key=self.config.api_key.get_secret_value(),
--- a/llama_stack/providers/remote/inference/cerebras/config.py
+++ b/llama_stack/providers/remote/inference/cerebras/config.py
@ -26,7 +26,7 @@ class CerebrasImplConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.CEREBRAS_API_KEY}", **kwargs) -> dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.CEREBRAS_API_KEY:=}", **kwargs) -> dict[str, Any]:
        return {
            "base_url": DEFAULT_BASE_URL,
            "api_key": api_key,
--- a/llama_stack/providers/remote/inference/databricks/config.py
+++ b/llama_stack/providers/remote/inference/databricks/config.py
@ -25,8 +25,8 @@ class DatabricksImplConfig(BaseModel):
    @classmethod
    def sample_run_config(
        cls,
-        url: str = "${env.DATABRICKS_URL}",
-        api_token: str = "${env.DATABRICKS_API_TOKEN}",
+        url: str = "${env.DATABRICKS_URL:=}",
+        api_token: str = "${env.DATABRICKS_API_TOKEN:=}",
        **kwargs: Any,
    ) -> dict[str, Any]:
        return {
--- a/llama_stack/providers/remote/inference/fireworks/config.py
+++ b/llama_stack/providers/remote/inference/fireworks/config.py
@ -24,7 +24,7 @@ class FireworksImplConfig(RemoteInferenceProviderConfig):
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.FIREWORKS_API_KEY}", **kwargs) -> dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.FIREWORKS_API_KEY:=}", **kwargs) -> dict[str, Any]:
        return {
            "url": "https://api.fireworks.ai/inference/v1",
            "api_key": api_key,
--- a/llama_stack/providers/remote/inference/gemini/config.py
+++ b/llama_stack/providers/remote/inference/gemini/config.py
@ -26,7 +26,7 @@ class GeminiConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.GEMINI_API_KEY}", **kwargs) -> dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.GEMINI_API_KEY:=}", **kwargs) -> dict[str, Any]:
        return {
            "api_key": api_key,
        }
--- a/llama_stack/providers/remote/inference/gemini/gemini.py
+++ b/llama_stack/providers/remote/inference/gemini/gemini.py
@ -15,6 +15,7 @@ class GeminiInferenceAdapter(LiteLLMOpenAIMixin):
        LiteLLMOpenAIMixin.__init__(
            self,
            MODEL_ENTRIES,
+            litellm_provider_name="gemini",
            api_key_from_config=config.api_key,
            provider_data_api_key_field="gemini_api_key",
        )
--- a/llama_stack/providers/remote/inference/gemini/models.py
+++ b/llama_stack/providers/remote/inference/gemini/models.py
@ -10,11 +10,11 @@ from llama_stack.providers.utils.inference.model_registry import (
 )

 LLM_MODEL_IDS = [
-    "gemini/gemini-1.5-flash",
-    "gemini/gemini-1.5-pro",
-    "gemini/gemini-2.0-flash",
-    "gemini/gemini-2.5-flash",
-    "gemini/gemini-2.5-pro",
+    "gemini-1.5-flash",
+    "gemini-1.5-pro",
+    "gemini-2.0-flash",
+    "gemini-2.5-flash",
+    "gemini-2.5-pro",
 ]

 SAFETY_MODELS_ENTRIES = []
@ -23,7 +23,7 @@ MODEL_ENTRIES = (
    [ProviderModelEntry(provider_model_id=m) for m in LLM_MODEL_IDS]
    + [
        ProviderModelEntry(
-            provider_model_id="gemini/text-embedding-004",
+            provider_model_id="text-embedding-004",
            model_type=ModelType.embedding,
            metadata={"embedding_dimension": 768, "context_length": 2048},
        ),
--- a/llama_stack/providers/remote/inference/groq/config.py
+++ b/llama_stack/providers/remote/inference/groq/config.py
@ -32,7 +32,7 @@ class GroqConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.GROQ_API_KEY}", **kwargs) -> dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.GROQ_API_KEY:=}", **kwargs) -> dict[str, Any]:
        return {
            "url": "https://api.groq.com",
            "api_key": api_key,
--- a/llama_stack/providers/remote/inference/groq/groq.py
+++ b/llama_stack/providers/remote/inference/groq/groq.py
@ -34,6 +34,7 @@ class GroqInferenceAdapter(LiteLLMOpenAIMixin):
        LiteLLMOpenAIMixin.__init__(
            self,
            model_entries=MODEL_ENTRIES,
+            litellm_provider_name="groq",
            api_key_from_config=config.api_key,
            provider_data_api_key_field="groq_api_key",
        )
@ -96,7 +97,7 @@ class GroqInferenceAdapter(LiteLLMOpenAIMixin):
            tool_choice = "required"

        params = await prepare_openai_completion_params(
-            model=model_obj.provider_resource_id.replace("groq/", ""),
+            model=model_obj.provider_resource_id,
            messages=messages,
            frequency_penalty=frequency_penalty,
            function_call=function_call,
--- a/llama_stack/providers/remote/inference/groq/models.py
+++ b/llama_stack/providers/remote/inference/groq/models.py
@ -14,19 +14,19 @@ SAFETY_MODELS_ENTRIES = []

 MODEL_ENTRIES = [
    build_hf_repo_model_entry(
-        "groq/llama3-8b-8192",
+        "llama3-8b-8192",
        CoreModelId.llama3_1_8b_instruct.value,
    ),
    build_model_entry(
-        "groq/llama-3.1-8b-instant",
+        "llama-3.1-8b-instant",
        CoreModelId.llama3_1_8b_instruct.value,
    ),
    build_hf_repo_model_entry(
-        "groq/llama3-70b-8192",
+        "llama3-70b-8192",
        CoreModelId.llama3_70b_instruct.value,
    ),
    build_hf_repo_model_entry(
-        "groq/llama-3.3-70b-versatile",
+        "llama-3.3-70b-versatile",
        CoreModelId.llama3_3_70b_instruct.value,
    ),
    # Groq only contains a preview version for llama-3.2-3b
@ -34,23 +34,15 @@ MODEL_ENTRIES = [
    # to pass the test fixture
    # TODO(aidand): Replace this with a stable model once Groq supports it
    build_hf_repo_model_entry(
-        "groq/llama-3.2-3b-preview",
+        "llama-3.2-3b-preview",
        CoreModelId.llama3_2_3b_instruct.value,
    ),
    build_hf_repo_model_entry(
-        "groq/llama-4-scout-17b-16e-instruct",
+        "meta-llama/llama-4-scout-17b-16e-instruct",
        CoreModelId.llama4_scout_17b_16e_instruct.value,
    ),
    build_hf_repo_model_entry(
-        "groq/meta-llama/llama-4-scout-17b-16e-instruct",
-        CoreModelId.llama4_scout_17b_16e_instruct.value,
-    ),
-    build_hf_repo_model_entry(
-        "groq/llama-4-maverick-17b-128e-instruct",
-        CoreModelId.llama4_maverick_17b_128e_instruct.value,
-    ),
-    build_hf_repo_model_entry(
-        "groq/meta-llama/llama-4-maverick-17b-128e-instruct",
+        "meta-llama/llama-4-maverick-17b-128e-instruct",
        CoreModelId.llama4_maverick_17b_128e_instruct.value,
    ),
 ] + SAFETY_MODELS_ENTRIES
--- a/llama_stack/providers/remote/inference/llama_openai_compat/llama.py
+++ b/llama_stack/providers/remote/inference/llama_openai_compat/llama.py
@ -32,6 +32,7 @@ class LlamaCompatInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
        LiteLLMOpenAIMixin.__init__(
            self,
            model_entries=MODEL_ENTRIES,
+            litellm_provider_name="llama",
            api_key_from_config=config.api_key,
            provider_data_api_key_field="llama_api_key",
            openai_compat_api_base=config.openai_compat_api_base,
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -166,7 +166,7 @@ class OllamaInferenceAdapter(
        ]
        for m in response.models:
            # kill embedding models since we don't know dimensions for them
-            if m.details.family in ["bert"]:
+            if "bert" in m.details.family:
                continue
            models.append(
                Model(
@ -420,9 +420,6 @@ class OllamaInferenceAdapter(
        except ValueError:
            pass  # Ignore statically unknown model, will check live listing

-        if model.provider_resource_id is None:
-            raise ValueError("Model provider_resource_id cannot be None")
-
        if model.model_type == ModelType.embedding:
            response = await self.client.list()
            if model.provider_resource_id not in [m.model for m in response.models]:
@ -433,9 +430,9 @@ class OllamaInferenceAdapter(
        #  - models not currently running are run by the ollama server as needed
        response = await self.client.list()
        available_models = [m.model for m in response.models]
-        provider_resource_id = self.register_helper.get_provider_model_id(model.provider_resource_id)
-        if provider_resource_id is None:
-            provider_resource_id = model.provider_resource_id
+
+        provider_resource_id = model.provider_resource_id
+        assert provider_resource_id is not None  # mypy
        if provider_resource_id not in available_models:
            available_models_latest = [m.model.split(":latest")[0] for m in response.models]
            if provider_resource_id in available_models_latest:
@ -443,7 +440,9 @@ class OllamaInferenceAdapter(
                    f"Imprecise provider resource id was used but 'latest' is available in Ollama - using '{model.provider_resource_id}:latest'"
                )
                return model
-            raise UnsupportedModelError(model.provider_resource_id, available_models)
+            raise UnsupportedModelError(provider_resource_id, available_models)
+
+        # mutating this should be considered an anti-pattern
        model.provider_resource_id = provider_resource_id

        return model
--- a/llama_stack/providers/remote/inference/openai/config.py
+++ b/llama_stack/providers/remote/inference/openai/config.py
@ -26,7 +26,7 @@ class OpenAIConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.OPENAI_API_KEY}", **kwargs) -> dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.OPENAI_API_KEY:=}", **kwargs) -> dict[str, Any]:
        return {
            "api_key": api_key,
        }
--- a/llama_stack/providers/remote/inference/openai/openai.py
+++ b/llama_stack/providers/remote/inference/openai/openai.py
@ -45,6 +45,7 @@ class OpenAIInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
        LiteLLMOpenAIMixin.__init__(
            self,
            MODEL_ENTRIES,
+            litellm_provider_name="openai",
            api_key_from_config=config.api_key,
            provider_data_api_key_field="openai_api_key",
        )
--- a/llama_stack/providers/remote/inference/sambanova/config.py
+++ b/llama_stack/providers/remote/inference/sambanova/config.py
@ -30,7 +30,7 @@ class SambaNovaImplConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.SAMBANOVA_API_KEY}", **kwargs) -> dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.SAMBANOVA_API_KEY:=}", **kwargs) -> dict[str, Any]:
        return {
            "url": "https://api.sambanova.ai/v1",
            "api_key": api_key,
--- a/llama_stack/providers/remote/inference/sambanova/models.py
+++ b/llama_stack/providers/remote/inference/sambanova/models.py
@ -9,49 +9,20 @@ from llama_stack.providers.utils.inference.model_registry import (
    build_hf_repo_model_entry,
 )

-SAFETY_MODELS_ENTRIES = [
-    build_hf_repo_model_entry(
-        "sambanova/Meta-Llama-Guard-3-8B",
-        CoreModelId.llama_guard_3_8b.value,
-    ),
-]
+SAFETY_MODELS_ENTRIES = []


 MODEL_ENTRIES = [
    build_hf_repo_model_entry(
-        "sambanova/Meta-Llama-3.1-8B-Instruct",
+        "Meta-Llama-3.1-8B-Instruct",
        CoreModelId.llama3_1_8b_instruct.value,
    ),
    build_hf_repo_model_entry(
-        "sambanova/Meta-Llama-3.1-405B-Instruct",
-        CoreModelId.llama3_1_405b_instruct.value,
-    ),
-    build_hf_repo_model_entry(
-        "sambanova/Meta-Llama-3.2-1B-Instruct",
-        CoreModelId.llama3_2_1b_instruct.value,
-    ),
-    build_hf_repo_model_entry(
-        "sambanova/Meta-Llama-3.2-3B-Instruct",
-        CoreModelId.llama3_2_3b_instruct.value,
-    ),
-    build_hf_repo_model_entry(
-        "sambanova/Meta-Llama-3.3-70B-Instruct",
+        "Meta-Llama-3.3-70B-Instruct",
        CoreModelId.llama3_3_70b_instruct.value,
    ),
    build_hf_repo_model_entry(
-        "sambanova/Llama-3.2-11B-Vision-Instruct",
-        CoreModelId.llama3_2_11b_vision_instruct.value,
-    ),
-    build_hf_repo_model_entry(
-        "sambanova/Llama-3.2-90B-Vision-Instruct",
-        CoreModelId.llama3_2_90b_vision_instruct.value,
-    ),
-    build_hf_repo_model_entry(
-        "sambanova/Llama-4-Scout-17B-16E-Instruct",
-        CoreModelId.llama4_scout_17b_16e_instruct.value,
-    ),
-    build_hf_repo_model_entry(
-        "sambanova/Llama-4-Maverick-17B-128E-Instruct",
+        "Llama-4-Maverick-17B-128E-Instruct",
        CoreModelId.llama4_maverick_17b_128e_instruct.value,
    ),
 ] + SAFETY_MODELS_ENTRIES
--- a/llama_stack/providers/remote/inference/sambanova/sambanova.py
+++ b/llama_stack/providers/remote/inference/sambanova/sambanova.py
@ -182,6 +182,7 @@ class SambaNovaInferenceAdapter(LiteLLMOpenAIMixin):
        LiteLLMOpenAIMixin.__init__(
            self,
            model_entries=MODEL_ENTRIES,
+            litellm_provider_name="sambanova",
            api_key_from_config=self.config.api_key.get_secret_value() if self.config.api_key else None,
            provider_data_api_key_field="sambanova_api_key",
        )
--- a/llama_stack/providers/remote/inference/tgi/config.py
+++ b/llama_stack/providers/remote/inference/tgi/config.py
@ -19,7 +19,7 @@ class TGIImplConfig(BaseModel):
    @classmethod
    def sample_run_config(
        cls,
-        url: str = "${env.TGI_URL}",
+        url: str = "${env.TGI_URL:=}",
        **kwargs,
    ):
        return {
--- a/llama_stack/providers/remote/inference/tgi/tgi.py
+++ b/llama_stack/providers/remote/inference/tgi/tgi.py
@ -305,6 +305,8 @@ class _HfAdapter(

 class TGIAdapter(_HfAdapter):
    async def initialize(self, config: TGIImplConfig) -> None:
+        if not config.url:
+            raise ValueError("You must provide a URL in run.yaml (or via the TGI_URL environment variable) to use TGI.")
        log.info(f"Initializing TGI client with url={config.url}")
        self.client = AsyncInferenceClient(
            model=config.url,
--- a/llama_stack/providers/remote/inference/together/config.py
+++ b/llama_stack/providers/remote/inference/together/config.py
@ -27,5 +27,5 @@ class TogetherImplConfig(RemoteInferenceProviderConfig):
    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
        return {
            "url": "https://api.together.xyz/v1",
-            "api_key": "${env.TOGETHER_API_KEY}",
+            "api_key": "${env.TOGETHER_API_KEY:=}",
        }
--- a/llama_stack/providers/remote/inference/together/models.py
+++ b/llama_stack/providers/remote/inference/together/models.py
@ -69,15 +69,9 @@ MODEL_ENTRIES = [
    build_hf_repo_model_entry(
        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
        CoreModelId.llama4_scout_17b_16e_instruct.value,
-        additional_aliases=[
-            "together/meta-llama/Llama-4-Scout-17B-16E-Instruct",
-        ],
    ),
    build_hf_repo_model_entry(
        "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
        CoreModelId.llama4_maverick_17b_128e_instruct.value,
-        additional_aliases=[
-            "together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
-        ],
    ),
 ] + SAFETY_MODELS_ENTRIES
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@ -299,7 +299,10 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
        self.client = None

    async def initialize(self) -> None:
-        pass
+        if not self.config.url:
+            raise ValueError(
+                "You must provide a URL in run.yaml (or via the VLLM_URL environment variable) to use vLLM."
+            )

    async def should_refresh_models(self) -> bool:
        return self.config.refresh_models
@ -337,9 +340,6 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
            HealthResponse: A dictionary containing the health status.
        """
        try:
-            if not self.config.url:
-                return HealthResponse(status=HealthStatus.ERROR, message="vLLM URL is not set")
-
            client = self._create_client() if self.client is None else self.client
            _ = [m async for m in client.models.list()]  # Ensure the client is initialized
            return HealthResponse(status=HealthStatus.OK)
@ -355,11 +355,6 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
        if self.client is not None:
            return

-        if not self.config.url:
-            raise ValueError(
-                "You must provide a vLLM URL in the run.yaml file (or set the VLLM_URL environment variable)"
-            )
-
        log.info(f"Initializing vLLM client with base_url={self.config.url}")
        self.client = self._create_client()

--- a/llama_stack/providers/remote/safety/sambanova/config.py
+++ b/llama_stack/providers/remote/safety/sambanova/config.py
@ -30,7 +30,7 @@ class SambaNovaSafetyConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.SAMBANOVA_API_KEY}", **kwargs) -> dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.SAMBANOVA_API_KEY:=}", **kwargs) -> dict[str, Any]:
        return {
            "url": "https://api.sambanova.ai/v1",
            "api_key": api_key,