feat(starter)!: simplify starter distro; litellm model registry changes (#2916)

2025-12-03 18:00:36 +00:00 · 2025-07-25 15:02:04 -07:00 · 2025-07-25 15:02:04 -07:00 · 9583f468f8
commit 9583f468f8
parent 3344d8a9e5
64 changed files with 2027 additions and 4092 deletions
--- a/llama_stack/providers/datatypes.py
+++ b/llama_stack/providers/datatypes.py
@ -43,6 +43,9 @@ class ModelsProtocolPrivate(Protocol):
       -> Provider uses provider-model-id for inference
    """

+    # this should be called `on_model_register` or something like that.
+    # the provider should _not_ be able to change the object in this
+    # callback
    async def register_model(self, model: Model) -> Model: ...

    async def unregister_model(self, model_id: str) -> None: ...
--- a/llama_stack/providers/inline/safety/llama_guard/llama_guard.py
+++ b/llama_stack/providers/inline/safety/llama_guard/llama_guard.py
@ -146,9 +146,9 @@ class LlamaGuardSafetyImpl(Safety, ShieldsProtocolPrivate):
        pass

    async def register_shield(self, shield: Shield) -> None:
-        # Allow any model to be registered as a shield
-        # The model will be validated during runtime when making inference calls
-        pass
+        model_id = shield.provider_resource_id
+        if not model_id:
+            raise ValueError("Llama Guard shield must have a model id")

    async def run_shield(
        self,
--- a/llama_stack/providers/remote/inference/anthropic/anthropic.py
+++ b/llama_stack/providers/remote/inference/anthropic/anthropic.py
@ -15,6 +15,7 @@ class AnthropicInferenceAdapter(LiteLLMOpenAIMixin):
        LiteLLMOpenAIMixin.__init__(
            self,
            MODEL_ENTRIES,
+            litellm_provider_name="anthropic",
            api_key_from_config=config.api_key,
            provider_data_api_key_field="anthropic_api_key",
        )
--- a/llama_stack/providers/remote/inference/anthropic/config.py
+++ b/llama_stack/providers/remote/inference/anthropic/config.py
@ -26,7 +26,7 @@ class AnthropicConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.ANTHROPIC_API_KEY}", **kwargs) -> dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.ANTHROPIC_API_KEY:=}", **kwargs) -> dict[str, Any]:
        return {
            "api_key": api_key,
        }
--- a/llama_stack/providers/remote/inference/anthropic/models.py
+++ b/llama_stack/providers/remote/inference/anthropic/models.py
@ -10,9 +10,9 @@ from llama_stack.providers.utils.inference.model_registry import (
 )

 LLM_MODEL_IDS = [
-    "anthropic/claude-3-5-sonnet-latest",
-    "anthropic/claude-3-7-sonnet-latest",
-    "anthropic/claude-3-5-haiku-latest",
+    "claude-3-5-sonnet-latest",
+    "claude-3-7-sonnet-latest",
+    "claude-3-5-haiku-latest",
 ]

 SAFETY_MODELS_ENTRIES = []
@ -21,17 +21,17 @@ MODEL_ENTRIES = (
    [ProviderModelEntry(provider_model_id=m) for m in LLM_MODEL_IDS]
    + [
        ProviderModelEntry(
-            provider_model_id="anthropic/voyage-3",
+            provider_model_id="voyage-3",
            model_type=ModelType.embedding,
            metadata={"embedding_dimension": 1024, "context_length": 32000},
        ),
        ProviderModelEntry(
-            provider_model_id="anthropic/voyage-3-lite",
+            provider_model_id="voyage-3-lite",
            model_type=ModelType.embedding,
            metadata={"embedding_dimension": 512, "context_length": 32000},
        ),
        ProviderModelEntry(
-            provider_model_id="anthropic/voyage-code-3",
+            provider_model_id="voyage-code-3",
            model_type=ModelType.embedding,
            metadata={"embedding_dimension": 1024, "context_length": 32000},
        ),
--- a/llama_stack/providers/remote/inference/bedrock/bedrock.py
+++ b/llama_stack/providers/remote/inference/bedrock/bedrock.py
@ -63,18 +63,20 @@ class BedrockInferenceAdapter(
    def __init__(self, config: BedrockConfig) -> None:
        ModelRegistryHelper.__init__(self, MODEL_ENTRIES)
        self._config = config
-
-        self._client = create_bedrock_client(config)
+        self._client = None

    @property
    def client(self) -> BaseClient:
+        if self._client is None:
+            self._client = create_bedrock_client(self._config)
        return self._client

    async def initialize(self) -> None:
        pass

    async def shutdown(self) -> None:
-        self.client.close()
+        if self._client is not None:
+            self._client.close()

    async def completion(
        self,
--- a/llama_stack/providers/remote/inference/cerebras/cerebras.py
+++ b/llama_stack/providers/remote/inference/cerebras/cerebras.py
@ -65,6 +65,7 @@ class CerebrasInferenceAdapter(
        )
        self.config = config

+        # TODO: make this use provider data, etc. like other providers
        self.client = AsyncCerebras(
            base_url=self.config.base_url,
            api_key=self.config.api_key.get_secret_value(),
--- a/llama_stack/providers/remote/inference/cerebras/config.py
+++ b/llama_stack/providers/remote/inference/cerebras/config.py
@ -26,7 +26,7 @@ class CerebrasImplConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.CEREBRAS_API_KEY}", **kwargs) -> dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.CEREBRAS_API_KEY:=}", **kwargs) -> dict[str, Any]:
        return {
            "base_url": DEFAULT_BASE_URL,
            "api_key": api_key,
--- a/llama_stack/providers/remote/inference/databricks/config.py
+++ b/llama_stack/providers/remote/inference/databricks/config.py
@ -25,8 +25,8 @@ class DatabricksImplConfig(BaseModel):
    @classmethod
    def sample_run_config(
        cls,
-        url: str = "${env.DATABRICKS_URL}",
-        api_token: str = "${env.DATABRICKS_API_TOKEN}",
+        url: str = "${env.DATABRICKS_URL:=}",
+        api_token: str = "${env.DATABRICKS_API_TOKEN:=}",
        **kwargs: Any,
    ) -> dict[str, Any]:
        return {
--- a/llama_stack/providers/remote/inference/fireworks/config.py
+++ b/llama_stack/providers/remote/inference/fireworks/config.py
@ -24,7 +24,7 @@ class FireworksImplConfig(RemoteInferenceProviderConfig):
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.FIREWORKS_API_KEY}", **kwargs) -> dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.FIREWORKS_API_KEY:=}", **kwargs) -> dict[str, Any]:
        return {
            "url": "https://api.fireworks.ai/inference/v1",
            "api_key": api_key,
--- a/llama_stack/providers/remote/inference/gemini/config.py
+++ b/llama_stack/providers/remote/inference/gemini/config.py
@ -26,7 +26,7 @@ class GeminiConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.GEMINI_API_KEY}", **kwargs) -> dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.GEMINI_API_KEY:=}", **kwargs) -> dict[str, Any]:
        return {
            "api_key": api_key,
        }
--- a/llama_stack/providers/remote/inference/gemini/gemini.py
+++ b/llama_stack/providers/remote/inference/gemini/gemini.py
@ -15,6 +15,7 @@ class GeminiInferenceAdapter(LiteLLMOpenAIMixin):
        LiteLLMOpenAIMixin.__init__(
            self,
            MODEL_ENTRIES,
+            litellm_provider_name="gemini",
            api_key_from_config=config.api_key,
            provider_data_api_key_field="gemini_api_key",
        )
--- a/llama_stack/providers/remote/inference/gemini/models.py
+++ b/llama_stack/providers/remote/inference/gemini/models.py
@ -10,11 +10,11 @@ from llama_stack.providers.utils.inference.model_registry import (
 )

 LLM_MODEL_IDS = [
-    "gemini/gemini-1.5-flash",
-    "gemini/gemini-1.5-pro",
-    "gemini/gemini-2.0-flash",
-    "gemini/gemini-2.5-flash",
-    "gemini/gemini-2.5-pro",
+    "gemini-1.5-flash",
+    "gemini-1.5-pro",
+    "gemini-2.0-flash",
+    "gemini-2.5-flash",
+    "gemini-2.5-pro",
 ]

 SAFETY_MODELS_ENTRIES = []
@ -23,7 +23,7 @@ MODEL_ENTRIES = (
    [ProviderModelEntry(provider_model_id=m) for m in LLM_MODEL_IDS]
    + [
        ProviderModelEntry(
-            provider_model_id="gemini/text-embedding-004",
+            provider_model_id="text-embedding-004",
            model_type=ModelType.embedding,
            metadata={"embedding_dimension": 768, "context_length": 2048},
        ),
--- a/llama_stack/providers/remote/inference/groq/config.py
+++ b/llama_stack/providers/remote/inference/groq/config.py
@ -32,7 +32,7 @@ class GroqConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.GROQ_API_KEY}", **kwargs) -> dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.GROQ_API_KEY:=}", **kwargs) -> dict[str, Any]:
        return {
            "url": "https://api.groq.com",
            "api_key": api_key,
--- a/llama_stack/providers/remote/inference/groq/groq.py
+++ b/llama_stack/providers/remote/inference/groq/groq.py
@ -34,6 +34,7 @@ class GroqInferenceAdapter(LiteLLMOpenAIMixin):
        LiteLLMOpenAIMixin.__init__(
            self,
            model_entries=MODEL_ENTRIES,
+            litellm_provider_name="groq",
            api_key_from_config=config.api_key,
            provider_data_api_key_field="groq_api_key",
        )
@ -96,7 +97,7 @@ class GroqInferenceAdapter(LiteLLMOpenAIMixin):
            tool_choice = "required"

        params = await prepare_openai_completion_params(
-            model=model_obj.provider_resource_id.replace("groq/", ""),
+            model=model_obj.provider_resource_id,
            messages=messages,
            frequency_penalty=frequency_penalty,
            function_call=function_call,
--- a/llama_stack/providers/remote/inference/groq/models.py
+++ b/llama_stack/providers/remote/inference/groq/models.py
@ -14,19 +14,19 @@ SAFETY_MODELS_ENTRIES = []

 MODEL_ENTRIES = [
    build_hf_repo_model_entry(
-        "groq/llama3-8b-8192",
+        "llama3-8b-8192",
        CoreModelId.llama3_1_8b_instruct.value,
    ),
    build_model_entry(
-        "groq/llama-3.1-8b-instant",
+        "llama-3.1-8b-instant",
        CoreModelId.llama3_1_8b_instruct.value,
    ),
    build_hf_repo_model_entry(
-        "groq/llama3-70b-8192",
+        "llama3-70b-8192",
        CoreModelId.llama3_70b_instruct.value,
    ),
    build_hf_repo_model_entry(
-        "groq/llama-3.3-70b-versatile",
+        "llama-3.3-70b-versatile",
        CoreModelId.llama3_3_70b_instruct.value,
    ),
    # Groq only contains a preview version for llama-3.2-3b
@ -34,23 +34,15 @@ MODEL_ENTRIES = [
    # to pass the test fixture
    # TODO(aidand): Replace this with a stable model once Groq supports it
    build_hf_repo_model_entry(
-        "groq/llama-3.2-3b-preview",
+        "llama-3.2-3b-preview",
        CoreModelId.llama3_2_3b_instruct.value,
    ),
    build_hf_repo_model_entry(
-        "groq/llama-4-scout-17b-16e-instruct",
+        "meta-llama/llama-4-scout-17b-16e-instruct",
        CoreModelId.llama4_scout_17b_16e_instruct.value,
    ),
    build_hf_repo_model_entry(
-        "groq/meta-llama/llama-4-scout-17b-16e-instruct",
-        CoreModelId.llama4_scout_17b_16e_instruct.value,
-    ),
-    build_hf_repo_model_entry(
-        "groq/llama-4-maverick-17b-128e-instruct",
-        CoreModelId.llama4_maverick_17b_128e_instruct.value,
-    ),
-    build_hf_repo_model_entry(
-        "groq/meta-llama/llama-4-maverick-17b-128e-instruct",
+        "meta-llama/llama-4-maverick-17b-128e-instruct",
        CoreModelId.llama4_maverick_17b_128e_instruct.value,
    ),
 ] + SAFETY_MODELS_ENTRIES
--- a/llama_stack/providers/remote/inference/llama_openai_compat/llama.py
+++ b/llama_stack/providers/remote/inference/llama_openai_compat/llama.py
@ -32,6 +32,7 @@ class LlamaCompatInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
        LiteLLMOpenAIMixin.__init__(
            self,
            model_entries=MODEL_ENTRIES,
+            litellm_provider_name="llama",
            api_key_from_config=config.api_key,
            provider_data_api_key_field="llama_api_key",
            openai_compat_api_base=config.openai_compat_api_base,
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -166,7 +166,7 @@ class OllamaInferenceAdapter(
        ]
        for m in response.models:
            # kill embedding models since we don't know dimensions for them
-            if m.details.family in ["bert"]:
+            if "bert" in m.details.family:
                continue
            models.append(
                Model(
@ -420,9 +420,6 @@ class OllamaInferenceAdapter(
        except ValueError:
            pass  # Ignore statically unknown model, will check live listing

-        if model.provider_resource_id is None:
-            raise ValueError("Model provider_resource_id cannot be None")
-
        if model.model_type == ModelType.embedding:
            response = await self.client.list()
            if model.provider_resource_id not in [m.model for m in response.models]:
@ -433,9 +430,9 @@ class OllamaInferenceAdapter(
        #  - models not currently running are run by the ollama server as needed
        response = await self.client.list()
        available_models = [m.model for m in response.models]
-        provider_resource_id = self.register_helper.get_provider_model_id(model.provider_resource_id)
-        if provider_resource_id is None:
-            provider_resource_id = model.provider_resource_id
+
+        provider_resource_id = model.provider_resource_id
+        assert provider_resource_id is not None  # mypy
        if provider_resource_id not in available_models:
            available_models_latest = [m.model.split(":latest")[0] for m in response.models]
            if provider_resource_id in available_models_latest:
@ -443,7 +440,9 @@ class OllamaInferenceAdapter(
                    f"Imprecise provider resource id was used but 'latest' is available in Ollama - using '{model.provider_resource_id}:latest'"
                )
                return model
-            raise UnsupportedModelError(model.provider_resource_id, available_models)
+            raise UnsupportedModelError(provider_resource_id, available_models)
+
+        # mutating this should be considered an anti-pattern
        model.provider_resource_id = provider_resource_id

        return model
--- a/llama_stack/providers/remote/inference/openai/config.py
+++ b/llama_stack/providers/remote/inference/openai/config.py
@ -26,7 +26,7 @@ class OpenAIConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.OPENAI_API_KEY}", **kwargs) -> dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.OPENAI_API_KEY:=}", **kwargs) -> dict[str, Any]:
        return {
            "api_key": api_key,
        }
--- a/llama_stack/providers/remote/inference/openai/openai.py
+++ b/llama_stack/providers/remote/inference/openai/openai.py
@ -45,6 +45,7 @@ class OpenAIInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
        LiteLLMOpenAIMixin.__init__(
            self,
            MODEL_ENTRIES,
+            litellm_provider_name="openai",
            api_key_from_config=config.api_key,
            provider_data_api_key_field="openai_api_key",
        )
--- a/llama_stack/providers/remote/inference/sambanova/config.py
+++ b/llama_stack/providers/remote/inference/sambanova/config.py
@ -30,7 +30,7 @@ class SambaNovaImplConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.SAMBANOVA_API_KEY}", **kwargs) -> dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.SAMBANOVA_API_KEY:=}", **kwargs) -> dict[str, Any]:
        return {
            "url": "https://api.sambanova.ai/v1",
            "api_key": api_key,
--- a/llama_stack/providers/remote/inference/sambanova/models.py
+++ b/llama_stack/providers/remote/inference/sambanova/models.py
@ -9,49 +9,20 @@ from llama_stack.providers.utils.inference.model_registry import (
    build_hf_repo_model_entry,
 )

-SAFETY_MODELS_ENTRIES = [
-    build_hf_repo_model_entry(
-        "sambanova/Meta-Llama-Guard-3-8B",
-        CoreModelId.llama_guard_3_8b.value,
-    ),
-]
+SAFETY_MODELS_ENTRIES = []


 MODEL_ENTRIES = [
    build_hf_repo_model_entry(
-        "sambanova/Meta-Llama-3.1-8B-Instruct",
+        "Meta-Llama-3.1-8B-Instruct",
        CoreModelId.llama3_1_8b_instruct.value,
    ),
    build_hf_repo_model_entry(
-        "sambanova/Meta-Llama-3.1-405B-Instruct",
-        CoreModelId.llama3_1_405b_instruct.value,
-    ),
-    build_hf_repo_model_entry(
-        "sambanova/Meta-Llama-3.2-1B-Instruct",
-        CoreModelId.llama3_2_1b_instruct.value,
-    ),
-    build_hf_repo_model_entry(
-        "sambanova/Meta-Llama-3.2-3B-Instruct",
-        CoreModelId.llama3_2_3b_instruct.value,
-    ),
-    build_hf_repo_model_entry(
-        "sambanova/Meta-Llama-3.3-70B-Instruct",
+        "Meta-Llama-3.3-70B-Instruct",
        CoreModelId.llama3_3_70b_instruct.value,
    ),
    build_hf_repo_model_entry(
-        "sambanova/Llama-3.2-11B-Vision-Instruct",
-        CoreModelId.llama3_2_11b_vision_instruct.value,
-    ),
-    build_hf_repo_model_entry(
-        "sambanova/Llama-3.2-90B-Vision-Instruct",
-        CoreModelId.llama3_2_90b_vision_instruct.value,
-    ),
-    build_hf_repo_model_entry(
-        "sambanova/Llama-4-Scout-17B-16E-Instruct",
-        CoreModelId.llama4_scout_17b_16e_instruct.value,
-    ),
-    build_hf_repo_model_entry(
-        "sambanova/Llama-4-Maverick-17B-128E-Instruct",
+        "Llama-4-Maverick-17B-128E-Instruct",
        CoreModelId.llama4_maverick_17b_128e_instruct.value,
    ),
 ] + SAFETY_MODELS_ENTRIES
--- a/llama_stack/providers/remote/inference/sambanova/sambanova.py
+++ b/llama_stack/providers/remote/inference/sambanova/sambanova.py
@ -182,6 +182,7 @@ class SambaNovaInferenceAdapter(LiteLLMOpenAIMixin):
        LiteLLMOpenAIMixin.__init__(
            self,
            model_entries=MODEL_ENTRIES,
+            litellm_provider_name="sambanova",
            api_key_from_config=self.config.api_key.get_secret_value() if self.config.api_key else None,
            provider_data_api_key_field="sambanova_api_key",
        )
--- a/llama_stack/providers/remote/inference/tgi/config.py
+++ b/llama_stack/providers/remote/inference/tgi/config.py
@ -19,7 +19,7 @@ class TGIImplConfig(BaseModel):
    @classmethod
    def sample_run_config(
        cls,
-        url: str = "${env.TGI_URL}",
+        url: str = "${env.TGI_URL:=}",
        **kwargs,
    ):
        return {
--- a/llama_stack/providers/remote/inference/tgi/tgi.py
+++ b/llama_stack/providers/remote/inference/tgi/tgi.py
@ -305,6 +305,8 @@ class _HfAdapter(

 class TGIAdapter(_HfAdapter):
    async def initialize(self, config: TGIImplConfig) -> None:
+        if not config.url:
+            raise ValueError("You must provide a URL in run.yaml (or via the TGI_URL environment variable) to use TGI.")
        log.info(f"Initializing TGI client with url={config.url}")
        self.client = AsyncInferenceClient(
            model=config.url,
--- a/llama_stack/providers/remote/inference/together/config.py
+++ b/llama_stack/providers/remote/inference/together/config.py
@ -27,5 +27,5 @@ class TogetherImplConfig(RemoteInferenceProviderConfig):
    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
        return {
            "url": "https://api.together.xyz/v1",
-            "api_key": "${env.TOGETHER_API_KEY}",
+            "api_key": "${env.TOGETHER_API_KEY:=}",
        }
--- a/llama_stack/providers/remote/inference/together/models.py
+++ b/llama_stack/providers/remote/inference/together/models.py
@ -69,15 +69,9 @@ MODEL_ENTRIES = [
    build_hf_repo_model_entry(
        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
        CoreModelId.llama4_scout_17b_16e_instruct.value,
-        additional_aliases=[
-            "together/meta-llama/Llama-4-Scout-17B-16E-Instruct",
-        ],
    ),
    build_hf_repo_model_entry(
        "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
        CoreModelId.llama4_maverick_17b_128e_instruct.value,
-        additional_aliases=[
-            "together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
-        ],
    ),
 ] + SAFETY_MODELS_ENTRIES
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@ -299,7 +299,10 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
        self.client = None

    async def initialize(self) -> None:
-        pass
+        if not self.config.url:
+            raise ValueError(
+                "You must provide a URL in run.yaml (or via the VLLM_URL environment variable) to use vLLM."
+            )

    async def should_refresh_models(self) -> bool:
        return self.config.refresh_models
@ -337,9 +340,6 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
            HealthResponse: A dictionary containing the health status.
        """
        try:
-            if not self.config.url:
-                return HealthResponse(status=HealthStatus.ERROR, message="vLLM URL is not set")
-
            client = self._create_client() if self.client is None else self.client
            _ = [m async for m in client.models.list()]  # Ensure the client is initialized
            return HealthResponse(status=HealthStatus.OK)
@ -355,11 +355,6 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
        if self.client is not None:
            return

-        if not self.config.url:
-            raise ValueError(
-                "You must provide a vLLM URL in the run.yaml file (or set the VLLM_URL environment variable)"
-            )
-
        log.info(f"Initializing vLLM client with base_url={self.config.url}")
        self.client = self._create_client()

--- a/llama_stack/providers/remote/safety/sambanova/config.py
+++ b/llama_stack/providers/remote/safety/sambanova/config.py
@ -30,7 +30,7 @@ class SambaNovaSafetyConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.SAMBANOVA_API_KEY}", **kwargs) -> dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.SAMBANOVA_API_KEY:=}", **kwargs) -> dict[str, Any]:
        return {
            "url": "https://api.sambanova.ai/v1",
            "api_key": api_key,
--- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py
+++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
@ -68,11 +68,14 @@ class LiteLLMOpenAIMixin(
    def __init__(
        self,
        model_entries,
+        litellm_provider_name: str,
        api_key_from_config: str | None,
        provider_data_api_key_field: str,
        openai_compat_api_base: str | None = None,
    ):
        ModelRegistryHelper.__init__(self, model_entries)
+
+        self.litellm_provider_name = litellm_provider_name
        self.api_key_from_config = api_key_from_config
        self.provider_data_api_key_field = provider_data_api_key_field
        self.api_base = openai_compat_api_base
@ -91,7 +94,11 @@ class LiteLLMOpenAIMixin(
    def get_litellm_model_name(self, model_id: str) -> str:
        # users may be using openai/ prefix in their model names. the openai/models.py did this by default.
        # model_id.startswith("openai/") is for backwards compatibility.
-        return "openai/" + model_id if self.is_openai_compat and not model_id.startswith("openai/") else model_id
+        return (
+            f"{self.litellm_provider_name}/{model_id}"
+            if self.is_openai_compat and not model_id.startswith(self.litellm_provider_name)
+            else model_id
+        )

    async def completion(
        self,
--- a/llama_stack/providers/utils/inference/model_registry.py
+++ b/llama_stack/providers/utils/inference/model_registry.py
@ -50,7 +50,8 @@ def build_hf_repo_model_entry(
    additional_aliases: list[str] | None = None,
 ) -> ProviderModelEntry:
    aliases = [
-        get_huggingface_repo(model_descriptor),
+        # NOTE: avoid HF aliases because they _cannot_ be unique across providers
+        # get_huggingface_repo(model_descriptor),
    ]
    if additional_aliases:
        aliases.extend(additional_aliases)
@ -75,7 +76,9 @@ class ModelRegistryHelper(ModelsProtocolPrivate):
    __provider_id__: str

    def __init__(self, model_entries: list[ProviderModelEntry], allowed_models: list[str] | None = None):
+        self.model_entries = model_entries
        self.allowed_models = allowed_models
+
        self.alias_to_provider_id_map = {}
        self.provider_id_to_llama_model_map = {}
        for entry in model_entries:
@ -98,7 +101,7 @@ class ModelRegistryHelper(ModelsProtocolPrivate):
                    continue
                models.append(
                    Model(
-                        model_id=id,
+                        identifier=id,
                        provider_resource_id=entry.provider_model_id,
                        model_type=ModelType.llm,
                        metadata=entry.metadata,
@ -185,8 +188,8 @@ class ModelRegistryHelper(ModelsProtocolPrivate):
        return model

    async def unregister_model(self, model_id: str) -> None:
-        # TODO: should we block unregistering base supported provider model IDs?
-        if model_id not in self.alias_to_provider_id_map:
-            raise ValueError(f"Model id '{model_id}' is not registered.")
-
-        del self.alias_to_provider_id_map[model_id]
+        # model_id is the identifier, not the provider_resource_id
+        # unfortunately, this ID can be of the form provider_id/model_id which
+        # we never registered. TODO: fix this by significantly rewriting
+        # registration and registry helper
+        pass