working fireworks and together

2025-12-16 22:29:27 +00:00 · 2024-11-12 13:07:35 -08:00 · 2024-11-12 13:07:35 -08:00 · 8de4cee373
commit 8de4cee373
parent 25d8ab0e14
8 changed files with 205 additions and 86 deletions
--- a/llama_stack/providers/remote/inference/bedrock/bedrock.py
+++ b/llama_stack/providers/remote/inference/bedrock/bedrock.py
@ -11,7 +11,10 @@ from botocore.client import BaseClient
 from llama_models.llama3.api.chat_format import ChatFormat
 from llama_models.llama3.api.tokenizer import Tokenizer

-from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
+from llama_stack.providers.utils.inference.model_registry import (
+    ModelAlias,
+    ModelRegistryHelper,
+)

 from llama_stack.apis.inference import *  # noqa: F403

@ -19,19 +22,26 @@ from llama_stack.providers.remote.inference.bedrock.config import BedrockConfig
 from llama_stack.providers.utils.bedrock.client import create_bedrock_client


-BEDROCK_SUPPORTED_MODELS = {
-    "Llama3.1-8B-Instruct": "meta.llama3-1-8b-instruct-v1:0",
-    "Llama3.1-70B-Instruct": "meta.llama3-1-70b-instruct-v1:0",
-    "Llama3.1-405B-Instruct": "meta.llama3-1-405b-instruct-v1:0",
-}
+model_aliases = [
+    ModelAlias(
+        provider_model_id="meta.llama3-1-8b-instruct-v1:0",
+        aliases=["Llama3.1-8B"],
+    ),
+    ModelAlias(
+        provider_model_id="meta.llama3-1-70b-instruct-v1:0",
+        aliases=["Llama3.1-70B"],
+    ),
+    ModelAlias(
+        provider_model_id="meta.llama3-1-405b-instruct-v1:0",
+        aliases=["Llama3.1-405B"],
+    ),
+]


 # NOTE: this is not quite tested after the recent refactors
 class BedrockInferenceAdapter(ModelRegistryHelper, Inference):
    def __init__(self, config: BedrockConfig) -> None:
-        ModelRegistryHelper.__init__(
-            self, stack_to_provider_models_map=BEDROCK_SUPPORTED_MODELS
-        )
+        ModelRegistryHelper.__init__(self, model_aliases)
        self._config = config

        self._client = create_bedrock_client(config)
--- a/llama_stack/providers/remote/inference/databricks/databricks.py
+++ b/llama_stack/providers/remote/inference/databricks/databricks.py
@ -37,7 +37,7 @@ DATABRICKS_SUPPORTED_MODELS = {
 class DatabricksInferenceAdapter(ModelRegistryHelper, Inference):
    def __init__(self, config: DatabricksImplConfig) -> None:
        ModelRegistryHelper.__init__(
-            self, stack_to_provider_models_map=DATABRICKS_SUPPORTED_MODELS
+            self, provider_to_common_model_aliases_map=DATABRICKS_SUPPORTED_MODELS
        )
        self.config = config
        self.formatter = ChatFormat(Tokenizer.get_instance())
--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@ -7,14 +7,17 @@
 from typing import AsyncGenerator

 from fireworks.client import Fireworks
+from llama_models.datatypes import CoreModelId

 from llama_models.llama3.api.chat_format import ChatFormat
 from llama_models.llama3.api.datatypes import Message
 from llama_models.llama3.api.tokenizer import Tokenizer
-
 from llama_stack.apis.inference import *  # noqa: F403
 from llama_stack.distribution.request_headers import NeedsRequestProviderData
-from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
+from llama_stack.providers.utils.inference.model_registry import (
+    ModelAlias,
+    ModelRegistryHelper,
+)
 from llama_stack.providers.utils.inference.openai_compat import (
    get_sampling_options,
    process_chat_completion_response,
@ -31,25 +34,61 @@ from llama_stack.providers.utils.inference.prompt_adapter import (

 from .config import FireworksImplConfig

-FIREWORKS_SUPPORTED_MODELS = {
-    "Llama3.1-8B-Instruct": "fireworks/llama-v3p1-8b-instruct",
-    "Llama3.1-70B-Instruct": "fireworks/llama-v3p1-70b-instruct",
-    "Llama3.1-405B-Instruct": "fireworks/llama-v3p1-405b-instruct",
-    "Llama3.2-1B-Instruct": "fireworks/llama-v3p2-1b-instruct",
-    "Llama3.2-3B-Instruct": "fireworks/llama-v3p2-3b-instruct",
-    "Llama3.2-11B-Vision-Instruct": "fireworks/llama-v3p2-11b-vision-instruct",
-    "Llama3.2-90B-Vision-Instruct": "fireworks/llama-v3p2-90b-vision-instruct",
-    "Llama-Guard-3-8B": "fireworks/llama-guard-3-8b",
-}
+
+model_aliases = [
+    ModelAlias(
+        provider_model_id="fireworks/llama-v3p1-8b-instruct",
+        aliases=["Llama3.1-8B-Instruct"],
+        llama_model=CoreModelId.llama3_1_8b_instruct.value,
+    ),
+    ModelAlias(
+        provider_model_id="fireworks/llama-v3p1-70b-instruct",
+        aliases=["Llama3.1-70B-Instruct"],
+        llama_model=CoreModelId.llama3_1_70b_instruct.value,
+    ),
+    ModelAlias(
+        provider_model_id="fireworks/llama-v3p1-405b-instruct",
+        aliases=["Llama3.1-405B-Instruct"],
+        llama_model=CoreModelId.llama3_1_405b_instruct.value,
+    ),
+    ModelAlias(
+        provider_model_id="fireworks/llama-v3p2-1b-instruct",
+        aliases=["Llama3.2-1B-Instruct"],
+        llama_model=CoreModelId.llama3_2_3b_instruct.value,
+    ),
+    ModelAlias(
+        provider_model_id="fireworks/llama-v3p2-3b-instruct",
+        aliases=["Llama3.2-3B-Instruct"],
+        llama_model=CoreModelId.llama3_2_11b_vision_instruct.value,
+    ),
+    ModelAlias(
+        provider_model_id="fireworks/llama-v3p2-11b-vision-instruct",
+        aliases=["Llama3.2-11B-Vision-Instruct"],
+        llama_model=CoreModelId.llama3_2_11b_vision_instruct.value,
+    ),
+    ModelAlias(
+        provider_model_id="fireworks/llama-v3p2-90b-vision-instruct",
+        aliases=["Llama3.2-90B-Vision-Instruct"],
+        llama_model=CoreModelId.llama3_2_90b_vision_instruct.value,
+    ),
+    ModelAlias(
+        provider_model_id="fireworks/llama-guard-3-8b",
+        aliases=["Llama-Guard-3-8B"],
+        llama_model=CoreModelId.llama_guard_3_8b.value,
+    ),
+    ModelAlias(
+        provider_model_id="fireworks/llama-guard-3-11b-vision",
+        aliases=["Llama-Guard-3-11B-Vision"],
+        llama_model=CoreModelId.llama_guard_3_11b_vision.value,
+    ),
+]


 class FireworksInferenceAdapter(
    ModelRegistryHelper, Inference, NeedsRequestProviderData
 ):
    def __init__(self, config: FireworksImplConfig) -> None:
-        ModelRegistryHelper.__init__(
-            self, stack_to_provider_models_map=FIREWORKS_SUPPORTED_MODELS
-        )
+        ModelRegistryHelper.__init__(self, model_aliases)
        self.config = config
        self.formatter = ChatFormat(Tokenizer.get_instance())

@ -81,8 +120,9 @@ class FireworksInferenceAdapter(
        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
    ) -> AsyncGenerator:
+        model = await self.model_store.get_model(model_id)
        request = CompletionRequest(
-            model=model_id,
+            model=model.provider_resource_id,
            content=content,
            sampling_params=sampling_params,
            response_format=response_format,
@ -148,8 +188,9 @@ class FireworksInferenceAdapter(
        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
    ) -> AsyncGenerator:
+        model = await self.model_store.get_model(model_id)
        request = ChatCompletionRequest(
-            model=model_id,
+            model=model.provider_resource_id,
            messages=messages,
            sampling_params=sampling_params,
            tools=tools or [],
@ -207,7 +248,7 @@ class FireworksInferenceAdapter(
                ]
            else:
                input_dict["prompt"] = chat_completion_request_to_prompt(
-                    request, self.formatter
+                    request, self.get_llama_model(request.model), self.formatter
                )
        else:
            assert (
@ -221,7 +262,7 @@ class FireworksInferenceAdapter(
                input_dict["prompt"] = input_dict["prompt"][len("<|begin_of_text|>") :]

        return {
-            "model": self.map_to_provider_model(request.model),
+            "model": request.model,
            **input_dict,
            "stream": request.stream,
            **self._build_options(request.sampling_params, request.response_format),
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@ -6,6 +6,8 @@

 from typing import AsyncGenerator

+from llama_models.datatypes import CoreModelId
+
 from llama_models.llama3.api.chat_format import ChatFormat

 from llama_models.llama3.api.datatypes import Message
@ -15,7 +17,10 @@ from together import Together

 from llama_stack.apis.inference import *  # noqa: F403
 from llama_stack.distribution.request_headers import NeedsRequestProviderData
-from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
+from llama_stack.providers.utils.inference.model_registry import (
+    ModelAlias,
+    ModelRegistryHelper,
+)
 from llama_stack.providers.utils.inference.openai_compat import (
    get_sampling_options,
    process_chat_completion_response,
@ -33,25 +38,55 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
 from .config import TogetherImplConfig


-TOGETHER_SUPPORTED_MODELS = {
-    "Llama3.1-8B-Instruct": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
-    "Llama3.1-70B-Instruct": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
-    "Llama3.1-405B-Instruct": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
-    "Llama3.2-3B-Instruct": "meta-llama/Llama-3.2-3B-Instruct-Turbo",
-    "Llama3.2-11B-Vision-Instruct": "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
-    "Llama3.2-90B-Vision-Instruct": "meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
-    "Llama-Guard-3-8B": "meta-llama/Meta-Llama-Guard-3-8B",
-    "Llama-Guard-3-11B-Vision": "meta-llama/Llama-Guard-3-11B-Vision-Turbo",
-}
+model_aliases = [
+    ModelAlias(
+        provider_model_id="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
+        aliases=["Llama3.1-8B-Instruct"],
+        llama_model=CoreModelId.llama3_1_8b_instruct.value,
+    ),
+    ModelAlias(
+        provider_model_id="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
+        aliases=["Llama3.1-70B-Instruct"],
+        llama_model=CoreModelId.llama3_1_70b_instruct.value,
+    ),
+    ModelAlias(
+        provider_model_id="meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
+        aliases=["Llama3.1-405B-Instruct"],
+        llama_model=CoreModelId.llama3_1_405b_instruct.value,
+    ),
+    ModelAlias(
+        provider_model_id="meta-llama/Llama-3.2-3B-Instruct-Turbo",
+        aliases=["Llama3.2-3B-Instruct"],
+        llama_model=CoreModelId.llama3_2_3b_instruct.value,
+    ),
+    ModelAlias(
+        provider_model_id="meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
+        aliases=["Llama3.2-11B-Vision-Instruct"],
+        llama_model=CoreModelId.llama3_2_11b_vision_instruct.value,
+    ),
+    ModelAlias(
+        provider_model_id="meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
+        aliases=["Llama3.2-90B-Vision-Instruct"],
+        llama_model=CoreModelId.llama3_2_90b_vision_instruct.value,
+    ),
+    ModelAlias(
+        provider_model_id="meta-llama/Meta-Llama-Guard-3-8B",
+        aliases=["Llama-Guard-3-8B"],
+        llama_model=CoreModelId.llama_guard_3_8b.value,
+    ),
+    ModelAlias(
+        provider_model_id="meta-llama/Llama-Guard-3-11B-Vision-Turbo",
+        aliases=["Llama-Guard-3-11B-Vision"],
+        llama_model=CoreModelId.llama_guard_3_11b_vision.value,
+    ),
+]


 class TogetherInferenceAdapter(
    ModelRegistryHelper, Inference, NeedsRequestProviderData
 ):
    def __init__(self, config: TogetherImplConfig) -> None:
-        ModelRegistryHelper.__init__(
-            self, stack_to_provider_models_map=TOGETHER_SUPPORTED_MODELS
-        )
+        ModelRegistryHelper.__init__(self, model_aliases)
        self.config = config
        self.formatter = ChatFormat(Tokenizer.get_instance())

@ -70,8 +105,9 @@ class TogetherInferenceAdapter(
        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
    ) -> AsyncGenerator:
+        model = await self.model_store.get_model(model_id)
        request = CompletionRequest(
-            model=model_id,
+            model=model.provider_resource_id,
            content=content,
            sampling_params=sampling_params,
            response_format=response_format,
@ -145,8 +181,9 @@ class TogetherInferenceAdapter(
        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
    ) -> AsyncGenerator:
+        model = await self.model_store.get_model(model_id)
        request = ChatCompletionRequest(
-            model=model_id,
+            model=model.provider_resource_id,
            messages=messages,
            sampling_params=sampling_params,
            tools=tools or [],
@ -204,7 +241,7 @@ class TogetherInferenceAdapter(
                ]
            else:
                input_dict["prompt"] = chat_completion_request_to_prompt(
-                    request, self.formatter
+                    request, self.get_llama_model(request.model), self.formatter
                )
        else:
            assert (
@ -213,7 +250,7 @@ class TogetherInferenceAdapter(
            input_dict["prompt"] = completion_request_to_prompt(request, self.formatter)

        return {
-            "model": self.map_to_provider_model(request.model),
+            "model": request.model,
            **input_dict,
            "stream": request.stream,
            **self._build_options(request.sampling_params, request.response_format),