Add a RoutableProvider protocol, support for multiple routing keys (#163)

* Update configure.py to use multiple routing keys for safety * Refactor distribution/datatypes into a providers/datatypes * Cleanup
2024-09-30 17:30:21 -07:00 · 2024-09-30 17:30:21 -07:00 · eb2d8a31a5
commit eb2d8a31a5
parent 73decb3781
24 changed files with 600 additions and 577 deletions
--- a/llama_stack/providers/adapters/inference/bedrock/bedrock.py
+++ b/llama_stack/providers/adapters/inference/bedrock/bedrock.py
@ -12,20 +12,21 @@ from botocore.config import Config

 from llama_models.llama3.api.chat_format import ChatFormat
 from llama_models.llama3.api.tokenizer import Tokenizer
-from llama_models.sku_list import resolve_model
+
+from llama_stack.providers.utils.inference.routable import RoutableProviderForModels

 from llama_stack.apis.inference import *  # noqa: F403
 from llama_stack.providers.adapters.inference.bedrock.config import BedrockConfig

-# mapping of Model SKUs to ollama models
+
 BEDROCK_SUPPORTED_MODELS = {
-    "Meta-Llama3.1-8B-Instruct": "meta.llama3-1-8b-instruct-v1:0",
-    "Meta-Llama3.1-70B-Instruct": "meta.llama3-1-70b-instruct-v1:0",
-    "Meta-Llama3.1-405B-Instruct": "meta.llama3-1-405b-instruct-v1:0",
+    "Llama3.1-8B-Instruct": "meta.llama3-1-8b-instruct-v1:0",
+    "Llama3.1-70B-Instruct": "meta.llama3-1-70b-instruct-v1:0",
+    "Llama3.1-405B-Instruct": "meta.llama3-1-405b-instruct-v1:0",
 }


-class BedrockInferenceAdapter(Inference):
+class BedrockInferenceAdapter(Inference, RoutableProviderForModels):

    @staticmethod
    def _create_bedrock_client(config: BedrockConfig) -> BaseClient:
@ -68,6 +69,9 @@ class BedrockInferenceAdapter(Inference):
        return boto3_session.client("bedrock-runtime", config=boto3_config)

    def __init__(self, config: BedrockConfig) -> None:
+        RoutableProviderForModels.__init__(
+            self, stack_to_provider_models_map=BEDROCK_SUPPORTED_MODELS
+        )
        self._config = config

        self._client = BedrockInferenceAdapter._create_bedrock_client(config)
@ -94,22 +98,6 @@ class BedrockInferenceAdapter(Inference):
    ) -> Union[CompletionResponse, CompletionResponseStreamChunk]:
        raise NotImplementedError()

-    @staticmethod
-    def resolve_bedrock_model(model_name: str) -> str:
-        model = resolve_model(model_name)
-        assert (
-            model is not None
-            and model.descriptor(shorten_default_variant=True)
-            in BEDROCK_SUPPORTED_MODELS
-        ), (
-            f"Unsupported model: {model_name}, use one of the supported models: "
-            f"{','.join(BEDROCK_SUPPORTED_MODELS.keys())}"
-        )
-
-        return BEDROCK_SUPPORTED_MODELS.get(
-            model.descriptor(shorten_default_variant=True)
-        )
-
    @staticmethod
    def _bedrock_stop_reason_to_stop_reason(bedrock_stop_reason: str) -> StopReason:
        if bedrock_stop_reason == "max_tokens":
@ -350,7 +338,7 @@ class BedrockInferenceAdapter(Inference):
    ) -> (
        AsyncGenerator
    ):  # Union[ChatCompletionResponse, ChatCompletionResponseStreamChunk]:
-        bedrock_model = BedrockInferenceAdapter.resolve_bedrock_model(model)
+        bedrock_model = self.map_to_provider_model(model)
        inference_config = BedrockInferenceAdapter.get_bedrock_inference_config(
            sampling_params
        )
--- a/llama_stack/providers/adapters/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/adapters/inference/fireworks/fireworks.py
@ -12,7 +12,8 @@ from llama_models.llama3.api.chat_format import ChatFormat

 from llama_models.llama3.api.datatypes import Message, StopReason
 from llama_models.llama3.api.tokenizer import Tokenizer
-from llama_models.sku_list import resolve_model
+
+from llama_stack.providers.utils.inference.routable import RoutableProviderForModels

 from llama_stack.apis.inference import *  # noqa: F403
 from llama_stack.providers.utils.inference.augment_messages import (
@ -21,6 +22,7 @@ from llama_stack.providers.utils.inference.augment_messages import (

 from .config import FireworksImplConfig

+
 FIREWORKS_SUPPORTED_MODELS = {
    "Llama3.1-8B-Instruct": "fireworks/llama-v3p1-8b-instruct",
    "Llama3.1-70B-Instruct": "fireworks/llama-v3p1-70b-instruct",
@ -28,8 +30,11 @@ FIREWORKS_SUPPORTED_MODELS = {
 }


-class FireworksInferenceAdapter(Inference):
+class FireworksInferenceAdapter(Inference, RoutableProviderForModels):
    def __init__(self, config: FireworksImplConfig) -> None:
+        RoutableProviderForModels.__init__(
+            self, stack_to_provider_models_map=FIREWORKS_SUPPORTED_MODELS
+        )
        self.config = config
        tokenizer = Tokenizer.get_instance()
        self.formatter = ChatFormat(tokenizer)
@ -65,18 +70,6 @@ class FireworksInferenceAdapter(Inference):

        return fireworks_messages

-    def resolve_fireworks_model(self, model_name: str) -> str:
-        model = resolve_model(model_name)
-        assert (
-            model is not None
-            and model.descriptor(shorten_default_variant=True)
-            in FIREWORKS_SUPPORTED_MODELS
-        ), f"Unsupported model: {model_name}, use one of the supported models: {','.join(FIREWORKS_SUPPORTED_MODELS.keys())}"
-
-        return FIREWORKS_SUPPORTED_MODELS.get(
-            model.descriptor(shorten_default_variant=True)
-        )
-
    def get_fireworks_chat_options(self, request: ChatCompletionRequest) -> dict:
        options = {}
        if request.sampling_params is not None:
@ -112,7 +105,7 @@ class FireworksInferenceAdapter(Inference):

        # accumulate sampling params and other options to pass to fireworks
        options = self.get_fireworks_chat_options(request)
-        fireworks_model = self.resolve_fireworks_model(request.model)
+        fireworks_model = self.map_to_provider_model(request.model)

        if not request.stream:
            r = await self.client.chat.completions.acreate(
--- a/llama_stack/providers/adapters/inference/ollama/ollama.py
+++ b/llama_stack/providers/adapters/inference/ollama/ollama.py
@ -11,7 +11,6 @@ import httpx
 from llama_models.llama3.api.chat_format import ChatFormat
 from llama_models.llama3.api.datatypes import Message, StopReason
 from llama_models.llama3.api.tokenizer import Tokenizer
-from llama_models.sku_list import resolve_model

 from ollama import AsyncClient

@ -19,6 +18,7 @@ from llama_stack.apis.inference import *  # noqa: F403
 from llama_stack.providers.utils.inference.augment_messages import (
    augment_messages_for_tools,
 )
+from llama_stack.providers.utils.inference.routable import RoutableProviderForModels

 # TODO: Eventually this will move to the llama cli model list command
 # mapping of Model SKUs to ollama models
@ -29,8 +29,11 @@ OLLAMA_SUPPORTED_SKUS = {
 }


-class OllamaInferenceAdapter(Inference):
+class OllamaInferenceAdapter(Inference, RoutableProviderForModels):
    def __init__(self, url: str) -> None:
+        RoutableProviderForModels.__init__(
+            self, stack_to_provider_models_map=OLLAMA_SUPPORTED_SKUS
+        )
        self.url = url
        tokenizer = Tokenizer.get_instance()
        self.formatter = ChatFormat(tokenizer)
@ -72,15 +75,6 @@ class OllamaInferenceAdapter(Inference):

        return ollama_messages

-    def resolve_ollama_model(self, model_name: str) -> str:
-        model = resolve_model(model_name)
-        assert (
-            model is not None
-            and model.descriptor(shorten_default_variant=True) in OLLAMA_SUPPORTED_SKUS
-        ), f"Unsupported model: {model_name}, use one of the supported models: {','.join(OLLAMA_SUPPORTED_SKUS.keys())}"
-
-        return OLLAMA_SUPPORTED_SKUS.get(model.descriptor(shorten_default_variant=True))
-
    def get_ollama_chat_options(self, request: ChatCompletionRequest) -> dict:
        options = {}
        if request.sampling_params is not None:
@ -120,7 +114,7 @@ class OllamaInferenceAdapter(Inference):
        messages = augment_messages_for_tools(request)
        # accumulate sampling params and other options to pass to ollama
        options = self.get_ollama_chat_options(request)
-        ollama_model = self.resolve_ollama_model(request.model)
+        ollama_model = self.map_to_provider_model(request.model)

        res = await self.client.ps()
        need_model_pull = True
--- a/llama_stack/providers/adapters/inference/tgi/tgi.py
+++ b/llama_stack/providers/adapters/inference/tgi/tgi.py
@ -13,6 +13,8 @@ from llama_models.llama3.api.chat_format import ChatFormat
 from llama_models.llama3.api.datatypes import StopReason
 from llama_models.llama3.api.tokenizer import Tokenizer

+from llama_stack.distribution.datatypes import RoutableProvider
+
 from llama_stack.apis.inference import *  # noqa: F403
 from llama_stack.providers.utils.inference.augment_messages import (
    augment_messages_for_tools,
@ -23,7 +25,7 @@ from .config import InferenceAPIImplConfig, InferenceEndpointImplConfig, TGIImpl
 logger = logging.getLogger(__name__)


-class _HfAdapter(Inference):
+class _HfAdapter(Inference, RoutableProvider):
    client: AsyncInferenceClient
    max_tokens: int
    model_id: str
@ -32,6 +34,11 @@ class _HfAdapter(Inference):
        self.tokenizer = Tokenizer.get_instance()
        self.formatter = ChatFormat(self.tokenizer)

+    async def validate_routing_keys(self, routing_keys: list[str]) -> None:
+        # these are the model names the Llama Stack will use to route requests to this provider
+        # perform validation here if necessary
+        pass
+
    async def shutdown(self) -> None:
        pass

--- a/llama_stack/providers/adapters/inference/together/config.py
+++ b/llama_stack/providers/adapters/inference/together/config.py
@ -4,6 +4,8 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from typing import Optional
+
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field

@ -14,7 +16,7 @@ class TogetherImplConfig(BaseModel):
        default="https://api.together.xyz/v1",
        description="The URL for the Together AI server",
    )
-    api_key: str = Field(
-        default="",
+    api_key: Optional[str] = Field(
+        default=None,
        description="The Together AI API Key",
    )
--- a/llama_stack/providers/adapters/inference/together/together.py
+++ b/llama_stack/providers/adapters/inference/together/together.py
@ -10,7 +10,6 @@ from llama_models.llama3.api.chat_format import ChatFormat

 from llama_models.llama3.api.datatypes import Message, StopReason
 from llama_models.llama3.api.tokenizer import Tokenizer
-from llama_models.sku_list import resolve_model

 from together import Together

@ -19,9 +18,11 @@ from llama_stack.distribution.request_headers import NeedsRequestProviderData
 from llama_stack.providers.utils.inference.augment_messages import (
    augment_messages_for_tools,
 )
+from llama_stack.providers.utils.inference.routable import RoutableProviderForModels

 from .config import TogetherImplConfig

+
 TOGETHER_SUPPORTED_MODELS = {
    "Llama3.1-8B-Instruct": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
    "Llama3.1-70B-Instruct": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
@ -32,8 +33,13 @@ TOGETHER_SUPPORTED_MODELS = {
 }


-class TogetherInferenceAdapter(Inference, NeedsRequestProviderData):
+class TogetherInferenceAdapter(
+    Inference, NeedsRequestProviderData, RoutableProviderForModels
+):
    def __init__(self, config: TogetherImplConfig) -> None:
+        RoutableProviderForModels.__init__(
+            self, stack_to_provider_models_map=TOGETHER_SUPPORTED_MODELS
+        )
        self.config = config
        tokenizer = Tokenizer.get_instance()
        self.formatter = ChatFormat(tokenizer)
@ -69,18 +75,6 @@ class TogetherInferenceAdapter(Inference, NeedsRequestProviderData):

        return together_messages

-    def resolve_together_model(self, model_name: str) -> str:
-        model = resolve_model(model_name)
-        assert (
-            model is not None
-            and model.descriptor(shorten_default_variant=True)
-            in TOGETHER_SUPPORTED_MODELS
-        ), f"Unsupported model: {model_name}, use one of the supported models: {','.join(TOGETHER_SUPPORTED_MODELS.keys())}"
-
-        return TOGETHER_SUPPORTED_MODELS.get(
-            model.descriptor(shorten_default_variant=True)
-        )
-
    def get_together_chat_options(self, request: ChatCompletionRequest) -> dict:
        options = {}
        if request.sampling_params is not None:
@ -103,12 +97,15 @@ class TogetherInferenceAdapter(Inference, NeedsRequestProviderData):
    ) -> AsyncGenerator:

        together_api_key = None
-        provider_data = self.get_request_provider_data()
-        if provider_data is None or not provider_data.together_api_key:
-            raise ValueError(
-                'Pass Together API Key in the header X-LlamaStack-ProviderData as { "together_api_key": <your api key>}'
-            )
-        together_api_key = provider_data.together_api_key
+        if self.config.api_key is not None:
+            together_api_key = self.config.api_key
+        else:
+            provider_data = self.get_request_provider_data()
+            if provider_data is None or not provider_data.together_api_key:
+                raise ValueError(
+                    'Pass Together API Key in the header X-LlamaStack-ProviderData as { "together_api_key": <your api key>}'
+                )
+            together_api_key = provider_data.together_api_key

        client = Together(api_key=together_api_key)
        # wrapper request to make it easier to pass around (internal only, not exposed to API)
@ -125,7 +122,7 @@ class TogetherInferenceAdapter(Inference, NeedsRequestProviderData):

        # accumulate sampling params and other options to pass to together
        options = self.get_together_chat_options(request)
-        together_model = self.resolve_together_model(request.model)
+        together_model = self.map_to_provider_model(request.model)
        messages = augment_messages_for_tools(request)

        if not request.stream:
@ -171,17 +168,10 @@ class TogetherInferenceAdapter(Inference, NeedsRequestProviderData):
                stream=True,
                **options,
            ):
-                if chunk.choices[0].finish_reason:
-                    if (
-                        stop_reason is None and chunk.choices[0].finish_reason == "stop"
-                    ) or (
-                        stop_reason is None and chunk.choices[0].finish_reason == "eos"
-                    ):
+                if finish_reason := chunk.choices[0].finish_reason:
+                    if stop_reason is None and finish_reason in ["stop", "eos"]:
                        stop_reason = StopReason.end_of_turn
-                    elif (
-                        stop_reason is None
-                        and chunk.choices[0].finish_reason == "length"
-                    ):
+                    elif stop_reason is None and finish_reason == "length":
                        stop_reason = StopReason.out_of_tokens
                    break