feat: inference passthrough provider (#1166)

## What does this PR do? In this PR, we implement a passthrough inference provider that works for any endpoints that respect llama stack inference API definition. ## Test Plan config some endpoint that respect llama stack inference API definition and got the inference results successfully <img width="1268" alt="Screenshot 2025-02-19 at 8 52 51 PM" src="https://github.com/user-attachments/assets/447816e4-ea7a-4365-b90c-386dc7dcf4a1" />
2025-02-19 21:47:00 -08:00 · 2025-02-19 21:47:00 -08:00 · 2b995c22eb
commit 2b995c22eb
parent d39f8de619
6 changed files with 364 additions and 0 deletions
--- a/llama_stack/providers/remote/inference/passthrough/passthrough.py
+++ b/llama_stack/providers/remote/inference/passthrough/passthrough.py
@ -0,0 +1,148 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import AsyncGenerator, List, Optional
+
+from llama_stack_client import LlamaStackClient
+
+from llama_stack.apis.common.content_types import InterleavedContent
+from llama_stack.apis.inference import (
+    EmbeddingsResponse,
+    Inference,
+    LogProbConfig,
+    Message,
+    ResponseFormat,
+    SamplingParams,
+    ToolChoice,
+    ToolConfig,
+    ToolDefinition,
+    ToolPromptFormat,
+)
+from llama_stack.apis.models import Model
+from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
+
+from .config import PassthroughImplConfig
+
+
+class PassthroughInferenceAdapter(Inference):
+    def __init__(self, config: PassthroughImplConfig) -> None:
+        ModelRegistryHelper.__init__(self, [])
+        self.config = config
+
+    async def initialize(self) -> None:
+        pass
+
+    async def shutdown(self) -> None:
+        pass
+
+    async def unregister_model(self, model_id: str) -> None:
+        pass
+
+    async def register_model(self, model: Model) -> Model:
+        return model
+
+    def _get_client(self) -> LlamaStackClient:
+        passthrough_url = None
+        passthrough_api_key = None
+        provider_data = None
+
+        if self.config.url is not None:
+            passthrough_url = self.config.url
+        else:
+            provider_data = self.get_request_provider_data()
+            if provider_data is None or not provider_data.passthrough_url:
+                raise ValueError(
+                    'Pass url of the passthrough endpoint in the header X-LlamaStack-Provider-Data as { "passthrough_url": <your passthrough url>}'
+                )
+            passthrough_url = provider_data.passthrough_url
+
+        if self.config.api_key is not None:
+            passthrough_api_key = self.config.api_key.get_secret_value()
+        else:
+            provider_data = self.get_request_provider_data()
+            if provider_data is None or not provider_data.passthrough_api_key:
+                raise ValueError(
+                    'Pass API Key for the passthrough endpoint in the header X-LlamaStack-Provider-Data as { "passthrough_api_key": <your api key>}'
+                )
+            passthrough_api_key = provider_data.passthrough_api_key
+
+        return LlamaStackClient(
+            base_url=passthrough_url,
+            api_key=passthrough_api_key,
+            provider_data=provider_data,
+        )
+
+    async def completion(
+        self,
+        model_id: str,
+        content: InterleavedContent,
+        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        response_format: Optional[ResponseFormat] = None,
+        stream: Optional[bool] = False,
+        logprobs: Optional[LogProbConfig] = None,
+    ) -> AsyncGenerator:
+        client = self._get_client()
+        model = await self.model_store.get_model(model_id)
+
+        params = {
+            "model_id": model.provider_resource_id,
+            "content": content,
+            "sampling_params": sampling_params,
+            "response_format": response_format,
+            "stream": stream,
+            "logprobs": logprobs,
+        }
+
+        params = {key: value for key, value in params.items() if value is not None}
+
+        # only pass through the not None params
+        return client.inference.completion(**params)
+
+    async def chat_completion(
+        self,
+        model_id: str,
+        messages: List[Message],
+        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        tools: Optional[List[ToolDefinition]] = None,
+        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
+        tool_prompt_format: Optional[ToolPromptFormat] = None,
+        response_format: Optional[ResponseFormat] = None,
+        stream: Optional[bool] = False,
+        logprobs: Optional[LogProbConfig] = None,
+        tool_config: Optional[ToolConfig] = None,
+    ) -> AsyncGenerator:
+        client = self._get_client()
+        model = await self.model_store.get_model(model_id)
+
+        params = {
+            "model_id": model.provider_resource_id,
+            "messages": messages,
+            "sampling_params": sampling_params,
+            "tools": tools,
+            "tool_choice": tool_choice,
+            "tool_prompt_format": tool_prompt_format,
+            "response_format": response_format,
+            "stream": stream,
+            "logprobs": logprobs,
+        }
+
+        params = {key: value for key, value in params.items() if value is not None}
+
+        # only pass through the not None params
+        return client.inference.chat_completion(**params)
+
+    async def embeddings(
+        self,
+        model_id: str,
+        contents: List[InterleavedContent],
+    ) -> EmbeddingsResponse:
+        client = self._get_client()
+        model = await self.model_store.get_model(model_id)
+
+        return client.inference.embeddings(
+            model_id=model.provider_resource_id,
+            contents=contents,
+        )