API Updates (#73)

* API Keys passed from Client instead of distro configuration * delete distribution registry * Rename the "package" word away * Introduce a "Router" layer for providers Some providers need to be factorized and considered as thin routing layers on top of other providers. Consider two examples: - The inference API should be a routing layer over inference providers, routed using the "model" key - The memory banks API is another instance where various memory bank types will be provided by independent providers (e.g., a vector store is served by Chroma while a keyvalue memory can be served by Redis or PGVector) This commit introduces a generalized routing layer for this purpose. * update `apis_to_serve` * llama_toolchain -> llama_stack * Codemod from llama_toolchain -> llama_stack - added providers/registry - cleaned up api/ subdirectories and moved impls away - restructured api/api.py - from llama_stack.apis.<api> import foo should work now - update imports to do llama_stack.apis.<api> - update many other imports - added __init__, fixed some registry imports - updated registry imports - create_agentic_system -> create_agent - AgenticSystem -> Agent * Moved some stuff out of common/; re-generated OpenAPI spec * llama-toolchain -> llama-stack (hyphens) * add control plane API * add redis adapter + sqlite provider * move core -> distribution * Some more toolchain -> stack changes * small naming shenanigans * Removing custom tool and agent utilities and moving them client side * Move control plane to distribution server for now * Remove control plane from API list * no codeshield dependency randomly plzzzzz * Add "fire" as a dependency * add back event loggers * stack configure fixes * use brave instead of bing in the example client * add init file so it gets packaged * add init files so it gets packaged * Update MANIFEST * bug fix --------- Co-authored-by: Hardik Shah <hjshah@fb.com> Co-authored-by: Xi Yan <xiyan@meta.com> Co-authored-by: Ashwin Bharambe <ashwin@meta.com>
2024-09-17 19:51:35 -07:00 · 2024-09-17 19:51:35 -07:00 · 9487ad8294
commit 9487ad8294
parent f294eac5f5
213 changed files with 1725 additions and 1204 deletions
--- a/llama_stack/providers/adapters/inference/init.py
+++ b/llama_stack/providers/adapters/inference/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/llama_stack/providers/adapters/inference/fireworks/init.py
+++ b/llama_stack/providers/adapters/inference/fireworks/init.py
@ -0,0 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .config import FireworksImplConfig
+
+
+async def get_adapter_impl(config: FireworksImplConfig, _deps):
+    from .fireworks import FireworksInferenceAdapter
+
+    assert isinstance(
+        config, FireworksImplConfig
+    ), f"Unexpected config type: {type(config)}"
+    impl = FireworksInferenceAdapter(config)
+    await impl.initialize()
+    return impl
--- a/llama_stack/providers/adapters/inference/fireworks/config.py
+++ b/llama_stack/providers/adapters/inference/fireworks/config.py
@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_models.schema_utils import json_schema_type
+from pydantic import BaseModel, Field
+
+
+@json_schema_type
+class FireworksImplConfig(BaseModel):
+    url: str = Field(
+        default="https://api.fireworks.ai/inference",
+        description="The URL for the Fireworks server",
+    )
+    api_key: str = Field(
+        default="",
+        description="The Fireworks.ai API Key",
+    )
--- a/llama_stack/providers/adapters/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/adapters/inference/fireworks/fireworks.py
@ -0,0 +1,245 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import AsyncGenerator
+
+from llama_models.llama3.api.chat_format import ChatFormat
+
+from llama_models.llama3.api.datatypes import Message, StopReason
+from llama_models.llama3.api.tokenizer import Tokenizer
+from llama_models.sku_list import resolve_model
+
+from fireworks.client import Fireworks
+
+from llama_stack.apis.inference import *  # noqa: F403
+from llama_stack.providers.utils.inference.prepare_messages import prepare_messages
+
+from .config import FireworksImplConfig
+
+FIREWORKS_SUPPORTED_MODELS = {
+    "Meta-Llama3.1-8B-Instruct": "fireworks/llama-v3p1-8b-instruct",
+    "Meta-Llama3.1-70B-Instruct": "fireworks/llama-v3p1-70b-instruct",
+    "Meta-Llama3.1-405B-Instruct": "fireworks/llama-v3p1-405b-instruct",
+}
+
+
+class FireworksInferenceAdapter(Inference):
+    def __init__(self, config: FireworksImplConfig) -> None:
+        self.config = config
+        tokenizer = Tokenizer.get_instance()
+        self.formatter = ChatFormat(tokenizer)
+
+    @property
+    def client(self) -> Fireworks:
+        return Fireworks(api_key=self.config.api_key)
+
+    async def initialize(self) -> None:
+        return
+
+    async def shutdown(self) -> None:
+        pass
+
+    async def completion(self, request: CompletionRequest) -> AsyncGenerator:
+        raise NotImplementedError()
+
+    def _messages_to_fireworks_messages(self, messages: list[Message]) -> list:
+        fireworks_messages = []
+        for message in messages:
+            if message.role == "ipython":
+                role = "tool"
+            else:
+                role = message.role
+            fireworks_messages.append({"role": role, "content": message.content})
+
+        return fireworks_messages
+
+    def resolve_fireworks_model(self, model_name: str) -> str:
+        model = resolve_model(model_name)
+        assert (
+            model is not None
+            and model.descriptor(shorten_default_variant=True)
+            in FIREWORKS_SUPPORTED_MODELS
+        ), f"Unsupported model: {model_name}, use one of the supported models: {','.join(FIREWORKS_SUPPORTED_MODELS.keys())}"
+
+        return FIREWORKS_SUPPORTED_MODELS.get(
+            model.descriptor(shorten_default_variant=True)
+        )
+
+    def get_fireworks_chat_options(self, request: ChatCompletionRequest) -> dict:
+        options = {}
+        if request.sampling_params is not None:
+            for attr in {"temperature", "top_p", "top_k", "max_tokens"}:
+                if getattr(request.sampling_params, attr):
+                    options[attr] = getattr(request.sampling_params, attr)
+
+        return options
+
+    async def chat_completion(
+        self,
+        model: str,
+        messages: List[Message],
+        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        tools: Optional[List[ToolDefinition]] = None,
+        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
+        tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
+        stream: Optional[bool] = False,
+        logprobs: Optional[LogProbConfig] = None,
+    ) -> AsyncGenerator:
+        request = ChatCompletionRequest(
+            model=model,
+            messages=messages,
+            sampling_params=sampling_params,
+            tools=tools or [],
+            tool_choice=tool_choice,
+            tool_prompt_format=tool_prompt_format,
+            stream=stream,
+            logprobs=logprobs,
+        )
+
+        messages = prepare_messages(request)
+
+        # accumulate sampling params and other options to pass to fireworks
+        options = self.get_fireworks_chat_options(request)
+        fireworks_model = self.resolve_fireworks_model(request.model)
+
+        if not request.stream:
+            r = await self.client.chat.completions.acreate(
+                model=fireworks_model,
+                messages=self._messages_to_fireworks_messages(messages),
+                stream=False,
+                **options,
+            )
+            stop_reason = None
+            if r.choices[0].finish_reason:
+                if r.choices[0].finish_reason == "stop":
+                    stop_reason = StopReason.end_of_turn
+                elif r.choices[0].finish_reason == "length":
+                    stop_reason = StopReason.out_of_tokens
+
+            completion_message = self.formatter.decode_assistant_message_from_content(
+                r.choices[0].message.content, stop_reason
+            )
+
+            yield ChatCompletionResponse(
+                completion_message=completion_message,
+                logprobs=None,
+            )
+        else:
+            yield ChatCompletionResponseStreamChunk(
+                event=ChatCompletionResponseEvent(
+                    event_type=ChatCompletionResponseEventType.start,
+                    delta="",
+                )
+            )
+
+            buffer = ""
+            ipython = False
+            stop_reason = None
+
+            async for chunk in self.client.chat.completions.acreate(
+                model=fireworks_model,
+                messages=self._messages_to_fireworks_messages(messages),
+                stream=True,
+                **options,
+            ):
+                if chunk.choices[0].finish_reason:
+                    if stop_reason is None and chunk.choices[0].finish_reason == "stop":
+                        stop_reason = StopReason.end_of_turn
+                    elif (
+                        stop_reason is None
+                        and chunk.choices[0].finish_reason == "length"
+                    ):
+                        stop_reason = StopReason.out_of_tokens
+                    break
+
+                text = chunk.choices[0].delta.content
+                if text is None:
+                    continue
+
+                # check if its a tool call ( aka starts with <|python_tag|> )
+                if not ipython and text.startswith("<|python_tag|>"):
+                    ipython = True
+                    yield ChatCompletionResponseStreamChunk(
+                        event=ChatCompletionResponseEvent(
+                            event_type=ChatCompletionResponseEventType.progress,
+                            delta=ToolCallDelta(
+                                content="",
+                                parse_status=ToolCallParseStatus.started,
+                            ),
+                        )
+                    )
+                    buffer += text
+                    continue
+
+                if ipython:
+                    if text == "<|eot_id|>":
+                        stop_reason = StopReason.end_of_turn
+                        text = ""
+                        continue
+                    elif text == "<|eom_id|>":
+                        stop_reason = StopReason.end_of_message
+                        text = ""
+                        continue
+
+                    buffer += text
+                    delta = ToolCallDelta(
+                        content=text,
+                        parse_status=ToolCallParseStatus.in_progress,
+                    )
+
+                    yield ChatCompletionResponseStreamChunk(
+                        event=ChatCompletionResponseEvent(
+                            event_type=ChatCompletionResponseEventType.progress,
+                            delta=delta,
+                            stop_reason=stop_reason,
+                        )
+                    )
+                else:
+                    buffer += text
+                    yield ChatCompletionResponseStreamChunk(
+                        event=ChatCompletionResponseEvent(
+                            event_type=ChatCompletionResponseEventType.progress,
+                            delta=text,
+                            stop_reason=stop_reason,
+                        )
+                    )
+
+            # parse tool calls and report errors
+            message = self.formatter.decode_assistant_message_from_content(
+                buffer, stop_reason
+            )
+            parsed_tool_calls = len(message.tool_calls) > 0
+            if ipython and not parsed_tool_calls:
+                yield ChatCompletionResponseStreamChunk(
+                    event=ChatCompletionResponseEvent(
+                        event_type=ChatCompletionResponseEventType.progress,
+                        delta=ToolCallDelta(
+                            content="",
+                            parse_status=ToolCallParseStatus.failure,
+                        ),
+                        stop_reason=stop_reason,
+                    )
+                )
+
+            for tool_call in message.tool_calls:
+                yield ChatCompletionResponseStreamChunk(
+                    event=ChatCompletionResponseEvent(
+                        event_type=ChatCompletionResponseEventType.progress,
+                        delta=ToolCallDelta(
+                            content=tool_call,
+                            parse_status=ToolCallParseStatus.success,
+                        ),
+                        stop_reason=stop_reason,
+                    )
+                )
+
+            yield ChatCompletionResponseStreamChunk(
+                event=ChatCompletionResponseEvent(
+                    event_type=ChatCompletionResponseEventType.complete,
+                    delta="",
+                    stop_reason=stop_reason,
+                )
+            )
--- a/llama_stack/providers/adapters/inference/ollama/init.py
+++ b/llama_stack/providers/adapters/inference/ollama/init.py
@ -0,0 +1,15 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.distribution.datatypes import RemoteProviderConfig
+
+
+async def get_adapter_impl(config: RemoteProviderConfig, _deps):
+    from .ollama import OllamaInferenceAdapter
+
+    impl = OllamaInferenceAdapter(config.url)
+    await impl.initialize()
+    return impl
--- a/llama_stack/providers/adapters/inference/ollama/ollama.py
+++ b/llama_stack/providers/adapters/inference/ollama/ollama.py
@ -0,0 +1,261 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import AsyncGenerator
+
+import httpx
+
+from llama_models.llama3.api.chat_format import ChatFormat
+from llama_models.llama3.api.datatypes import Message, StopReason
+from llama_models.llama3.api.tokenizer import Tokenizer
+from llama_models.sku_list import resolve_model
+
+from ollama import AsyncClient
+
+from llama_stack.apis.inference import *  # noqa: F403
+from llama_stack.providers.utils.inference.prepare_messages import prepare_messages
+
+# TODO: Eventually this will move to the llama cli model list command
+# mapping of Model SKUs to ollama models
+OLLAMA_SUPPORTED_SKUS = {
+    # "Meta-Llama3.1-8B-Instruct": "llama3.1",
+    "Meta-Llama3.1-8B-Instruct": "llama3.1:8b-instruct-fp16",
+    "Meta-Llama3.1-70B-Instruct": "llama3.1:70b-instruct-fp16",
+}
+
+
+class OllamaInferenceAdapter(Inference):
+    def __init__(self, url: str) -> None:
+        self.url = url
+        tokenizer = Tokenizer.get_instance()
+        self.formatter = ChatFormat(tokenizer)
+
+    @property
+    def client(self) -> AsyncClient:
+        return AsyncClient(host=self.url)
+
+    async def initialize(self) -> None:
+        try:
+            await self.client.ps()
+        except httpx.ConnectError as e:
+            raise RuntimeError(
+                "Ollama Server is not running, start it using `ollama serve` in a separate terminal"
+            ) from e
+
+    async def shutdown(self) -> None:
+        pass
+
+    async def completion(self, request: CompletionRequest) -> AsyncGenerator:
+        raise NotImplementedError()
+
+    def _messages_to_ollama_messages(self, messages: list[Message]) -> list:
+        ollama_messages = []
+        for message in messages:
+            if message.role == "ipython":
+                role = "tool"
+            else:
+                role = message.role
+            ollama_messages.append({"role": role, "content": message.content})
+
+        return ollama_messages
+
+    def resolve_ollama_model(self, model_name: str) -> str:
+        model = resolve_model(model_name)
+        assert (
+            model is not None
+            and model.descriptor(shorten_default_variant=True) in OLLAMA_SUPPORTED_SKUS
+        ), f"Unsupported model: {model_name}, use one of the supported models: {','.join(OLLAMA_SUPPORTED_SKUS.keys())}"
+
+        return OLLAMA_SUPPORTED_SKUS.get(model.descriptor(shorten_default_variant=True))
+
+    def get_ollama_chat_options(self, request: ChatCompletionRequest) -> dict:
+        options = {}
+        if request.sampling_params is not None:
+            for attr in {"temperature", "top_p", "top_k", "max_tokens"}:
+                if getattr(request.sampling_params, attr):
+                    options[attr] = getattr(request.sampling_params, attr)
+            if (
+                request.sampling_params.repetition_penalty is not None
+                and request.sampling_params.repetition_penalty != 1.0
+            ):
+                options["repeat_penalty"] = request.sampling_params.repetition_penalty
+
+        return options
+
+    async def chat_completion(
+        self,
+        model: str,
+        messages: List[Message],
+        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        tools: Optional[List[ToolDefinition]] = None,
+        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
+        tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
+        stream: Optional[bool] = False,
+        logprobs: Optional[LogProbConfig] = None,
+    ) -> AsyncGenerator:
+        request = ChatCompletionRequest(
+            model=model,
+            messages=messages,
+            sampling_params=sampling_params,
+            tools=tools or [],
+            tool_choice=tool_choice,
+            tool_prompt_format=tool_prompt_format,
+            stream=stream,
+            logprobs=logprobs,
+        )
+
+        messages = prepare_messages(request)
+        # accumulate sampling params and other options to pass to ollama
+        options = self.get_ollama_chat_options(request)
+        ollama_model = self.resolve_ollama_model(request.model)
+
+        res = await self.client.ps()
+        need_model_pull = True
+        for r in res["models"]:
+            if ollama_model == r["model"]:
+                need_model_pull = False
+                break
+
+        if need_model_pull:
+            print(f"Pulling model: {ollama_model}")
+            status = await self.client.pull(ollama_model)
+            assert (
+                status["status"] == "success"
+            ), f"Failed to pull model {self.model} in ollama"
+
+        if not request.stream:
+            r = await self.client.chat(
+                model=ollama_model,
+                messages=self._messages_to_ollama_messages(messages),
+                stream=False,
+                options=options,
+            )
+            stop_reason = None
+            if r["done"]:
+                if r["done_reason"] == "stop":
+                    stop_reason = StopReason.end_of_turn
+                elif r["done_reason"] == "length":
+                    stop_reason = StopReason.out_of_tokens
+
+            completion_message = self.formatter.decode_assistant_message_from_content(
+                r["message"]["content"], stop_reason
+            )
+            yield ChatCompletionResponse(
+                completion_message=completion_message,
+                logprobs=None,
+            )
+        else:
+            yield ChatCompletionResponseStreamChunk(
+                event=ChatCompletionResponseEvent(
+                    event_type=ChatCompletionResponseEventType.start,
+                    delta="",
+                )
+            )
+            stream = await self.client.chat(
+                model=ollama_model,
+                messages=self._messages_to_ollama_messages(messages),
+                stream=True,
+                options=options,
+            )
+
+            buffer = ""
+            ipython = False
+            stop_reason = None
+
+            async for chunk in stream:
+                if chunk["done"]:
+                    if stop_reason is None and chunk["done_reason"] == "stop":
+                        stop_reason = StopReason.end_of_turn
+                    elif stop_reason is None and chunk["done_reason"] == "length":
+                        stop_reason = StopReason.out_of_tokens
+                    break
+
+                text = chunk["message"]["content"]
+
+                # check if its a tool call ( aka starts with <|python_tag|> )
+                if not ipython and text.startswith("<|python_tag|>"):
+                    ipython = True
+                    yield ChatCompletionResponseStreamChunk(
+                        event=ChatCompletionResponseEvent(
+                            event_type=ChatCompletionResponseEventType.progress,
+                            delta=ToolCallDelta(
+                                content="",
+                                parse_status=ToolCallParseStatus.started,
+                            ),
+                        )
+                    )
+                    buffer += text
+                    continue
+
+                if ipython:
+                    if text == "<|eot_id|>":
+                        stop_reason = StopReason.end_of_turn
+                        text = ""
+                        continue
+                    elif text == "<|eom_id|>":
+                        stop_reason = StopReason.end_of_message
+                        text = ""
+                        continue
+
+                    buffer += text
+                    delta = ToolCallDelta(
+                        content=text,
+                        parse_status=ToolCallParseStatus.in_progress,
+                    )
+
+                    yield ChatCompletionResponseStreamChunk(
+                        event=ChatCompletionResponseEvent(
+                            event_type=ChatCompletionResponseEventType.progress,
+                            delta=delta,
+                            stop_reason=stop_reason,
+                        )
+                    )
+                else:
+                    buffer += text
+                    yield ChatCompletionResponseStreamChunk(
+                        event=ChatCompletionResponseEvent(
+                            event_type=ChatCompletionResponseEventType.progress,
+                            delta=text,
+                            stop_reason=stop_reason,
+                        )
+                    )
+
+            # parse tool calls and report errors
+            message = self.formatter.decode_assistant_message_from_content(
+                buffer, stop_reason
+            )
+            parsed_tool_calls = len(message.tool_calls) > 0
+            if ipython and not parsed_tool_calls:
+                yield ChatCompletionResponseStreamChunk(
+                    event=ChatCompletionResponseEvent(
+                        event_type=ChatCompletionResponseEventType.progress,
+                        delta=ToolCallDelta(
+                            content="",
+                            parse_status=ToolCallParseStatus.failure,
+                        ),
+                        stop_reason=stop_reason,
+                    )
+                )
+
+            for tool_call in message.tool_calls:
+                yield ChatCompletionResponseStreamChunk(
+                    event=ChatCompletionResponseEvent(
+                        event_type=ChatCompletionResponseEventType.progress,
+                        delta=ToolCallDelta(
+                            content=tool_call,
+                            parse_status=ToolCallParseStatus.success,
+                        ),
+                        stop_reason=stop_reason,
+                    )
+                )
+
+            yield ChatCompletionResponseStreamChunk(
+                event=ChatCompletionResponseEvent(
+                    event_type=ChatCompletionResponseEventType.complete,
+                    delta="",
+                    stop_reason=stop_reason,
+                )
+            )
--- a/llama_stack/providers/adapters/inference/tgi/init.py
+++ b/llama_stack/providers/adapters/inference/tgi/init.py
@ -0,0 +1,24 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .config import TGIImplConfig
+from .tgi import InferenceEndpointAdapter, TGIAdapter
+
+
+async def get_adapter_impl(config: TGIImplConfig, _deps):
+    assert isinstance(config, TGIImplConfig), f"Unexpected config type: {type(config)}"
+
+    if config.url is not None:
+        impl = TGIAdapter(config)
+    elif config.is_inference_endpoint():
+        impl = InferenceEndpointAdapter(config)
+    else:
+        raise ValueError(
+            "Invalid configuration. Specify either an URL or HF Inference Endpoint details (namespace and endpoint name)."
+        )
+
+    await impl.initialize()
+    return impl
--- a/llama_stack/providers/adapters/inference/tgi/config.py
+++ b/llama_stack/providers/adapters/inference/tgi/config.py
@ -0,0 +1,29 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Optional
+
+from llama_models.schema_utils import json_schema_type
+from pydantic import BaseModel, Field
+
+
+@json_schema_type
+class TGIImplConfig(BaseModel):
+    url: Optional[str] = Field(
+        default=None,
+        description="The URL for the local TGI endpoint (e.g., http://localhost:8080)",
+    )
+    api_token: Optional[str] = Field(
+        default=None,
+        description="The HF token for Hugging Face Inference Endpoints (will default to locally saved token if not provided)",
+    )
+    hf_endpoint_name: Optional[str] = Field(
+        default=None,
+        description="The name of the Hugging Face Inference Endpoint : can be either in the format of '{namespace}/{endpoint_name}' (namespace can be the username or organization name) or just '{endpoint_name}' if logged into the same account as the namespace",
+    )
+
+    def is_inference_endpoint(self) -> bool:
+        return self.hf_endpoint_name is not None
--- a/llama_stack/providers/adapters/inference/tgi/tgi.py
+++ b/llama_stack/providers/adapters/inference/tgi/tgi.py
@ -0,0 +1,295 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+from typing import Any, AsyncGenerator, Dict
+
+import requests
+
+from huggingface_hub import HfApi, InferenceClient
+from llama_models.llama3.api.chat_format import ChatFormat
+from llama_models.llama3.api.datatypes import StopReason
+from llama_models.llama3.api.tokenizer import Tokenizer
+from llama_stack.apis.inference import *  # noqa: F403
+from llama_stack.providers.utils.inference.prepare_messages import prepare_messages
+
+from .config import TGIImplConfig
+
+HF_SUPPORTED_MODELS = {
+    "Meta-Llama3.1-8B-Instruct": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "Meta-Llama3.1-70B-Instruct": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+    "Meta-Llama3.1-405B-Instruct": "meta-llama/Meta-Llama-3.1-405B-Instruct",
+}
+
+
+class TGIAdapter(Inference):
+    def __init__(self, config: TGIImplConfig) -> None:
+        self.config = config
+        self.tokenizer = Tokenizer.get_instance()
+        self.formatter = ChatFormat(self.tokenizer)
+
+    @property
+    def client(self) -> InferenceClient:
+        return InferenceClient(model=self.config.url, token=self.config.api_token)
+
+    def _get_endpoint_info(self) -> Dict[str, Any]:
+        return {
+            **self.client.get_endpoint_info(),
+            "inference_url": self.config.url,
+        }
+
+    async def initialize(self) -> None:
+        try:
+            info = self._get_endpoint_info()
+            if "model_id" not in info:
+                raise RuntimeError("Missing model_id in model info")
+            if "max_total_tokens" not in info:
+                raise RuntimeError("Missing max_total_tokens in model info")
+            self.max_tokens = info["max_total_tokens"]
+
+            model_id = info["model_id"]
+            model_name = next(
+                (name for name, id in HF_SUPPORTED_MODELS.items() if id == model_id),
+                None,
+            )
+            if model_name is None:
+                raise RuntimeError(
+                    f"TGI is serving model: {model_id}, use one of the supported models: {', '.join(HF_SUPPORTED_MODELS.values())}"
+                )
+            self.model_name = model_name
+            self.inference_url = info["inference_url"]
+        except Exception as e:
+            import traceback
+
+            traceback.print_exc()
+            raise RuntimeError(f"Error initializing TGIAdapter: {e}") from e
+
+    async def shutdown(self) -> None:
+        pass
+
+    async def completion(self, request: CompletionRequest) -> AsyncGenerator:
+        raise NotImplementedError()
+
+    def get_chat_options(self, request: ChatCompletionRequest) -> dict:
+        options = {}
+        if request.sampling_params is not None:
+            for attr in {"temperature", "top_p", "top_k", "max_tokens"}:
+                if getattr(request.sampling_params, attr):
+                    options[attr] = getattr(request.sampling_params, attr)
+
+        return options
+
+    async def chat_completion(
+        self,
+        model: str,
+        messages: List[Message],
+        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        tools: Optional[List[ToolDefinition]] = None,
+        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
+        tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
+        stream: Optional[bool] = False,
+        logprobs: Optional[LogProbConfig] = None,
+    ) -> AsyncGenerator:
+        request = ChatCompletionRequest(
+            model=model,
+            messages=messages,
+            sampling_params=sampling_params,
+            tools=tools or [],
+            tool_choice=tool_choice,
+            tool_prompt_format=tool_prompt_format,
+            stream=stream,
+            logprobs=logprobs,
+        )
+
+        messages = prepare_messages(request)
+        model_input = self.formatter.encode_dialog_prompt(messages)
+        prompt = self.tokenizer.decode(model_input.tokens)
+
+        input_tokens = len(model_input.tokens)
+        max_new_tokens = min(
+            request.sampling_params.max_tokens or (self.max_tokens - input_tokens),
+            self.max_tokens - input_tokens - 1,
+        )
+
+        print(f"Calculated max_new_tokens: {max_new_tokens}")
+
+        assert (
+            request.model == self.model_name
+        ), f"Model mismatch, expected {self.model_name}, got {request.model}"
+
+        options = self.get_chat_options(request)
+        if not request.stream:
+            response = self.client.text_generation(
+                prompt=prompt,
+                stream=False,
+                details=True,
+                max_new_tokens=max_new_tokens,
+                stop_sequences=["<|eom_id|>", "<|eot_id|>"],
+                **options,
+            )
+            stop_reason = None
+            if response.details.finish_reason:
+                if response.details.finish_reason == "stop":
+                    stop_reason = StopReason.end_of_turn
+                elif response.details.finish_reason == "length":
+                    stop_reason = StopReason.out_of_tokens
+
+            completion_message = self.formatter.decode_assistant_message_from_content(
+                response.generated_text,
+                stop_reason,
+            )
+            yield ChatCompletionResponse(
+                completion_message=completion_message,
+                logprobs=None,
+            )
+
+        else:
+            yield ChatCompletionResponseStreamChunk(
+                event=ChatCompletionResponseEvent(
+                    event_type=ChatCompletionResponseEventType.start,
+                    delta="",
+                )
+            )
+            buffer = ""
+            ipython = False
+            stop_reason = None
+            tokens = []
+
+            for response in self.client.text_generation(
+                prompt=prompt,
+                stream=True,
+                details=True,
+                max_new_tokens=max_new_tokens,
+                stop_sequences=["<|eom_id|>", "<|eot_id|>"],
+                **options,
+            ):
+                token_result = response.token
+
+                buffer += token_result.text
+                tokens.append(token_result.id)
+
+                if not ipython and buffer.startswith("<|python_tag|>"):
+                    ipython = True
+                    yield ChatCompletionResponseStreamChunk(
+                        event=ChatCompletionResponseEvent(
+                            event_type=ChatCompletionResponseEventType.progress,
+                            delta=ToolCallDelta(
+                                content="",
+                                parse_status=ToolCallParseStatus.started,
+                            ),
+                        )
+                    )
+                    buffer = buffer[len("<|python_tag|>") :]
+                    continue
+
+                if token_result.text == "<|eot_id|>":
+                    stop_reason = StopReason.end_of_turn
+                    text = ""
+                elif token_result.text == "<|eom_id|>":
+                    stop_reason = StopReason.end_of_message
+                    text = ""
+                else:
+                    text = token_result.text
+
+                if ipython:
+                    delta = ToolCallDelta(
+                        content=text,
+                        parse_status=ToolCallParseStatus.in_progress,
+                    )
+                else:
+                    delta = text
+
+                if stop_reason is None:
+                    yield ChatCompletionResponseStreamChunk(
+                        event=ChatCompletionResponseEvent(
+                            event_type=ChatCompletionResponseEventType.progress,
+                            delta=delta,
+                            stop_reason=stop_reason,
+                        )
+                    )
+
+            if stop_reason is None:
+                stop_reason = StopReason.out_of_tokens
+
+            # parse tool calls and report errors
+            message = self.formatter.decode_assistant_message(tokens, stop_reason)
+            parsed_tool_calls = len(message.tool_calls) > 0
+            if ipython and not parsed_tool_calls:
+                yield ChatCompletionResponseStreamChunk(
+                    event=ChatCompletionResponseEvent(
+                        event_type=ChatCompletionResponseEventType.progress,
+                        delta=ToolCallDelta(
+                            content="",
+                            parse_status=ToolCallParseStatus.failure,
+                        ),
+                        stop_reason=stop_reason,
+                    )
+                )
+
+            for tool_call in message.tool_calls:
+                yield ChatCompletionResponseStreamChunk(
+                    event=ChatCompletionResponseEvent(
+                        event_type=ChatCompletionResponseEventType.progress,
+                        delta=ToolCallDelta(
+                            content=tool_call,
+                            parse_status=ToolCallParseStatus.success,
+                        ),
+                        stop_reason=stop_reason,
+                    )
+                )
+
+            yield ChatCompletionResponseStreamChunk(
+                event=ChatCompletionResponseEvent(
+                    event_type=ChatCompletionResponseEventType.complete,
+                    delta="",
+                    stop_reason=stop_reason,
+                )
+            )
+
+
+class InferenceEndpointAdapter(TGIAdapter):
+    def __init__(self, config: TGIImplConfig) -> None:
+        super().__init__(config)
+        self.config.url = self._construct_endpoint_url()
+
+    def _construct_endpoint_url(self) -> str:
+        hf_endpoint_name = self.config.hf_endpoint_name
+        assert hf_endpoint_name.count("/") <= 1, (
+            "Endpoint name must be in the format of 'namespace/endpoint_name' "
+            "or 'endpoint_name'"
+        )
+        if "/" not in hf_endpoint_name:
+            hf_namespace: str = self.get_namespace()
+            endpoint_path = f"{hf_namespace}/{hf_endpoint_name}"
+        else:
+            endpoint_path = hf_endpoint_name
+        return f"https://api.endpoints.huggingface.cloud/v2/endpoint/{endpoint_path}"
+
+    def get_namespace(self) -> str:
+        return HfApi().whoami()["name"]
+
+    @property
+    def client(self) -> InferenceClient:
+        return InferenceClient(model=self.inference_url, token=self.config.api_token)
+
+    def _get_endpoint_info(self) -> Dict[str, Any]:
+        headers = {
+            "accept": "application/json",
+            "authorization": f"Bearer {self.config.api_token}",
+        }
+        response = requests.get(self.config.url, headers=headers)
+        response.raise_for_status()
+        endpoint_info = response.json()
+        return {
+            "inference_url": endpoint_info["status"]["url"],
+            "model_id": endpoint_info["model"]["repository"],
+            "max_total_tokens": int(
+                endpoint_info["model"]["image"]["custom"]["env"]["MAX_TOTAL_TOKENS"]
+            ),
+        }
+
+    async def initialize(self) -> None:
+        await super().initialize()
--- a/llama_stack/providers/adapters/inference/together/init.py
+++ b/llama_stack/providers/adapters/inference/together/init.py
@ -0,0 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .config import TogetherImplConfig
+
+
+async def get_adapter_impl(config: TogetherImplConfig, _deps):
+    from .together import TogetherInferenceAdapter
+
+    assert isinstance(
+        config, TogetherImplConfig
+    ), f"Unexpected config type: {type(config)}"
+    impl = TogetherInferenceAdapter(config)
+    await impl.initialize()
+    return impl
--- a/llama_stack/providers/adapters/inference/together/config.py
+++ b/llama_stack/providers/adapters/inference/together/config.py
@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_models.schema_utils import json_schema_type
+from pydantic import BaseModel, Field
+
+
+@json_schema_type
+class TogetherImplConfig(BaseModel):
+    url: str = Field(
+        default="https://api.together.xyz/v1",
+        description="The URL for the Together AI server",
+    )
+    api_key: str = Field(
+        default="",
+        description="The Together AI API Key",
+    )
--- a/llama_stack/providers/adapters/inference/together/together.py
+++ b/llama_stack/providers/adapters/inference/together/together.py
@ -0,0 +1,252 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import AsyncGenerator
+
+from llama_models.llama3.api.chat_format import ChatFormat
+
+from llama_models.llama3.api.datatypes import Message, StopReason
+from llama_models.llama3.api.tokenizer import Tokenizer
+from llama_models.sku_list import resolve_model
+
+from together import Together
+
+from llama_stack.apis.inference import *  # noqa: F403
+from llama_stack.providers.utils.inference.prepare_messages import prepare_messages
+
+from .config import TogetherImplConfig
+
+TOGETHER_SUPPORTED_MODELS = {
+    "Meta-Llama3.1-8B-Instruct": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
+    "Meta-Llama3.1-70B-Instruct": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
+    "Meta-Llama3.1-405B-Instruct": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
+}
+
+
+class TogetherInferenceAdapter(Inference):
+    def __init__(self, config: TogetherImplConfig) -> None:
+        self.config = config
+        tokenizer = Tokenizer.get_instance()
+        self.formatter = ChatFormat(tokenizer)
+
+    @property
+    def client(self) -> Together:
+        return Together(api_key=self.config.api_key)
+
+    async def initialize(self) -> None:
+        return
+
+    async def shutdown(self) -> None:
+        pass
+
+    async def completion(self, request: CompletionRequest) -> AsyncGenerator:
+        raise NotImplementedError()
+
+    def _messages_to_together_messages(self, messages: list[Message]) -> list:
+        together_messages = []
+        for message in messages:
+            if message.role == "ipython":
+                role = "tool"
+            else:
+                role = message.role
+            together_messages.append({"role": role, "content": message.content})
+
+        return together_messages
+
+    def resolve_together_model(self, model_name: str) -> str:
+        model = resolve_model(model_name)
+        assert (
+            model is not None
+            and model.descriptor(shorten_default_variant=True)
+            in TOGETHER_SUPPORTED_MODELS
+        ), f"Unsupported model: {model_name}, use one of the supported models: {','.join(TOGETHER_SUPPORTED_MODELS.keys())}"
+
+        return TOGETHER_SUPPORTED_MODELS.get(
+            model.descriptor(shorten_default_variant=True)
+        )
+
+    def get_together_chat_options(self, request: ChatCompletionRequest) -> dict:
+        options = {}
+        if request.sampling_params is not None:
+            for attr in {"temperature", "top_p", "top_k", "max_tokens"}:
+                if getattr(request.sampling_params, attr):
+                    options[attr] = getattr(request.sampling_params, attr)
+
+        return options
+
+    async def chat_completion(
+        self,
+        model: str,
+        messages: List[Message],
+        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        tools: Optional[List[ToolDefinition]] = None,
+        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
+        tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
+        stream: Optional[bool] = False,
+        logprobs: Optional[LogProbConfig] = None,
+    ) -> AsyncGenerator:
+        # wrapper request to make it easier to pass around (internal only, not exposed to API)
+        request = ChatCompletionRequest(
+            model=model,
+            messages=messages,
+            sampling_params=sampling_params,
+            tools=tools or [],
+            tool_choice=tool_choice,
+            tool_prompt_format=tool_prompt_format,
+            stream=stream,
+            logprobs=logprobs,
+        )
+
+        # accumulate sampling params and other options to pass to together
+        options = self.get_together_chat_options(request)
+        together_model = self.resolve_together_model(request.model)
+        messages = prepare_messages(request)
+
+        if not request.stream:
+            # TODO: might need to add back an async here
+            r = self.client.chat.completions.create(
+                model=together_model,
+                messages=self._messages_to_together_messages(messages),
+                stream=False,
+                **options,
+            )
+            stop_reason = None
+            if r.choices[0].finish_reason:
+                if (
+                    r.choices[0].finish_reason == "stop"
+                    or r.choices[0].finish_reason == "eos"
+                ):
+                    stop_reason = StopReason.end_of_turn
+                elif r.choices[0].finish_reason == "length":
+                    stop_reason = StopReason.out_of_tokens
+
+            completion_message = self.formatter.decode_assistant_message_from_content(
+                r.choices[0].message.content, stop_reason
+            )
+            yield ChatCompletionResponse(
+                completion_message=completion_message,
+                logprobs=None,
+            )
+        else:
+            yield ChatCompletionResponseStreamChunk(
+                event=ChatCompletionResponseEvent(
+                    event_type=ChatCompletionResponseEventType.start,
+                    delta="",
+                )
+            )
+
+            buffer = ""
+            ipython = False
+            stop_reason = None
+
+            for chunk in self.client.chat.completions.create(
+                model=together_model,
+                messages=self._messages_to_together_messages(messages),
+                stream=True,
+                **options,
+            ):
+                if chunk.choices[0].finish_reason:
+                    if (
+                        stop_reason is None and chunk.choices[0].finish_reason == "stop"
+                    ) or (
+                        stop_reason is None and chunk.choices[0].finish_reason == "eos"
+                    ):
+                        stop_reason = StopReason.end_of_turn
+                    elif (
+                        stop_reason is None
+                        and chunk.choices[0].finish_reason == "length"
+                    ):
+                        stop_reason = StopReason.out_of_tokens
+                    break
+
+                text = chunk.choices[0].delta.content
+                if text is None:
+                    continue
+
+                # check if its a tool call ( aka starts with <|python_tag|> )
+                if not ipython and text.startswith("<|python_tag|>"):
+                    ipython = True
+                    yield ChatCompletionResponseStreamChunk(
+                        event=ChatCompletionResponseEvent(
+                            event_type=ChatCompletionResponseEventType.progress,
+                            delta=ToolCallDelta(
+                                content="",
+                                parse_status=ToolCallParseStatus.started,
+                            ),
+                        )
+                    )
+                    buffer += text
+                    continue
+
+                if ipython:
+                    if text == "<|eot_id|>":
+                        stop_reason = StopReason.end_of_turn
+                        text = ""
+                        continue
+                    elif text == "<|eom_id|>":
+                        stop_reason = StopReason.end_of_message
+                        text = ""
+                        continue
+
+                    buffer += text
+                    delta = ToolCallDelta(
+                        content=text,
+                        parse_status=ToolCallParseStatus.in_progress,
+                    )
+
+                    yield ChatCompletionResponseStreamChunk(
+                        event=ChatCompletionResponseEvent(
+                            event_type=ChatCompletionResponseEventType.progress,
+                            delta=delta,
+                            stop_reason=stop_reason,
+                        )
+                    )
+                else:
+                    buffer += text
+                    yield ChatCompletionResponseStreamChunk(
+                        event=ChatCompletionResponseEvent(
+                            event_type=ChatCompletionResponseEventType.progress,
+                            delta=text,
+                            stop_reason=stop_reason,
+                        )
+                    )
+
+            # parse tool calls and report errors
+            message = self.formatter.decode_assistant_message_from_content(
+                buffer, stop_reason
+            )
+            parsed_tool_calls = len(message.tool_calls) > 0
+            if ipython and not parsed_tool_calls:
+                yield ChatCompletionResponseStreamChunk(
+                    event=ChatCompletionResponseEvent(
+                        event_type=ChatCompletionResponseEventType.progress,
+                        delta=ToolCallDelta(
+                            content="",
+                            parse_status=ToolCallParseStatus.failure,
+                        ),
+                        stop_reason=stop_reason,
+                    )
+                )
+
+            for tool_call in message.tool_calls:
+                yield ChatCompletionResponseStreamChunk(
+                    event=ChatCompletionResponseEvent(
+                        event_type=ChatCompletionResponseEventType.progress,
+                        delta=ToolCallDelta(
+                            content=tool_call,
+                            parse_status=ToolCallParseStatus.success,
+                        ),
+                        stop_reason=stop_reason,
+                    )
+                )
+
+            yield ChatCompletionResponseStreamChunk(
+                event=ChatCompletionResponseEvent(
+                    event_type=ChatCompletionResponseEventType.complete,
+                    delta="",
+                    stop_reason=stop_reason,
+                )
+            )