Merge branch 'main' into nvidia-e2e-notebook

2025-07-21 03:59:42 +00:00 · 2025-04-15 08:38:41 -04:00 · 2025-04-15 08:38:41 -04:00 · 7cdd2a0410
commit 7cdd2a0410
parent 1a76c55df4 83b5523e2d
264 changed files with 229042 additions and 8445 deletions
--- a/llama_stack/providers/datatypes.py
+++ b/llama_stack/providers/datatypes.py
@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from enum import Enum
 from typing import Any, List, Optional, Protocol
 from urllib.parse import urlparse

@ -201,3 +202,12 @@ def remote_provider_spec(
        adapter=adapter,
        api_dependencies=api_dependencies or [],
    )
+
+
+class HealthStatus(str, Enum):
+    OK = "OK"
+    ERROR = "Error"
+    NOT_IMPLEMENTED = "Not Implemented"
+
+
+HealthResponse = dict[str, Any]
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@ -52,6 +52,7 @@ from llama_stack.apis.inference import (
    StopReason,
    SystemMessage,
    ToolDefinition,
+    ToolParamDefinition,
    ToolResponse,
    ToolResponseMessage,
    UserMessage,
@ -63,7 +64,6 @@ from llama_stack.log import get_logger
 from llama_stack.models.llama.datatypes import (
    BuiltinTool,
    ToolCall,
-    ToolParamDefinition,
 )
 from llama_stack.providers.utils.kvstore import KVStore
 from llama_stack.providers.utils.telemetry import tracing
@ -89,7 +89,6 @@ class ChatAgent(ShieldRunnerMixin):
        self,
        agent_id: str,
        agent_config: AgentConfig,
-        tempdir: str,
        inference_api: Inference,
        safety_api: Safety,
        tool_runtime_api: ToolRuntime,
@ -99,7 +98,6 @@ class ChatAgent(ShieldRunnerMixin):
    ):
        self.agent_id = agent_id
        self.agent_config = agent_config
-        self.tempdir = tempdir
        self.inference_api = inference_api
        self.safety_api = safety_api
        self.vector_io_api = vector_io_api
@ -255,7 +253,7 @@ class ChatAgent(ShieldRunnerMixin):
                    )
                )
            )
-            input_messages = last_turn_messages
+            input_messages = last_turn.input_messages

            turn_id = request.turn_id
            start_time = last_turn.started_at
--- a/llama_stack/providers/inline/agents/meta_reference/agents.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agents.py
@ -7,7 +7,6 @@
 import json
 import logging
 import shutil
-import tempfile
 import uuid
 from typing import AsyncGenerator, List, Optional, Union

@ -64,7 +63,6 @@ class MetaReferenceAgentsImpl(Agents):
        self.tool_groups_api = tool_groups_api

        self.in_memory_store = InmemoryKVStoreImpl()
-        self.tempdir = tempfile.mkdtemp()

    async def initialize(self) -> None:
        self.persistence_store = await kvstore_impl(self.config.persistence_store)
@ -107,7 +105,6 @@ class MetaReferenceAgentsImpl(Agents):
        return ChatAgent(
            agent_id=agent_id,
            agent_config=agent_config,
-            tempdir=self.tempdir,
            inference_api=self.inference_api,
            safety_api=self.safety_api,
            vector_io_api=self.vector_io_api,
--- a/llama_stack/providers/inline/inference/meta_reference/init.py
+++ b/llama_stack/providers/inline/inference/meta_reference/init.py
@ -4,13 +4,13 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict, Union
+from typing import Any, Dict

-from .config import MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig
+from .config import MetaReferenceInferenceConfig


 async def get_provider_impl(
-    config: Union[MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig],
+    config: MetaReferenceInferenceConfig,
    _deps: Dict[str, Any],
 ):
    from .inference import MetaReferenceInferenceImpl
--- a/llama_stack/providers/inline/inference/meta_reference/common.py
+++ b/llama_stack/providers/inline/inference/meta_reference/common.py
@ -5,19 +5,10 @@
 # the root directory of this source tree.

 from pathlib import Path
-from typing import List, Optional
-
-from pydantic import BaseModel

 from llama_stack.distribution.utils.model_utils import model_local_dir


-class TokenResult(BaseModel):
-    token: int
-    text: str
-    logprobs: Optional[List[float]] = None
-
-
 def model_checkpoint_dir(model_id) -> str:
    checkpoint_dir = Path(model_local_dir(model_id))

--- a/llama_stack/providers/inline/inference/meta_reference/config.py
+++ b/llama_stack/providers/inline/inference/meta_reference/config.py
@ -21,6 +21,7 @@ class MetaReferenceInferenceConfig(BaseModel):
    torch_seed: Optional[int] = None
    max_seq_len: int = 4096
    max_batch_size: int = 1
+    model_parallel_size: Optional[int] = None

    # when this is False, we assume that the distributed process group is setup by someone
    # outside of this code (e.g., when run inside `torchrun`). that is useful for clients
@ -31,6 +32,8 @@ class MetaReferenceInferenceConfig(BaseModel):
    # can override by specifying the directory explicitly
    checkpoint_dir: Optional[str] = None

+    quantization: Optional[QuantizationConfig] = None
+
    @field_validator("model")
    @classmethod
    def validate_model(cls, model: str) -> str:
@ -47,27 +50,19 @@ class MetaReferenceInferenceConfig(BaseModel):
        cls,
        model: str = "Llama3.2-3B-Instruct",
        checkpoint_dir: str = "${env.CHECKPOINT_DIR:null}",
+        quantization_type: str = "${env.QUANTIZATION_TYPE:bf16}",
+        model_parallel_size: str = "${env.MODEL_PARALLEL_SIZE:0}",
+        max_batch_size: str = "${env.MAX_BATCH_SIZE:1}",
+        max_seq_len: str = "${env.MAX_SEQ_LEN:4096}",
        **kwargs,
    ) -> Dict[str, Any]:
        return {
            "model": model,
-            "max_seq_len": 4096,
            "checkpoint_dir": checkpoint_dir,
+            "quantization": {
+                "type": quantization_type,
+            },
+            "model_parallel_size": model_parallel_size,
+            "max_batch_size": max_batch_size,
+            "max_seq_len": max_seq_len,
        }
-
-
-class MetaReferenceQuantizedInferenceConfig(MetaReferenceInferenceConfig):
-    quantization: QuantizationConfig
-
-    @classmethod
-    def sample_run_config(
-        cls,
-        model: str = "Llama3.2-3B-Instruct",
-        checkpoint_dir: str = "${env.CHECKPOINT_DIR:null}",
-        **kwargs,
-    ) -> Dict[str, Any]:
-        config = super().sample_run_config(model, checkpoint_dir, **kwargs)
-        config["quantization"] = {
-            "type": "fp8",
-        }
-        return config
--- a/llama_stack/providers/inline/inference/meta_reference/generators.py
+++ b/llama_stack/providers/inline/inference/meta_reference/generators.py
@ -0,0 +1,212 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import math
+from typing import Generator, List, Optional, Tuple
+
+import torch
+from lmformatenforcer import JsonSchemaParser, TokenEnforcer, TokenEnforcerTokenizerData
+
+from llama_stack.apis.inference import (
+    GreedySamplingStrategy,
+    JsonSchemaResponseFormat,
+    ResponseFormat,
+    SamplingParams,
+    TopPSamplingStrategy,
+)
+from llama_stack.models.llama.datatypes import QuantizationMode
+from llama_stack.models.llama.llama3.generation import Llama3
+from llama_stack.models.llama.llama3.tokenizer import Tokenizer as Llama3Tokenizer
+from llama_stack.models.llama.llama4.generation import Llama4
+from llama_stack.models.llama.llama4.tokenizer import Tokenizer as Llama4Tokenizer
+from llama_stack.models.llama.sku_types import Model, ModelFamily
+from llama_stack.providers.utils.inference.prompt_adapter import (
+    ChatCompletionRequestWithRawContent,
+    CompletionRequestWithRawContent,
+    get_default_tool_prompt_format,
+)
+
+from .common import model_checkpoint_dir
+from .config import MetaReferenceInferenceConfig
+from .inference import resolve_model
+
+Tokenizer = Llama4Tokenizer | Llama3Tokenizer
+
+
+class LogitsProcessor:
+    def __init__(self, token_enforcer: TokenEnforcer):
+        self.token_enforcer = token_enforcer
+        self.mask: Optional[torch.Tensor] = None
+
+    def __call__(self, tokens: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
+        token_sequence = tokens[0, :].tolist()
+        allowed_tokens = self.token_enforcer.get_allowed_tokens(token_sequence)
+
+        if self.mask is not None:
+            self.mask.fill_(-math.inf)
+        else:
+            self.mask = torch.full_like(scores, -math.inf)
+
+        self.mask[:, :, allowed_tokens] = 0
+        scores = scores + self.mask
+        return scores
+
+
+def get_logits_processor(
+    tokenizer: Tokenizer,
+    vocab_size: int,
+    response_format: Optional[ResponseFormat],
+) -> Optional["LogitsProcessor"]:
+    if response_format is None:
+        return None
+
+    if not isinstance(response_format, JsonSchemaResponseFormat):
+        raise ValueError(f"Unsupported response format type {response_format.type}")
+
+    parser = JsonSchemaParser(response_format.json_schema)
+    data = TokenEnforcerTokenizerData(
+        _build_regular_tokens_list(tokenizer, vocab_size),
+        tokenizer.decode,
+        tokenizer.stop_tokens,
+    )
+    token_enforcer = TokenEnforcer(data, parser)
+    return LogitsProcessor(token_enforcer)
+
+
+def _build_regular_tokens_list(tokenizer: Tokenizer, vocab_size: int) -> List[Tuple[int, str, bool]]:
+    token_0 = tokenizer.encode("0", bos=False, eos=False)[-1]
+    regular_tokens = []
+
+    special_token_ids = set(tokenizer.special_tokens.values())
+    for token_idx in range(vocab_size):
+        if token_idx in special_token_ids:
+            continue
+
+        # We prepend token 0 and skip the first letter of the result to get a space if the token is a start word.
+        decoded_after_0 = tokenizer.decode([token_0, token_idx])[1:]
+        decoded_regular = tokenizer.decode([token_idx])
+        is_word_start_token = len(decoded_after_0) > len(decoded_regular)
+        regular_tokens.append((token_idx, decoded_after_0, is_word_start_token))
+    return regular_tokens
+
+
+def _infer_sampling_params(sampling_params: SamplingParams):
+    if isinstance(sampling_params.strategy, GreedySamplingStrategy):
+        temperature = 0.0
+        top_p = 1.0
+    elif isinstance(sampling_params.strategy, TopPSamplingStrategy):
+        temperature = sampling_params.strategy.temperature or 1.0
+        top_p = sampling_params.strategy.top_p or 1.0
+    else:
+        raise ValueError(f"Unsupported sampling strategy {sampling_params.strategy}")
+    return temperature, top_p
+
+
+def _infer_tool_prompt_format(request: ChatCompletionRequestWithRawContent):
+    tool_config = request.tool_config
+    if tool_config is not None and tool_config.tool_prompt_format is not None:
+        return tool_config.tool_prompt_format
+    else:
+        return get_default_tool_prompt_format(request.model)
+
+
+class LlamaGenerator:
+    def __init__(
+        self,
+        config: MetaReferenceInferenceConfig,
+        model_id: str,
+        llama_model: Model,
+    ):
+        if config.checkpoint_dir and config.checkpoint_dir != "null":
+            ckpt_dir = config.checkpoint_dir
+        else:
+            resolved_model = resolve_model(model_id)
+            if resolved_model is None:
+                # if the model is not a native llama model, get the default checkpoint_dir based on model id
+                ckpt_dir = model_checkpoint_dir(model_id)
+            else:
+                # if the model is a native llama model, get the default checkpoint_dir based on model core_model_id value
+                ckpt_dir = model_checkpoint_dir(resolved_model.descriptor())
+
+        if config.quantization:
+            if config.quantization.type == "fp8_mixed":
+                quantization_mode = QuantizationMode.fp8_mixed
+            elif config.quantization.type == "int4_mixed":
+                quantization_mode = QuantizationMode.int4_mixed
+            elif config.quantization.type == "bf16":
+                quantization_mode = None
+            else:
+                raise ValueError(f"Unsupported quantization mode {config.quantization}")
+        else:
+            quantization_mode = None
+
+        cls = Llama4 if llama_model.model_family == ModelFamily.llama4 else Llama3
+        self.inner_generator = cls.build(
+            ckpt_dir=ckpt_dir,
+            max_seq_len=config.max_seq_len,
+            max_batch_size=config.max_batch_size,
+            world_size=config.model_parallel_size or llama_model.pth_file_count,
+            quantization_mode=quantization_mode,
+        )
+
+        self.tokenizer = self.inner_generator.tokenizer
+        self.args = self.inner_generator.args
+        self.formatter = self.inner_generator.formatter
+
+    def completion(
+        self,
+        request_batch: List[CompletionRequestWithRawContent],
+    ) -> Generator:
+        first_request = request_batch[0]
+        sampling_params = first_request.sampling_params or SamplingParams()
+        max_gen_len = sampling_params.max_tokens
+        if max_gen_len is None or max_gen_len == 0 or max_gen_len >= self.args.max_seq_len:
+            max_gen_len = self.args.max_seq_len - 1
+
+        temperature, top_p = _infer_sampling_params(sampling_params)
+        for result in self.inner_generator.generate(
+            llm_inputs=[self.formatter.encode_content(request.content) for request in request_batch],
+            max_gen_len=max_gen_len,
+            temperature=temperature,
+            top_p=top_p,
+            logprobs=bool(first_request.logprobs),
+            echo=False,
+            logits_processor=get_logits_processor(
+                self.tokenizer,
+                self.args.vocab_size,
+                first_request.response_format,
+            ),
+        ):
+            yield result
+
+    def chat_completion(
+        self,
+        request_batch: List[ChatCompletionRequestWithRawContent],
+    ) -> Generator:
+        first_request = request_batch[0]
+        sampling_params = first_request.sampling_params or SamplingParams()
+        max_gen_len = sampling_params.max_tokens
+        if max_gen_len is None or max_gen_len == 0 or max_gen_len >= self.args.max_seq_len:
+            max_gen_len = self.args.max_seq_len - 1
+
+        temperature, top_p = _infer_sampling_params(sampling_params)
+        for result in self.inner_generator.generate(
+            llm_inputs=[
+                self.formatter.encode_dialog_prompt(request.messages, _infer_tool_prompt_format(request))
+                for request in request_batch
+            ],
+            max_gen_len=max_gen_len,
+            temperature=temperature,
+            top_p=top_p,
+            logprobs=bool(first_request.logprobs),
+            echo=False,
+            logits_processor=get_logits_processor(
+                self.tokenizer,
+                self.args.vocab_size,
+                first_request.response_format,
+            ),
+        ):
+            yield result
--- a/llama_stack/providers/inline/inference/meta_reference/inference.py
+++ b/llama_stack/providers/inline/inference/meta_reference/inference.py
@ -5,15 +5,20 @@
 # the root directory of this source tree.

 import asyncio
-import logging
+import os
 from typing import AsyncGenerator, List, Optional, Union

+from pydantic import BaseModel
+from termcolor import cprint
+
 from llama_stack.apis.common.content_types import (
    TextDelta,
    ToolCallDelta,
    ToolCallParseStatus,
 )
 from llama_stack.apis.inference import (
+    BatchChatCompletionResponse,
+    BatchCompletionResponse,
    ChatCompletionRequest,
    ChatCompletionResponse,
    ChatCompletionResponseEvent,
@ -28,18 +33,23 @@ from llama_stack.apis.inference import (
    LogProbConfig,
    Message,
    ResponseFormat,
+    SamplingParams,
+    StopReason,
    TokenLogProbs,
    ToolChoice,
    ToolConfig,
-)
-from llama_stack.apis.models import Model, ModelType
-from llama_stack.models.llama.datatypes import (
-    SamplingParams,
-    StopReason,
    ToolDefinition,
    ToolPromptFormat,
+    UserMessage,
 )
+from llama_stack.apis.models import Model, ModelType
+from llama_stack.log import get_logger
+from llama_stack.models.llama.llama3.chat_format import ChatFormat as Llama3ChatFormat
+from llama_stack.models.llama.llama3.tokenizer import Tokenizer as Llama3Tokenizer
+from llama_stack.models.llama.llama4.chat_format import ChatFormat as Llama4ChatFormat
+from llama_stack.models.llama.llama4.tokenizer import Tokenizer as Llama4Tokenizer
 from llama_stack.models.llama.sku_list import resolve_model
+from llama_stack.models.llama.sku_types import ModelFamily
 from llama_stack.providers.datatypes import ModelsProtocolPrivate
 from llama_stack.providers.utils.inference.embedding_mixin import (
    SentenceTransformerEmbeddingMixin,
@ -48,6 +58,10 @@ from llama_stack.providers.utils.inference.model_registry import (
    ModelRegistryHelper,
    build_hf_repo_model_entry,
 )
+from llama_stack.providers.utils.inference.openai_compat import (
+    OpenAIChatCompletionToLlamaStackMixin,
+    OpenAICompletionToLlamaStackMixin,
+)
 from llama_stack.providers.utils.inference.prompt_adapter import (
    augment_content_with_response_format_prompt,
    chat_completion_request_to_messages,
@ -55,16 +69,22 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
 )

 from .config import MetaReferenceInferenceConfig
-from .llama3.generation import Llama3
+from .generators import LlamaGenerator
 from .model_parallel import LlamaModelParallelGenerator

-log = logging.getLogger(__name__)
+log = get_logger(__name__, category="inference")
 # there's a single model parallel process running serving the model. for now,
 # we don't support multiple concurrent requests to this process.
 SEMAPHORE = asyncio.Semaphore(1)


+def llama_builder_fn(config: MetaReferenceInferenceConfig, model_id: str, llama_model: Model) -> LlamaGenerator:
+    return LlamaGenerator(config, model_id, llama_model)
+
+
 class MetaReferenceInferenceImpl(
+    OpenAICompletionToLlamaStackMixin,
+    OpenAIChatCompletionToLlamaStackMixin,
    SentenceTransformerEmbeddingMixin,
    Inference,
    ModelsProtocolPrivate,
@ -77,29 +97,10 @@ class MetaReferenceInferenceImpl(
    async def initialize(self) -> None:
        pass

-    async def load_model(self, model_id, llama_model) -> None:
-        log.info(f"Loading model `{model_id}`")
-        if self.config.create_distributed_process_group:
-            self.generator = LlamaModelParallelGenerator(self.config, model_id, llama_model)
-            self.generator.start()
-        else:
-            self.generator = Llama3.build(self.config, model_id, llama_model)
-
-        self.model_id = model_id
-        self.llama_model = llama_model
-
    async def shutdown(self) -> None:
        if self.config.create_distributed_process_group:
            self.generator.stop()

-    def check_model(self, request) -> None:
-        if self.model_id is None or self.llama_model is None:
-            raise RuntimeError(
-                "No avaible model yet, please register your requested model or add your model in the resouces first"
-            )
-        elif request.model != self.model_id:
-            raise RuntimeError(f"Model mismatch: request model: {request.model} != loaded model: {self.model_id}")
-
    async def unregister_model(self, model_id: str) -> None:
        pass

@ -127,11 +128,58 @@ class MetaReferenceInferenceImpl(
        if model.model_type == ModelType.embedding:
            self._load_sentence_transformer_model(model.provider_resource_id)

+        # TODO: what is this?! you can't really specify skipping via model metadata
+        # kill this madness
        if "skip_load" in model.metadata and model.metadata["skip_load"]:
            return model
+
        await self.load_model(model.identifier, llama_model)
        return model

+    async def load_model(self, model_id, llama_model) -> None:
+        log.info(f"Loading model `{model_id}`")
+
+        builder_params = [self.config, model_id, llama_model]
+
+        if self.config.create_distributed_process_group:
+            self.generator = LlamaModelParallelGenerator(
+                model_parallel_size=self.config.model_parallel_size or llama_model.pth_file_count,
+                builder_fn=llama_builder_fn,
+                builder_params=builder_params,
+                formatter=(
+                    Llama4ChatFormat(Llama4Tokenizer.get_instance())
+                    if llama_model.model_family == ModelFamily.llama4
+                    else Llama3ChatFormat(Llama3Tokenizer.get_instance())
+                ),
+            )
+            self.generator.start()
+        else:
+            self.generator = llama_builder_fn(*builder_params)
+
+        self.model_id = model_id
+        self.llama_model = llama_model
+
+        log.info("Warming up...")
+        await self.completion(
+            model_id=model_id,
+            content="Hello, world!",
+            sampling_params=SamplingParams(max_tokens=10),
+        )
+        await self.chat_completion(
+            model_id=model_id,
+            messages=[UserMessage(content="Hi how are you?")],
+            sampling_params=SamplingParams(max_tokens=20),
+        )
+        log.info("Warmed up!")
+
+    def check_model(self, request) -> None:
+        if self.model_id is None or self.llama_model is None:
+            raise RuntimeError(
+                "No avaible model yet, please register your requested model or add your model in the resouces first"
+            )
+        elif request.model != self.model_id:
+            raise RuntimeError(f"Model mismatch: request model: {request.model} != loaded model: {self.model_id}")
+
    async def completion(
        self,
        model_id: str,
@ -161,17 +209,55 @@ class MetaReferenceInferenceImpl(
        if request.stream:
            return self._stream_completion(request)
        else:
-            return await self._nonstream_completion(request)
+            results = await self._nonstream_completion([request])
+            return results[0]
+
+    async def batch_completion(
+        self,
+        model_id: str,
+        content_batch: List[InterleavedContent],
+        sampling_params: Optional[SamplingParams] = None,
+        response_format: Optional[ResponseFormat] = None,
+        stream: Optional[bool] = False,
+        logprobs: Optional[LogProbConfig] = None,
+    ) -> BatchCompletionResponse:
+        if sampling_params is None:
+            sampling_params = SamplingParams()
+        if logprobs:
+            assert logprobs.top_k == 1, f"Unexpected top_k={logprobs.top_k}"
+
+        content_batch = [
+            augment_content_with_response_format_prompt(response_format, content) for content in content_batch
+        ]
+
+        request_batch = []
+        for content in content_batch:
+            request = CompletionRequest(
+                model=model_id,
+                content=content,
+                sampling_params=sampling_params,
+                response_format=response_format,
+                stream=stream,
+                logprobs=logprobs,
+            )
+            self.check_model(request)
+            request = await convert_request_to_raw(request)
+            request_batch.append(request)
+
+        results = await self._nonstream_completion(request_batch)
+        return BatchCompletionResponse(batch=results)

    async def _stream_completion(self, request: CompletionRequest) -> AsyncGenerator:
+        tokenizer = self.generator.formatter.tokenizer
+
        def impl():
            stop_reason = None

            for token_result in self.generator.completion(request):
-                if token_result.text == "<|eot_id|>":
+                if token_result.token == tokenizer.eot_id:
                    stop_reason = StopReason.end_of_turn
                    text = ""
-                elif token_result.text == "<|eom_id|>":
+                elif token_result.token == tokenizer.eom_id:
                    stop_reason = StopReason.end_of_message
                    text = ""
                else:
@ -204,37 +290,54 @@ class MetaReferenceInferenceImpl(
            for x in impl():
                yield x

-    async def _nonstream_completion(self, request: CompletionRequest) -> CompletionResponse:
+    async def _nonstream_completion(self, request_batch: List[CompletionRequest]) -> List[CompletionResponse]:
+        tokenizer = self.generator.formatter.tokenizer
+
+        first_request = request_batch[0]
+
+        class ItemState(BaseModel):
+            tokens: List[int] = []
+            logprobs: List[TokenLogProbs] = []
+            stop_reason: StopReason | None = None
+            finished: bool = False
+
        def impl():
-            tokens = []
-            logprobs = []
-            stop_reason = None
+            states = [ItemState() for _ in request_batch]

-            for token_result in self.generator.completion(request):
-                tokens.append(token_result.token)
-                if token_result.text == "<|eot_id|>":
-                    stop_reason = StopReason.end_of_turn
-                elif token_result.text == "<|eom_id|>":
-                    stop_reason = StopReason.end_of_message
+            results = []
+            for token_results in self.generator.completion(request_batch):
+                for result in token_results:
+                    idx = result.batch_idx
+                    state = states[idx]
+                    if state.finished or result.ignore_token:
+                        continue

-                if request.logprobs:
-                    assert len(token_result.logprobs) == 1
+                    state.finished = result.finished
+                    if first_request.logprobs:
+                        state.logprobs.append(TokenLogProbs(logprobs_by_token={result.text: result.logprobs[0]}))

-                    logprobs.append(TokenLogProbs(logprobs_by_token={token_result.text: token_result.logprobs[0]}))
+                    state.tokens.append(result.token)
+                    if result.token == tokenizer.eot_id:
+                        state.stop_reason = StopReason.end_of_turn
+                    elif result.token == tokenizer.eom_id:
+                        state.stop_reason = StopReason.end_of_message

-            if stop_reason is None:
-                stop_reason = StopReason.out_of_tokens
+            for state in states:
+                if state.stop_reason is None:
+                    state.stop_reason = StopReason.out_of_tokens

-            content = self.generator.formatter.tokenizer.decode(tokens)
-            if content.endswith("<|eot_id|>"):
-                content = content[: -len("<|eot_id|>")]
-            elif content.endswith("<|eom_id|>"):
-                content = content[: -len("<|eom_id|>")]
-            return CompletionResponse(
-                content=content,
-                stop_reason=stop_reason,
-                logprobs=logprobs if request.logprobs else None,
-            )
+                if state.tokens[-1] in self.generator.formatter.tokenizer.stop_tokens:
+                    state.tokens = state.tokens[:-1]
+                content = self.generator.formatter.tokenizer.decode(state.tokens)
+                results.append(
+                    CompletionResponse(
+                        content=content,
+                        stop_reason=state.stop_reason,
+                        logprobs=state.logprobs if first_request.logprobs else None,
+                    )
+                )
+
+            return results

        if self.config.create_distributed_process_group:
            async with SEMAPHORE:
@ -269,7 +372,7 @@ class MetaReferenceInferenceImpl(
            response_format=response_format,
            stream=stream,
            logprobs=logprobs,
-            tool_config=tool_config,
+            tool_config=tool_config or ToolConfig(),
        )
        self.check_model(request)

@ -285,39 +388,110 @@ class MetaReferenceInferenceImpl(
        if request.stream:
            return self._stream_chat_completion(request)
        else:
-            return await self._nonstream_chat_completion(request)
+            results = await self._nonstream_chat_completion([request])
+            return results[0]

-    async def _nonstream_chat_completion(self, request: ChatCompletionRequest) -> ChatCompletionResponse:
-        def impl():
-            tokens = []
-            logprobs = []
-            stop_reason = None
+    async def batch_chat_completion(
+        self,
+        model_id: str,
+        messages_batch: List[List[Message]],
+        sampling_params: Optional[SamplingParams] = None,
+        response_format: Optional[ResponseFormat] = None,
+        tools: Optional[List[ToolDefinition]] = None,
+        stream: Optional[bool] = False,
+        logprobs: Optional[LogProbConfig] = None,
+        tool_config: Optional[ToolConfig] = None,
+    ) -> BatchChatCompletionResponse:
+        if sampling_params is None:
+            sampling_params = SamplingParams()
+        if logprobs:
+            assert logprobs.top_k == 1, f"Unexpected top_k={logprobs.top_k}"

-            for token_result in self.generator.chat_completion(request):
-                tokens.append(token_result.token)
-
-                if token_result.text == "<|eot_id|>":
-                    stop_reason = StopReason.end_of_turn
-                elif token_result.text == "<|eom_id|>":
-                    stop_reason = StopReason.end_of_message
-
-                if request.logprobs:
-                    assert len(token_result.logprobs) == 1
-
-                    logprobs.append(TokenLogProbs(logprobs_by_token={token_result.text: token_result.logprobs[0]}))
-
-            if stop_reason is None:
-                stop_reason = StopReason.out_of_tokens
-
-            raw_message = self.generator.formatter.decode_assistant_message(tokens, stop_reason)
-            return ChatCompletionResponse(
-                completion_message=CompletionMessage(
-                    content=raw_message.content,
-                    stop_reason=raw_message.stop_reason,
-                    tool_calls=raw_message.tool_calls,
-                ),
-                logprobs=logprobs if request.logprobs else None,
+        # wrapper request to make it easier to pass around (internal only, not exposed to API)
+        request_batch = []
+        for messages in messages_batch:
+            request = ChatCompletionRequest(
+                model=model_id,
+                messages=messages,
+                sampling_params=sampling_params,
+                tools=tools or [],
+                response_format=response_format,
+                logprobs=logprobs,
+                tool_config=tool_config or ToolConfig(),
            )
+            self.check_model(request)
+
+            # augment and rewrite messages depending on the model
+            request.messages = chat_completion_request_to_messages(request, self.llama_model.core_model_id.value)
+            # download media and convert to raw content so we can send it to the model
+            request = await convert_request_to_raw(request)
+            request_batch.append(request)
+
+        if self.config.create_distributed_process_group:
+            if SEMAPHORE.locked():
+                raise RuntimeError("Only one concurrent request is supported")
+
+        results = await self._nonstream_chat_completion(request_batch)
+        return BatchChatCompletionResponse(batch=results)
+
+    async def _nonstream_chat_completion(
+        self, request_batch: List[ChatCompletionRequest]
+    ) -> List[ChatCompletionResponse]:
+        tokenizer = self.generator.formatter.tokenizer
+
+        first_request = request_batch[0]
+
+        class ItemState(BaseModel):
+            tokens: List[int] = []
+            logprobs: List[TokenLogProbs] = []
+            stop_reason: StopReason | None = None
+            finished: bool = False
+
+        def impl():
+            states = [ItemState() for _ in request_batch]
+
+            for token_results in self.generator.chat_completion(request_batch):
+                first = token_results[0]
+                if not first.finished and not first.ignore_token:
+                    if os.environ.get("LLAMA_MODELS_DEBUG", "0") in ("1", "2"):
+                        cprint(first.text, "cyan", end="")
+                    if os.environ.get("LLAMA_MODELS_DEBUG", "0") == "2":
+                        cprint(f"<{first.token}>", "magenta", end="")
+
+                for result in token_results:
+                    idx = result.batch_idx
+                    state = states[idx]
+                    if state.finished or result.ignore_token:
+                        continue
+
+                    state.finished = result.finished
+                    if first_request.logprobs:
+                        state.logprobs.append(TokenLogProbs(logprobs_by_token={result.text: result.logprobs[0]}))
+
+                    state.tokens.append(result.token)
+                    if result.token == tokenizer.eot_id:
+                        state.stop_reason = StopReason.end_of_turn
+                    elif result.token == tokenizer.eom_id:
+                        state.stop_reason = StopReason.end_of_message
+
+            results = []
+            for state in states:
+                if state.stop_reason is None:
+                    state.stop_reason = StopReason.out_of_tokens
+
+                raw_message = self.generator.formatter.decode_assistant_message(state.tokens, state.stop_reason)
+                results.append(
+                    ChatCompletionResponse(
+                        completion_message=CompletionMessage(
+                            content=raw_message.content,
+                            stop_reason=raw_message.stop_reason,
+                            tool_calls=raw_message.tool_calls,
+                        ),
+                        logprobs=state.logprobs if first_request.logprobs else None,
+                    )
+                )
+
+            return results

        if self.config.create_distributed_process_group:
            async with SEMAPHORE:
@ -326,6 +500,8 @@ class MetaReferenceInferenceImpl(
            return impl()

    async def _stream_chat_completion(self, request: ChatCompletionRequest) -> AsyncGenerator:
+        tokenizer = self.generator.formatter.tokenizer
+
        def impl():
            yield ChatCompletionResponseStreamChunk(
                event=ChatCompletionResponseEvent(
@ -340,6 +516,25 @@ class MetaReferenceInferenceImpl(
            ipython = False

            for token_result in self.generator.chat_completion(request):
+                if os.environ.get("LLAMA_MODELS_DEBUG", "0") == "1":
+                    cprint(token_result.text, "cyan", end="")
+                if os.environ.get("LLAMA_MODELS_DEBUG", "0") == "2":
+                    cprint(f"<{token_result.token}>", "magenta", end="")
+
+                if token_result.token == tokenizer.eot_id:
+                    stop_reason = StopReason.end_of_turn
+                    text = ""
+                elif token_result.token == tokenizer.eom_id:
+                    stop_reason = StopReason.end_of_message
+                    text = ""
+                else:
+                    text = token_result.text
+
+                if request.logprobs:
+                    assert len(token_result.logprobs) == 1
+
+                    logprobs.append(TokenLogProbs(logprobs_by_token={token_result.text: token_result.logprobs[0]}))
+
                tokens.append(token_result.token)

                if not ipython and token_result.text.startswith("<|python_tag|>"):
@ -355,10 +550,10 @@ class MetaReferenceInferenceImpl(
                    )
                    continue

-                if token_result.text == "<|eot_id|>":
+                if token_result.token == tokenizer.eot_id:
                    stop_reason = StopReason.end_of_turn
                    text = ""
-                elif token_result.text == "<|eom_id|>":
+                elif token_result.token == tokenizer.eom_id:
                    stop_reason = StopReason.end_of_message
                    text = ""
                else:
--- a/llama_stack/providers/inline/inference/meta_reference/llama3/args.py
+++ b/llama_stack/providers/inline/inference/meta_reference/llama3/args.py
@ -1,82 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# top-level folder for each specific model found within the models/ directory at
-# the top-level of this source tree.
-
-from dataclasses import dataclass
-from enum import Enum
-from typing import Optional
-
-
-class QuantizationScheme(Enum):
-    int4_weight_int8_dynamic_activation = "int4_weight_int8_dynamic_activation"
-
-
-@dataclass
-class QuantizationArgs:
-    scheme: Optional[QuantizationScheme] = None
-    group_size: Optional[int] = None
-    spinquant: bool = False
-
-    def __init__(self, **kwargs):
-        for k, v in kwargs.items():
-            if k == "scheme":
-                setattr(self, k, QuantizationScheme(v))
-            else:
-                if hasattr(self, k):
-                    setattr(self, k, v)
-
-
-@dataclass
-class LoRAArgs:
-    rank: int
-    scale: float
-
-
-@dataclass
-class ModelArgs:
-    dim: int = 4096
-    n_layers: int = 32
-    n_heads: int = 32
-    n_kv_heads: Optional[int] = None
-    vocab_size: int = -1
-    multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
-    ffn_dim_multiplier: Optional[float] = None
-    norm_eps: float = 1e-5
-    rope_theta: float = 500000
-    use_scaled_rope: bool = False
-
-    max_batch_size: int = 32
-    max_seq_len: int = 2048
-
-    # vision model params
-    vision_chunk_size: int = -1  # image resolution for image models
-    vision_max_num_chunks: int = 4
-    vision_num_cross_attention_layers: int = -1
-
-    quantization_args: Optional[QuantizationArgs] = None
-    lora_args: Optional[LoRAArgs] = None
-
-    def __init__(self, **kwargs):
-        for k, v in kwargs.items():
-            if k == "lora_args":
-                setattr(self, k, LoRAArgs(**v))
-            elif k == "quantization_args":
-                setattr(self, k, QuantizationArgs(**v))
-            else:
-                if hasattr(self, k):
-                    setattr(self, k, v)
-
-        if self.n_kv_heads is None:
-            self.n_kv_heads = self.n_heads
-        assert self.n_kv_heads <= self.n_heads
-        assert self.n_heads % self.n_kv_heads == 0
-        assert self.dim % self.n_heads == 0
--- a/llama_stack/providers/inline/inference/meta_reference/llama3/generation.py
+++ b/llama_stack/providers/inline/inference/meta_reference/llama3/generation.py
@ -1,483 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
-
-import json
-import logging
-import math
-import os
-import sys
-import time
-from pathlib import Path
-from typing import Generator, List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-from fairscale.nn.model_parallel.initialize import (
-    get_model_parallel_rank,
-    initialize_model_parallel,
-    model_parallel_is_initialized,
-)
-from lmformatenforcer import JsonSchemaParser, TokenEnforcer, TokenEnforcerTokenizerData
-
-from llama_stack.apis.inference import (
-    Fp8QuantizationConfig,
-    Int4QuantizationConfig,
-    ResponseFormat,
-    ResponseFormatType,
-)
-from llama_stack.models.llama.datatypes import (
-    GreedySamplingStrategy,
-    Model,
-    SamplingParams,
-    TopPSamplingStrategy,
-)
-from llama_stack.models.llama.llama3.chat_format import ChatFormat, LLMInput
-from llama_stack.models.llama.llama3.tokenizer import Tokenizer
-from llama_stack.models.llama.sku_list import resolve_model
-from llama_stack.providers.utils.inference.prompt_adapter import (
-    ChatCompletionRequestWithRawContent,
-    CompletionRequestWithRawContent,
-)
-
-from ..common import TokenResult, model_checkpoint_dir
-from ..config import MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig
-from .args import ModelArgs
-from .model import Transformer
-from .multimodal.model import CrossAttentionTransformer
-
-log = logging.getLogger(__name__)
-
-
-class Llama3:
-    @staticmethod
-    def build(
-        config: Union[MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig],
-        model_id: str,
-        llama_model: Model,
-    ):
-        """
-        Build a Llama instance by initializing and loading a model checkpoint.
-
-        Note:
-            This method initializes the distributed process group, sets the device to CUDA,
-            and loads the pre-trained model and tokenizer.
-        """
-        if "DEVICE" in os.environ:
-            device = os.environ.get("DEVICE")
-            if device == "cuda":
-                assert torch.cuda.is_available(), "PyTorch CUDA backend not available"
-            if device == "xpu":
-                assert torch.xpu.is_available(), "PyTorch XPU backend not available"
-        else:
-            if torch.cuda.is_available():
-                device = "cuda"
-            elif torch.xpu.is_available():
-                device = "xpu"
-            else:
-                device = "cpu"
-        log.info(f"Using {device} device")
-
-        llama_model_id = llama_model.core_model_id.value
-        if not torch.distributed.is_initialized():
-            if device == "cuda":
-                torch.distributed.init_process_group("nccl")
-            else:
-                torch.distributed.init_process_group("gloo")
-
-        model_parallel_size = llama_model.pth_file_count
-
-        if not model_parallel_is_initialized():
-            initialize_model_parallel(model_parallel_size)
-
-        local_rank = int(os.environ.get("LOCAL_RANK", 0))
-        if device == "cuda":
-            torch.cuda.set_device(local_rank)
-        elif device == "xpu":
-            torch.xpu.set_device(local_rank)
-
-        # seed must be the same in all processes
-        if config.torch_seed is not None:
-            torch.manual_seed(config.torch_seed)
-
-        if local_rank > 0:
-            sys.stdout = open(os.devnull, "w")
-
-        start_time = time.time()
-        if config.checkpoint_dir and config.checkpoint_dir != "null":
-            ckpt_dir = config.checkpoint_dir
-        else:
-            resolved_model = resolve_model(model_id)
-            if resolved_model is None:
-                # if the model is not a native llama model, get the default checkpoint_dir based on model id
-                ckpt_dir = model_checkpoint_dir(model_id)
-            else:
-                # if the model is a native llama model, get the default checkpoint_dir based on model core_model_id value
-                ckpt_dir = model_checkpoint_dir(resolved_model.descriptor())
-
-        checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
-        assert len(checkpoints) > 0, f"no checkpoint files found in {ckpt_dir}"
-        assert model_parallel_size == len(checkpoints), (
-            f"Loading a checkpoint for MP={len(checkpoints)} but world size is {model_parallel_size}"
-        )
-        ckpt_path = checkpoints[get_model_parallel_rank()]
-        state_dict = torch.load(ckpt_path, map_location="cpu", weights_only=True)
-        with open(Path(ckpt_dir) / "params.json", "r") as f:
-            params = json.loads(f.read())
-
-        if "model" in params:
-            params = params["model"]
-
-        model_args: ModelArgs = ModelArgs(
-            max_seq_len=config.max_seq_len,
-            max_batch_size=config.max_batch_size,
-            **params,
-        )
-
-        tokenizer = Tokenizer.get_instance()
-        assert model_args.vocab_size == tokenizer.n_words, (
-            f"model_args vocab = {model_args.vocab_size} but tokenizer vocab = {tokenizer.n_words}"
-        )
-
-        if isinstance(config, MetaReferenceQuantizedInferenceConfig):
-            if isinstance(config.quantization, Fp8QuantizationConfig):
-                from ..quantization.loader import convert_to_fp8_quantized_model
-
-                # load on CPU in bf16 so that fp8 conversion does not find an
-                # unexpected (fp32, e.g.) datatype
-                torch.set_default_tensor_type(torch.BFloat16Tensor)
-                if model_args.vision_chunk_size > 0:
-                    model = CrossAttentionTransformer(model_args)
-                    model.setup_cache(model_args.max_batch_size, torch.bfloat16)
-                else:
-                    model = Transformer(model_args)
-                model.load_state_dict(state_dict, strict=False)
-                model = convert_to_fp8_quantized_model(model, config, ckpt_dir)
-            elif isinstance(config.quantization, Int4QuantizationConfig):
-                from ..quantization.loader import convert_to_int4_quantized_model
-
-                model = Transformer(model_args)
-                model = convert_to_int4_quantized_model(model, model_args, config)
-                model.load_state_dict(state_dict, strict=True)
-
-                if model_args.quantization_args is not None and model_args.quantization_args.spinquant:
-                    # Add a wrapper for adding hadamard transform for spinquant.
-                    # This needs to be done after loading the state dict otherwise an error will be raised while
-                    # loading the state dict.
-                    from ..quantization.hadamard_utils import (
-                        add_hadamard_transform_for_spinquant,
-                    )
-
-                    add_hadamard_transform_for_spinquant(model)
-            else:
-                raise NotImplementedError("Currently int4 and fp8 are the only supported quantization methods.")
-        else:
-            if device == "cuda":
-                if torch.cuda.is_bf16_supported():
-                    torch.set_default_tensor_type(torch.cuda.BFloat16Tensor)
-                else:
-                    torch.set_default_tensor_type(torch.cuda.HalfTensor)
-            else:
-                torch.set_default_device(device)
-                if device == "xpu" and torch.xpu.is_bf16_supported():
-                    torch.set_default_dtype(torch.bfloat16)
-                else:
-                    torch.set_default_dtype(torch.half)
-            if model_args.vision_chunk_size > 0:
-                model = CrossAttentionTransformer(model_args)
-                model.setup_cache(model_args.max_batch_size, torch.bfloat16)
-            else:
-                model = Transformer(model_args)
-            model.load_state_dict(state_dict, strict=False)
-
-        model.to(device)
-
-        log.info(f"Loaded in {time.time() - start_time:.2f} seconds")
-        return Llama3(model, tokenizer, model_args, llama_model_id)
-
-    def __init__(
-        self,
-        model: Transformer,
-        tokenizer: Tokenizer,
-        args: ModelArgs,
-        llama_model: str,
-    ):
-        self.args = args
-        self.model = model
-        self.tokenizer = tokenizer
-        self.formatter = ChatFormat(tokenizer)
-        self.llama_model = llama_model
-
-    @torch.inference_mode()
-    def generate(
-        self,
-        model_input: LLMInput,
-        max_gen_len: int,
-        temperature: float = 0.6,
-        top_p: float = 0.9,
-        logprobs: bool = False,
-        echo: bool = False,
-        include_stop_token: bool = False,
-        print_input_tokens: bool = False,
-        logits_processor: Optional["LogitsProcessor"] = None,
-    ) -> Generator:
-        params = self.model.params
-
-        if print_input_tokens:
-            input_tokens = [self.formatter.vision_token if t == 128256 else t for t in model_input.tokens]
-            log.info("Input to model -> " + self.tokenizer.decode(input_tokens))
-        prompt_tokens = [model_input.tokens]
-
-        bsz = 1
-        assert bsz <= params.max_batch_size, (bsz, params.max_batch_size)
-
-        min_prompt_len = min(len(t) for t in prompt_tokens)
-        max_prompt_len = max(len(t) for t in prompt_tokens)
-
-        if max_prompt_len >= params.max_seq_len:
-            log.error(f"Out of token budget {max_prompt_len} vs {params.max_seq_len}")
-            return
-
-        total_len = min(max_gen_len + max_prompt_len, params.max_seq_len)
-
-        is_vision = isinstance(self.model, CrossAttentionTransformer)
-        if is_vision:
-            images = model_input.vision.images if model_input.vision is not None else []
-            mask = model_input.vision.mask if model_input.vision is not None else []
-
-            # the method works for bsz > 1 so add a batch dimension
-            xattn_caches, cross_attention_masks, full_text_row_masked_out_mask = self.model.compute_vision_tokens_masks(
-                batch_images=[images],
-                batch_masks=[mask],
-                total_len=total_len,
-            )
-
-        pad_id = self.tokenizer.pad_id
-        tokens = torch.full((bsz, total_len), pad_id, dtype=torch.long)
-        for k, t in enumerate(prompt_tokens):
-            tokens[k, : len(t)] = torch.tensor(t, dtype=torch.long)
-        if logprobs:
-            token_logprobs = torch.zeros_like(tokens)
-
-        prev_pos = 0
-        eos_reached = torch.tensor([False] * bsz)
-        input_text_mask = tokens != pad_id
-        if min_prompt_len == total_len:
-            # TODO(ashwin): unify this branch with the one below and figure out multimodal crap
-            logits = self.model.forward(tokens, prev_pos)
-            token_logprobs = -F.cross_entropy(
-                input=logits.transpose(1, 2),
-                target=tokens,
-                reduction="none",
-                ignore_index=pad_id,
-            )
-
-        stop_tokens = torch.tensor(self.tokenizer.stop_tokens)
-        for cur_pos in range(min_prompt_len, total_len):
-            if is_vision:
-                position_ids = torch.arange(prev_pos, cur_pos, dtype=torch.long)
-                logits = self.model.forward(
-                    position_ids,
-                    tokens,
-                    cross_attention_masks,
-                    full_text_row_masked_out_mask,
-                    xattn_caches,
-                )
-            else:
-                logits = self.model.forward(tokens[:, prev_pos:cur_pos], prev_pos)
-
-            if logits_processor is not None:
-                logits = logits_processor.process_logits(tokens[:, :cur_pos], logits)
-
-            if temperature > 0:
-                probs = torch.softmax(logits[:, -1] / temperature, dim=-1)
-                next_token = sample_top_p(probs, top_p)
-            else:
-                next_token = torch.argmax(logits[:, -1], dim=-1)
-
-            next_token = next_token.reshape(-1)
-            # only replace token if prompt has already been generated
-            next_token = torch.where(input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token)
-            tokens[:, cur_pos] = next_token
-
-            target = tokens[:, prev_pos + 1 : cur_pos + 1]
-            if is_vision:
-                # the logits space (num_classes) is designed to never contain a media_token
-                # however our input token stream does contain them. we need to nuke them here
-                # or else the CUDA kernels will crash with an illegal memory access
-                vision_tokens = [self.tokenizer.special_tokens["<|image|>"], 128256]
-                masks = [target.eq(t) for t in vision_tokens]
-                if len(masks) > 1:
-                    mask = torch.logical_or(*masks)
-                else:
-                    mask = masks[0]
-                target[mask] = 0
-
-            if logprobs:
-                token_logprobs[:, prev_pos + 1 : cur_pos + 1] = -F.cross_entropy(
-                    input=logits.transpose(1, 2),
-                    target=tokens[:, prev_pos + 1 : cur_pos + 1],
-                    reduction="none",
-                    ignore_index=pad_id,
-                )
-            eos_reached |= (~input_text_mask[:, cur_pos]) & (torch.isin(next_token, stop_tokens))
-            yield TokenResult(
-                token=next_token[0].item(),
-                text=self.tokenizer.decode(next_token.tolist()),
-                logprobs=(token_logprobs[:, cur_pos : cur_pos + 1][0].tolist() if logprobs else None),
-            )
-
-            prev_pos = cur_pos
-            if all(eos_reached):
-                break
-
-    def completion(
-        self,
-        request: CompletionRequestWithRawContent,
-    ) -> Generator:
-        sampling_params = request.sampling_params
-        max_gen_len = sampling_params.max_tokens
-        if max_gen_len is None or max_gen_len == 0 or max_gen_len >= self.model.params.max_seq_len:
-            max_gen_len = self.model.params.max_seq_len - 1
-
-        model_input = self.formatter.encode_content(request.content)
-        temperature, top_p = _infer_sampling_params(sampling_params)
-        yield from self.generate(
-            model_input=model_input,
-            max_gen_len=max_gen_len,
-            temperature=temperature,
-            top_p=top_p,
-            logprobs=bool(request.logprobs),
-            include_stop_token=True,
-            logits_processor=get_logits_processor(
-                self.tokenizer,
-                self.args.vocab_size,
-                request.response_format,
-            ),
-        )
-
-    def chat_completion(
-        self,
-        request: ChatCompletionRequestWithRawContent,
-    ) -> Generator:
-        sampling_params = request.sampling_params
-        max_gen_len = sampling_params.max_tokens
-        if max_gen_len is None or max_gen_len == 0 or max_gen_len >= self.model.params.max_seq_len:
-            max_gen_len = self.model.params.max_seq_len - 1
-
-        temperature, top_p = _infer_sampling_params(sampling_params)
-        yield from self.generate(
-            model_input=self.formatter.encode_dialog_prompt(
-                request.messages,
-                request.tool_config.tool_prompt_format,
-            ),
-            max_gen_len=max_gen_len,
-            temperature=temperature,
-            top_p=top_p,
-            logprobs=bool(request.logprobs),
-            include_stop_token=True,
-            logits_processor=get_logits_processor(
-                self.tokenizer,
-                self.args.vocab_size,
-                request.response_format,
-            ),
-        )
-
-
-def sample_top_p(probs, p):
-    """
-    Perform top-p (nucleus) sampling on a probability distribution.
-
-    Args:
-        probs (torch.Tensor): Probability distribution tensor.
-        p (float): Probability threshold for top-p sampling.
-
-    Returns:
-        torch.Tensor: Sampled token indices.
-
-    Note:
-        Top-p sampling selects the smallest set of tokens whose cumulative probability mass
-        exceeds the threshold p. The distribution is renormalized based on the selected tokens.
-    """
-    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
-    probs_sum = torch.cumsum(probs_sort, dim=-1)
-    mask = probs_sum - probs_sort > p
-    probs_sort[mask] = 0.0
-    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
-    next_token = torch.multinomial(probs_sort, num_samples=1)
-    next_token = torch.gather(probs_idx, -1, next_token)
-    return next_token
-
-
-class LogitsProcessor:
-    def __init__(self, token_enforcer: TokenEnforcer):
-        self.token_enforcer = token_enforcer
-        self.mask: Optional[torch.Tensor] = None
-
-    def process_logits(self, tokens: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
-        token_sequence = tokens[0, :].tolist()
-        allowed_tokens = self.token_enforcer.get_allowed_tokens(token_sequence)
-
-        if self.mask is not None:
-            self.mask.fill_(-math.inf)
-        else:
-            self.mask = torch.full_like(scores, -math.inf)
-
-        self.mask[:, :, allowed_tokens] = 0
-        scores = scores + self.mask
-        return scores
-
-
-def get_logits_processor(
-    tokenizer: Tokenizer,
-    vocab_size: int,
-    response_format: Optional[ResponseFormat],
-) -> Optional["LogitsProcessor"]:
-    if response_format is None:
-        return None
-
-    if response_format.type != ResponseFormatType.json_schema.value:
-        raise ValueError(f"Unsupported response format type {response_format.type}")
-
-    parser = JsonSchemaParser(response_format.json_schema)
-    data = TokenEnforcerTokenizerData(
-        _build_regular_tokens_list(tokenizer, vocab_size),
-        tokenizer.decode,
-        tokenizer.stop_tokens,
-    )
-    token_enforcer = TokenEnforcer(data, parser)
-    return LogitsProcessor(token_enforcer)
-
-
-def _build_regular_tokens_list(tokenizer: Tokenizer, vocab_size: int) -> List[Tuple[int, str, bool]]:
-    token_0 = tokenizer.encode("0", bos=False, eos=False)[-1]
-    regular_tokens = []
-
-    special_token_ids = set(tokenizer.special_tokens.values())
-    for token_idx in range(vocab_size):
-        if token_idx in special_token_ids:
-            continue
-
-        # We prepend token 0 and skip the first letter of the result to get a space if the token is a start word.
-        decoded_after_0 = tokenizer.decode([token_0, token_idx])[1:]
-        decoded_regular = tokenizer.decode([token_idx])
-        is_word_start_token = len(decoded_after_0) > len(decoded_regular)
-        regular_tokens.append((token_idx, decoded_after_0, is_word_start_token))
-    return regular_tokens
-
-
-def _infer_sampling_params(sampling_params: SamplingParams):
-    if isinstance(sampling_params.strategy, GreedySamplingStrategy):
-        temperature = 0.0
-        top_p = 1.0
-    elif isinstance(sampling_params.strategy, TopPSamplingStrategy):
-        temperature = sampling_params.strategy.temperature
-        top_p = sampling_params.strategy.top_p
-    else:
-        raise ValueError(f"Unsupported sampling strategy {sampling_params.strategy}")
-    return temperature, top_p
--- a/llama_stack/providers/inline/inference/meta_reference/llama3/model.py
+++ b/llama_stack/providers/inline/inference/meta_reference/llama3/model.py
@ -1,311 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# top-level folder for each specific model found within the models/ directory at
-# the top-level of this source tree.
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
-
-import math
-from typing import Optional, Tuple
-
-import fairscale.nn.model_parallel.initialize as fs_init
-import torch
-import torch.nn.functional as F
-from fairscale.nn.model_parallel.layers import (
-    ColumnParallelLinear,
-    RowParallelLinear,
-    VocabParallelEmbedding,
-)
-from torch import nn
-
-from .args import ModelArgs
-
-
-class RMSNorm(torch.nn.Module):
-    def __init__(self, dim: int, eps: float = 1e-6):
-        super().__init__()
-        self.eps = eps
-        self.weight = nn.Parameter(torch.ones(dim))
-
-    def _norm(self, x):
-        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
-
-    def forward(self, x):
-        output = self._norm(x.float()).type_as(x)
-        return output * self.weight
-
-
-def apply_scaling(freqs: torch.Tensor) -> torch.Tensor:
-    # Values obtained from grid search
-    scale_factor = 8
-    low_freq_factor = 1
-    high_freq_factor = 4
-    old_context_len = 8192  # original llama3 length
-
-    low_freq_wavelen = old_context_len / low_freq_factor
-    high_freq_wavelen = old_context_len / high_freq_factor
-
-    wavelen = 2 * torch.pi / freqs
-    new_freqs = torch.where(wavelen > low_freq_wavelen, freqs / scale_factor, freqs)
-    smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
-    return torch.where(
-        (wavelen >= high_freq_wavelen) & (wavelen <= low_freq_wavelen),
-        (1 - smooth) * new_freqs / scale_factor + smooth * new_freqs,
-        new_freqs,
-    )
-
-
-def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0, use_scaled: bool = False):
-    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
-    t = torch.arange(end, device=freqs.device, dtype=torch.float32)
-    if use_scaled:
-        freqs = apply_scaling(freqs)
-    freqs = torch.outer(t, freqs)
-    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
-    return freqs_cis
-
-
-def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
-    ndim = x.ndim
-    assert 0 <= 1 < ndim
-    assert freqs_cis.shape == (x.shape[1], x.shape[-1])
-    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
-    return freqs_cis.view(*shape)
-
-
-def apply_rotary_emb(
-    xq: torch.Tensor,
-    xk: torch.Tensor,
-    freqs_cis: torch.Tensor,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
-    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
-    freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
-    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
-    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
-    return xq_out.type_as(xq), xk_out.type_as(xk)
-
-
-def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """torch.repeat_interleave(x, dim=2, repeats=n_rep)"""
-    bs, slen, n_kv_heads, head_dim = x.shape
-    if n_rep == 1:
-        return x
-    return (
-        x[:, :, :, None, :]
-        .expand(bs, slen, n_kv_heads, n_rep, head_dim)
-        .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
-    )
-
-
-class Attention(nn.Module):
-    def __init__(self, args: ModelArgs):
-        super().__init__()
-        self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads
-        model_parallel_size = fs_init.get_model_parallel_world_size()
-        self.n_local_heads = args.n_heads // model_parallel_size
-        self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
-        self.n_rep = self.n_local_heads // self.n_local_kv_heads
-        self.head_dim = args.dim // args.n_heads
-
-        self.wq = ColumnParallelLinear(
-            args.dim,
-            args.n_heads * self.head_dim,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.wk = ColumnParallelLinear(
-            args.dim,
-            self.n_kv_heads * self.head_dim,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.wv = ColumnParallelLinear(
-            args.dim,
-            self.n_kv_heads * self.head_dim,
-            bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
-        )
-        self.wo = RowParallelLinear(
-            args.n_heads * self.head_dim,
-            args.dim,
-            bias=False,
-            input_is_parallel=True,
-            init_method=lambda x: x,
-        )
-
-        self.cache_k = torch.zeros(
-            (
-                args.max_batch_size,
-                args.max_seq_len,
-                self.n_local_kv_heads,
-                self.head_dim,
-            )
-        )
-        self.cache_v = torch.zeros(
-            (
-                args.max_batch_size,
-                args.max_seq_len,
-                self.n_local_kv_heads,
-                self.head_dim,
-            )
-        )
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        start_pos: int,
-        freqs_cis: torch.Tensor,
-        mask: Optional[torch.Tensor],
-    ):
-        bsz, seqlen, _ = x.shape
-        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
-
-        xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
-        xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
-        xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
-
-        xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)
-
-        self.cache_k = self.cache_k.to(xq)
-        self.cache_v = self.cache_v.to(xq)
-
-        self.cache_k[:bsz, start_pos : start_pos + seqlen] = xk
-        self.cache_v[:bsz, start_pos : start_pos + seqlen] = xv
-
-        keys = self.cache_k[:bsz, : start_pos + seqlen]
-        values = self.cache_v[:bsz, : start_pos + seqlen]
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        keys = repeat_kv(keys, self.n_rep)  # (bs, cache_len + seqlen, n_local_heads, head_dim)
-        values = repeat_kv(values, self.n_rep)  # (bs, cache_len + seqlen, n_local_heads, head_dim)
-
-        xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
-        keys = keys.transpose(1, 2)  # (bs, n_local_heads, cache_len + seqlen, head_dim)
-        values = values.transpose(1, 2)  # (bs, n_local_heads, cache_len + seqlen, head_dim)
-        scores = torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(self.head_dim)
-        if mask is not None:
-            scores = scores + mask  # (bs, n_local_heads, seqlen, cache_len + seqlen)
-        scores = F.softmax(scores.float(), dim=-1).type_as(xq)
-        output = torch.matmul(scores, values)  # (bs, n_local_heads, seqlen, head_dim)
-        output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
-        return self.wo(output)
-
-
-class FeedForward(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        hidden_dim: int,
-        multiple_of: int,
-        ffn_dim_multiplier: Optional[float],
-    ):
-        super().__init__()
-        hidden_dim = int(2 * hidden_dim / 3)
-        # custom dim factor multiplier
-        if ffn_dim_multiplier is not None:
-            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
-        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
-
-        self.w1 = ColumnParallelLinear(dim, hidden_dim, bias=False, gather_output=False, init_method=lambda x: x)
-        self.w2 = RowParallelLinear(hidden_dim, dim, bias=False, input_is_parallel=True, init_method=lambda x: x)
-        self.w3 = ColumnParallelLinear(dim, hidden_dim, bias=False, gather_output=False, init_method=lambda x: x)
-
-    def forward(self, x):
-        return self.w2(F.silu(self.w1(x)) * self.w3(x))
-
-
-class TransformerBlock(nn.Module):
-    def __init__(self, layer_id: int, args: ModelArgs):
-        super().__init__()
-        self.n_heads = args.n_heads
-        self.dim = args.dim
-        self.head_dim = args.dim // args.n_heads
-        self.attention = Attention(args)
-        self.feed_forward = FeedForward(
-            dim=args.dim,
-            hidden_dim=4 * args.dim,
-            multiple_of=args.multiple_of,
-            ffn_dim_multiplier=args.ffn_dim_multiplier,
-        )
-        self.layer_id = layer_id
-        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
-        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        start_pos: int,
-        freqs_cis: torch.Tensor,
-        mask: Optional[torch.Tensor],
-    ):
-        h = x + self.attention(self.attention_norm(x), start_pos, freqs_cis, mask)
-        out = h + self.feed_forward(self.ffn_norm(h))
-        return out
-
-
-class Transformer(nn.Module):
-    def __init__(self, params: ModelArgs):
-        super().__init__()
-        self.params = params
-        self.vocab_size = params.vocab_size
-        self.n_layers = params.n_layers
-
-        self.tok_embeddings = VocabParallelEmbedding(params.vocab_size, params.dim, init_method=lambda x: x)
-
-        self.layers = torch.nn.ModuleList()
-        for layer_id in range(params.n_layers):
-            self.layers.append(TransformerBlock(layer_id, params))
-
-        self.norm = RMSNorm(params.dim, eps=params.norm_eps)
-        self.output = ColumnParallelLinear(params.dim, params.vocab_size, bias=False, init_method=lambda x: x)
-
-        self.freqs_cis = precompute_freqs_cis(
-            params.dim // params.n_heads,
-            params.max_seq_len * 2,
-            params.rope_theta,
-            params.use_scaled_rope,
-        )
-
-    @torch.inference_mode()
-    def forward(self, tokens: torch.Tensor, start_pos: int):
-        _bsz, seqlen = tokens.shape
-        h = self.tok_embeddings(tokens)
-        self.freqs_cis = self.freqs_cis.to(h.device)
-        freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen]
-
-        mask = None
-        if seqlen > 1:
-            mask = torch.full((seqlen, seqlen), float("-inf"), device=tokens.device)
-
-            mask = torch.triu(mask, diagonal=1)
-
-            # https://github.com/pytorch/pytorch/issues/100005
-            # torch.triu is buggy when the device is mps: filled values are
-            # nan instead of 0.
-            if mask.device.type == torch.device("mps").type:
-                mask = torch.nan_to_num(mask, nan=0.0)
-
-            # When performing key-value caching, we compute the attention scores
-            # only for the new sequence. Thus, the matrix of scores is of size
-            # (seqlen, cache_len + seqlen), and the only masked entries are (i, j) for
-            # j > cache_len + i, since row i corresponds to token cache_len + i.
-            mask = torch.hstack([torch.zeros((seqlen, start_pos), device=tokens.device), mask]).type_as(h)
-
-        for layer in self.layers:
-            h = layer(h, start_pos, freqs_cis, mask)
-        h = self.norm(h)
-        output = self.output(h).float()
-        return output
--- a/llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/init.py
+++ b/llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/init.py
@ -1,12 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# top-level folder for each specific model found within the models/ directory at
-# the top-level of this source tree.
--- a/llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/encoder_utils.py
+++ b/llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/encoder_utils.py
@ -1,179 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# top-level folder for each specific model found within the models/ directory at
-# the top-level of this source tree.
-
-# Copyright (c) Meta Platforms, Inc. and its affiliates.
-import math
-from logging import getLogger
-
-import torch
-import torch.nn.functional as F
-
-from .utils import get_negative_inf_value, to_2tuple
-
-logger = getLogger()
-
-
-def resize_local_position_embedding(orig_pos_embed, grid_size):
-    """
-    Resize position embedding for vision encoder.
-    Original position embedding is [n_tiles * n_tiles + 1, dim]
-    New position embedding will be [grid_size[0] * grid_size[1] + 1, dim]
-    """
-    new_grid_size = to_2tuple(grid_size)
-    orig_grid_size = to_2tuple(int(math.sqrt(len(orig_pos_embed) - 1)))
-
-    new_pos_emb_tok, new_pos_emb_img = (
-        orig_pos_embed[:1],
-        orig_pos_embed[1:],
-    )
-    logger.info(f"resizing position embedding grid-size from {orig_grid_size} to {new_grid_size}")
-
-    new_pos_emb_img = new_pos_emb_img.reshape(1, orig_grid_size[0], orig_grid_size[1], -1).permute(0, 3, 1, 2)
-
-    new_pos_emb_img = F.interpolate(
-        new_pos_emb_img,
-        size=new_grid_size,
-        mode="bilinear",
-        align_corners=True,
-    )
-    new_pos_emb_img = new_pos_emb_img.permute(0, 2, 3, 1).reshape(1, new_grid_size[0] * new_grid_size[1], -1)[0]
-    new_pos_embed = torch.cat([new_pos_emb_tok, new_pos_emb_img], dim=0)
-    return new_pos_embed
-
-
-def initialize_global_position_embedding_from_local(pos_and_cls_embed, grid_size, x_scale, y_scale):
-    """
-    Takes a local position embedding for vision encoder and uses it
-    to initialize the global position embedding.
-    Input: local position embedding of shape [grid_size[0] * grid_size[1] + 1, dim]
-    Returns: global position embedding of shape [x_scale, y_scale, grid_size[0] * grid_size[1] + 1, dim]
-    Here x_scale and y_scale are the number of tiles along x-axis and y-axis respectively.
-    """
-    pos_embed = pos_and_cls_embed[1:]
-    cls_embed = pos_and_cls_embed[0].view(1, 1, 1, -1)
-    grid_size = to_2tuple(grid_size)
-    new_pos_emb_img = pos_embed.reshape(1, grid_size[0], grid_size[1], -1).permute(0, 3, 1, 2)
-    new_grid_size = (x_scale * grid_size[0], y_scale * grid_size[1])
-    new_pos_emb_img = F.interpolate(
-        new_pos_emb_img,
-        size=new_grid_size,
-        mode="bilinear",
-        align_corners=True,
-    )
-    new_pos_emb_img = new_pos_emb_img.permute(0, 2, 3, 1)
-    new_pos_emb_img = new_pos_emb_img.view(x_scale, grid_size[0], y_scale, grid_size[1], -1)
-    new_pos_emb_img = new_pos_emb_img.permute(0, 2, 1, 3, 4).contiguous()
-    new_pos_emb_img = new_pos_emb_img.reshape(x_scale, y_scale, grid_size[0] * grid_size[1], -1)
-    cls_embed = cls_embed.expand(x_scale, y_scale, -1, -1)
-    pos_and_cls_embed = torch.cat([cls_embed, new_pos_emb_img], dim=2)
-    return pos_and_cls_embed
-
-
-def resize_global_position_embedding(pos_and_cls_embed, grid_size, x_scale, y_scale):
-    """
-    Takes a global position embedding for vision encoder and resizes it to new size.
-    Input: global position embedding of shape [x_old, y_old, old_grid_size[0] * old_grid_size[1] + 1, dim]
-    Returns: global position embedding of shape [x_scale, y_scale, grid_size[0] * grid_size[1] + 1, dim]
-    Here x_scale and y_scale are the number of tiles along x-axis and y-axis respectively.
-    """
-    # first remove cls token
-    pos_embed = pos_and_cls_embed[:, :, 1:]
-    cls_embed = pos_and_cls_embed[:, :, 0].unsqueeze(2)
-
-    xs_old, ys_old, ntok, dim = pos_embed.shape
-    old_grid_size = int(math.sqrt(ntok))
-
-    # move to correct form for interpolation
-    pos_embed = pos_embed.view(xs_old, ys_old, old_grid_size, old_grid_size, dim)
-    pos_embed = pos_embed.permute(0, 2, 1, 3, 4).contiguous()
-    pos_embed = pos_embed.view(xs_old * old_grid_size, ys_old * old_grid_size, dim)
-    pos_embed = pos_embed.unsqueeze(0)
-
-    # interpolate
-    new_size = (grid_size[0] * x_scale, grid_size[1] * y_scale)
-    pos_embed = pos_embed.permute(0, 3, 1, 2)
-    pos_embed_resized = F.interpolate(
-        pos_embed,
-        size=new_size,
-        mode="bilinear",
-        align_corners=True,
-    )
-    pos_embed = pos_embed_resized.permute(0, 2, 3, 1)[0]
-
-    # move it back in place
-    pos_embed = pos_embed.view(x_scale, grid_size[0], y_scale, grid_size[1], dim)
-    pos_embed = pos_embed.permute(0, 2, 1, 3, 4).contiguous()
-    pos_embed = pos_embed.view(x_scale, y_scale, grid_size[0] * grid_size[1], dim)
-
-    # interpolate cls token
-    cls_embed = cls_embed.permute(2, 3, 0, 1)
-    cls_embed_resized = F.interpolate(
-        cls_embed,
-        size=(x_scale, y_scale),
-        mode="bilinear",
-        align_corners=True,
-    )
-    cls_embed = cls_embed_resized.permute(2, 3, 0, 1)
-    # add cls token back in
-    pos_and_cls_embed = torch.cat([cls_embed, pos_embed], dim=2)
-
-    return pos_and_cls_embed
-
-
-def build_encoder_attention_mask(
-    x: torch.Tensor,
-    ar: torch.Tensor,
-    ntok: int,
-    num_chunks: int,
-    n_heads: int,
-):
-    """
-    Build vision encoder attention mask that omits padding tokens.
-    """
-    masks = []
-    for arx in ar:
-        mask_i = torch.ones((num_chunks, x.shape[2], 1), dtype=x.dtype)
-        mask_i[: arx[0] * arx[1], :ntok] = 0
-        mask_i = mask_i.view(num_chunks * x.shape[2], -1)
-        mask_i = mask_i @ mask_i.T * get_negative_inf_value(x.dtype)
-        mask_i = mask_i.unsqueeze(0)
-        masks.append(mask_i)
-    masks = torch.stack(masks).to(x.device).expand(-1, n_heads, -1, -1)
-    return masks
-
-
-def expand_num_tokens_to_mult8(x):
-    num_pad_tokens = 8 - (x.shape[-2] % 8)
-    if num_pad_tokens == 0:
-        return x, 0
-    else:
-        return (
-            torch.cat(
-                [
-                    x,
-                    torch.zeros(
-                        (x.shape[0], x.shape[1], num_pad_tokens, x.shape[-1]),
-                        dtype=x.dtype,
-                        device=x.device,
-                    ),
-                ],
-                dim=-2,
-            ),
-            num_pad_tokens,
-        )
-
-
-def contract_num_tokens_from_mult8(x, num_pad_tokens):
-    if num_pad_tokens == 0:
-        return x
-    return x[:, :, :-num_pad_tokens]
--- a/llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/image_transform.py
+++ b/llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/image_transform.py
@ -1,408 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# top-level folder for each specific model found within the models/ directory at
-# the top-level of this source tree.
-
-import math
-from collections import defaultdict
-from logging import getLogger
-from typing import Any, Optional, Set, Tuple
-
-import torch
-import torchvision.transforms as tv
-from PIL import Image
-from torchvision.transforms import functional as F
-
-IMAGE_RES = 224
-
-logger = getLogger()
-
-
-class VariableSizeImageTransform(object):
-    """
-    This class accepts images of any size and dynamically resize, pads and chunks it
-    based on the image aspect ratio and the number of image chunks we allow.
-
-    The algorithm will NOT distort the image fit a certain aspect ratio, because
-    that leads to a significant degradation in image quality.
-
-    It can be summarized in 6 steps:
-    1. Find all possible canvas combinations of max_num_chunks;
-    2. Find the best canvas to fit the image;
-    3. Resize without distortion
-    4. Pad
-    5. Normalize
-    6. Chunk
-
-    For example, if an input image is of size 300x800, patch_size of 224,
-    and max_num_chunks = 8, it will find the closest aspect ratio that
-    is allowed within 8 image chunks, with some restrictions.
-    In this case, 2:4 = 2 horizontal patches and 4 vertical patches,
-    giving a total of 8 chunks.
-
-    If resize_to_max_canvas, the image will be resized (without distortion),
-    to the largest possible resolution. In this case, 388:896, and padded to 448:896,
-    where we maintain the original aspect ratio and pad with zeros value for the rest.
-    This approach minimizes the amount of padding required for any arbitrary resolution.
-
-    However, if limit_upscaling_to_patch_size is set to True,
-    the upscaling will be limited to the patch size. In the example above,
-    the image would remain 300x800 (no upscaling), and then padded to 448:896.
-
-    The final output will therefore be of shape (8, 3, 224, 224), where 2x4
-    patches are coming from the resizing and chunking.
-    """
-
-    def __init__(self, size: int = IMAGE_RES) -> None:
-        self.size = size
-        logger.info(f"VariableSizeImageTransform size: {self.size}")
-        self.to_tensor = tv.ToTensor()
-        self._mean = (0.48145466, 0.4578275, 0.40821073)
-        self._std = (0.26862954, 0.26130258, 0.27577711)
-        self.normalize = tv.Normalize(
-            mean=self._mean,
-            std=self._std,
-            inplace=True,
-        )
-        self.resample = tv.InterpolationMode.BILINEAR
-
-    @staticmethod
-    def get_factors(n: int) -> Set[int]:
-        """
-        Calculate all factors of a given number, i.e. a dividor that leaves
-        no remainder. For example, if n=12, it will return {1, 2, 3, 4, 6, 12}.
-
-        Args:
-            n (int): The number to find factors for.
-
-        Returns:
-            set: A set containing all factors of the number.
-        """
-        factors_set = set()
-
-        for i in range(1, int(n**0.5) + 1):
-            if n % i == 0:
-                factors_set.add(i)
-                factors_set.add(n // i)
-        return factors_set
-
-    def find_supported_resolutions(self, max_num_chunks: int, patch_size: int) -> torch.Tensor:
-        """
-        Computes all of the allowed resoltuions for a fixed number of chunks
-        and patch_size. Useful for when dividing an image into chunks.
-
-        Args:
-            max_num_chunks (int): Maximum number of chunks for processing.
-            patch_size (int): Size of the side of the patch.
-
-        Returns:
-            torch.Tensor: List of possible resolutions as tuples (height, width).
-
-        Example:
-            >>> max_num_chunks = 5
-            >>> patch_size = 224
-            >>> find_supported_resolutions(max_num_chunks, patch_size)
-            tensor([(224, 896), (448, 448), (224, 224), (896, 224), (224, 672),
-            (672, 224), (224, 448), (448, 224)])
-
-            Given max_num_chunks=4, patch_size=224, it will create a dictionary:
-            {
-            0.25: [(1, 4)],
-            1.0: [(2, 2), (1, 1)],
-            4.0: [(4, 1)],
-            0.33: [(1, 3)],
-            3.0: [(3, 1)],
-            0.5: [(1, 2)],
-            2.0: [(2, 1)]
-            }
-
-            and return the resolutions multiplied by the patch_size:
-            [(1*224, 4*224), (2*224, 2*224), ..., (2*224, 1*224)]
-        """
-        asp_dict = defaultdict(list)
-        for chunk_size in range(max_num_chunks, 0, -1):
-            _factors = sorted(self.get_factors(chunk_size))
-            _asp_ratios = [(factor, chunk_size // factor) for factor in _factors]
-            for height, width in _asp_ratios:
-                ratio_float = height / width
-                asp_dict[ratio_float].append((height, width))
-
-        # get the resolutions multiplied by the patch_size
-        possible_resolutions = []
-        for value in asp_dict.values():
-            for height, depth in value:
-                possible_resolutions.append((height * patch_size, depth * patch_size))
-
-        return possible_resolutions
-
-    @staticmethod
-    def get_max_res_without_distortion(
-        image_size: Tuple[int, int],
-        target_size: Tuple[int, int],
-    ) -> Tuple[int, int]:
-        """
-        Determines the maximum resolution to which an image can be resized to without distorting its
-        aspect ratio, based on the target resolution.
-
-        Args:
-            image_size (Tuple[int, int]): The original resolution of the image (height, width).
-            target_resolution (Tuple[int, int]): The desired resolution to fit the image into (height, width).
-        Returns:
-            Tuple[int, int]: The optimal dimensions (height, width) to which the image should be resized.
-        Example:
-            >>> _get_max_res_without_distortion([200, 300], target_size = [450, 200])
-            (134, 200)
-            >>> _get_max_res_without_distortion([800, 600], target_size = [450, 1300])
-            (450, 338)
-        """
-
-        original_width, original_height = image_size
-        target_width, target_height = target_size
-
-        scale_w = target_width / original_width
-        scale_h = target_height / original_height
-
-        if scale_w < scale_h:
-            new_width = target_width
-            new_height = min(math.floor(original_height * scale_w), target_height)
-        else:
-            new_height = target_height
-            new_width = min(math.floor(original_width * scale_h), target_width)
-
-        return new_width, new_height
-
-    def _pad(self, image: Image.Image, target_size) -> Image.Image:
-        new_width, new_height = target_size
-        new_im = Image.new(mode="RGB", size=(new_width, new_height), color=(0, 0, 0))  # type: ignore
-        new_im.paste(image)
-        return new_im
-
-    def _split(self, image: torch.Tensor, ncw: int, nch: int) -> torch.Tensor:
-        # Split image into number of required tiles (width x height)
-        num_channels, height, width = image.size()
-        image = image.view(num_channels, nch, height // nch, ncw, width // ncw)
-        # Permute dimensions to reorder the axes
-        image = image.permute(1, 3, 0, 2, 4).contiguous()
-        # Reshape into the desired output shape (batch_size * 4, num_channels, width/2, height/2)
-        image = image.view(ncw * nch, num_channels, height // nch, width // ncw)
-        return image
-
-    def resize_without_distortion(
-        self,
-        image: torch.Tensor,
-        target_size: Tuple[int, int],
-        max_upscaling_size: Optional[int],
-    ) -> torch.Tensor:
-        """
-        Used to resize an image to target_resolution, without distortion.
-
-        If target_size requires upscaling the image, the user can set max_upscaling_size to
-        limit the upscaling to a maximum size. In this case, since we rescale without distortion,
-        modifying target_size works as a boundary for the image's largest side.
-
-        Args:
-            resample (str): Resampling method used when resizing images.
-                Supports "nearest", "nearest_exact", "bilinear", "bicubic".
-            max_upscaling_size (int): The maximum size to upscale the image to.
-                If None, there is no limit.
-        Examples:
-        >>> target_size = (1000, 1200)
-        >>> max_upscaling_size = 600
-        >>> image_size = (400, 200)
-        >>> resize_without_distortion(image_size, target_size, max_upscaling_size)
-        (600, 300)  # new_size_without_distortion
-
-        >>> target_size = (1000, 1200)
-        >>> max_upscaling_size = 600
-        >>> image_size = (2000, 200)
-        >>> resize_without_distortion(image_size, target_size, max_upscaling_size)
-        (1000, 100)  # new_size_without_distortion
-
-        >>> target_size = (1000, 1200)
-        >>> max_upscaling_size = 2000
-        >>> image_size = (400, 200)
-        >>> resize_without_distortion(image_size, target_size, max_upscaling_size)
-        (1000, 500)  # new_size_without_distortion
-
-        >>> target_size = (1000, 1200)
-        >>> max_upscaling_size = None
-        >>> image_size = (400, 200)
-        >>> resize_without_distortion(image_size, target_size, max_upscaling_size)
-        (1000, 500)  # new_size_without_distortion
-        """
-
-        image_width, image_height = image.size
-        image_size = (image_width, image_height)
-
-        # If target_size requires upscaling, we might want to limit the upscaling to max_upscaling_size
-        if max_upscaling_size is not None:
-            new_target_width = min(max(image_width, max_upscaling_size), target_size[0])
-            new_target_height = min(max(image_height, max_upscaling_size), target_size[1])
-            target_size = (new_target_width, new_target_height)
-
-        # resize to target_size while preserving aspect ratio
-        new_size_without_distortion = self.get_max_res_without_distortion(image_size, target_size)
-
-        image = F.resize(
-            image,
-            (new_size_without_distortion[1], new_size_without_distortion[0]),
-            interpolation=self.resample,
-        )
-
-        return image
-
-    def get_best_fit(
-        self,
-        image_size: Tuple[int, int],
-        possible_resolutions: torch.Tensor,
-        resize_to_max_canvas: bool = False,
-    ) -> Tuple[int, int]:
-        """
-        Determines the best canvas possible from a list of possible resolutions to, without distortion,
-        resize an image to.
-
-        For each possible resolution, calculates the scaling factors for
-        width and height, and selects the smallest one, which is the limiting side.
-        E.g. to match the canvas you can upscale height by 2x, and width by 1.5x,
-        therefore, the maximum upscaling you can do is min(2, 1.5) = 1.5.
-
-        If upscaling is possible (any of the scaling factors is greater than 1),
-        then picks the smallest upscaling factor > 1, unless resize_to_max_canvas is True.
-
-        If upscaling is not possible, then picks the largest scaling factor <= 1, i.e.
-        reduce downscaling as much as possible.
-
-        If there are multiple resolutions with the same max scale, we pick the one with the lowest area,
-        to minimize padding. E.g., the same image can be upscaled to 224x224 and 224x448, but the latter
-        has more padding.
-
-        Args:
-            image_size (Tuple[int, int]): A tuple containing the height and width of the image.
-            possible_resolutions (torch.Tensor): A tensor of shape (N, 2) where each
-                row represents a possible resolution (height, width).
-            use_max_upscaling (bool): If True, will return the largest upscaling resolution.
-
-        Returns:
-            List[int]: The best resolution [height, width] for the given image.
-
-        Example:
-            >>> image_size = (200, 300)
-            >>> possible_resolutions = torch.tensor([[224, 672],
-            ...                                     [672, 224],
-            ...                                     [224, 448],
-            ...                                     [448, 224],
-            ...                                     [224, 224]])
-            >>> _get_smallest_upscaling_possibility(image_size, possible_resolutions)
-            [224, 448]
-
-            We have:
-                scale_w = tensor([2.2400, 0.7467, 1.4933, 0.7467, 0.7467])
-                scale_h = tensor([1.1200, 3.3600, 1.1200, 2.2400, 1.1200])
-                scales = tensor([1.1200, 0.7467, 1.1200, 0.7467, 0.7467])
-            Only one of the scales > 1:
-                upscaling_possible = tensor([1.1200, 1.1200])
-                smallest_rescale = tensor(1.1200)
-            So we pick the resolution with the smallest smallest area:
-                areas = tensor([150528, 100352]) # [672, 224], [224, 448]
-                optimal_canvas = tensor([224, 448])
-        """
-
-        original_width, original_height = image_size
-
-        # get all possible resolutions heights/widths
-        target_widths, target_heights = (
-            possible_resolutions[:, 0],
-            possible_resolutions[:, 1],
-        )
-
-        # get scaling factors to resize the image without distortion
-        scale_w = target_widths / original_width
-        scale_h = target_heights / original_height
-
-        # get the min scale between width and height (limiting side -> no distortion)
-        scales = torch.where(scale_w > scale_h, scale_h, scale_w)
-
-        # filter only scales that allow upscaling
-        upscaling_options = scales[scales >= 1]
-        if len(upscaling_options) > 0:
-            if resize_to_max_canvas:
-                selected_scale = torch.max(upscaling_options)
-            else:
-                selected_scale = torch.min(upscaling_options)
-        else:
-            # no upscaling possible,
-            # get the minimum downscaling (max scale for scales<1)
-            downscaling_options = scales[scales < 1]
-            selected_scale = torch.max(downscaling_options)
-
-        # get all resolutions that support this scaling factor,
-        # e.g. you can upscale to 224x224, 224x448, 224x672 without distortion
-        chosen_canvas = possible_resolutions[scales == selected_scale]
-
-        # if there are multiple resolutions,
-        # get the one with minimum area to reduce padding
-        if len(chosen_canvas) > 1:
-            areas = chosen_canvas[:, 0] * chosen_canvas[:, 1]
-            optimal_idx = torch.argmin(areas)
-            optimal_canvas = chosen_canvas[optimal_idx]
-        else:
-            optimal_canvas = chosen_canvas[0]
-
-        return tuple(optimal_canvas.tolist())
-
-    def __call__(
-        self,
-        image: Image.Image,
-        max_num_chunks: int,
-        normalize_img: bool = True,
-        resize_to_max_canvas: bool = False,
-    ) -> Tuple[Any, Any]:
-        """
-        Args:
-            image (PIL.Image): Image to be resized.
-            max_num_chunks (int): Maximum number of chunks to split the image into.
-            normalize_img (bool): Whether to normalize the image.
-            resize_to_max_canvas (bool): Whether to resize the image to the maximum canvas size.
-            If True, picks the canvas the allows the largest resizing without distortion.
-            If False, downsample as little as possible, including no resizing at all,
-            but never upsample, unless the image is smaller than the patch size.
-        """
-        assert max_num_chunks > 0
-        assert isinstance(image, Image.Image), type(image)
-        w, h = image.size
-
-        possible_resolutions = self.find_supported_resolutions(max_num_chunks=max_num_chunks, patch_size=self.size)
-        possible_resolutions = torch.tensor(possible_resolutions)
-
-        best_resolution = self.get_best_fit(
-            image_size=(w, h),
-            possible_resolutions=possible_resolutions,
-            resize_to_max_canvas=resize_to_max_canvas,
-        )
-
-        max_upscaling_size = None if resize_to_max_canvas else self.size
-        image = self.resize_without_distortion(image, best_resolution, max_upscaling_size)
-        image = self._pad(image, best_resolution)
-
-        image = self.to_tensor(image)
-
-        if normalize_img:
-            image = self.normalize(image)
-
-        ratio_w, ratio_h = (
-            best_resolution[0] // self.size,
-            best_resolution[1] // self.size,
-        )
-
-        image = self._split(image, ratio_w, ratio_h)  # type: ignore
-
-        ar = (ratio_h, ratio_w)
-        return image, ar
--- a/llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/model.py
+++ b/llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/model.py
--- a/llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/utils.py
+++ b/llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/utils.py
@ -1,26 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# top-level folder for each specific model found within the models/ directory at
-# the top-level of this source tree.
-
-import collections
-
-import torch
-
-
-def get_negative_inf_value(dtype):
-    return torch.finfo(dtype).min
-
-
-def to_2tuple(x):
-    if isinstance(x, collections.abc.Iterable):
-        return x
-    return (x, x)
--- a/llama_stack/providers/inline/inference/meta_reference/model_parallel.py
+++ b/llama_stack/providers/inline/inference/meta_reference/model_parallel.py
@ -4,23 +4,17 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-import os
 from copy import deepcopy
 from functools import partial
-from typing import Any, Generator
+from typing import Any, Callable, Generator, List

-from llama_stack.models.llama.datatypes import Model
-from llama_stack.models.llama.llama3.chat_format import ChatFormat
-from llama_stack.models.llama.llama3.tokenizer import Tokenizer
-from llama_stack.models.llama.sku_list import resolve_model
+from llama_stack.models.llama.llama3.chat_format import ChatFormat as Llama3ChatFormat
+from llama_stack.models.llama.llama4.chat_format import ChatFormat as Llama4ChatFormat
 from llama_stack.providers.utils.inference.prompt_adapter import (
    ChatCompletionRequestWithRawContent,
    CompletionRequestWithRawContent,
 )

-from .common import model_checkpoint_dir
-from .config import MetaReferenceInferenceConfig
-from .llama3.generation import Llama3
 from .parallel_utils import ModelParallelProcessGroup


@ -29,21 +23,20 @@ class ModelRunner:
        self.llama = llama

    # the `task` object is the same that is sent to `ModelParallelProcessGroup.run_inference()`
-    def __call__(self, req: Any):
-        if isinstance(req, ChatCompletionRequestWithRawContent):
-            return self.llama.chat_completion(req)
-        elif isinstance(req, CompletionRequestWithRawContent):
-            return self.llama.completion(req)
+    def __call__(self, task: Any):
+        if task[0] == "chat_completion":
+            return self.llama.chat_completion(task[1])
+        elif task[0] == "completion":
+            return self.llama.completion(task[1])
        else:
-            raise ValueError(f"Unexpected task type {type(req)}")
+            raise ValueError(f"Unexpected task type {task[0]}")


 def init_model_cb(
-    config: MetaReferenceInferenceConfig,
-    model_id: str,
-    llama_model: Model,
+    builder_fn: Callable,
+    params: list[Any],
 ):
-    llama = Llama3.build(config, model_id, llama_model)
+    llama = builder_fn(*params)
    return ModelRunner(llama)


@ -60,25 +53,15 @@ class LlamaModelParallelGenerator:

    def __init__(
        self,
-        config: MetaReferenceInferenceConfig,
-        model_id: str,
-        llama_model: Model,
+        model_parallel_size: int,
+        builder_fn: Callable,
+        builder_params: list[Any],
+        formatter: Llama3ChatFormat | Llama4ChatFormat,
    ):
-        self.config = config
-        self.model_id = model_id
-        self.llama_model = llama_model
-
-        # this is a hack because Agent's loop uses this to tokenize and check if input is too long
-        # while the tool-use loop is going
-        resolved_model = resolve_model(model_id)
-        if resolved_model is None:
-            # if the model is not a native llama model, get the default checkpoint_dir based on model id
-            checkpoint_dir = model_checkpoint_dir(model_id)
-        else:
-            # if the model is a native llama model, get the default checkpoint_dir based on model core_model_id value
-            checkpoint_dir = model_checkpoint_dir(resolved_model.descriptor())
-        tokenizer_path = os.path.join(checkpoint_dir, "tokenizer.model")
-        self.formatter = ChatFormat(Tokenizer(tokenizer_path))
+        self.model_parallel_size = model_parallel_size
+        self.builder_fn = builder_fn
+        self.builder_params = builder_params
+        self.formatter = formatter

    def start(self):
        self.__enter__()
@ -87,11 +70,9 @@ class LlamaModelParallelGenerator:
        self.__exit__(None, None, None)

    def __enter__(self):
-        model_parallel_size = self.llama_model.pth_file_count
-
        self.group = ModelParallelProcessGroup(
-            model_parallel_size,
-            init_model_cb=partial(init_model_cb, self.config, self.model_id, self.llama_model),
+            self.model_parallel_size,
+            init_model_cb=partial(init_model_cb, self.builder_fn, self.builder_params),
        )
        self.group.start()
        return self
@ -101,16 +82,16 @@ class LlamaModelParallelGenerator:

    def completion(
        self,
-        request: CompletionRequestWithRawContent,
+        request_batch: List[CompletionRequestWithRawContent],
    ) -> Generator:
-        req_obj = deepcopy(request)
-        gen = self.group.run_inference(req_obj)
+        req_obj = deepcopy(request_batch)
+        gen = self.group.run_inference(("completion", req_obj))
        yield from gen

    def chat_completion(
        self,
-        request: ChatCompletionRequestWithRawContent,
+        request_batch: List[ChatCompletionRequestWithRawContent],
    ) -> Generator:
-        req_obj = deepcopy(request)
-        gen = self.group.run_inference(req_obj)
+        req_obj = deepcopy(request_batch)
+        gen = self.group.run_inference(("chat_completion", req_obj))
        yield from gen
--- a/llama_stack/providers/inline/inference/meta_reference/parallel_utils.py
+++ b/llama_stack/providers/inline/inference/meta_reference/parallel_utils.py
@ -19,7 +19,7 @@ import tempfile
 import time
 import uuid
 from enum import Enum
-from typing import Callable, Generator, Literal, Optional, Union
+from typing import Callable, Generator, List, Literal, Optional, Tuple, Union

 import torch
 import zmq
@ -32,13 +32,12 @@ from pydantic import BaseModel, Field
 from torch.distributed.launcher.api import LaunchConfig, elastic_launch
 from typing_extensions import Annotated

+from llama_stack.models.llama.datatypes import GenerationResult
 from llama_stack.providers.utils.inference.prompt_adapter import (
    ChatCompletionRequestWithRawContent,
    CompletionRequestWithRawContent,
 )

-from .common import TokenResult
-
 log = logging.getLogger(__name__)


@ -70,12 +69,12 @@ class CancelSentinel(BaseModel):

 class TaskRequest(BaseModel):
    type: Literal[ProcessingMessageName.task_request] = ProcessingMessageName.task_request
-    task: Union[CompletionRequestWithRawContent, ChatCompletionRequestWithRawContent]
+    task: Tuple[str, List[CompletionRequestWithRawContent] | List[ChatCompletionRequestWithRawContent]]


 class TaskResponse(BaseModel):
    type: Literal[ProcessingMessageName.task_response] = ProcessingMessageName.task_response
-    result: TokenResult
+    result: List[GenerationResult]


 class ExceptionResponse(BaseModel):
@ -332,7 +331,7 @@ class ModelParallelProcessGroup:

    def run_inference(
        self,
-        req: Union[CompletionRequestWithRawContent, ChatCompletionRequestWithRawContent],
+        req: Tuple[str, List[CompletionRequestWithRawContent] | List[ChatCompletionRequestWithRawContent]],
    ) -> Generator:
        assert not self.running, "inference already running"

--- a/llama_stack/providers/inline/inference/meta_reference/quantization/init.py
+++ b/llama_stack/providers/inline/inference/meta_reference/quantization/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/llama_stack/providers/inline/inference/meta_reference/quantization/fp8_impls.py
+++ b/llama_stack/providers/inline/inference/meta_reference/quantization/fp8_impls.py
@ -1,177 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
-
-import collections
-import logging
-from typing import Optional, Type
-
-log = logging.getLogger(__name__)
-
-try:
-    import fbgemm_gpu.experimental.gen_ai  # noqa: F401
-
-    log.info("Using efficient FP8 operators in FBGEMM.")
-except ImportError:
-    log.error("No efficient FP8 operators. Please install FBGEMM in fp8_requirements.txt.")
-    raise
-
-import torch
-from torch import Tensor, nn
-
-
-class Fp8ScaledWeights:
-    # TODO: Ugly trick so torch allows us to replace parameters
-    # with our custom Fp8Weights instance. Do this properly.
-    @property
-    def __class__(self) -> Type[nn.parameter.Parameter]:
-        return nn.Parameter
-
-    @property
-    def grad_fn(self) -> None:
-        return None
-
-
-# pyre-fixme[4]: Attribute annotation cannot be `Any`.
-# pyre-fixme[2]: Parameter annotation cannot be `Any`.
-class Fp8RowwiseWeights(
-    Fp8ScaledWeights,
-    collections.namedtuple(
-        "Fp8RowwiseWeights",
-        ["weight", "scale", "shape", "activation_scale_ub"],
-    ),
-):
-    pass
-
-
-def ffn_swiglu(
-    x: Tensor,
-    w1: Fp8RowwiseWeights,
-    w3: Fp8RowwiseWeights,
-    w2: Fp8RowwiseWeights,
-    num_tokens: Optional[Tensor] = None,
-    is_memory_bounded: bool = False,
-) -> Tensor:
-    if isinstance(w1, Fp8ScaledWeights) and isinstance(w3, Fp8ScaledWeights) and isinstance(w2, Fp8ScaledWeights):
-        return ffn_swiglu_fp8_dynamic(x, w1, w3, w2, w1.activation_scale_ub, num_tokens, is_memory_bounded)
-
-    (B, T, D) = x.shape  # noqa: N806
-    (HD_L, D_) = w1.shape  # noqa: N806
-    assert D_ == D
-
-    assert isinstance(w1, Tensor)
-    assert isinstance(w3, Tensor)
-    x1 = x.view(B * T, D) @ w1.T
-    x2 = x.view(B * T, D) @ w3.T
-    z = torch.nn.functional.silu(x1) * x2
-    del x1, x2
-    assert isinstance(w2, Tensor)
-    return (z @ w2.T).view(B, T, D)
-
-
-@torch.inference_mode()
-def quantize_fp8(
-    w: Tensor,
-    fp8_activation_scale_ub: float,
-    output_device: Optional[torch.device] = None,
-) -> Fp8RowwiseWeights:
-    """Quantize [n, k] weight tensor.
-
-    Args:
-        w (Tensor): [n, k] input high precision tensor to quantize.
-        fp8_activation_scale_ub (float): Upper bound for activation max.
-    """
-    activation_scale_ub = torch.tensor(
-        [fp8_activation_scale_ub],
-        dtype=torch.float,
-        device="cuda",
-    )
-    wq, w_scale = torch.ops.fbgemm.quantize_fp8_per_row(w)
-    del w
-    return Fp8RowwiseWeights(
-        weight=wq,
-        scale=w_scale,
-        shape=wq.shape,
-        activation_scale_ub=activation_scale_ub,
-    )
-
-
-@torch.inference_mode()
-def load_fp8(
-    w: Tensor,
-    w_scale: Tensor,
-    fp8_activation_scale_ub: float,
-) -> Fp8RowwiseWeights:
-    """Load FP8 [n, k] weight tensor.
-
-    Args:
-        w (Tensor): [n, k] input FP8.
-        fp8_activation_scale_ub (float): Upper bound for activation max.
-    """
-    activation_scale_ub = torch.tensor(
-        [fp8_activation_scale_ub],
-        dtype=torch.float,
-        device="cuda",
-    )
-    return Fp8RowwiseWeights(
-        weight=w.to(torch.float8_e4m3fn).to(device="cuda"),
-        scale=w_scale.to(device="cuda"),
-        shape=w.shape,
-        activation_scale_ub=activation_scale_ub,
-    )
-
-
-def fc_fp8_dynamic(
-    x: Tensor,
-    w: Fp8RowwiseWeights,
-    activation_scale_ub: Optional[Tensor] = None,
-    num_tokens: Optional[Tensor] = None,
-    is_memory_bounded: bool = False,
-) -> Tensor:
-    """
-    Single w8a8 fc layer with dynamic row-wise scaling.
-    """
-    if isinstance(w, Fp8RowwiseWeights):
-        xq, x_scale = torch.ops.fbgemm.quantize_fp8_per_row(x, num_tokens, activation_scale_ub)
-        y = torch.ops.fbgemm.f8f8bf16_rowwise(xq, w.weight, x_scale, w.scale, use_fast_accum=True)
-    del xq
-    return y
-
-
-def ffn_swiglu_fp8_dynamic(
-    x: Tensor,
-    w1: Fp8RowwiseWeights,
-    w3: Fp8RowwiseWeights,
-    w2: Fp8RowwiseWeights,
-    activation_scale_ub: Optional[Tensor] = None,
-    num_tokens: Optional[Tensor] = None,
-    is_memory_bounded: bool = False,
-) -> Tensor:
-    (B, T, D) = x.shape  # noqa: N806
-    HD_L = w1.shape[0]  # noqa: N806
-    assert HD_L == w3.shape[0]
-    x1 = fc_fp8_dynamic(
-        x.view(B * T, D),
-        w1,
-        activation_scale_ub,
-        num_tokens,
-        is_memory_bounded,
-    )
-    x2 = fc_fp8_dynamic(
-        x.view(B * T, D),
-        w3,
-        activation_scale_ub,
-        num_tokens,
-        is_memory_bounded,
-    )
-    z = torch.nn.functional.silu(x1) * x2
-    del x1, x2
-
-    z_ = fc_fp8_dynamic(z, w2, activation_scale_ub, num_tokens, is_memory_bounded)
-
-    return z_.view(B, T, D)
--- a/llama_stack/providers/inline/inference/meta_reference/quantization/fp8_txest_disabled.py
+++ b/llama_stack/providers/inline/inference/meta_reference/quantization/fp8_txest_disabled.py
@ -1,78 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
-
-# The file gets a special treatment for now?
-# ruff: noqa: N803
-
-import unittest
-
-import torch
-from fp8_impls import FfnQuantizeMode, ffn_swiglu_fp8_dynamic, quantize_fp8
-from hypothesis import given, settings
-from hypothesis import strategies as st
-from torch import Tensor
-
-
-@unittest.skipIf(
-    not torch.cuda.is_available() or torch.cuda.get_device_properties(torch.cuda.current_device()).major < 9,
-    "Skip when H100 is not available",
-)
-class FP8Tests(unittest.TestCase):
-    @settings(deadline=None)
-    @given(
-        D=st.sampled_from([4096, 8192]),
-        HD_L=st.sampled_from([1280, 2560]),
-        B=st.sampled_from([1, 2]),
-        T=st.sampled_from([2048, 4096]),
-        UB=st.sampled_from([1000, 10000]),
-    )
-    def test_fp8_ffn(
-        self,
-        D: int,  # noqa
-        HD_L: int,
-        B: int,
-        T: int,
-        UB: float,
-    ) -> None:
-        x = torch.randn(size=(B, T, D), dtype=torch.bfloat16, device="cuda") * 0.1
-        w1 = torch.randn(size=(HD_L, D), dtype=torch.bfloat16, device="cuda") * 0.01
-        w3 = torch.randn(size=(HD_L, D), dtype=torch.bfloat16, device="cuda") * 0.01
-        w2 = torch.randn(size=(D, HD_L), dtype=torch.bfloat16, device="cuda") * 0.1
-
-        x_q = quantize_fp8(x, UB, mode=FfnQuantizeMode.FP8_ROWWISE)
-        w1_q = quantize_fp8(w1, UB, mode=FfnQuantizeMode.FP8_ROWWISE)
-        w3_q = quantize_fp8(w3, UB, mode=FfnQuantizeMode.FP8_ROWWISE)
-        w2_q = quantize_fp8(w2, UB, mode=FfnQuantizeMode.FP8_ROWWISE)
-
-        def ref_ffn(x: Tensor, w1: Tensor, w3: Tensor, w2: Tensor) -> Tensor:
-            (B, T, D) = x.shape  # noqa: N806
-            (HD_L, D_) = w1.shape  # noqa: N806
-            assert D_ == D
-
-            x1 = x.view(B * T, D) @ w1.T
-            x2 = x.view(B * T, D) @ w3.T
-
-            z = torch.nn.functional.silu(x1) * x2
-            return (z @ w2.T).view(B, T, D).to(torch.bfloat16)
-
-        v = ffn_swiglu_fp8_dynamic(x, w1_q, w3_q, w2_q)
-
-        # Fake quant
-        x = x_q.weight.bfloat16() * x_q.scale.unsqueeze(-1)
-        w1 = w1_q.weight.bfloat16() * w1_q.scale.unsqueeze(-1)
-        w3 = w3_q.weight.bfloat16() * w3_q.scale.unsqueeze(-1)
-        w2 = w2_q.weight.bfloat16() * w2_q.scale.unsqueeze(-1)
-
-        v_ref = ref_ffn(x, w1, w3, w2)
-
-        torch.testing.assert_close(v_ref, v, atol=4.0e-3, rtol=4.0e-3)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/llama_stack/providers/inline/inference/meta_reference/quantization/hadamard_utils.py
+++ b/llama_stack/providers/inline/inference/meta_reference/quantization/hadamard_utils.py
@ -1,86 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import math
-import re
-
-import torch
-from torch import nn
-
-
-def hadamard_transform(x: torch.Tensor) -> torch.Tensor:
-    """Hadamard transform.
-
-    This function performs the Hadamard transform on the input tensor 'x'.
-    The Hadamard transform is a linear transformation that multiplies the input
-    tensor by the Hadamard matrix of dimension n x n, where n is the size of
-    the last dimension of the input tensor.
-    """
-    *_, n = x.shape
-    m = int(math.log2(n))
-    assert n == 1 << m, "n must be a power of 2"
-    x = x[..., None]
-    inv_sqrt2 = 0.5**0.5
-    for _ in range(m):
-        top = x[..., ::2, :] + x[..., 1::2, :]
-        bot = x[..., ::2, :] - x[..., 1::2, :]
-        x = torch.cat((top, bot), dim=-1)
-        x *= inv_sqrt2
-    res = x.squeeze(-2)
-    return res
-
-
-class HadamardModule(torch.nn.Module):
-    """A module that applies the Hadamard transform to the input tensor.
-
-    Args:
-        group_size: The size of the groups that the input tensor will be divided into
-            before applying the Hadamard transform.
-    """
-
-    def __init__(self, group_size: int) -> None:
-        super().__init__()
-        self.group_size = group_size
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        reshape_back = False
-        orig_shape = x.shape
-        if self.group_size != x.shape[-1]:
-            reshape_back = True
-            x = x.reshape(-1, x.shape[-1] // self.group_size, self.group_size)
-        x = hadamard_transform(x)
-        if reshape_back:
-            x = x.reshape(orig_shape)
-        return x
-
-
-def add_hadamard_transform_for_spinquant(model: torch.nn.Module, prefix: str = "") -> None:
-    """
-    Adds a Hadamard transform to the last linear layer of each feedforward network (FFN) in the model.
-    This function recursively traverses the model's children and looks for layers that match the pattern
-    "layers.<digit>.feed_forward.w2", where <digit> is one or more digits. When such a layer is found,
-    it is replaced with a new sequential module that consists of a HadamardModule followed by the original
-    layer. The HadamardModule applies the Hadamard transform to the input tensor.
-
-    See `SpinQuant <https://arxiv.org/abs/2405.16406>_` paper for more details.
-
-    Args:
-        model: An instance of 'torch.nn.Module' (e.g., Transformer model).
-        prefix: A string prefix to add to the full name of each child module.
-
-    Returns:
-        None
-    """
-
-    pattern_last_linear_ffn = r"layers.\d+.feed_forward.w2"
-    for module_name, module in model.named_children():
-        child_full_name = prefix + "." + module_name
-        if re.search(pattern_last_linear_ffn, child_full_name):
-            new_module = nn.Sequential(HadamardModule(group_size=module.in_features), module)
-            del module
-            setattr(model, module_name, new_module)
-        else:
-            add_hadamard_transform_for_spinquant(module, (prefix + "." if prefix else prefix) + module_name)
--- a/llama_stack/providers/inline/inference/meta_reference/quantization/loader.py
+++ b/llama_stack/providers/inline/inference/meta_reference/quantization/loader.py
@ -1,320 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
-
-import logging
-import os
-from typing import Any, Dict, List, Optional
-
-import torch
-from fairscale.nn.model_parallel.initialize import get_model_parallel_rank
-from fairscale.nn.model_parallel.layers import ColumnParallelLinear, RowParallelLinear
-from fairscale.nn.model_parallel.mappings import reduce_from_model_parallel_region
-from torch import Tensor, nn
-from torchao.quantization.GPTQ import Int8DynActInt4WeightLinear
-
-from llama_stack.apis.inference import QuantizationType
-from llama_stack.models.llama.datatypes import CheckpointQuantizationFormat
-from llama_stack.models.llama.sku_list import resolve_model
-
-from ...llama3.args import ModelArgs
-from ...llama3.model import Transformer, TransformerBlock
-from ..config import MetaReferenceQuantizedInferenceConfig
-
-log = logging.getLogger(__name__)
-
-
-def swiglu_wrapper(
-    self,
-    x: Tensor,
-):
-    from .fp8_impls import ffn_swiglu
-
-    out = ffn_swiglu(x, self.w1.weight, self.w3.weight, self.w2.weight)
-    return reduce_from_model_parallel_region(out)
-
-
-def convert_to_fp8_quantized_model(
-    model: Transformer,
-    config: MetaReferenceQuantizedInferenceConfig,
-    checkpoint_dir: str,
-    fp8_activation_scale_ub: Optional[float] = 1200.0,
-) -> Transformer:
-    if config.quantization.type == QuantizationType.bf16.value:
-        return model
-
-    elif config.quantization.type != QuantizationType.fp8.value:
-        raise ValueError("Only FP8 quantization is supported")
-
-    from .fp8_impls import Fp8ScaledWeights, load_fp8, quantize_fp8
-
-    llama_model = resolve_model(config.model)
-    assert llama_model is not None, f"Model {config.model} not found"
-
-    # Move weights to GPU with quantization
-    if llama_model.quantization_format == CheckpointQuantizationFormat.fp8_mixed.value:
-        log.info("Loading fp8 scales...")
-        fp8_scales_path = os.path.join(checkpoint_dir, f"fp8_scales_{get_model_parallel_rank()}.pt")
-        assert os.path.isfile(fp8_scales_path), f"fp8_scales_path not found for rank {get_model_parallel_rank()}"
-        fp8_scales = torch.load(fp8_scales_path, weights_only=True)
-
-        for block in model.layers:
-            if isinstance(block, TransformerBlock):
-                if block.layer_id == 0 or block.layer_id == (model.n_layers - 1):
-                    continue
-
-                block.feed_forward.forward = swiglu_wrapper.__get__(block.feed_forward)
-                for key in ("w1", "w3", "w2"):
-                    param = getattr(block.feed_forward, key)
-                    param.weight = load_fp8(
-                        param.weight,
-                        fp8_scales[f"{block.layer_id}_feed_forward.{key}_{get_model_parallel_rank()}"],
-                        fp8_activation_scale_ub,
-                    )
-    else:
-        log.info("Quantizing fp8 weights from bf16...")
-        for block in model.layers:
-            if isinstance(block, TransformerBlock):
-                if block.layer_id == 0 or block.layer_id == (model.n_layers - 1):
-                    continue
-                block.feed_forward.forward = swiglu_wrapper.__get__(block.feed_forward)
-                for key in ("w1", "w3", "w2"):
-                    param = getattr(block.feed_forward, key)
-                    param.weight = quantize_fp8(
-                        param.weight,
-                        fp8_activation_scale_ub,
-                        output_device=torch.device("cuda"),
-                    )
-
-    for _, parameter in model.named_parameters():
-        if not isinstance(parameter, Fp8ScaledWeights):
-            parameter.data = parameter.to(device="cuda")
-    return model
-
-
-class Int8DynActInt4WeightLinearLoRA(Int8DynActInt4WeightLinear):
-    """
-    Int8DynActInt4WeightLinear with LoRA adaptor.
-
-    Args:
-        in_features: Number of input features.
-        out_features: Number of output features.
-        bias: Whether to use bias.
-        device: Device to use.
-        group_size: Group size for quantization.
-        precision: Precision of quantization.
-        scales_precision: Precision of scales.
-        lora_rank: Rank of LoRA adaptor.
-        lora_scale: Scale of LoRA adaptor.
-    """
-
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        bias=False,
-        device=None,
-        # quantization parameters
-        group_size: int = 256,
-        precision: torch.dtype = torch.float32,
-        scales_precision: torch.dtype = torch.float32,
-        # LoRA parameters
-        lora_rank: Optional[int] = None,
-        lora_scale: Optional[float] = None,
-    ) -> None:
-        super().__init__(
-            in_features,
-            out_features,
-            bias=bias,
-            device=device,
-            groupsize=group_size,
-            precision=precision,
-            scales_precision=scales_precision,
-        )
-        if lora_rank is not None:
-            assert lora_scale is not None, "Please specify lora scale for LoRA."
-            # Low-rank adaptation. See paper for more details: https://arxiv.org/abs/2106.09685
-            self.adaptor = nn.Sequential()
-            self.adaptor.add_module("A", nn.Linear(in_features, lora_rank, bias=False))
-            self.adaptor.add_module("B", nn.Linear(lora_rank, out_features, bias=False))
-            self.lora_scale = lora_scale
-        else:
-            self.adaptor = None
-            self.lora_scale = None
-        self._register_load_state_dict_pre_hook(self.load_hook)
-
-    def load_hook(
-        self,
-        state_dict: Dict[str, Any],
-        prefix: str,
-        local_metadata: Dict[str, Any],
-        strict: bool,
-        missing_keys: List[str],
-        unexpected_keys: List[str],
-        error_msgs: List[str],
-    ) -> None:
-        """A hook to load the quantized weights from the state dict."""
-        if prefix + "zeros" not in state_dict:
-            # Zero-point may not be saved in the state dict. In this case, we assume it's zero.
-            assert prefix + "scales" in state_dict
-            state_dict[prefix + "zeros"] = torch.zeros_like(state_dict[prefix + "scales"])
-
-    def forward(self, input_: torch.Tensor) -> torch.Tensor:
-        module_out = super().forward(input_)
-        if self.adaptor is not None:
-            adaptor_out = self.adaptor(input_) * self.lora_scale
-            return module_out + adaptor_out
-        return module_out
-
-
-class Int8WeightEmbedding(torch.nn.Embedding):
-    """An embedding layer to load int8 weights.
-
-    Args:
-        num_embeddings: Number of embeddings.
-        embedding_dim: Embedding dimension.
-        padding_idx: Padding index.
-    """
-
-    def __init__(
-        self,
-        num_embeddings: int,
-        embedding_dim: int,
-        padding_idx: int,
-        device=None,
-    ) -> None:
-        super().__init__(num_embeddings, embedding_dim, padding_idx, device=device)
-
-        self._register_load_state_dict_pre_hook(self.load_hook)
-
-    def load_hook(
-        self,
-        state_dict: Dict[str, Any],
-        prefix: str,
-        local_metadata: Dict[str, Any],
-        strict: bool,
-        missing_keys: List[str],
-        unexpected_keys: List[str],
-        error_msgs: List[str],
-    ) -> None:
-        """A hook to load the quantized embedding weight and scales from the state dict."""
-        weights = state_dict.pop(prefix + "weight")
-        scales = state_dict.pop(prefix + "scales")
-        state_dict[prefix + "weight"] = weights * scales
-
-
-class Int8WeightLinear(torch.nn.Linear):
-    """A linear layer to load int8 weights.
-
-    Args:
-        in_features: Number of input features.
-        out_features: Number of output features.
-        bias: Whether to use bias.
-    """
-
-    def __init__(self, in_features: int, out_features: int, bias: bool = True, device=None) -> None:
-        super().__init__(in_features, out_features, bias, device=device)
-
-        self._register_load_state_dict_pre_hook(self.load_hook)
-
-    def load_hook(
-        self,
-        state_dict: Dict[str, Any],
-        prefix: str,
-        local_metadata: Dict[str, Any],
-        strict: bool,
-        missing_keys: List[str],
-        unexpected_keys: List[str],
-        error_msgs: List[str],
-    ) -> None:
-        """A hook to load the quantized linear weight and scales from the state dict."""
-        weights = state_dict.pop(prefix + "weight")
-        scales = state_dict.pop(prefix + "scales")
-        state_dict[prefix + "weight"] = weights * scales
-
-
-def _prepare_model_int4_weight_int8_dynamic_activation(
-    model: torch.nn.Module,
-    group_size: int,
-    lora_rank: Optional[int],
-    lora_scale: Optional[float],
-):
-    """Prepare the model for int4 weight and int8 dynamic activation quantization.
-
-    Note that the weights of embedding and output layers are quantized to int8.
-    """
-    device = None
-    for module_name, module in model.named_children():
-        if module_name == "output":
-            quantized_module = Int8WeightLinear(
-                in_features=module.in_features,
-                out_features=module.out_features,
-                bias=module.bias,
-                device=device,
-            )
-            del module
-            setattr(model, module_name, quantized_module)
-        elif module_name == "tok_embeddings":
-            quantized_module = Int8WeightEmbedding(
-                num_embeddings=module.num_embeddings,
-                embedding_dim=module.embedding_dim,
-                padding_idx=module.padding_idx,
-                device=device,
-            )
-            del module
-            setattr(model, module_name, quantized_module)
-        elif isinstance(module, (ColumnParallelLinear, RowParallelLinear, nn.Linear)):
-            quantized_module = Int8DynActInt4WeightLinearLoRA(
-                in_features=module.in_features,
-                out_features=module.out_features,
-                bias=False,
-                group_size=group_size,
-                lora_rank=lora_rank,
-                lora_scale=lora_scale,
-                device=device,
-            )
-            del module
-            setattr(model, module_name, quantized_module)
-        else:
-            _prepare_model_int4_weight_int8_dynamic_activation(module, group_size, lora_rank, lora_scale)
-
-    return model
-
-
-def convert_to_int4_quantized_model(
-    model: Transformer,
-    model_args: ModelArgs,
-    config: MetaReferenceQuantizedInferenceConfig,
-) -> Transformer:
-    """Convert the model to int4 quantized model."""
-
-    if model_args.quantization_args is None:
-        raise ValueError("'quantization_args' cannot be None. Please specify it.")
-
-    quantization_args = model_args.quantization_args
-
-    if quantization_args.scheme.value != "int4_weight_int8_dynamic_activation":
-        raise NotImplementedError(
-            "Only int4 quantization with 'int4_weight_int8_dynamic_activation' scheme is supported."
-        )
-
-    group_size = model_args.quantization_args.group_size
-    if group_size is None:
-        raise ValueError("'group_size' cannot be None in 'quantization_args'. Please specify it.")
-
-    if model_args.lora_args is None:
-        # Certain quantized models (e.g., SpinQuant) may not have LoRA.
-        lora_rank = None
-        lora_scale = None
-    else:
-        lora_rank = model_args.lora_args.rank
-        lora_scale = model_args.lora_args.scale
-
-    _prepare_model_int4_weight_int8_dynamic_activation(model, group_size, lora_rank, lora_scale)
-    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-    return model.to(device)
--- a/llama_stack/providers/inline/inference/meta_reference/quantization/scripts/init.py
+++ b/llama_stack/providers/inline/inference/meta_reference/quantization/scripts/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/llama_stack/providers/inline/inference/meta_reference/quantization/scripts/quantize_checkpoint.py
+++ b/llama_stack/providers/inline/inference/meta_reference/quantization/scripts/quantize_checkpoint.py
@ -1,152 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
-
-import json
-import logging
-import os
-import shutil
-import sys
-from pathlib import Path
-from typing import Optional
-
-import fire
-import torch
-from fairscale.nn.model_parallel.initialize import (
-    get_model_parallel_rank,
-    initialize_model_parallel,
-    model_parallel_is_initialized,
-)
-from torch.nn.parameter import Parameter
-
-from llama_stack.models.llama.llama3.tokenizer import Tokenizer
-from llama_stack.providers.inline.inference.meta_reference.llama3.args import ModelArgs
-from llama_stack.providers.inline.inference.meta_reference.llama3.model import Transformer, TransformerBlock
-from llama_stack.providers.inline.inference.meta_reference.quantization.fp8_impls import (
-    quantize_fp8,
-)
-
-log = logging.getLogger(__name__)
-
-
-def main(
-    ckpt_dir: str,
-    tokenizer_path: str,
-    quantized_ckpt_dir: str,
-    max_seq_len: Optional[int] = 512,
-    max_batch_size: Optional[int] = 4,
-    model_parallel_size: Optional[int] = None,
-    fp8_activation_scale_ub: Optional[float] = 1200.0,
-    seed: int = 1,
-):
-    """ """
-    if not os.path.exists(quantized_ckpt_dir):
-        os.makedirs(quantized_ckpt_dir)
-        shutil.copy(
-            os.path.join(ckpt_dir, "params.json"),
-            os.path.join(quantized_ckpt_dir, "params.json"),
-        )
-        shutil.copy(
-            os.path.join(ckpt_dir, "tokenizer.model"),
-            os.path.join(quantized_ckpt_dir, "tokenizer.model"),
-        )
-
-    if not torch.distributed.is_initialized():
-        torch.distributed.init_process_group("nccl")
-        if not model_parallel_is_initialized():
-            if model_parallel_size is None:
-                model_parallel_size = int(os.environ.get("WORLD_SIZE", 1))
-            initialize_model_parallel(model_parallel_size)
-
-        local_rank = int(os.environ.get("LOCAL_RANK", 0))
-        torch.cuda.set_device(local_rank)
-
-        # seed must be the same in all processes
-        torch.manual_seed(seed)
-
-        if local_rank > 0:
-            sys.stdout = open(os.devnull, "w")
-
-        checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
-        assert len(checkpoints) > 0, f"no checkpoint files found in {ckpt_dir}"
-        assert model_parallel_size == len(checkpoints), (
-            f"Loading a checkpoint for MP={len(checkpoints)} but world size is {model_parallel_size}"
-        )
-        ckpt_path = checkpoints[get_model_parallel_rank()]
-        checkpoint = torch.load(ckpt_path, map_location="cpu", weights_only=True)
-        with open(Path(ckpt_dir) / "params.json", "r") as f:
-            params = json.loads(f.read())
-
-        model_args: ModelArgs = ModelArgs(
-            max_seq_len=max_seq_len,
-            max_batch_size=max_batch_size,
-            **params,
-        )
-        tokenizer = Tokenizer(model_path=tokenizer_path)
-        assert model_args.vocab_size == tokenizer.n_words, (
-            f"model_args vocab = {model_args.vocab_size} but tokenizer vocab = {tokenizer.n_words}"
-        )
-
-        # load on CPU in bf16 so that fp8 conversion does not find an unexpected (fp32, e.g.) datatype
-        torch.set_default_tensor_type(torch.BFloat16Tensor)
-
-        model = Transformer(model_args)
-        model.load_state_dict(checkpoint, strict=False)
-
-        if torch.cuda.is_bf16_supported():
-            torch.set_default_tensor_type(torch.cuda.BFloat16Tensor)
-        else:
-            torch.set_default_tensor_type(torch.cuda.HalfTensor)
-
-        log.info(ckpt_path)
-        assert quantized_ckpt_dir is not None, "QUantized checkpoint directory should not be None"
-        fp8_scales = {}
-        for block in model.layers:
-            if isinstance(block, TransformerBlock):
-                if block.layer_id == 0 or block.layer_id == (model.n_layers - 1):
-                    continue
-
-                fp8_weight = quantize_fp8(
-                    block.feed_forward.w1.weight,
-                    fp8_activation_scale_ub,
-                    output_device=torch.device("cpu"),
-                )
-                with torch.inference_mode():
-                    block.feed_forward.w1.weight = Parameter(fp8_weight.weight)
-                fp8_scales[f"{block.layer_id}_feed_forward.w1_{get_model_parallel_rank()}"] = fp8_weight.scale
-
-                fp8_weight = quantize_fp8(
-                    block.feed_forward.w3.weight,
-                    fp8_activation_scale_ub,
-                    output_device=torch.device("cpu"),
-                )
-                with torch.inference_mode():
-                    block.feed_forward.w3.weight = Parameter(fp8_weight.weight)
-                fp8_scales[f"{block.layer_id}_feed_forward.w3_{get_model_parallel_rank()}"] = fp8_weight.scale
-
-                fp8_weight = quantize_fp8(
-                    block.feed_forward.w2.weight,
-                    fp8_activation_scale_ub,
-                    output_device=torch.device("cpu"),
-                )
-                with torch.inference_mode():
-                    block.feed_forward.w2.weight = Parameter(fp8_weight.weight)
-                fp8_scales[f"{block.layer_id}_feed_forward.w2_{get_model_parallel_rank()}"] = fp8_weight.scale
-
-        fp8_scales_path = os.path.join(quantized_ckpt_dir, f"fp8_scales_{get_model_parallel_rank()}.pt")
-        torch.save(fp8_scales, fp8_scales_path)
-
-        ckpt_path = os.path.join(
-            quantized_ckpt_dir,
-            "consolidated.{:02d}.pth".format(get_model_parallel_rank()),
-        )
-        torch.save(model.state_dict(), ckpt_path)
-
-
-if __name__ == "__main__":
-    fire.Fire(main)
--- a/llama_stack/providers/inline/inference/meta_reference/quantization/scripts/run_quantize_checkpoint.sh
+++ b/llama_stack/providers/inline/inference/meta_reference/quantization/scripts/run_quantize_checkpoint.sh
@ -1,31 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-set -euo pipefail
-set -x
-
-cd $(dirname "$(realpath "$0")")
-
-MASTER_HOST=$1
-RUN_ID=$2
-CKPT_DIR=$3
-QUANT_CKPT_DIR=$4
-TOKENIZER_PATH=$5
-NNODES=$6
-NPROC=$7
-
-echo $MASTER_HOST, $RUN_ID, $CKPT_DIR, $QUANT_CKPT_DIR
-
-NCCL_NET=Socket NCCL_SOCKET_IFNAME=eth TIKTOKEN_CACHE_DIR="" PYTHONPATH="/home/$USER/llama-stack" \
-  torchrun \
-   --nnodes=$NNODES --nproc_per_node=$NPROC \
-   --rdzv_id=$RUN_ID \
-   --rdzv_conf='timeout=120' \
-   --rdzv_backend=c10d \
-   --rdzv_endpoint="${MASTER_HOST}:29502" \
-   quantize_checkpoint.py $CKPT_DIR $TOKENIZER_PATH $QUANT_CKPT_DIR
--- a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
+++ b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py
@ -10,6 +10,7 @@ from typing import AsyncGenerator, List, Optional, Union
 from llama_stack.apis.inference import (
    CompletionResponse,
    Inference,
+    InterleavedContent,
    LogProbConfig,
    Message,
    ResponseFormat,
@ -23,6 +24,10 @@ from llama_stack.providers.datatypes import Model, ModelsProtocolPrivate
 from llama_stack.providers.utils.inference.embedding_mixin import (
    SentenceTransformerEmbeddingMixin,
 )
+from llama_stack.providers.utils.inference.openai_compat import (
+    OpenAIChatCompletionToLlamaStackMixin,
+    OpenAICompletionToLlamaStackMixin,
+)

 from .config import SentenceTransformersInferenceConfig

@ -30,6 +35,8 @@ log = logging.getLogger(__name__)


 class SentenceTransformersInferenceImpl(
+    OpenAIChatCompletionToLlamaStackMixin,
+    OpenAICompletionToLlamaStackMixin,
    SentenceTransformerEmbeddingMixin,
    Inference,
    ModelsProtocolPrivate,
@ -74,3 +81,25 @@ class SentenceTransformersInferenceImpl(
        tool_config: Optional[ToolConfig] = None,
    ) -> AsyncGenerator:
        raise ValueError("Sentence transformers don't support chat completion")
+
+    async def batch_completion(
+        self,
+        model_id: str,
+        content_batch: List[InterleavedContent],
+        sampling_params: Optional[SamplingParams] = None,
+        response_format: Optional[ResponseFormat] = None,
+        logprobs: Optional[LogProbConfig] = None,
+    ):
+        raise NotImplementedError("Batch completion is not supported for Sentence Transformers")
+
+    async def batch_chat_completion(
+        self,
+        model_id: str,
+        messages_batch: List[List[Message]],
+        sampling_params: Optional[SamplingParams] = None,
+        tools: Optional[List[ToolDefinition]] = None,
+        tool_config: Optional[ToolConfig] = None,
+        response_format: Optional[ResponseFormat] = None,
+        logprobs: Optional[LogProbConfig] = None,
+    ):
+        raise NotImplementedError("Batch chat completion is not supported for Sentence Transformers")
--- a/llama_stack/providers/inline/inference/vllm/openai_utils.py
+++ b/llama_stack/providers/inline/inference/vllm/openai_utils.py
@ -14,9 +14,10 @@ from llama_stack.apis.inference import (
    JsonSchemaResponseFormat,
    Message,
    ToolChoice,
+    ToolDefinition,
    UserMessage,
 )
-from llama_stack.models.llama.datatypes import BuiltinTool, ToolDefinition
+from llama_stack.models.llama.datatypes import BuiltinTool
 from llama_stack.providers.utils.inference.openai_compat import (
    convert_message_to_openai_dict,
    get_sampling_options,
--- a/llama_stack/providers/inline/inference/vllm/vllm.py
+++ b/llama_stack/providers/inline/inference/vllm/vllm.py
@ -46,6 +46,8 @@ from llama_stack.apis.inference import (
    TokenLogProbs,
    ToolChoice,
    ToolConfig,
+    TopKSamplingStrategy,
+    TopPSamplingStrategy,
 )
 from llama_stack.apis.models import Model
 from llama_stack.log import get_logger
@ -55,8 +57,6 @@ from llama_stack.models.llama.datatypes import (
    ToolCall,
    ToolDefinition,
    ToolPromptFormat,
-    TopKSamplingStrategy,
-    TopPSamplingStrategy,
 )
 from llama_stack.models.llama.llama3.chat_format import ChatFormat
 from llama_stack.models.llama.llama3.tokenizer import Tokenizer
@ -66,8 +66,10 @@ from llama_stack.providers.utils.inference.model_registry import (
    ModelsProtocolPrivate,
 )
 from llama_stack.providers.utils.inference.openai_compat import (
+    OpenAIChatCompletionToLlamaStackMixin,
    OpenAICompatCompletionChoice,
    OpenAICompatCompletionResponse,
+    OpenAICompletionToLlamaStackMixin,
    get_stop_reason,
    process_chat_completion_stream_response,
 )
@ -172,7 +174,12 @@ def _convert_sampling_params(
    return vllm_sampling_params


-class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
+class VLLMInferenceImpl(
+    Inference,
+    OpenAIChatCompletionToLlamaStackMixin,
+    OpenAICompletionToLlamaStackMixin,
+    ModelsProtocolPrivate,
+):
    """
    vLLM-based inference model adapter for Llama Stack with support for multiple models.

--- a/llama_stack/providers/inline/post_training/torchtune/common/utils.py
+++ b/llama_stack/providers/inline/post_training/torchtune/common/utils.py
@ -22,8 +22,8 @@ from torchtune.models.llama3_2 import lora_llama3_2_3b
 from torchtune.modules.transforms import Transform

 from llama_stack.apis.post_training import DatasetFormat
-from llama_stack.models.llama.datatypes import Model
 from llama_stack.models.llama.sku_list import resolve_model
+from llama_stack.models.llama.sku_types import Model

 BuildLoraModelCallable = Callable[..., torch.nn.Module]
 BuildTokenizerCallable = Callable[..., Llama3Tokenizer]
--- a/llama_stack/providers/inline/post_training/torchtune/post_training.py
+++ b/llama_stack/providers/inline/post_training/torchtune/post_training.py
@ -3,13 +3,14 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from datetime import datetime, timezone
+from enum import Enum
 from typing import Any, Dict, Optional

 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
 from llama_stack.apis.post_training import (
    AlgorithmConfig,
+    Checkpoint,
    DPOAlignmentConfig,
    JobStatus,
    ListPostTrainingJobsResponse,
@ -25,9 +26,19 @@ from llama_stack.providers.inline.post_training.torchtune.config import (
 from llama_stack.providers.inline.post_training.torchtune.recipes.lora_finetuning_single_device import (
    LoraFinetuningSingleDevice,
 )
+from llama_stack.providers.utils.scheduler import JobArtifact, Scheduler
+from llama_stack.providers.utils.scheduler import JobStatus as SchedulerJobStatus
 from llama_stack.schema_utils import webmethod


+class TrainingArtifactType(Enum):
+    CHECKPOINT = "checkpoint"
+    RESOURCES_STATS = "resources_stats"
+
+
+_JOB_TYPE_SUPERVISED_FINE_TUNE = "supervised-fine-tune"
+
+
 class TorchtunePostTrainingImpl:
    def __init__(
        self,
@ -38,13 +49,27 @@ class TorchtunePostTrainingImpl:
        self.config = config
        self.datasetio_api = datasetio_api
        self.datasets_api = datasets
+        self._scheduler = Scheduler()

-        # TODO: assume sync job, will need jobs API for async scheduling
-        self.jobs = {}
-        self.checkpoints_dict = {}
+    async def shutdown(self) -> None:
+        await self._scheduler.shutdown()

-    async def shutdown(self):
-        pass
+    @staticmethod
+    def _checkpoint_to_artifact(checkpoint: Checkpoint) -> JobArtifact:
+        return JobArtifact(
+            type=TrainingArtifactType.CHECKPOINT.value,
+            name=checkpoint.identifier,
+            uri=checkpoint.path,
+            metadata=dict(checkpoint),
+        )
+
+    @staticmethod
+    def _resources_stats_to_artifact(resources_stats: Dict[str, Any]) -> JobArtifact:
+        return JobArtifact(
+            type=TrainingArtifactType.RESOURCES_STATS.value,
+            name=TrainingArtifactType.RESOURCES_STATS.value,
+            metadata=resources_stats,
+        )

    async def supervised_fine_tune(
        self,
@ -56,20 +81,11 @@ class TorchtunePostTrainingImpl:
        checkpoint_dir: Optional[str],
        algorithm_config: Optional[AlgorithmConfig],
    ) -> PostTrainingJob:
-        if job_uuid in self.jobs:
-            raise ValueError(f"Job {job_uuid} already exists")
-
-        post_training_job = PostTrainingJob(job_uuid=job_uuid)
-
-        job_status_response = PostTrainingJobStatusResponse(
-            job_uuid=job_uuid,
-            status=JobStatus.scheduled,
-            scheduled_at=datetime.now(timezone.utc),
-        )
-        self.jobs[job_uuid] = job_status_response
-
        if isinstance(algorithm_config, LoraFinetuningConfig):
-            try:
+
+            async def handler(on_log_message_cb, on_status_change_cb, on_artifact_collected_cb):
+                on_log_message_cb("Starting Lora finetuning")
+
                recipe = LoraFinetuningSingleDevice(
                    self.config,
                    job_uuid,
@ -82,26 +98,22 @@ class TorchtunePostTrainingImpl:
                    self.datasetio_api,
                    self.datasets_api,
                )
-
-                job_status_response.status = JobStatus.in_progress
-                job_status_response.started_at = datetime.now(timezone.utc)
-
                await recipe.setup()
+
                resources_allocated, checkpoints = await recipe.train()

-                self.checkpoints_dict[job_uuid] = checkpoints
-                job_status_response.resources_allocated = resources_allocated
-                job_status_response.checkpoints = checkpoints
-                job_status_response.status = JobStatus.completed
-                job_status_response.completed_at = datetime.now(timezone.utc)
+                on_artifact_collected_cb(self._resources_stats_to_artifact(resources_allocated))
+                for checkpoint in checkpoints:
+                    artifact = self._checkpoint_to_artifact(checkpoint)
+                    on_artifact_collected_cb(artifact)

-            except Exception:
-                job_status_response.status = JobStatus.failed
-                raise
+                on_status_change_cb(SchedulerJobStatus.completed)
+                on_log_message_cb("Lora finetuning completed")
        else:
            raise NotImplementedError()

-        return post_training_job
+        job_uuid = self._scheduler.schedule(_JOB_TYPE_SUPERVISED_FINE_TUNE, job_uuid, handler)
+        return PostTrainingJob(job_uuid=job_uuid)

    async def preference_optimize(
        self,
@ -114,19 +126,55 @@ class TorchtunePostTrainingImpl:
    ) -> PostTrainingJob: ...

    async def get_training_jobs(self) -> ListPostTrainingJobsResponse:
-        return ListPostTrainingJobsResponse(data=[PostTrainingJob(job_uuid=uuid_) for uuid_ in self.jobs])
+        return ListPostTrainingJobsResponse(
+            data=[PostTrainingJob(job_uuid=job.id) for job in self._scheduler.get_jobs()]
+        )
+
+    @staticmethod
+    def _get_artifacts_metadata_by_type(job, artifact_type):
+        return [artifact.metadata for artifact in job.artifacts if artifact.type == artifact_type]
+
+    @classmethod
+    def _get_checkpoints(cls, job):
+        return cls._get_artifacts_metadata_by_type(job, TrainingArtifactType.CHECKPOINT.value)
+
+    @classmethod
+    def _get_resources_allocated(cls, job):
+        data = cls._get_artifacts_metadata_by_type(job, TrainingArtifactType.RESOURCES_STATS.value)
+        return data[0] if data else None

    @webmethod(route="/post-training/job/status")
    async def get_training_job_status(self, job_uuid: str) -> Optional[PostTrainingJobStatusResponse]:
-        return self.jobs.get(job_uuid, None)
+        job = self._scheduler.get_job(job_uuid)
+
+        match job.status:
+            # TODO: Add support for other statuses to API
+            case SchedulerJobStatus.new | SchedulerJobStatus.scheduled:
+                status = JobStatus.scheduled
+            case SchedulerJobStatus.running:
+                status = JobStatus.in_progress
+            case SchedulerJobStatus.completed:
+                status = JobStatus.completed
+            case SchedulerJobStatus.failed:
+                status = JobStatus.failed
+            case _:
+                raise NotImplementedError()
+
+        return PostTrainingJobStatusResponse(
+            job_uuid=job_uuid,
+            status=status,
+            scheduled_at=job.scheduled_at,
+            started_at=job.started_at,
+            completed_at=job.completed_at,
+            checkpoints=self._get_checkpoints(job),
+            resources_allocated=self._get_resources_allocated(job),
+        )

    @webmethod(route="/post-training/job/cancel")
    async def cancel_training_job(self, job_uuid: str) -> None:
-        raise NotImplementedError("Job cancel is not implemented yet")
+        self._scheduler.cancel(job_uuid)

    @webmethod(route="/post-training/job/artifacts")
    async def get_training_job_artifacts(self, job_uuid: str) -> Optional[PostTrainingJobArtifactsResponse]:
-        if job_uuid in self.checkpoints_dict:
-            checkpoints = self.checkpoints_dict.get(job_uuid, [])
-            return PostTrainingJobArtifactsResponse(job_uuid=job_uuid, checkpoints=checkpoints)
-        return None
+        job = self._scheduler.get_job(job_uuid)
+        return PostTrainingJobArtifactsResponse(job_uuid=job_uuid, checkpoints=self._get_checkpoints(job))
--- a/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
+++ b/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
@ -38,6 +38,8 @@ from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
 from llama_stack.apis.post_training import (
    Checkpoint,
+    DataConfig,
+    EfficiencyConfig,
    LoraFinetuningConfig,
    OptimizerConfig,
    QATFinetuningConfig,
@ -89,6 +91,10 @@ class LoraFinetuningSingleDevice:
        datasetio_api: DatasetIO,
        datasets_api: Datasets,
    ) -> None:
+        assert isinstance(training_config.data_config, DataConfig), "DataConfig must be initialized"
+
+        assert isinstance(training_config.efficiency_config, EfficiencyConfig), "EfficiencyConfig must be initialized"
+
        self.job_uuid = job_uuid
        self.training_config = training_config
        if not isinstance(algorithm_config, LoraFinetuningConfig):
@ -188,6 +194,7 @@ class LoraFinetuningSingleDevice:
        self._tokenizer = await self._setup_tokenizer()
        log.info("Tokenizer is initialized.")

+        assert isinstance(self.training_config.optimizer_config, OptimizerConfig), "OptimizerConfig must be initialized"
        self._optimizer = await self._setup_optimizer(optimizer_config=self.training_config.optimizer_config)
        log.info("Optimizer is initialized.")

@ -195,6 +202,8 @@ class LoraFinetuningSingleDevice:
        self._model.set_num_output_chunks(self._loss_fn.num_output_chunks)
        log.info("Loss is initialized.")

+        assert isinstance(self.training_config.data_config, DataConfig), "DataConfig must be initialized"
+
        self._training_sampler, self._training_dataloader = await self._setup_data(
            dataset_id=self.training_config.data_config.dataset_id,
            tokenizer=self._tokenizer,
@ -452,6 +461,7 @@ class LoraFinetuningSingleDevice:
        """
        The core training loop.
        """
+        assert isinstance(self.training_config.data_config, DataConfig), "DataConfig must be initialized"
        # Initialize tokens count and running loss (for grad accumulation)
        t0 = time.perf_counter()
        running_loss: float = 0.0
--- a/llama_stack/providers/inline/safety/llama_guard/llama_guard.py
+++ b/llama_stack/providers/inline/safety/llama_guard/llama_guard.py
@ -10,7 +10,6 @@ from typing import Any, Dict, List, Optional

 from llama_stack.apis.common.content_types import ImageContentItem, TextContentItem
 from llama_stack.apis.inference import (
-    ChatCompletionResponseEventType,
    Inference,
    Message,
    UserMessage,
@ -23,7 +22,8 @@ from llama_stack.apis.safety import (
 )
 from llama_stack.apis.shields import Shield
 from llama_stack.distribution.datatypes import Api
-from llama_stack.models.llama.datatypes import CoreModelId, Role
+from llama_stack.models.llama.datatypes import Role
+from llama_stack.models.llama.sku_types import CoreModelId
 from llama_stack.providers.datatypes import ShieldsProtocolPrivate
 from llama_stack.providers.utils.inference.prompt_adapter import (
    interleaved_content_as_str,
@ -238,16 +238,12 @@ class LlamaGuardShield:
            shield_input_message = self.build_text_shield_input(messages)

        # TODO: llama-stack inference protocol has issues with non-streaming inference code
-        content = ""
-        async for chunk in await self.inference_api.chat_completion(
+        response = await self.inference_api.chat_completion(
            model_id=self.model,
            messages=[shield_input_message],
-            stream=True,
-        ):
-            event = chunk.event
-            if event.event_type == ChatCompletionResponseEventType.progress and event.delta.type == "text":
-                content += event.delta.text
-
+            stream=False,
+        )
+        content = response.completion_message.content
        content = content.strip()
        return self.get_shield_response(content)

--- a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
@ -126,7 +126,7 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):
    def _log_unstructured(self, event: UnstructuredLogEvent, ttl_seconds: int) -> None:
        with self._lock:
            # Use global storage instead of instance storage
-            span_id = event.span_id
+            span_id = int(event.span_id, 16)
            span = _GLOBAL_STORAGE["active_spans"].get(span_id)

            if span:
--- a/llama_stack/providers/registry/inference.py
+++ b/llama_stack/providers/registry/inference.py
@ -24,6 +24,8 @@ META_REFERENCE_DEPS = [
    "zmq",
    "lm-format-enforcer",
    "sentence-transformers",
+    "torchao==0.8.0",
+    "fbgemm-gpu-genai==1.1.2",
 ]


@ -36,19 +38,6 @@ def available_providers() -> List[ProviderSpec]:
            module="llama_stack.providers.inline.inference.meta_reference",
            config_class="llama_stack.providers.inline.inference.meta_reference.MetaReferenceInferenceConfig",
        ),
-        InlineProviderSpec(
-            api=Api.inference,
-            provider_type="inline::meta-reference-quantized",
-            pip_packages=(
-                META_REFERENCE_DEPS
-                + [
-                    "fbgemm-gpu",
-                    "torchao==0.5.0",
-                ]
-            ),
-            module="llama_stack.providers.inline.inference.meta_reference",
-            config_class="llama_stack.providers.inline.inference.meta_reference.MetaReferenceQuantizedInferenceConfig",
-        ),
        InlineProviderSpec(
            api=Api.inference,
            provider_type="inline::vllm",
@ -228,6 +217,56 @@ def available_providers() -> List[ProviderSpec]:
                provider_data_validator="llama_stack.providers.remote.inference.groq.config.GroqProviderDataValidator",
            ),
        ),
+        remote_provider_spec(
+            api=Api.inference,
+            adapter=AdapterSpec(
+                adapter_type="fireworks-openai-compat",
+                pip_packages=["litellm"],
+                module="llama_stack.providers.remote.inference.fireworks_openai_compat",
+                config_class="llama_stack.providers.remote.inference.fireworks_openai_compat.config.FireworksCompatConfig",
+                provider_data_validator="llama_stack.providers.remote.inference.fireworks_openai_compat.config.FireworksProviderDataValidator",
+            ),
+        ),
+        remote_provider_spec(
+            api=Api.inference,
+            adapter=AdapterSpec(
+                adapter_type="together-openai-compat",
+                pip_packages=["litellm"],
+                module="llama_stack.providers.remote.inference.together_openai_compat",
+                config_class="llama_stack.providers.remote.inference.together_openai_compat.config.TogetherCompatConfig",
+                provider_data_validator="llama_stack.providers.remote.inference.together_openai_compat.config.TogetherProviderDataValidator",
+            ),
+        ),
+        remote_provider_spec(
+            api=Api.inference,
+            adapter=AdapterSpec(
+                adapter_type="groq-openai-compat",
+                pip_packages=["litellm"],
+                module="llama_stack.providers.remote.inference.groq_openai_compat",
+                config_class="llama_stack.providers.remote.inference.groq_openai_compat.config.GroqCompatConfig",
+                provider_data_validator="llama_stack.providers.remote.inference.groq_openai_compat.config.GroqProviderDataValidator",
+            ),
+        ),
+        remote_provider_spec(
+            api=Api.inference,
+            adapter=AdapterSpec(
+                adapter_type="sambanova-openai-compat",
+                pip_packages=["litellm"],
+                module="llama_stack.providers.remote.inference.sambanova_openai_compat",
+                config_class="llama_stack.providers.remote.inference.sambanova_openai_compat.config.SambaNovaCompatConfig",
+                provider_data_validator="llama_stack.providers.remote.inference.sambanova_openai_compat.config.SambaNovaProviderDataValidator",
+            ),
+        ),
+        remote_provider_spec(
+            api=Api.inference,
+            adapter=AdapterSpec(
+                adapter_type="cerebras-openai-compat",
+                pip_packages=["litellm"],
+                module="llama_stack.providers.remote.inference.cerebras_openai_compat",
+                config_class="llama_stack.providers.remote.inference.cerebras_openai_compat.config.CerebrasCompatConfig",
+                provider_data_validator="llama_stack.providers.remote.inference.cerebras_openai_compat.config.CerebrasProviderDataValidator",
+            ),
+        ),
        remote_provider_spec(
            api=Api.inference,
            adapter=AdapterSpec(
--- a/llama_stack/providers/remote/inference/bedrock/bedrock.py
+++ b/llama_stack/providers/remote/inference/bedrock/bedrock.py
@ -36,8 +36,10 @@ from llama_stack.providers.utils.inference.model_registry import (
    ModelRegistryHelper,
 )
 from llama_stack.providers.utils.inference.openai_compat import (
+    OpenAIChatCompletionToLlamaStackMixin,
    OpenAICompatCompletionChoice,
    OpenAICompatCompletionResponse,
+    OpenAICompletionToLlamaStackMixin,
    get_sampling_strategy_options,
    process_chat_completion_response,
    process_chat_completion_stream_response,
@ -51,7 +53,12 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
 from .models import MODEL_ENTRIES


-class BedrockInferenceAdapter(ModelRegistryHelper, Inference):
+class BedrockInferenceAdapter(
+    ModelRegistryHelper,
+    Inference,
+    OpenAIChatCompletionToLlamaStackMixin,
+    OpenAICompletionToLlamaStackMixin,
+):
    def __init__(self, config: BedrockConfig) -> None:
        ModelRegistryHelper.__init__(self, MODEL_ENTRIES)
        self._config = config
--- a/llama_stack/providers/remote/inference/bedrock/models.py
+++ b/llama_stack/providers/remote/inference/bedrock/models.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.models.llama.datatypes import CoreModelId
+from llama_stack.models.llama.sku_types import CoreModelId
 from llama_stack.providers.utils.inference.model_registry import (
    build_hf_repo_model_entry,
 )
--- a/llama_stack/providers/remote/inference/cerebras/cerebras.py
+++ b/llama_stack/providers/remote/inference/cerebras/cerebras.py
@ -28,12 +28,14 @@ from llama_stack.apis.inference import (
    ToolConfig,
    ToolDefinition,
    ToolPromptFormat,
+    TopKSamplingStrategy,
 )
-from llama_stack.models.llama.datatypes import TopKSamplingStrategy
 from llama_stack.providers.utils.inference.model_registry import (
    ModelRegistryHelper,
 )
 from llama_stack.providers.utils.inference.openai_compat import (
+    OpenAIChatCompletionToLlamaStackMixin,
+    OpenAICompletionToLlamaStackMixin,
    get_sampling_options,
    process_chat_completion_response,
    process_chat_completion_stream_response,
@ -49,7 +51,12 @@ from .config import CerebrasImplConfig
 from .models import MODEL_ENTRIES


-class CerebrasInferenceAdapter(ModelRegistryHelper, Inference):
+class CerebrasInferenceAdapter(
+    ModelRegistryHelper,
+    Inference,
+    OpenAIChatCompletionToLlamaStackMixin,
+    OpenAICompletionToLlamaStackMixin,
+):
    def __init__(self, config: CerebrasImplConfig) -> None:
        ModelRegistryHelper.__init__(
            self,
--- a/llama_stack/providers/remote/inference/cerebras/models.py
+++ b/llama_stack/providers/remote/inference/cerebras/models.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.models.llama.datatypes import CoreModelId
+from llama_stack.models.llama.sku_types import CoreModelId
 from llama_stack.providers.utils.inference.model_registry import (
    build_hf_repo_model_entry,
 )
--- a/llama_stack/providers/remote/inference/cerebras_openai_compat/init.py
+++ b/llama_stack/providers/remote/inference/cerebras_openai_compat/init.py
@ -0,0 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.inference import Inference
+
+from .config import CerebrasCompatConfig
+
+
+async def get_adapter_impl(config: CerebrasCompatConfig, _deps) -> Inference:
+    # import dynamically so the import is used only when it is needed
+    from .cerebras import CerebrasCompatInferenceAdapter
+
+    adapter = CerebrasCompatInferenceAdapter(config)
+    return adapter
--- a/llama_stack/providers/remote/inference/cerebras_openai_compat/cerebras.py
+++ b/llama_stack/providers/remote/inference/cerebras_openai_compat/cerebras.py
@ -0,0 +1,30 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.providers.remote.inference.cerebras_openai_compat.config import CerebrasCompatConfig
+from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
+
+from ..cerebras.models import MODEL_ENTRIES
+
+
+class CerebrasCompatInferenceAdapter(LiteLLMOpenAIMixin):
+    _config: CerebrasCompatConfig
+
+    def __init__(self, config: CerebrasCompatConfig):
+        LiteLLMOpenAIMixin.__init__(
+            self,
+            model_entries=MODEL_ENTRIES,
+            api_key_from_config=config.api_key,
+            provider_data_api_key_field="cerebras_api_key",
+            openai_compat_api_base=config.openai_compat_api_base,
+        )
+        self.config = config
+
+    async def initialize(self):
+        await super().initialize()
+
+    async def shutdown(self):
+        await super().shutdown()
--- a/llama_stack/providers/remote/inference/cerebras_openai_compat/config.py
+++ b/llama_stack/providers/remote/inference/cerebras_openai_compat/config.py
@ -0,0 +1,38 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any, Dict, Optional
+
+from pydantic import BaseModel, Field
+
+from llama_stack.schema_utils import json_schema_type
+
+
+class CerebrasProviderDataValidator(BaseModel):
+    cerebras_api_key: Optional[str] = Field(
+        default=None,
+        description="API key for Cerebras models",
+    )
+
+
+@json_schema_type
+class CerebrasCompatConfig(BaseModel):
+    api_key: Optional[str] = Field(
+        default=None,
+        description="The Cerebras API key",
+    )
+
+    openai_compat_api_base: str = Field(
+        default="https://api.cerebras.ai/v1",
+        description="The URL for the Cerebras API server",
+    )
+
+    @classmethod
+    def sample_run_config(cls, api_key: str = "${env.CEREBRAS_API_KEY}", **kwargs) -> Dict[str, Any]:
+        return {
+            "openai_compat_api_base": "https://api.cerebras.ai/v1",
+            "api_key": api_key,
+        }
--- a/llama_stack/providers/remote/inference/databricks/databricks.py
+++ b/llama_stack/providers/remote/inference/databricks/databricks.py
@ -28,12 +28,14 @@ from llama_stack.apis.inference import (
    ToolDefinition,
    ToolPromptFormat,
 )
-from llama_stack.models.llama.datatypes import CoreModelId
+from llama_stack.models.llama.sku_types import CoreModelId
 from llama_stack.providers.utils.inference.model_registry import (
    ModelRegistryHelper,
    build_hf_repo_model_entry,
 )
 from llama_stack.providers.utils.inference.openai_compat import (
+    OpenAIChatCompletionToLlamaStackMixin,
+    OpenAICompletionToLlamaStackMixin,
    get_sampling_options,
    process_chat_completion_response,
    process_chat_completion_stream_response,
@ -56,7 +58,12 @@ model_entries = [
 ]


-class DatabricksInferenceAdapter(ModelRegistryHelper, Inference):
+class DatabricksInferenceAdapter(
+    ModelRegistryHelper,
+    Inference,
+    OpenAIChatCompletionToLlamaStackMixin,
+    OpenAICompletionToLlamaStackMixin,
+):
    def __init__(self, config: DatabricksImplConfig) -> None:
        ModelRegistryHelper.__init__(self, model_entries=model_entries)
        self.config = config
--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@ -4,9 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import AsyncGenerator, List, Optional, Union
+from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union

 from fireworks.client import Fireworks
+from openai import AsyncOpenAI

 from llama_stack.apis.common.content_types import (
    InterleavedContent,
@ -31,14 +32,23 @@ from llama_stack.apis.inference import (
    ToolDefinition,
    ToolPromptFormat,
 )
+from llama_stack.apis.inference.inference import (
+    OpenAIChatCompletion,
+    OpenAIChatCompletionChunk,
+    OpenAICompletion,
+    OpenAIMessageParam,
+    OpenAIResponseFormatParam,
+)
 from llama_stack.distribution.request_headers import NeedsRequestProviderData
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.inference.model_registry import (
    ModelRegistryHelper,
 )
 from llama_stack.providers.utils.inference.openai_compat import (
+    OpenAIChatCompletionToLlamaStackMixin,
    convert_message_to_openai_dict,
    get_sampling_options,
+    prepare_openai_completion_params,
    process_chat_completion_response,
    process_chat_completion_stream_response,
    process_completion_response,
@ -81,10 +91,16 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
                )
            return provider_data.fireworks_api_key

+    def _get_base_url(self) -> str:
+        return "https://api.fireworks.ai/inference/v1"
+
    def _get_client(self) -> Fireworks:
        fireworks_api_key = self._get_api_key()
        return Fireworks(api_key=fireworks_api_key)

+    def _get_openai_client(self) -> AsyncOpenAI:
+        return AsyncOpenAI(base_url=self._get_base_url(), api_key=self._get_api_key())
+
    async def completion(
        self,
        model_id: str,
@ -268,3 +284,114 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv

        embeddings = [data.embedding for data in response.data]
        return EmbeddingsResponse(embeddings=embeddings)
+
+    async def openai_completion(
+        self,
+        model: str,
+        prompt: Union[str, List[str], List[int], List[List[int]]],
+        best_of: Optional[int] = None,
+        echo: Optional[bool] = None,
+        frequency_penalty: Optional[float] = None,
+        logit_bias: Optional[Dict[str, float]] = None,
+        logprobs: Optional[bool] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        presence_penalty: Optional[float] = None,
+        seed: Optional[int] = None,
+        stop: Optional[Union[str, List[str]]] = None,
+        stream: Optional[bool] = None,
+        stream_options: Optional[Dict[str, Any]] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        user: Optional[str] = None,
+        guided_choice: Optional[List[str]] = None,
+        prompt_logprobs: Optional[int] = None,
+    ) -> OpenAICompletion:
+        model_obj = await self.model_store.get_model(model)
+
+        # Fireworks always prepends with BOS
+        if isinstance(prompt, str) and prompt.startswith("<|begin_of_text|>"):
+            prompt = prompt[len("<|begin_of_text|>") :]
+
+        params = await prepare_openai_completion_params(
+            model=model_obj.provider_resource_id,
+            prompt=prompt,
+            best_of=best_of,
+            echo=echo,
+            frequency_penalty=frequency_penalty,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            max_tokens=max_tokens,
+            n=n,
+            presence_penalty=presence_penalty,
+            seed=seed,
+            stop=stop,
+            stream=stream,
+            stream_options=stream_options,
+            temperature=temperature,
+            top_p=top_p,
+            user=user,
+        )
+
+        return await self._get_openai_client().completions.create(**params)
+
+    async def openai_chat_completion(
+        self,
+        model: str,
+        messages: List[OpenAIMessageParam],
+        frequency_penalty: Optional[float] = None,
+        function_call: Optional[Union[str, Dict[str, Any]]] = None,
+        functions: Optional[List[Dict[str, Any]]] = None,
+        logit_bias: Optional[Dict[str, float]] = None,
+        logprobs: Optional[bool] = None,
+        max_completion_tokens: Optional[int] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        parallel_tool_calls: Optional[bool] = None,
+        presence_penalty: Optional[float] = None,
+        response_format: Optional[OpenAIResponseFormatParam] = None,
+        seed: Optional[int] = None,
+        stop: Optional[Union[str, List[str]]] = None,
+        stream: Optional[bool] = None,
+        stream_options: Optional[Dict[str, Any]] = None,
+        temperature: Optional[float] = None,
+        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        top_logprobs: Optional[int] = None,
+        top_p: Optional[float] = None,
+        user: Optional[str] = None,
+    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+        model_obj = await self.model_store.get_model(model)
+        params = await prepare_openai_completion_params(
+            messages=messages,
+            frequency_penalty=frequency_penalty,
+            function_call=function_call,
+            functions=functions,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            max_completion_tokens=max_completion_tokens,
+            max_tokens=max_tokens,
+            n=n,
+            parallel_tool_calls=parallel_tool_calls,
+            presence_penalty=presence_penalty,
+            response_format=response_format,
+            seed=seed,
+            stop=stop,
+            stream=stream,
+            stream_options=stream_options,
+            temperature=temperature,
+            tool_choice=tool_choice,
+            tools=tools,
+            top_logprobs=top_logprobs,
+            top_p=top_p,
+            user=user,
+        )
+
+        # Divert Llama Models through Llama Stack inference APIs because
+        # Fireworks chat completions OpenAI-compatible API does not support
+        # tool calls properly.
+        llama_model = self.get_llama_model(model_obj.provider_resource_id)
+        if llama_model:
+            return await OpenAIChatCompletionToLlamaStackMixin.openai_chat_completion(self, model=model, **params)
+
+        return await self._get_openai_client().chat.completions.create(model=model_obj.provider_resource_id, **params)
--- a/llama_stack/providers/remote/inference/fireworks/models.py
+++ b/llama_stack/providers/remote/inference/fireworks/models.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.

 from llama_stack.apis.models.models import ModelType
-from llama_stack.models.llama.datatypes import CoreModelId
+from llama_stack.models.llama.sku_types import CoreModelId
 from llama_stack.providers.utils.inference.model_registry import (
    ProviderModelEntry,
    build_hf_repo_model_entry,
@ -48,6 +48,14 @@ MODEL_ENTRIES = [
        "accounts/fireworks/models/llama-guard-3-11b-vision",
        CoreModelId.llama_guard_3_11b_vision.value,
    ),
+    build_hf_repo_model_entry(
+        "accounts/fireworks/models/llama4-scout-instruct-basic",
+        CoreModelId.llama4_scout_17b_16e_instruct.value,
+    ),
+    build_hf_repo_model_entry(
+        "accounts/fireworks/models/llama4-maverick-instruct-basic",
+        CoreModelId.llama4_maverick_17b_128e_instruct.value,
+    ),
    ProviderModelEntry(
        provider_model_id="nomic-ai/nomic-embed-text-v1.5",
        model_type=ModelType.embedding,
--- a/llama_stack/providers/remote/inference/fireworks_openai_compat/init.py
+++ b/llama_stack/providers/remote/inference/fireworks_openai_compat/init.py
@ -0,0 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.inference import Inference
+
+from .config import FireworksCompatConfig
+
+
+async def get_adapter_impl(config: FireworksCompatConfig, _deps) -> Inference:
+    # import dynamically so the import is used only when it is needed
+    from .fireworks import FireworksCompatInferenceAdapter
+
+    adapter = FireworksCompatInferenceAdapter(config)
+    return adapter
--- a/llama_stack/providers/remote/inference/fireworks_openai_compat/config.py
+++ b/llama_stack/providers/remote/inference/fireworks_openai_compat/config.py
@ -0,0 +1,38 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any, Dict, Optional
+
+from pydantic import BaseModel, Field
+
+from llama_stack.schema_utils import json_schema_type
+
+
+class FireworksProviderDataValidator(BaseModel):
+    fireworks_api_key: Optional[str] = Field(
+        default=None,
+        description="API key for Fireworks models",
+    )
+
+
+@json_schema_type
+class FireworksCompatConfig(BaseModel):
+    api_key: Optional[str] = Field(
+        default=None,
+        description="The Fireworks API key",
+    )
+
+    openai_compat_api_base: str = Field(
+        default="https://api.fireworks.ai/inference/v1",
+        description="The URL for the Fireworks API server",
+    )
+
+    @classmethod
+    def sample_run_config(cls, api_key: str = "${env.FIREWORKS_API_KEY}", **kwargs) -> Dict[str, Any]:
+        return {
+            "openai_compat_api_base": "https://api.fireworks.ai/inference/v1",
+            "api_key": api_key,
+        }
--- a/llama_stack/providers/remote/inference/fireworks_openai_compat/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks_openai_compat/fireworks.py
@ -0,0 +1,30 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.providers.remote.inference.fireworks_openai_compat.config import FireworksCompatConfig
+from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
+
+from ..fireworks.models import MODEL_ENTRIES
+
+
+class FireworksCompatInferenceAdapter(LiteLLMOpenAIMixin):
+    _config: FireworksCompatConfig
+
+    def __init__(self, config: FireworksCompatConfig):
+        LiteLLMOpenAIMixin.__init__(
+            self,
+            model_entries=MODEL_ENTRIES,
+            api_key_from_config=config.api_key,
+            provider_data_api_key_field="fireworks_api_key",
+            openai_compat_api_base=config.openai_compat_api_base,
+        )
+        self.config = config
+
+    async def initialize(self):
+        await super().initialize()
+
+    async def shutdown(self):
+        await super().shutdown()
--- a/llama_stack/providers/remote/inference/groq/groq.py
+++ b/llama_stack/providers/remote/inference/groq/groq.py
@ -4,8 +4,24 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from typing import Any, AsyncIterator, Dict, List, Optional, Union
+
+from openai import AsyncOpenAI
+
+from llama_stack.apis.inference.inference import (
+    OpenAIChatCompletion,
+    OpenAIChatCompletionChunk,
+    OpenAIChoiceDelta,
+    OpenAIChunkChoice,
+    OpenAIMessageParam,
+    OpenAIResponseFormatParam,
+    OpenAISystemMessageParam,
+)
 from llama_stack.providers.remote.inference.groq.config import GroqConfig
 from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
+from llama_stack.providers.utils.inference.openai_compat import (
+    prepare_openai_completion_params,
+)

 from .models import MODEL_ENTRIES

@ -21,9 +37,129 @@ class GroqInferenceAdapter(LiteLLMOpenAIMixin):
            provider_data_api_key_field="groq_api_key",
        )
        self.config = config
+        self._openai_client = None

    async def initialize(self):
        await super().initialize()

    async def shutdown(self):
        await super().shutdown()
+        if self._openai_client:
+            await self._openai_client.close()
+            self._openai_client = None
+
+    def _get_openai_client(self) -> AsyncOpenAI:
+        if not self._openai_client:
+            self._openai_client = AsyncOpenAI(
+                base_url=f"{self.config.url}/openai/v1",
+                api_key=self.config.api_key,
+            )
+        return self._openai_client
+
+    async def openai_chat_completion(
+        self,
+        model: str,
+        messages: List[OpenAIMessageParam],
+        frequency_penalty: Optional[float] = None,
+        function_call: Optional[Union[str, Dict[str, Any]]] = None,
+        functions: Optional[List[Dict[str, Any]]] = None,
+        logit_bias: Optional[Dict[str, float]] = None,
+        logprobs: Optional[bool] = None,
+        max_completion_tokens: Optional[int] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        parallel_tool_calls: Optional[bool] = None,
+        presence_penalty: Optional[float] = None,
+        response_format: Optional[OpenAIResponseFormatParam] = None,
+        seed: Optional[int] = None,
+        stop: Optional[Union[str, List[str]]] = None,
+        stream: Optional[bool] = None,
+        stream_options: Optional[Dict[str, Any]] = None,
+        temperature: Optional[float] = None,
+        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        top_logprobs: Optional[int] = None,
+        top_p: Optional[float] = None,
+        user: Optional[str] = None,
+    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+        model_obj = await self.model_store.get_model(model)
+
+        # Groq does not support json_schema response format, so we need to convert it to json_object
+        if response_format and response_format.type == "json_schema":
+            response_format.type = "json_object"
+            schema = response_format.json_schema.get("schema", {})
+            response_format.json_schema = None
+            json_instructions = f"\nYour response should be a JSON object that matches the following schema: {schema}"
+            if messages and messages[0].role == "system":
+                messages[0].content = messages[0].content + json_instructions
+            else:
+                messages.insert(0, OpenAISystemMessageParam(content=json_instructions))
+
+        # Groq returns a 400 error if tools are provided but none are called
+        # So, set tool_choice to "required" to attempt to force a call
+        if tools and (not tool_choice or tool_choice == "auto"):
+            tool_choice = "required"
+
+        params = await prepare_openai_completion_params(
+            model=model_obj.provider_resource_id.replace("groq/", ""),
+            messages=messages,
+            frequency_penalty=frequency_penalty,
+            function_call=function_call,
+            functions=functions,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            max_completion_tokens=max_completion_tokens,
+            max_tokens=max_tokens,
+            n=n,
+            parallel_tool_calls=parallel_tool_calls,
+            presence_penalty=presence_penalty,
+            response_format=response_format,
+            seed=seed,
+            stop=stop,
+            stream=stream,
+            stream_options=stream_options,
+            temperature=temperature,
+            tool_choice=tool_choice,
+            tools=tools,
+            top_logprobs=top_logprobs,
+            top_p=top_p,
+            user=user,
+        )
+
+        # Groq does not support streaming requests that set response_format
+        fake_stream = False
+        if stream and response_format:
+            params["stream"] = False
+            fake_stream = True
+
+        response = await self._get_openai_client().chat.completions.create(**params)
+
+        if fake_stream:
+            chunk_choices = []
+            for choice in response.choices:
+                delta = OpenAIChoiceDelta(
+                    content=choice.message.content,
+                    role=choice.message.role,
+                    tool_calls=choice.message.tool_calls,
+                )
+                chunk_choice = OpenAIChunkChoice(
+                    delta=delta,
+                    finish_reason=choice.finish_reason,
+                    index=choice.index,
+                    logprobs=None,
+                )
+                chunk_choices.append(chunk_choice)
+            chunk = OpenAIChatCompletionChunk(
+                id=response.id,
+                choices=chunk_choices,
+                object="chat.completion.chunk",
+                created=response.created,
+                model=response.model,
+            )
+
+            async def _fake_stream_generator():
+                yield chunk
+
+            return _fake_stream_generator()
+        else:
+            return response
--- a/llama_stack/providers/remote/inference/groq/models.py
+++ b/llama_stack/providers/remote/inference/groq/models.py
@ -35,4 +35,20 @@ MODEL_ENTRIES = [
        "groq/llama-3.2-3b-preview",
        CoreModelId.llama3_2_3b_instruct.value,
    ),
+    build_hf_repo_model_entry(
+        "groq/llama-4-scout-17b-16e-instruct",
+        CoreModelId.llama4_scout_17b_16e_instruct.value,
+    ),
+    build_hf_repo_model_entry(
+        "groq/meta-llama/llama-4-scout-17b-16e-instruct",
+        CoreModelId.llama4_scout_17b_16e_instruct.value,
+    ),
+    build_hf_repo_model_entry(
+        "groq/llama-4-maverick-17b-128e-instruct",
+        CoreModelId.llama4_maverick_17b_128e_instruct.value,
+    ),
+    build_hf_repo_model_entry(
+        "groq/meta-llama/llama-4-maverick-17b-128e-instruct",
+        CoreModelId.llama4_maverick_17b_128e_instruct.value,
+    ),
 ]
--- a/llama_stack/providers/remote/inference/groq_openai_compat/init.py
+++ b/llama_stack/providers/remote/inference/groq_openai_compat/init.py
@ -0,0 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.inference import Inference
+
+from .config import GroqCompatConfig
+
+
+async def get_adapter_impl(config: GroqCompatConfig, _deps) -> Inference:
+    # import dynamically so the import is used only when it is needed
+    from .groq import GroqCompatInferenceAdapter
+
+    adapter = GroqCompatInferenceAdapter(config)
+    return adapter
--- a/llama_stack/providers/remote/inference/groq_openai_compat/config.py
+++ b/llama_stack/providers/remote/inference/groq_openai_compat/config.py
@ -0,0 +1,38 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any, Dict, Optional
+
+from pydantic import BaseModel, Field
+
+from llama_stack.schema_utils import json_schema_type
+
+
+class GroqProviderDataValidator(BaseModel):
+    groq_api_key: Optional[str] = Field(
+        default=None,
+        description="API key for Groq models",
+    )
+
+
+@json_schema_type
+class GroqCompatConfig(BaseModel):
+    api_key: Optional[str] = Field(
+        default=None,
+        description="The Groq API key",
+    )
+
+    openai_compat_api_base: str = Field(
+        default="https://api.groq.com/openai/v1",
+        description="The URL for the Groq API server",
+    )
+
+    @classmethod
+    def sample_run_config(cls, api_key: str = "${env.GROQ_API_KEY}", **kwargs) -> Dict[str, Any]:
+        return {
+            "openai_compat_api_base": "https://api.groq.com/openai/v1",
+            "api_key": api_key,
+        }
--- a/llama_stack/providers/remote/inference/groq_openai_compat/groq.py
+++ b/llama_stack/providers/remote/inference/groq_openai_compat/groq.py
@ -0,0 +1,30 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.providers.remote.inference.groq_openai_compat.config import GroqCompatConfig
+from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
+
+from ..groq.models import MODEL_ENTRIES
+
+
+class GroqCompatInferenceAdapter(LiteLLMOpenAIMixin):
+    _config: GroqCompatConfig
+
+    def __init__(self, config: GroqCompatConfig):
+        LiteLLMOpenAIMixin.__init__(
+            self,
+            model_entries=MODEL_ENTRIES,
+            api_key_from_config=config.api_key,
+            provider_data_api_key_field="groq_api_key",
+            openai_compat_api_base=config.openai_compat_api_base,
+        )
+        self.config = config
+
+    async def initialize(self):
+        await super().initialize()
+
+    async def shutdown(self):
+        await super().shutdown()
--- a/llama_stack/providers/remote/inference/nvidia/models.py
+++ b/llama_stack/providers/remote/inference/nvidia/models.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.

 from llama_stack.apis.models import ModelType
-from llama_stack.models.llama.datatypes import CoreModelId
+from llama_stack.models.llama.sku_types import CoreModelId
 from llama_stack.providers.utils.inference.model_registry import (
    ProviderModelEntry,
    build_hf_repo_model_entry,
--- a/llama_stack/providers/remote/inference/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py
@ -7,7 +7,7 @@
 import logging
 import warnings
 from functools import lru_cache
-from typing import AsyncIterator, List, Optional, Union
+from typing import Any, AsyncIterator, Dict, List, Optional, Union

 from openai import APIConnectionError, AsyncOpenAI, BadRequestError

@ -29,21 +29,27 @@ from llama_stack.apis.inference import (
    LogProbConfig,
    Message,
    ResponseFormat,
+    SamplingParams,
    TextTruncation,
    ToolChoice,
    ToolConfig,
-)
-from llama_stack.models.llama.datatypes import (
-    SamplingParams,
    ToolDefinition,
-    ToolPromptFormat,
 )
+from llama_stack.apis.inference.inference import (
+    OpenAIChatCompletion,
+    OpenAIChatCompletionChunk,
+    OpenAICompletion,
+    OpenAIMessageParam,
+    OpenAIResponseFormatParam,
+)
+from llama_stack.models.llama.datatypes import ToolPromptFormat
 from llama_stack.providers.utils.inference.model_registry import (
    ModelRegistryHelper,
 )
 from llama_stack.providers.utils.inference.openai_compat import (
    convert_openai_chat_completion_choice,
    convert_openai_chat_completion_stream,
+    prepare_openai_completion_params,
 )
 from llama_stack.providers.utils.inference.prompt_adapter import content_has_media

@ -265,3 +271,111 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
        else:
            # we pass n=1 to get only one completion
            return convert_openai_chat_completion_choice(response.choices[0])
+
+    async def openai_completion(
+        self,
+        model: str,
+        prompt: Union[str, List[str], List[int], List[List[int]]],
+        best_of: Optional[int] = None,
+        echo: Optional[bool] = None,
+        frequency_penalty: Optional[float] = None,
+        logit_bias: Optional[Dict[str, float]] = None,
+        logprobs: Optional[bool] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        presence_penalty: Optional[float] = None,
+        seed: Optional[int] = None,
+        stop: Optional[Union[str, List[str]]] = None,
+        stream: Optional[bool] = None,
+        stream_options: Optional[Dict[str, Any]] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        user: Optional[str] = None,
+        guided_choice: Optional[List[str]] = None,
+        prompt_logprobs: Optional[int] = None,
+    ) -> OpenAICompletion:
+        provider_model_id = self.get_provider_model_id(model)
+
+        params = await prepare_openai_completion_params(
+            model=provider_model_id,
+            prompt=prompt,
+            best_of=best_of,
+            echo=echo,
+            frequency_penalty=frequency_penalty,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            max_tokens=max_tokens,
+            n=n,
+            presence_penalty=presence_penalty,
+            seed=seed,
+            stop=stop,
+            stream=stream,
+            stream_options=stream_options,
+            temperature=temperature,
+            top_p=top_p,
+            user=user,
+        )
+
+        try:
+            return await self._get_client(provider_model_id).completions.create(**params)
+        except APIConnectionError as e:
+            raise ConnectionError(f"Failed to connect to NVIDIA NIM at {self._config.url}: {e}") from e
+
+    async def openai_chat_completion(
+        self,
+        model: str,
+        messages: List[OpenAIMessageParam],
+        frequency_penalty: Optional[float] = None,
+        function_call: Optional[Union[str, Dict[str, Any]]] = None,
+        functions: Optional[List[Dict[str, Any]]] = None,
+        logit_bias: Optional[Dict[str, float]] = None,
+        logprobs: Optional[bool] = None,
+        max_completion_tokens: Optional[int] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        parallel_tool_calls: Optional[bool] = None,
+        presence_penalty: Optional[float] = None,
+        response_format: Optional[OpenAIResponseFormatParam] = None,
+        seed: Optional[int] = None,
+        stop: Optional[Union[str, List[str]]] = None,
+        stream: Optional[bool] = None,
+        stream_options: Optional[Dict[str, Any]] = None,
+        temperature: Optional[float] = None,
+        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        top_logprobs: Optional[int] = None,
+        top_p: Optional[float] = None,
+        user: Optional[str] = None,
+    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+        provider_model_id = self.get_provider_model_id(model)
+
+        params = await prepare_openai_completion_params(
+            model=provider_model_id,
+            messages=messages,
+            frequency_penalty=frequency_penalty,
+            function_call=function_call,
+            functions=functions,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            max_completion_tokens=max_completion_tokens,
+            max_tokens=max_tokens,
+            n=n,
+            parallel_tool_calls=parallel_tool_calls,
+            presence_penalty=presence_penalty,
+            response_format=response_format,
+            seed=seed,
+            stop=stop,
+            stream=stream,
+            stream_options=stream_options,
+            temperature=temperature,
+            tool_choice=tool_choice,
+            tools=tools,
+            top_logprobs=top_logprobs,
+            top_p=top_p,
+            user=user,
+        )
+
+        try:
+            return await self._get_client(provider_model_id).chat.completions.create(**params)
+        except APIConnectionError as e:
+            raise ConnectionError(f"Failed to connect to NVIDIA NIM at {self._config.url}: {e}") from e
--- a/llama_stack/providers/remote/inference/nvidia/openai_utils.py
+++ b/llama_stack/providers/remote/inference/nvidia/openai_utils.py
@ -19,11 +19,9 @@ from llama_stack.apis.inference import (
    CompletionRequest,
    CompletionResponse,
    CompletionResponseStreamChunk,
+    GreedySamplingStrategy,
    JsonSchemaResponseFormat,
    TokenLogProbs,
-)
-from llama_stack.models.llama.datatypes import (
-    GreedySamplingStrategy,
    TopKSamplingStrategy,
    TopPSamplingStrategy,
 )
--- a/llama_stack/providers/remote/inference/ollama/models.py
+++ b/llama_stack/providers/remote/inference/ollama/models.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.

 from llama_stack.apis.models.models import ModelType
-from llama_stack.models.llama.datatypes import CoreModelId
+from llama_stack.models.llama.sku_types import CoreModelId
 from llama_stack.providers.utils.inference.model_registry import (
    ProviderModelEntry,
    build_hf_repo_model_entry,
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -5,10 +5,11 @@
 # the root directory of this source tree.


-from typing import Any, AsyncGenerator, List, Optional, Union
+from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union

 import httpx
 from ollama import AsyncClient
+from openai import AsyncOpenAI

 from llama_stack.apis.common.content_types import (
    ImageContentItem,
@ -38,9 +39,20 @@ from llama_stack.apis.inference import (
    ToolDefinition,
    ToolPromptFormat,
 )
+from llama_stack.apis.inference.inference import (
+    OpenAIChatCompletion,
+    OpenAIChatCompletionChunk,
+    OpenAICompletion,
+    OpenAIMessageParam,
+    OpenAIResponseFormatParam,
+)
 from llama_stack.apis.models import Model, ModelType
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import ModelsProtocolPrivate
+from llama_stack.providers.datatypes import (
+    HealthResponse,
+    HealthStatus,
+    ModelsProtocolPrivate,
+)
 from llama_stack.providers.utils.inference.model_registry import (
    ModelRegistryHelper,
 )
@ -67,7 +79,10 @@ from .models import model_entries
 logger = get_logger(name=__name__, category="inference")


-class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
+class OllamaInferenceAdapter(
+    Inference,
+    ModelsProtocolPrivate,
+):
    def __init__(self, url: str) -> None:
        self.register_helper = ModelRegistryHelper(model_entries)
        self.url = url
@ -76,10 +91,25 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
    def client(self) -> AsyncClient:
        return AsyncClient(host=self.url)

+    @property
+    def openai_client(self) -> AsyncOpenAI:
+        return AsyncOpenAI(base_url=f"{self.url}/v1", api_key="ollama")
+
    async def initialize(self) -> None:
        logger.info(f"checking connectivity to Ollama at `{self.url}`...")
+        await self.health()
+
+    async def health(self) -> HealthResponse:
+        """
+        Performs a health check by verifying connectivity to the Ollama server.
+        This method is used by initialize() and the Provider API to verify that the service is running
+        correctly.
+        Returns:
+            HealthResponse: A dictionary containing the health status.
+        """
        try:
            await self.client.ps()
+            return HealthResponse(status=HealthStatus.OK)
        except httpx.ConnectError as e:
            raise RuntimeError(
                "Ollama Server is not running, start it using `ollama serve` in a separate terminal"
@ -307,17 +337,155 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
        if model.model_type == ModelType.embedding:
            logger.info(f"Pulling embedding model `{model.provider_resource_id}` if necessary...")
            await self.client.pull(model.provider_resource_id)
-            response = await self.client.list()
-        else:
-            response = await self.client.ps()
+        # we use list() here instead of ps() -
+        #  - ps() only lists running models, not available models
+        #  - models not currently running are run by the ollama server as needed
+        response = await self.client.list()
        available_models = [m["model"] for m in response["models"]]
        if model.provider_resource_id not in available_models:
+            available_models_latest = [m["model"].split(":latest")[0] for m in response["models"]]
+            if model.provider_resource_id in available_models_latest:
+                logger.warning(
+                    f"Imprecise provider resource id was used but 'latest' is available in Ollama - using '{model.provider_resource_id}:latest'"
+                )
+                return model
            raise ValueError(
                f"Model '{model.provider_resource_id}' is not available in Ollama. Available models: {', '.join(available_models)}"
            )

        return model

+    async def openai_completion(
+        self,
+        model: str,
+        prompt: Union[str, List[str], List[int], List[List[int]]],
+        best_of: Optional[int] = None,
+        echo: Optional[bool] = None,
+        frequency_penalty: Optional[float] = None,
+        logit_bias: Optional[Dict[str, float]] = None,
+        logprobs: Optional[bool] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        presence_penalty: Optional[float] = None,
+        seed: Optional[int] = None,
+        stop: Optional[Union[str, List[str]]] = None,
+        stream: Optional[bool] = None,
+        stream_options: Optional[Dict[str, Any]] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        user: Optional[str] = None,
+        guided_choice: Optional[List[str]] = None,
+        prompt_logprobs: Optional[int] = None,
+    ) -> OpenAICompletion:
+        if not isinstance(prompt, str):
+            raise ValueError("Ollama does not support non-string prompts for completion")
+
+        model_obj = await self._get_model(model)
+        params = {
+            k: v
+            for k, v in {
+                "model": model_obj.provider_resource_id,
+                "prompt": prompt,
+                "best_of": best_of,
+                "echo": echo,
+                "frequency_penalty": frequency_penalty,
+                "logit_bias": logit_bias,
+                "logprobs": logprobs,
+                "max_tokens": max_tokens,
+                "n": n,
+                "presence_penalty": presence_penalty,
+                "seed": seed,
+                "stop": stop,
+                "stream": stream,
+                "stream_options": stream_options,
+                "temperature": temperature,
+                "top_p": top_p,
+                "user": user,
+            }.items()
+            if v is not None
+        }
+        return await self.openai_client.completions.create(**params)  # type: ignore
+
+    async def openai_chat_completion(
+        self,
+        model: str,
+        messages: List[OpenAIMessageParam],
+        frequency_penalty: Optional[float] = None,
+        function_call: Optional[Union[str, Dict[str, Any]]] = None,
+        functions: Optional[List[Dict[str, Any]]] = None,
+        logit_bias: Optional[Dict[str, float]] = None,
+        logprobs: Optional[bool] = None,
+        max_completion_tokens: Optional[int] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        parallel_tool_calls: Optional[bool] = None,
+        presence_penalty: Optional[float] = None,
+        response_format: Optional[OpenAIResponseFormatParam] = None,
+        seed: Optional[int] = None,
+        stop: Optional[Union[str, List[str]]] = None,
+        stream: Optional[bool] = None,
+        stream_options: Optional[Dict[str, Any]] = None,
+        temperature: Optional[float] = None,
+        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        top_logprobs: Optional[int] = None,
+        top_p: Optional[float] = None,
+        user: Optional[str] = None,
+    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+        model_obj = await self._get_model(model)
+        params = {
+            k: v
+            for k, v in {
+                "model": model_obj.provider_resource_id,
+                "messages": messages,
+                "frequency_penalty": frequency_penalty,
+                "function_call": function_call,
+                "functions": functions,
+                "logit_bias": logit_bias,
+                "logprobs": logprobs,
+                "max_completion_tokens": max_completion_tokens,
+                "max_tokens": max_tokens,
+                "n": n,
+                "parallel_tool_calls": parallel_tool_calls,
+                "presence_penalty": presence_penalty,
+                "response_format": response_format,
+                "seed": seed,
+                "stop": stop,
+                "stream": stream,
+                "stream_options": stream_options,
+                "temperature": temperature,
+                "tool_choice": tool_choice,
+                "tools": tools,
+                "top_logprobs": top_logprobs,
+                "top_p": top_p,
+                "user": user,
+            }.items()
+            if v is not None
+        }
+        return await self.openai_client.chat.completions.create(**params)  # type: ignore
+
+    async def batch_completion(
+        self,
+        model_id: str,
+        content_batch: List[InterleavedContent],
+        sampling_params: Optional[SamplingParams] = None,
+        response_format: Optional[ResponseFormat] = None,
+        logprobs: Optional[LogProbConfig] = None,
+    ):
+        raise NotImplementedError("Batch completion is not supported for Ollama")
+
+    async def batch_chat_completion(
+        self,
+        model_id: str,
+        messages_batch: List[List[Message]],
+        sampling_params: Optional[SamplingParams] = None,
+        tools: Optional[List[ToolDefinition]] = None,
+        tool_config: Optional[ToolConfig] = None,
+        response_format: Optional[ResponseFormat] = None,
+        logprobs: Optional[LogProbConfig] = None,
+    ):
+        raise NotImplementedError("Batch chat completion is not supported for Ollama")
+

 async def convert_message_to_openai_dict_for_ollama(message: Message) -> List[dict]:
    async def _convert_content(content) -> dict:
--- a/llama_stack/providers/remote/inference/passthrough/passthrough.py
+++ b/llama_stack/providers/remote/inference/passthrough/passthrough.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, AsyncGenerator, Dict, List, Optional
+from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union

 from llama_stack_client import AsyncLlamaStackClient

@ -26,9 +26,17 @@ from llama_stack.apis.inference import (
    ToolDefinition,
    ToolPromptFormat,
 )
+from llama_stack.apis.inference.inference import (
+    OpenAIChatCompletion,
+    OpenAIChatCompletionChunk,
+    OpenAICompletion,
+    OpenAIMessageParam,
+    OpenAIResponseFormatParam,
+)
 from llama_stack.apis.models import Model
 from llama_stack.distribution.library_client import convert_pydantic_to_json_value, convert_to_pydantic
 from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
+from llama_stack.providers.utils.inference.openai_compat import prepare_openai_completion_params

 from .config import PassthroughImplConfig

@ -201,6 +209,112 @@ class PassthroughInferenceAdapter(Inference):
            task_type=task_type,
        )

+    async def openai_completion(
+        self,
+        model: str,
+        prompt: Union[str, List[str], List[int], List[List[int]]],
+        best_of: Optional[int] = None,
+        echo: Optional[bool] = None,
+        frequency_penalty: Optional[float] = None,
+        logit_bias: Optional[Dict[str, float]] = None,
+        logprobs: Optional[bool] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        presence_penalty: Optional[float] = None,
+        seed: Optional[int] = None,
+        stop: Optional[Union[str, List[str]]] = None,
+        stream: Optional[bool] = None,
+        stream_options: Optional[Dict[str, Any]] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        user: Optional[str] = None,
+        guided_choice: Optional[List[str]] = None,
+        prompt_logprobs: Optional[int] = None,
+    ) -> OpenAICompletion:
+        client = self._get_client()
+        model_obj = await self.model_store.get_model(model)
+
+        params = await prepare_openai_completion_params(
+            model=model_obj.provider_resource_id,
+            prompt=prompt,
+            best_of=best_of,
+            echo=echo,
+            frequency_penalty=frequency_penalty,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            max_tokens=max_tokens,
+            n=n,
+            presence_penalty=presence_penalty,
+            seed=seed,
+            stop=stop,
+            stream=stream,
+            stream_options=stream_options,
+            temperature=temperature,
+            top_p=top_p,
+            user=user,
+            guided_choice=guided_choice,
+            prompt_logprobs=prompt_logprobs,
+        )
+
+        return await client.inference.openai_completion(**params)
+
+    async def openai_chat_completion(
+        self,
+        model: str,
+        messages: List[OpenAIMessageParam],
+        frequency_penalty: Optional[float] = None,
+        function_call: Optional[Union[str, Dict[str, Any]]] = None,
+        functions: Optional[List[Dict[str, Any]]] = None,
+        logit_bias: Optional[Dict[str, float]] = None,
+        logprobs: Optional[bool] = None,
+        max_completion_tokens: Optional[int] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        parallel_tool_calls: Optional[bool] = None,
+        presence_penalty: Optional[float] = None,
+        response_format: Optional[OpenAIResponseFormatParam] = None,
+        seed: Optional[int] = None,
+        stop: Optional[Union[str, List[str]]] = None,
+        stream: Optional[bool] = None,
+        stream_options: Optional[Dict[str, Any]] = None,
+        temperature: Optional[float] = None,
+        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        top_logprobs: Optional[int] = None,
+        top_p: Optional[float] = None,
+        user: Optional[str] = None,
+    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+        client = self._get_client()
+        model_obj = await self.model_store.get_model(model)
+
+        params = await prepare_openai_completion_params(
+            model=model_obj.provider_resource_id,
+            messages=messages,
+            frequency_penalty=frequency_penalty,
+            function_call=function_call,
+            functions=functions,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            max_completion_tokens=max_completion_tokens,
+            max_tokens=max_tokens,
+            n=n,
+            parallel_tool_calls=parallel_tool_calls,
+            presence_penalty=presence_penalty,
+            response_format=response_format,
+            seed=seed,
+            stop=stop,
+            stream=stream,
+            stream_options=stream_options,
+            temperature=temperature,
+            tool_choice=tool_choice,
+            tools=tools,
+            top_logprobs=top_logprobs,
+            top_p=top_p,
+            user=user,
+        )
+
+        return await client.inference.openai_chat_completion(**params)
+
    def cast_value_to_json_dict(self, request_params: Dict[str, Any]) -> Dict[str, Any]:
        json_params = {}
        for key, value in request_params.items():
--- a/llama_stack/providers/remote/inference/runpod/runpod.py
+++ b/llama_stack/providers/remote/inference/runpod/runpod.py
@ -12,6 +12,8 @@ from llama_stack.apis.inference import *  # noqa: F403
 # from llama_stack.providers.datatypes import ModelsProtocolPrivate
 from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
 from llama_stack.providers.utils.inference.openai_compat import (
+    OpenAIChatCompletionToLlamaStackMixin,
+    OpenAICompletionToLlamaStackMixin,
    get_sampling_options,
    process_chat_completion_response,
    process_chat_completion_stream_response,
@ -38,7 +40,12 @@ RUNPOD_SUPPORTED_MODELS = {
 }


-class RunpodInferenceAdapter(ModelRegistryHelper, Inference):
+class RunpodInferenceAdapter(
+    ModelRegistryHelper,
+    Inference,
+    OpenAIChatCompletionToLlamaStackMixin,
+    OpenAICompletionToLlamaStackMixin,
+):
    def __init__(self, config: RunpodImplConfig) -> None:
        ModelRegistryHelper.__init__(self, stack_to_provider_models_map=RUNPOD_SUPPORTED_MODELS)
        self.config = config
--- a/llama_stack/providers/remote/inference/sambanova/models.py
+++ b/llama_stack/providers/remote/inference/sambanova/models.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.models.llama.datatypes import CoreModelId
+from llama_stack.models.llama.sku_types import CoreModelId
 from llama_stack.providers.utils.inference.model_registry import (
    build_hf_repo_model_entry,
 )
@ -46,4 +46,8 @@ MODEL_ENTRIES = [
        "Meta-Llama-Guard-3-8B",
        CoreModelId.llama_guard_3_8b.value,
    ),
+    build_hf_repo_model_entry(
+        "Llama-4-Scout-17B-16E-Instruct",
+        CoreModelId.llama4_scout_17b_16e_instruct.value,
+    ),
 ]
--- a/llama_stack/providers/remote/inference/sambanova/sambanova.py
+++ b/llama_stack/providers/remote/inference/sambanova/sambanova.py
@ -21,6 +21,7 @@ from llama_stack.apis.inference import (
    CompletionMessage,
    EmbeddingsResponse,
    EmbeddingTaskType,
+    GreedySamplingStrategy,
    Inference,
    LogProbConfig,
    Message,
@ -35,15 +36,14 @@ from llama_stack.apis.inference import (
    ToolDefinition,
    ToolPromptFormat,
    ToolResponseMessage,
-    UserMessage,
-)
-from llama_stack.models.llama.datatypes import (
-    GreedySamplingStrategy,
    TopKSamplingStrategy,
    TopPSamplingStrategy,
+    UserMessage,
 )
 from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
 from llama_stack.providers.utils.inference.openai_compat import (
+    OpenAIChatCompletionToLlamaStackMixin,
+    OpenAICompletionToLlamaStackMixin,
    process_chat_completion_stream_response,
 )
 from llama_stack.providers.utils.inference.prompt_adapter import (
@ -54,7 +54,12 @@ from .config import SambaNovaImplConfig
 from .models import MODEL_ENTRIES


-class SambaNovaInferenceAdapter(ModelRegistryHelper, Inference):
+class SambaNovaInferenceAdapter(
+    ModelRegistryHelper,
+    Inference,
+    OpenAIChatCompletionToLlamaStackMixin,
+    OpenAICompletionToLlamaStackMixin,
+):
    def __init__(self, config: SambaNovaImplConfig) -> None:
        ModelRegistryHelper.__init__(self, model_entries=MODEL_ENTRIES)
        self.config = config
--- a/llama_stack/providers/remote/inference/sambanova_openai_compat/init.py
+++ b/llama_stack/providers/remote/inference/sambanova_openai_compat/init.py
@ -0,0 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.inference import Inference
+
+from .config import SambaNovaCompatConfig
+
+
+async def get_adapter_impl(config: SambaNovaCompatConfig, _deps) -> Inference:
+    # import dynamically so the import is used only when it is needed
+    from .sambanova import SambaNovaCompatInferenceAdapter
+
+    adapter = SambaNovaCompatInferenceAdapter(config)
+    return adapter
--- a/llama_stack/providers/remote/inference/sambanova_openai_compat/config.py
+++ b/llama_stack/providers/remote/inference/sambanova_openai_compat/config.py
@ -0,0 +1,38 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any, Dict, Optional
+
+from pydantic import BaseModel, Field
+
+from llama_stack.schema_utils import json_schema_type
+
+
+class SambaNovaProviderDataValidator(BaseModel):
+    sambanova_api_key: Optional[str] = Field(
+        default=None,
+        description="API key for SambaNova models",
+    )
+
+
+@json_schema_type
+class SambaNovaCompatConfig(BaseModel):
+    api_key: Optional[str] = Field(
+        default=None,
+        description="The SambaNova API key",
+    )
+
+    openai_compat_api_base: str = Field(
+        default="https://api.sambanova.ai/v1",
+        description="The URL for the SambaNova API server",
+    )
+
+    @classmethod
+    def sample_run_config(cls, api_key: str = "${env.SAMBANOVA_API_KEY}", **kwargs) -> Dict[str, Any]:
+        return {
+            "openai_compat_api_base": "https://api.sambanova.ai/v1",
+            "api_key": api_key,
+        }
--- a/llama_stack/providers/remote/inference/sambanova_openai_compat/sambanova.py
+++ b/llama_stack/providers/remote/inference/sambanova_openai_compat/sambanova.py
@ -0,0 +1,30 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.providers.remote.inference.sambanova_openai_compat.config import SambaNovaCompatConfig
+from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
+
+from ..sambanova.models import MODEL_ENTRIES
+
+
+class SambaNovaCompatInferenceAdapter(LiteLLMOpenAIMixin):
+    _config: SambaNovaCompatConfig
+
+    def __init__(self, config: SambaNovaCompatConfig):
+        LiteLLMOpenAIMixin.__init__(
+            self,
+            model_entries=MODEL_ENTRIES,
+            api_key_from_config=config.api_key,
+            provider_data_api_key_field="sambanova_api_key",
+            openai_compat_api_base=config.openai_compat_api_base,
+        )
+        self.config = config
+
+    async def initialize(self):
+        await super().initialize()
+
+    async def shutdown(self):
+        await super().shutdown()
--- a/llama_stack/providers/remote/inference/tgi/tgi.py
+++ b/llama_stack/providers/remote/inference/tgi/tgi.py
@ -40,8 +40,10 @@ from llama_stack.providers.utils.inference.model_registry import (
    build_hf_repo_model_entry,
 )
 from llama_stack.providers.utils.inference.openai_compat import (
+    OpenAIChatCompletionToLlamaStackMixin,
    OpenAICompatCompletionChoice,
    OpenAICompatCompletionResponse,
+    OpenAICompletionToLlamaStackMixin,
    get_sampling_options,
    process_chat_completion_response,
    process_chat_completion_stream_response,
@ -69,7 +71,12 @@ def build_hf_repo_model_entries():
    ]


-class _HfAdapter(Inference, ModelsProtocolPrivate):
+class _HfAdapter(
+    Inference,
+    OpenAIChatCompletionToLlamaStackMixin,
+    OpenAICompletionToLlamaStackMixin,
+    ModelsProtocolPrivate,
+):
    client: AsyncInferenceClient
    max_tokens: int
    model_id: str
--- a/llama_stack/providers/remote/inference/together/models.py
+++ b/llama_stack/providers/remote/inference/together/models.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.

 from llama_stack.apis.models.models import ModelType
-from llama_stack.models.llama.datatypes import CoreModelId
+from llama_stack.models.llama.sku_types import CoreModelId
 from llama_stack.providers.utils.inference.model_registry import (
    ProviderModelEntry,
    build_hf_repo_model_entry,
@ -64,4 +64,18 @@ MODEL_ENTRIES = [
            "context_length": 32768,
        },
    ),
+    build_hf_repo_model_entry(
+        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+        CoreModelId.llama4_scout_17b_16e_instruct.value,
+        additional_aliases=[
+            "together/meta-llama/Llama-4-Scout-17B-16E-Instruct",
+        ],
+    ),
+    build_hf_repo_model_entry(
+        "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+        CoreModelId.llama4_maverick_17b_128e_instruct.value,
+        additional_aliases=[
+            "together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+        ],
+    ),
 ]
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@ -4,8 +4,9 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import AsyncGenerator, List, Optional, Union
+from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union

+from openai import AsyncOpenAI
 from together import AsyncTogether

 from llama_stack.apis.common.content_types import (
@ -30,12 +31,20 @@ from llama_stack.apis.inference import (
    ToolDefinition,
    ToolPromptFormat,
 )
+from llama_stack.apis.inference.inference import (
+    OpenAIChatCompletion,
+    OpenAIChatCompletionChunk,
+    OpenAICompletion,
+    OpenAIMessageParam,
+    OpenAIResponseFormatParam,
+)
 from llama_stack.distribution.request_headers import NeedsRequestProviderData
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
 from llama_stack.providers.utils.inference.openai_compat import (
    convert_message_to_openai_dict,
    get_sampling_options,
+    prepare_openai_completion_params,
    process_chat_completion_response,
    process_chat_completion_stream_response,
    process_completion_response,
@ -60,6 +69,7 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
        ModelRegistryHelper.__init__(self, MODEL_ENTRIES)
        self.config = config
        self._client = None
+        self._openai_client = None

    async def initialize(self) -> None:
        pass
@ -110,6 +120,15 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
            self._client = AsyncTogether(api_key=together_api_key)
        return self._client

+    def _get_openai_client(self) -> AsyncOpenAI:
+        if not self._openai_client:
+            together_client = self._get_client().client
+            self._openai_client = AsyncOpenAI(
+                base_url=together_client.base_url,
+                api_key=together_client.api_key,
+            )
+        return self._openai_client
+
    async def _nonstream_completion(self, request: CompletionRequest) -> ChatCompletionResponse:
        params = await self._get_params(request)
        client = self._get_client()
@ -118,7 +137,7 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi

    async def _stream_completion(self, request: CompletionRequest) -> AsyncGenerator:
        params = await self._get_params(request)
-        client = await self._get_client()
+        client = self._get_client()
        stream = await client.completions.create(**params)
        async for chunk in process_completion_stream_response(stream):
            yield chunk
@ -243,3 +262,123 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
        )
        embeddings = [item.embedding for item in r.data]
        return EmbeddingsResponse(embeddings=embeddings)
+
+    async def openai_completion(
+        self,
+        model: str,
+        prompt: Union[str, List[str], List[int], List[List[int]]],
+        best_of: Optional[int] = None,
+        echo: Optional[bool] = None,
+        frequency_penalty: Optional[float] = None,
+        logit_bias: Optional[Dict[str, float]] = None,
+        logprobs: Optional[bool] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        presence_penalty: Optional[float] = None,
+        seed: Optional[int] = None,
+        stop: Optional[Union[str, List[str]]] = None,
+        stream: Optional[bool] = None,
+        stream_options: Optional[Dict[str, Any]] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        user: Optional[str] = None,
+        guided_choice: Optional[List[str]] = None,
+        prompt_logprobs: Optional[int] = None,
+    ) -> OpenAICompletion:
+        model_obj = await self.model_store.get_model(model)
+        params = await prepare_openai_completion_params(
+            model=model_obj.provider_resource_id,
+            prompt=prompt,
+            best_of=best_of,
+            echo=echo,
+            frequency_penalty=frequency_penalty,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            max_tokens=max_tokens,
+            n=n,
+            presence_penalty=presence_penalty,
+            seed=seed,
+            stop=stop,
+            stream=stream,
+            stream_options=stream_options,
+            temperature=temperature,
+            top_p=top_p,
+            user=user,
+        )
+        return await self._get_openai_client().completions.create(**params)  # type: ignore
+
+    async def openai_chat_completion(
+        self,
+        model: str,
+        messages: List[OpenAIMessageParam],
+        frequency_penalty: Optional[float] = None,
+        function_call: Optional[Union[str, Dict[str, Any]]] = None,
+        functions: Optional[List[Dict[str, Any]]] = None,
+        logit_bias: Optional[Dict[str, float]] = None,
+        logprobs: Optional[bool] = None,
+        max_completion_tokens: Optional[int] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        parallel_tool_calls: Optional[bool] = None,
+        presence_penalty: Optional[float] = None,
+        response_format: Optional[OpenAIResponseFormatParam] = None,
+        seed: Optional[int] = None,
+        stop: Optional[Union[str, List[str]]] = None,
+        stream: Optional[bool] = None,
+        stream_options: Optional[Dict[str, Any]] = None,
+        temperature: Optional[float] = None,
+        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        top_logprobs: Optional[int] = None,
+        top_p: Optional[float] = None,
+        user: Optional[str] = None,
+    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+        model_obj = await self.model_store.get_model(model)
+        params = await prepare_openai_completion_params(
+            model=model_obj.provider_resource_id,
+            messages=messages,
+            frequency_penalty=frequency_penalty,
+            function_call=function_call,
+            functions=functions,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            max_completion_tokens=max_completion_tokens,
+            max_tokens=max_tokens,
+            n=n,
+            parallel_tool_calls=parallel_tool_calls,
+            presence_penalty=presence_penalty,
+            response_format=response_format,
+            seed=seed,
+            stop=stop,
+            stream=stream,
+            stream_options=stream_options,
+            temperature=temperature,
+            tool_choice=tool_choice,
+            tools=tools,
+            top_logprobs=top_logprobs,
+            top_p=top_p,
+            user=user,
+        )
+        if params.get("stream", True):
+            return self._stream_openai_chat_completion(params)
+        return await self._get_openai_client().chat.completions.create(**params)  # type: ignore
+
+    async def _stream_openai_chat_completion(self, params: dict) -> AsyncGenerator:
+        # together.ai sometimes adds usage data to the stream, even if include_usage is False
+        # This causes an unexpected final chunk with empty choices array to be sent
+        # to clients that may not handle it gracefully.
+        include_usage = False
+        if params.get("stream_options", None):
+            include_usage = params["stream_options"].get("include_usage", False)
+        stream = await self._get_openai_client().chat.completions.create(**params)
+
+        seen_finish_reason = False
+        async for chunk in stream:
+            # Final usage chunk with no choices that the user didn't request, so discard
+            if not include_usage and seen_finish_reason and len(chunk.choices) == 0:
+                break
+            yield chunk
+            for choice in chunk.choices:
+                if choice.finish_reason:
+                    seen_finish_reason = True
+                    break
--- a/llama_stack/providers/remote/inference/together_openai_compat/init.py
+++ b/llama_stack/providers/remote/inference/together_openai_compat/init.py
@ -0,0 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.inference import Inference
+
+from .config import TogetherCompatConfig
+
+
+async def get_adapter_impl(config: TogetherCompatConfig, _deps) -> Inference:
+    # import dynamically so the import is used only when it is needed
+    from .together import TogetherCompatInferenceAdapter
+
+    adapter = TogetherCompatInferenceAdapter(config)
+    return adapter
--- a/llama_stack/providers/remote/inference/together_openai_compat/config.py
+++ b/llama_stack/providers/remote/inference/together_openai_compat/config.py
@ -0,0 +1,38 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any, Dict, Optional
+
+from pydantic import BaseModel, Field
+
+from llama_stack.schema_utils import json_schema_type
+
+
+class TogetherProviderDataValidator(BaseModel):
+    together_api_key: Optional[str] = Field(
+        default=None,
+        description="API key for Together models",
+    )
+
+
+@json_schema_type
+class TogetherCompatConfig(BaseModel):
+    api_key: Optional[str] = Field(
+        default=None,
+        description="The Together API key",
+    )
+
+    openai_compat_api_base: str = Field(
+        default="https://api.together.xyz/v1",
+        description="The URL for the Together API server",
+    )
+
+    @classmethod
+    def sample_run_config(cls, api_key: str = "${env.TOGETHER_API_KEY}", **kwargs) -> Dict[str, Any]:
+        return {
+            "openai_compat_api_base": "https://api.together.xyz/v1",
+            "api_key": api_key,
+        }
--- a/llama_stack/providers/remote/inference/together_openai_compat/together.py
+++ b/llama_stack/providers/remote/inference/together_openai_compat/together.py
@ -0,0 +1,30 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.providers.remote.inference.together_openai_compat.config import TogetherCompatConfig
+from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
+
+from ..together.models import MODEL_ENTRIES
+
+
+class TogetherCompatInferenceAdapter(LiteLLMOpenAIMixin):
+    _config: TogetherCompatConfig
+
+    def __init__(self, config: TogetherCompatConfig):
+        LiteLLMOpenAIMixin.__init__(
+            self,
+            model_entries=MODEL_ENTRIES,
+            api_key_from_config=config.api_key,
+            provider_data_api_key_field="together_api_key",
+            openai_compat_api_base=config.openai_compat_api_base,
+        )
+        self.config = config
+
+    async def initialize(self):
+        await super().initialize()
+
+    async def shutdown(self):
+        await super().shutdown()
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.
 import json
 import logging
-from typing import Any, AsyncGenerator, List, Optional, Union
+from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union

 import httpx
 from openai import AsyncOpenAI
@ -45,6 +45,12 @@ from llama_stack.apis.inference import (
    ToolDefinition,
    ToolPromptFormat,
 )
+from llama_stack.apis.inference.inference import (
+    OpenAIChatCompletion,
+    OpenAICompletion,
+    OpenAIMessageParam,
+    OpenAIResponseFormatParam,
+)
 from llama_stack.apis.models import Model, ModelType
 from llama_stack.models.llama.datatypes import BuiltinTool, StopReason, ToolCall
 from llama_stack.models.llama.sku_list import all_registered_models
@ -58,6 +64,7 @@ from llama_stack.providers.utils.inference.openai_compat import (
    convert_message_to_openai_dict,
    convert_tool_call,
    get_sampling_options,
+    prepare_openai_completion_params,
    process_chat_completion_stream_response,
    process_completion_response,
    process_completion_stream_response,
@ -418,3 +425,131 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):

        embeddings = [data.embedding for data in response.data]
        return EmbeddingsResponse(embeddings=embeddings)
+
+    async def openai_completion(
+        self,
+        model: str,
+        prompt: Union[str, List[str], List[int], List[List[int]]],
+        best_of: Optional[int] = None,
+        echo: Optional[bool] = None,
+        frequency_penalty: Optional[float] = None,
+        logit_bias: Optional[Dict[str, float]] = None,
+        logprobs: Optional[bool] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        presence_penalty: Optional[float] = None,
+        seed: Optional[int] = None,
+        stop: Optional[Union[str, List[str]]] = None,
+        stream: Optional[bool] = None,
+        stream_options: Optional[Dict[str, Any]] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        user: Optional[str] = None,
+        guided_choice: Optional[List[str]] = None,
+        prompt_logprobs: Optional[int] = None,
+    ) -> OpenAICompletion:
+        model_obj = await self._get_model(model)
+
+        extra_body: Dict[str, Any] = {}
+        if prompt_logprobs is not None and prompt_logprobs >= 0:
+            extra_body["prompt_logprobs"] = prompt_logprobs
+        if guided_choice:
+            extra_body["guided_choice"] = guided_choice
+
+        params = await prepare_openai_completion_params(
+            model=model_obj.provider_resource_id,
+            prompt=prompt,
+            best_of=best_of,
+            echo=echo,
+            frequency_penalty=frequency_penalty,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            max_tokens=max_tokens,
+            n=n,
+            presence_penalty=presence_penalty,
+            seed=seed,
+            stop=stop,
+            stream=stream,
+            stream_options=stream_options,
+            temperature=temperature,
+            top_p=top_p,
+            user=user,
+            extra_body=extra_body,
+        )
+        return await self.client.completions.create(**params)  # type: ignore
+
+    async def openai_chat_completion(
+        self,
+        model: str,
+        messages: List[OpenAIMessageParam],
+        frequency_penalty: Optional[float] = None,
+        function_call: Optional[Union[str, Dict[str, Any]]] = None,
+        functions: Optional[List[Dict[str, Any]]] = None,
+        logit_bias: Optional[Dict[str, float]] = None,
+        logprobs: Optional[bool] = None,
+        max_completion_tokens: Optional[int] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        parallel_tool_calls: Optional[bool] = None,
+        presence_penalty: Optional[float] = None,
+        response_format: Optional[OpenAIResponseFormatParam] = None,
+        seed: Optional[int] = None,
+        stop: Optional[Union[str, List[str]]] = None,
+        stream: Optional[bool] = None,
+        stream_options: Optional[Dict[str, Any]] = None,
+        temperature: Optional[float] = None,
+        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        top_logprobs: Optional[int] = None,
+        top_p: Optional[float] = None,
+        user: Optional[str] = None,
+    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+        model_obj = await self._get_model(model)
+        params = await prepare_openai_completion_params(
+            model=model_obj.provider_resource_id,
+            messages=messages,
+            frequency_penalty=frequency_penalty,
+            function_call=function_call,
+            functions=functions,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            max_completion_tokens=max_completion_tokens,
+            max_tokens=max_tokens,
+            n=n,
+            parallel_tool_calls=parallel_tool_calls,
+            presence_penalty=presence_penalty,
+            response_format=response_format,
+            seed=seed,
+            stop=stop,
+            stream=stream,
+            stream_options=stream_options,
+            temperature=temperature,
+            tool_choice=tool_choice,
+            tools=tools,
+            top_logprobs=top_logprobs,
+            top_p=top_p,
+            user=user,
+        )
+        return await self.client.chat.completions.create(**params)  # type: ignore
+
+    async def batch_completion(
+        self,
+        model_id: str,
+        content_batch: List[InterleavedContent],
+        sampling_params: Optional[SamplingParams] = None,
+        response_format: Optional[ResponseFormat] = None,
+        logprobs: Optional[LogProbConfig] = None,
+    ):
+        raise NotImplementedError("Batch completion is not supported for Ollama")
+
+    async def batch_chat_completion(
+        self,
+        model_id: str,
+        messages_batch: List[List[Message]],
+        sampling_params: Optional[SamplingParams] = None,
+        tools: Optional[List[ToolDefinition]] = None,
+        tool_config: Optional[ToolConfig] = None,
+        response_format: Optional[ResponseFormat] = None,
+        logprobs: Optional[LogProbConfig] = None,
+    ):
+        raise NotImplementedError("Batch chat completion is not supported for Ollama")
--- a/llama_stack/providers/remote/post_training/nvidia/models.py
+++ b/llama_stack/providers/remote/post_training/nvidia/models.py
@ -6,7 +6,7 @@

 from typing import List

-from llama_stack.models.llama.datatypes import CoreModelId
+from llama_stack.models.llama.sku_types import CoreModelId
 from llama_stack.providers.utils.inference.model_registry import (
    ProviderModelEntry,
    build_hf_repo_model_entry,
--- a/llama_stack/providers/remote/post_training/nvidia/post_training.py
+++ b/llama_stack/providers/remote/post_training/nvidia/post_training.py
@ -209,10 +209,6 @@ class NvidiaPostTrainingAdapter(ModelRegistryHelper):
        model: str,
        checkpoint_dir: Optional[str],
        algorithm_config: Optional[AlgorithmConfig] = None,
-        extra_json: Optional[Dict[str, Any]] = None,
-        params: Optional[Dict[str, Any]] = None,
-        headers: Optional[Dict[str, Any]] = None,
-        **kwargs,
    ) -> NvidiaPostTrainingJob:
        """
        Fine-tunes a model on a dataset.
--- a/llama_stack/providers/remote/safety/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/safety/nvidia/nvidia.py
@ -109,7 +109,6 @@ class NeMoGuardrails:
        headers = {
            "Accept": "application/json",
        }
-        print(data)
        response = requests.post(url=f"{self.guardrails_service_url}{path}", headers=headers, json=data)
        response.raise_for_status()
        return response.json()
--- a/llama_stack/providers/tests/report.py
+++ b/llama_stack/providers/tests/report.py
@ -12,8 +12,8 @@ import pytest
 from pytest import ExitCode
 from pytest_html.basereport import _process_outcome

-from llama_stack.models.llama.datatypes import CoreModelId
 from llama_stack.models.llama.sku_list import all_registered_models
+from llama_stack.models.llama.sku_types import CoreModelId

 INFERENCE_APIS = ["chat_completion"]
 FUNCTIONALITIES = ["streaming", "structured_output", "tool_calling"]
--- a/llama_stack/providers/utils/inference/init.py
+++ b/llama_stack/providers/utils/inference/init.py
@ -6,8 +6,8 @@

 from typing import List

-from llama_stack.models.llama.datatypes import *  # noqa: F403
 from llama_stack.models.llama.sku_list import all_registered_models
+from llama_stack.models.llama.sku_types import *  # noqa: F403


 def is_supported_safety_model(model: Model) -> bool:
@ -27,7 +27,7 @@ def supported_inference_models() -> List[Model]:
        m
        for m in all_registered_models()
        if (
-            m.model_family in {ModelFamily.llama3_1, ModelFamily.llama3_2, ModelFamily.llama3_3}
+            m.model_family in {ModelFamily.llama3_1, ModelFamily.llama3_2, ModelFamily.llama3_3, ModelFamily.llama4}
            or is_supported_safety_model(m)
        )
    ]
--- a/llama_stack/providers/utils/inference/litellm_openai_mixin.py
+++ b/llama_stack/providers/utils/inference/litellm_openai_mixin.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import AsyncGenerator, AsyncIterator, List, Optional, Union
+from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union

 import litellm

@ -30,18 +30,24 @@ from llama_stack.apis.inference import (
    ToolDefinition,
    ToolPromptFormat,
 )
+from llama_stack.apis.inference.inference import (
+    OpenAIChatCompletion,
+    OpenAIChatCompletionChunk,
+    OpenAICompletion,
+    OpenAIMessageParam,
+    OpenAIResponseFormatParam,
+)
 from llama_stack.apis.models.models import Model
 from llama_stack.distribution.request_headers import NeedsRequestProviderData
 from llama_stack.log import get_logger
-from llama_stack.providers.utils.inference.model_registry import (
-    ModelRegistryHelper,
-)
+from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
 from llama_stack.providers.utils.inference.openai_compat import (
    convert_message_to_openai_dict_new,
    convert_openai_chat_completion_choice,
    convert_openai_chat_completion_stream,
    convert_tooldef_to_openai_tool,
    get_sampling_options,
+    prepare_openai_completion_params,
 )
 from llama_stack.providers.utils.inference.prompt_adapter import (
    interleaved_content_as_str,
@ -55,10 +61,22 @@ class LiteLLMOpenAIMixin(
    Inference,
    NeedsRequestProviderData,
 ):
-    def __init__(self, model_entries, api_key_from_config: str, provider_data_api_key_field: str):
+    def __init__(
+        self,
+        model_entries,
+        api_key_from_config: Optional[str],
+        provider_data_api_key_field: str,
+        openai_compat_api_base: str | None = None,
+    ):
        ModelRegistryHelper.__init__(self, model_entries)
        self.api_key_from_config = api_key_from_config
        self.provider_data_api_key_field = provider_data_api_key_field
+        self.api_base = openai_compat_api_base
+
+        if openai_compat_api_base:
+            self.is_openai_compat = True
+        else:
+            self.is_openai_compat = False

    async def initialize(self):
        pass
@ -98,6 +116,7 @@ class LiteLLMOpenAIMixin(
    ) -> Union[ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]]:
        if sampling_params is None:
            sampling_params = SamplingParams()
+
        model = await self.model_store.get_model(model_id)
        request = ChatCompletionRequest(
            model=model.provider_resource_id,
@ -111,6 +130,9 @@ class LiteLLMOpenAIMixin(
        )

        params = await self._get_params(request)
+        if self.is_openai_compat:
+            params["model"] = "openai/" + params["model"]
+
        logger.debug(f"params to litellm (openai compat): {params}")
        # unfortunately, we need to use synchronous litellm.completion here because litellm
        # caches various httpx.client objects in a non-eventloop aware manner
@ -208,6 +230,7 @@ class LiteLLMOpenAIMixin(
        return {
            "model": request.model,
            "api_key": api_key,
+            "api_base": self.api_base,
            **input_dict,
            "stream": request.stream,
            **get_sampling_options(request.sampling_params),
@ -230,3 +253,125 @@ class LiteLLMOpenAIMixin(

        embeddings = [data["embedding"] for data in response["data"]]
        return EmbeddingsResponse(embeddings=embeddings)
+
+    async def openai_completion(
+        self,
+        model: str,
+        prompt: Union[str, List[str], List[int], List[List[int]]],
+        best_of: Optional[int] = None,
+        echo: Optional[bool] = None,
+        frequency_penalty: Optional[float] = None,
+        logit_bias: Optional[Dict[str, float]] = None,
+        logprobs: Optional[bool] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        presence_penalty: Optional[float] = None,
+        seed: Optional[int] = None,
+        stop: Optional[Union[str, List[str]]] = None,
+        stream: Optional[bool] = None,
+        stream_options: Optional[Dict[str, Any]] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        user: Optional[str] = None,
+        guided_choice: Optional[List[str]] = None,
+        prompt_logprobs: Optional[int] = None,
+    ) -> OpenAICompletion:
+        model_obj = await self.model_store.get_model(model)
+        params = await prepare_openai_completion_params(
+            model=model_obj.provider_resource_id,
+            prompt=prompt,
+            best_of=best_of,
+            echo=echo,
+            frequency_penalty=frequency_penalty,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            max_tokens=max_tokens,
+            n=n,
+            presence_penalty=presence_penalty,
+            seed=seed,
+            stop=stop,
+            stream=stream,
+            stream_options=stream_options,
+            temperature=temperature,
+            top_p=top_p,
+            user=user,
+            guided_choice=guided_choice,
+            prompt_logprobs=prompt_logprobs,
+        )
+        return await litellm.atext_completion(**params)
+
+    async def openai_chat_completion(
+        self,
+        model: str,
+        messages: List[OpenAIMessageParam],
+        frequency_penalty: Optional[float] = None,
+        function_call: Optional[Union[str, Dict[str, Any]]] = None,
+        functions: Optional[List[Dict[str, Any]]] = None,
+        logit_bias: Optional[Dict[str, float]] = None,
+        logprobs: Optional[bool] = None,
+        max_completion_tokens: Optional[int] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        parallel_tool_calls: Optional[bool] = None,
+        presence_penalty: Optional[float] = None,
+        response_format: Optional[OpenAIResponseFormatParam] = None,
+        seed: Optional[int] = None,
+        stop: Optional[Union[str, List[str]]] = None,
+        stream: Optional[bool] = None,
+        stream_options: Optional[Dict[str, Any]] = None,
+        temperature: Optional[float] = None,
+        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        top_logprobs: Optional[int] = None,
+        top_p: Optional[float] = None,
+        user: Optional[str] = None,
+    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+        model_obj = await self.model_store.get_model(model)
+        params = await prepare_openai_completion_params(
+            model=model_obj.provider_resource_id,
+            messages=messages,
+            frequency_penalty=frequency_penalty,
+            function_call=function_call,
+            functions=functions,
+            logit_bias=logit_bias,
+            logprobs=logprobs,
+            max_completion_tokens=max_completion_tokens,
+            max_tokens=max_tokens,
+            n=n,
+            parallel_tool_calls=parallel_tool_calls,
+            presence_penalty=presence_penalty,
+            response_format=response_format,
+            seed=seed,
+            stop=stop,
+            stream=stream,
+            stream_options=stream_options,
+            temperature=temperature,
+            tool_choice=tool_choice,
+            tools=tools,
+            top_logprobs=top_logprobs,
+            top_p=top_p,
+            user=user,
+        )
+        return await litellm.acompletion(**params)
+
+    async def batch_completion(
+        self,
+        model_id: str,
+        content_batch: List[InterleavedContent],
+        sampling_params: Optional[SamplingParams] = None,
+        response_format: Optional[ResponseFormat] = None,
+        logprobs: Optional[LogProbConfig] = None,
+    ):
+        raise NotImplementedError("Batch completion is not supported for OpenAI Compat")
+
+    async def batch_chat_completion(
+        self,
+        model_id: str,
+        messages_batch: List[List[Message]],
+        sampling_params: Optional[SamplingParams] = None,
+        tools: Optional[List[ToolDefinition]] = None,
+        tool_config: Optional[ToolConfig] = None,
+        response_format: Optional[ResponseFormat] = None,
+        logprobs: Optional[LogProbConfig] = None,
+    ):
+        raise NotImplementedError("Batch chat completion is not supported for OpenAI Compat")
--- a/llama_stack/providers/utils/inference/model_registry.py
+++ b/llama_stack/providers/utils/inference/model_registry.py
@ -33,12 +33,17 @@ def get_huggingface_repo(model_descriptor: str) -> Optional[str]:
    return None


-def build_hf_repo_model_entry(provider_model_id: str, model_descriptor: str) -> ProviderModelEntry:
+def build_hf_repo_model_entry(
+    provider_model_id: str, model_descriptor: str, additional_aliases: Optional[List[str]] = None
+) -> ProviderModelEntry:
+    aliases = [
+        get_huggingface_repo(model_descriptor),
+    ]
+    if additional_aliases:
+        aliases.extend(additional_aliases)
    return ProviderModelEntry(
        provider_model_id=provider_model_id,
-        aliases=[
-            get_huggingface_repo(model_descriptor),
-        ],
+        aliases=aliases,
        llama_model=model_descriptor,
    )

--- a/llama_stack/providers/utils/inference/openai_compat.py
+++ b/llama_stack/providers/utils/inference/openai_compat.py
@ -5,8 +5,10 @@
 # the root directory of this source tree.
 import json
 import logging
+import time
+import uuid
 import warnings
-from typing import AsyncGenerator, Dict, Iterable, List, Optional, Union
+from typing import Any, AsyncGenerator, AsyncIterator, Awaitable, Dict, Iterable, List, Optional, Union

 from openai import AsyncStream
 from openai.types.chat import (
@ -48,6 +50,18 @@ from openai.types.chat.chat_completion import (
 from openai.types.chat.chat_completion import (
    ChoiceLogprobs as OpenAIChoiceLogprobs,  # same as chat_completion_chunk ChoiceLogprobs
 )
+from openai.types.chat.chat_completion_chunk import (
+    Choice as OpenAIChatCompletionChunkChoice,
+)
+from openai.types.chat.chat_completion_chunk import (
+    ChoiceDelta as OpenAIChoiceDelta,
+)
+from openai.types.chat.chat_completion_chunk import (
+    ChoiceDeltaToolCall as OpenAIChoiceDeltaToolCall,
+)
+from openai.types.chat.chat_completion_chunk import (
+    ChoiceDeltaToolCallFunction as OpenAIChoiceDeltaToolCallFunction,
+)
 from openai.types.chat.chat_completion_content_part_image_param import (
    ImageURL as OpenAIImageURL,
 )
@ -57,6 +71,7 @@ from openai.types.chat.chat_completion_message_tool_call_param import (
 from pydantic import BaseModel

 from llama_stack.apis.common.content_types import (
+    URL,
    ImageContentItem,
    InterleavedContent,
    TextContentItem,
@ -73,21 +88,34 @@ from llama_stack.apis.inference import (
    CompletionMessage,
    CompletionResponse,
    CompletionResponseStreamChunk,
+    GreedySamplingStrategy,
    Message,
+    SamplingParams,
    SystemMessage,
    TokenLogProbs,
    ToolResponseMessage,
+    TopKSamplingStrategy,
+    TopPSamplingStrategy,
    UserMessage,
 )
+from llama_stack.apis.inference.inference import (
+    JsonSchemaResponseFormat,
+    OpenAIChatCompletion,
+    OpenAICompletion,
+    OpenAICompletionChoice,
+    OpenAIMessageParam,
+    OpenAIResponseFormatParam,
+    ToolConfig,
+)
+from llama_stack.apis.inference.inference import (
+    OpenAIChoice as OpenAIChatCompletionChoice,
+)
 from llama_stack.models.llama.datatypes import (
    BuiltinTool,
-    GreedySamplingStrategy,
-    SamplingParams,
    StopReason,
    ToolCall,
    ToolDefinition,
-    TopKSamplingStrategy,
-    TopPSamplingStrategy,
+    ToolParamDefinition,
 )
 from llama_stack.providers.utils.inference.prompt_adapter import (
    convert_image_content_to_url,
@ -573,21 +601,24 @@ async def convert_message_to_openai_dict_new(
            content=await _convert_message_content(message.content),
        )
    elif isinstance(message, CompletionMessage):
+        tool_calls = [
+            OpenAIChatCompletionMessageToolCall(
+                id=tool.call_id,
+                function=OpenAIFunction(
+                    name=(tool.tool_name if not isinstance(tool.tool_name, BuiltinTool) else tool.tool_name.value),
+                    arguments=json.dumps(tool.arguments),
+                ),
+                type="function",
+            )
+            for tool in message.tool_calls
+        ]
+        params = {}
+        if tool_calls:
+            params = {"tool_calls": tool_calls}
        out = OpenAIChatCompletionAssistantMessage(
            role="assistant",
            content=await _convert_message_content(message.content),
-            tool_calls=[
-                OpenAIChatCompletionMessageToolCall(
-                    id=tool.call_id,
-                    function=OpenAIFunction(
-                        name=(tool.tool_name if not isinstance(tool.tool_name, BuiltinTool) else tool.tool_name.value),
-                        arguments=json.dumps(tool.arguments),
-                    ),
-                    type="function",
-                )
-                for tool in message.tool_calls
-            ]
-            or None,
+            **params,
        )
    elif isinstance(message, ToolResponseMessage):
        out = OpenAIChatCompletionToolMessage(
@ -639,6 +670,36 @@ PYTHON_TYPE_TO_LITELLM_TYPE = {
 }


+def to_openai_param_type(param_type: str) -> dict:
+    """
+    Convert Python type hints to OpenAI parameter type format.
+
+    Examples:
+        'str' -> {'type': 'string'}
+        'int' -> {'type': 'integer'}
+        'list[str]' -> {'type': 'array', 'items': {'type': 'string'}}
+        'list[int]' -> {'type': 'array', 'items': {'type': 'integer'}}
+    """
+    # Handle basic types first
+    basic_types = {
+        "str": "string",
+        "int": "integer",
+        "float": "number",
+        "bool": "boolean",
+    }
+
+    if param_type in basic_types:
+        return {"type": basic_types[param_type]}
+
+    # Handle list/array types
+    if param_type.startswith("list[") and param_type.endswith("]"):
+        inner_type = param_type[5:-1]
+        if inner_type in basic_types:
+            return {"type": "array", "items": {"type": basic_types.get(inner_type, inner_type)}}
+
+    return {"type": param_type}
+
+
 def convert_tooldef_to_openai_tool(tool: ToolDefinition) -> dict:
    """
    Convert a ToolDefinition to an OpenAI API-compatible dictionary.
@ -699,7 +760,7 @@ def convert_tooldef_to_openai_tool(tool: ToolDefinition) -> dict:
        properties = parameters["properties"]
        required = []
        for param_name, param in tool.parameters.items():
-            properties[param_name] = {"type": PYTHON_TYPE_TO_LITELLM_TYPE.get(param.param_type, param.param_type)}
+            properties[param_name] = to_openai_param_type(param.param_type)
            if param.description:
                properties[param_name].update(description=param.description)
            if param.default:
@ -715,6 +776,17 @@ def convert_tooldef_to_openai_tool(tool: ToolDefinition) -> dict:
    return out


+def _convert_stop_reason_to_openai_finish_reason(stop_reason: StopReason) -> str:
+    """
+    Convert a StopReason to an OpenAI chat completion finish_reason.
+    """
+    return {
+        StopReason.end_of_turn: "stop",
+        StopReason.end_of_message: "tool_calls",
+        StopReason.out_of_tokens: "length",
+    }.get(stop_reason, "stop")
+
+
 def _convert_openai_finish_reason(finish_reason: str) -> StopReason:
    """
    Convert an OpenAI chat completion finish_reason to a StopReason.
@ -740,6 +812,56 @@ def _convert_openai_finish_reason(finish_reason: str) -> StopReason:
    }.get(finish_reason, StopReason.end_of_turn)


+def _convert_openai_request_tool_config(tool_choice: Optional[Union[str, Dict[str, Any]]] = None) -> ToolConfig:
+    tool_config = ToolConfig()
+    if tool_choice:
+        tool_config.tool_choice = tool_choice
+    return tool_config
+
+
+def _convert_openai_request_tools(tools: Optional[List[Dict[str, Any]]] = None) -> List[ToolDefinition]:
+    lls_tools = []
+    if not tools:
+        return lls_tools
+
+    for tool in tools:
+        tool_fn = tool.get("function", {})
+        tool_name = tool_fn.get("name", None)
+        tool_desc = tool_fn.get("description", None)
+
+        tool_params = tool_fn.get("parameters", None)
+        lls_tool_params = {}
+        if tool_params is not None:
+            tool_param_properties = tool_params.get("properties", {})
+            for tool_param_key, tool_param_value in tool_param_properties.items():
+                tool_param_def = ToolParamDefinition(
+                    param_type=tool_param_value.get("type", None),
+                    description=tool_param_value.get("description", None),
+                )
+                lls_tool_params[tool_param_key] = tool_param_def
+
+        lls_tool = ToolDefinition(
+            tool_name=tool_name,
+            description=tool_desc,
+            parameters=lls_tool_params,
+        )
+        lls_tools.append(lls_tool)
+    return lls_tools
+
+
+def _convert_openai_request_response_format(response_format: OpenAIResponseFormatParam = None):
+    if not response_format:
+        return None
+    # response_format can be a dict or a pydantic model
+    response_format = dict(response_format)
+    if response_format.get("type", "") == "json_schema":
+        return JsonSchemaResponseFormat(
+            type="json_schema",
+            json_schema=response_format.get("json_schema", {}).get("schema", ""),
+        )
+    return None
+
+
 def _convert_openai_tool_calls(
    tool_calls: List[OpenAIChatCompletionMessageToolCall],
 ) -> List[ToolCall]:
@ -801,7 +923,7 @@ def _convert_openai_logprobs(
         - token, logprob

    """
-    if not logprobs:
+    if not logprobs or not logprobs.content:
        return None

    return [
@ -810,6 +932,65 @@ def _convert_openai_logprobs(
    ]


+def _convert_openai_sampling_params(
+    max_tokens: Optional[int] = None,
+    temperature: Optional[float] = None,
+    top_p: Optional[float] = None,
+) -> SamplingParams:
+    sampling_params = SamplingParams()
+
+    if max_tokens:
+        sampling_params.max_tokens = max_tokens
+
+    # Map an explicit temperature of 0 to greedy sampling
+    if temperature == 0:
+        strategy = GreedySamplingStrategy()
+    else:
+        # OpenAI defaults to 1.0 for temperature and top_p if unset
+        if temperature is None:
+            temperature = 1.0
+        if top_p is None:
+            top_p = 1.0
+        strategy = TopPSamplingStrategy(temperature=temperature, top_p=top_p)
+
+    sampling_params.strategy = strategy
+    return sampling_params
+
+
+def _convert_openai_request_messages(messages: List[OpenAIMessageParam]):
+    # Llama Stack messages and OpenAI messages are similar, but not identical.
+    lls_messages = []
+    for message in messages:
+        lls_message = dict(message)
+
+        #  Llama Stack expects `call_id` but OpenAI uses `tool_call_id`
+        tool_call_id = lls_message.pop("tool_call_id", None)
+        if tool_call_id:
+            lls_message["call_id"] = tool_call_id
+
+        content = lls_message.get("content", None)
+        if isinstance(content, list):
+            lls_content = []
+            for item in content:
+                # items can either by pydantic models or dicts here...
+                item = dict(item)
+                if item.get("type", "") == "image_url":
+                    lls_item = ImageContentItem(
+                        type="image",
+                        image=URL(uri=item.get("image_url", {}).get("url", "")),
+                    )
+                elif item.get("type", "") == "text":
+                    lls_item = TextContentItem(
+                        type="text",
+                        text=item.get("text", ""),
+                    )
+                lls_content.append(lls_item)
+            lls_message["content"] = lls_content
+        lls_messages.append(lls_message)
+
+    return lls_messages
+
+
 def convert_openai_chat_completion_choice(
    choice: OpenAIChoice,
 ) -> ChatCompletionResponse:
@ -1016,3 +1197,218 @@ async def convert_openai_chat_completion_stream(
            stop_reason=stop_reason,
        )
    )
+
+
+async def prepare_openai_completion_params(**params):
+    async def _prepare_value(value: Any) -> Any:
+        new_value = value
+        if isinstance(value, list):
+            new_value = [await _prepare_value(v) for v in value]
+        elif isinstance(value, dict):
+            new_value = {k: await _prepare_value(v) for k, v in value.items()}
+        elif isinstance(value, BaseModel):
+            new_value = value.model_dump(exclude_none=True)
+        return new_value
+
+    completion_params = {}
+    for k, v in params.items():
+        if v is not None:
+            completion_params[k] = await _prepare_value(v)
+    return completion_params
+
+
+class OpenAICompletionToLlamaStackMixin:
+    async def openai_completion(
+        self,
+        model: str,
+        prompt: Union[str, List[str], List[int], List[List[int]]],
+        best_of: Optional[int] = None,
+        echo: Optional[bool] = None,
+        frequency_penalty: Optional[float] = None,
+        logit_bias: Optional[Dict[str, float]] = None,
+        logprobs: Optional[bool] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        presence_penalty: Optional[float] = None,
+        seed: Optional[int] = None,
+        stop: Optional[Union[str, List[str]]] = None,
+        stream: Optional[bool] = None,
+        stream_options: Optional[Dict[str, Any]] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        user: Optional[str] = None,
+        guided_choice: Optional[List[str]] = None,
+        prompt_logprobs: Optional[int] = None,
+    ) -> OpenAICompletion:
+        if stream:
+            raise ValueError(f"{self.__class__.__name__} doesn't support streaming openai completions")
+
+        # This is a pretty hacky way to do emulate completions -
+        # basically just de-batches them...
+        prompts = [prompt] if not isinstance(prompt, list) else prompt
+
+        sampling_params = _convert_openai_sampling_params(
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+        )
+
+        choices = []
+        # "n" is the number of completions to generate per prompt
+        n = n or 1
+        for _i in range(0, n):
+            # and we may have multiple prompts, if batching was used
+
+            for prompt in prompts:
+                result = self.completion(
+                    model_id=model,
+                    content=prompt,
+                    sampling_params=sampling_params,
+                )
+
+                index = len(choices)
+                text = result.content
+                finish_reason = _convert_stop_reason_to_openai_finish_reason(result.stop_reason)
+
+                choice = OpenAICompletionChoice(
+                    index=index,
+                    text=text,
+                    finish_reason=finish_reason,
+                )
+                choices.append(choice)
+
+        return OpenAICompletion(
+            id=f"cmpl-{uuid.uuid4()}",
+            choices=choices,
+            created=int(time.time()),
+            model=model,
+            object="text_completion",
+        )
+
+
+class OpenAIChatCompletionToLlamaStackMixin:
+    async def openai_chat_completion(
+        self,
+        model: str,
+        messages: List[OpenAIChatCompletionMessage],
+        frequency_penalty: Optional[float] = None,
+        function_call: Optional[Union[str, Dict[str, Any]]] = None,
+        functions: Optional[List[Dict[str, Any]]] = None,
+        logit_bias: Optional[Dict[str, float]] = None,
+        logprobs: Optional[bool] = None,
+        max_completion_tokens: Optional[int] = None,
+        max_tokens: Optional[int] = None,
+        n: Optional[int] = None,
+        parallel_tool_calls: Optional[bool] = None,
+        presence_penalty: Optional[float] = None,
+        response_format: Optional[OpenAIResponseFormatParam] = None,
+        seed: Optional[int] = None,
+        stop: Optional[Union[str, List[str]]] = None,
+        stream: Optional[bool] = None,
+        stream_options: Optional[Dict[str, Any]] = None,
+        temperature: Optional[float] = None,
+        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
+        tools: Optional[List[Dict[str, Any]]] = None,
+        top_logprobs: Optional[int] = None,
+        top_p: Optional[float] = None,
+        user: Optional[str] = None,
+    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+        messages = _convert_openai_request_messages(messages)
+        response_format = _convert_openai_request_response_format(response_format)
+        sampling_params = _convert_openai_sampling_params(
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+        )
+        tool_config = _convert_openai_request_tool_config(tool_choice)
+        tools = _convert_openai_request_tools(tools)
+
+        outstanding_responses = []
+        # "n" is the number of completions to generate per prompt
+        n = n or 1
+        for _i in range(0, n):
+            response = self.chat_completion(
+                model_id=model,
+                messages=messages,
+                sampling_params=sampling_params,
+                response_format=response_format,
+                stream=stream,
+                tool_config=tool_config,
+                tools=tools,
+            )
+            outstanding_responses.append(response)
+
+        if stream:
+            return OpenAIChatCompletionToLlamaStackMixin._process_stream_response(self, model, outstanding_responses)
+
+        return await OpenAIChatCompletionToLlamaStackMixin._process_non_stream_response(
+            self, model, outstanding_responses
+        )
+
+    async def _process_stream_response(
+        self, model: str, outstanding_responses: List[Awaitable[AsyncIterator[ChatCompletionResponseStreamChunk]]]
+    ):
+        id = f"chatcmpl-{uuid.uuid4()}"
+        for outstanding_response in outstanding_responses:
+            response = await outstanding_response
+            i = 0
+            async for chunk in response:
+                event = chunk.event
+                finish_reason = _convert_stop_reason_to_openai_finish_reason(event.stop_reason)
+
+                if isinstance(event.delta, TextDelta):
+                    text_delta = event.delta.text
+                    delta = OpenAIChoiceDelta(content=text_delta)
+                    yield OpenAIChatCompletionChunk(
+                        id=id,
+                        choices=[OpenAIChatCompletionChunkChoice(index=i, finish_reason=finish_reason, delta=delta)],
+                        created=int(time.time()),
+                        model=model,
+                        object="chat.completion.chunk",
+                    )
+                elif isinstance(event.delta, ToolCallDelta):
+                    if event.delta.parse_status == ToolCallParseStatus.succeeded:
+                        tool_call = event.delta.tool_call
+                        openai_tool_call = OpenAIChoiceDeltaToolCall(
+                            index=0,
+                            id=tool_call.call_id,
+                            function=OpenAIChoiceDeltaToolCallFunction(
+                                name=tool_call.tool_name, arguments=tool_call.arguments_json
+                            ),
+                        )
+                        delta = OpenAIChoiceDelta(tool_calls=[openai_tool_call])
+                        yield OpenAIChatCompletionChunk(
+                            id=id,
+                            choices=[
+                                OpenAIChatCompletionChunkChoice(index=i, finish_reason=finish_reason, delta=delta)
+                            ],
+                            created=int(time.time()),
+                            model=model,
+                            object="chat.completion.chunk",
+                        )
+                i = i + 1
+
+    async def _process_non_stream_response(
+        self, model: str, outstanding_responses: List[Awaitable[ChatCompletionResponse]]
+    ) -> OpenAIChatCompletion:
+        choices = []
+        for outstanding_response in outstanding_responses:
+            response = await outstanding_response
+            completion_message = response.completion_message
+            message = await convert_message_to_openai_dict_new(completion_message)
+            finish_reason = _convert_stop_reason_to_openai_finish_reason(completion_message.stop_reason)
+
+            choice = OpenAIChatCompletionChoice(
+                index=len(choices),
+                message=message,
+                finish_reason=finish_reason,
+            )
+            choices.append(choice)
+
+        return OpenAIChatCompletion(
+            id=f"chatcmpl-{uuid.uuid4()}",
+            choices=choices,
+            created=int(time.time()),
+            model=model,
+            object="chat.completion",
+        )
--- a/llama_stack/providers/utils/inference/prompt_adapter.py
+++ b/llama_stack/providers/utils/inference/prompt_adapter.py
@ -34,7 +34,6 @@ from llama_stack.apis.inference import (
 )
 from llama_stack.log import get_logger
 from llama_stack.models.llama.datatypes import (
-    ModelFamily,
    RawContent,
    RawContentItem,
    RawMediaItem,
@ -43,7 +42,6 @@ from llama_stack.models.llama.datatypes import (
    Role,
    StopReason,
    ToolPromptFormat,
-    is_multimodal,
 )
 from llama_stack.models.llama.llama3.chat_format import ChatFormat
 from llama_stack.models.llama.llama3.prompt_templates import (
@ -55,6 +53,7 @@ from llama_stack.models.llama.llama3.prompt_templates import (
 )
 from llama_stack.models.llama.llama3.tokenizer import Tokenizer
 from llama_stack.models.llama.sku_list import resolve_model
+from llama_stack.models.llama.sku_types import ModelFamily, is_multimodal
 from llama_stack.providers.utils.inference import supported_inference_models

 log = get_logger(name=__name__, category="inference")
@ -224,7 +223,9 @@ async def completion_request_to_prompt(request: CompletionRequest) -> str:
    return formatter.tokenizer.decode(model_input.tokens)


-async def completion_request_to_prompt_model_input_info(request: CompletionRequest) -> Tuple[str, int]:
+async def completion_request_to_prompt_model_input_info(
+    request: CompletionRequest,
+) -> Tuple[str, int]:
    content = augment_content_with_response_format_prompt(request.response_format, request.content)
    request.content = content
    request = await convert_request_to_raw(request)
@ -302,8 +303,12 @@ def chat_completion_request_to_messages(
    ):
        # llama3.1 and llama3.2 multimodal models follow the same tool prompt format
        messages = augment_messages_for_tools_llama_3_1(request)
-    elif model.model_family in (ModelFamily.llama3_2, ModelFamily.llama3_3):
-        # llama3.2 and llama3.3 models follow the same tool prompt format
+    elif model.model_family in (
+        ModelFamily.llama3_2,
+        ModelFamily.llama3_3,
+        ModelFamily.llama4,
+    ):
+        # llama3.2, llama3.3 and llama4 models follow the same tool prompt format
        messages = augment_messages_for_tools_llama_3_2(request)
    else:
        messages = request.messages
@ -471,7 +476,11 @@ def get_default_tool_prompt_format(model: str) -> ToolPromptFormat:
    ):
        # llama3.1 and llama3.2 multimodal models follow the same tool prompt format
        return ToolPromptFormat.json
-    elif llama_model.model_family in (ModelFamily.llama3_2, ModelFamily.llama3_3):
+    elif llama_model.model_family in (
+        ModelFamily.llama3_2,
+        ModelFamily.llama3_3,
+        ModelFamily.llama4,
+    ):
        # llama3.2 and llama3.3 models follow the same tool prompt format
        return ToolPromptFormat.python_list
    else:
--- a/llama_stack/providers/utils/scheduler.py
+++ b/llama_stack/providers/utils/scheduler.py
@ -0,0 +1,265 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import abc
+import asyncio
+import functools
+import threading
+from datetime import datetime, timezone
+from enum import Enum
+from typing import Any, Callable, Coroutine, Dict, Iterable, Tuple, TypeAlias
+
+from pydantic import BaseModel
+
+from llama_stack.log import get_logger
+
+logger = get_logger(name=__name__, category="scheduler")
+
+
+# TODO: revisit the list of possible statuses when defining a more coherent
+# Jobs API for all API flows; e.g. do we need new vs scheduled?
+class JobStatus(Enum):
+    new = "new"
+    scheduled = "scheduled"
+    running = "running"
+    failed = "failed"
+    completed = "completed"
+
+
+JobID: TypeAlias = str
+JobType: TypeAlias = str
+
+
+class JobArtifact(BaseModel):
+    type: JobType
+    name: str
+    # TODO: uri should be a reference to /files API; revisit when /files is implemented
+    uri: str | None = None
+    metadata: Dict[str, Any]
+
+
+JobHandler = Callable[
+    [Callable[[str], None], Callable[[JobStatus], None], Callable[[JobArtifact], None]], Coroutine[Any, Any, None]
+]
+
+
+LogMessage: TypeAlias = Tuple[datetime, str]
+
+
+_COMPLETED_STATUSES = {JobStatus.completed, JobStatus.failed}
+
+
+class Job:
+    def __init__(self, job_type: JobType, job_id: JobID, handler: JobHandler):
+        super().__init__()
+        self.id = job_id
+        self._type = job_type
+        self._handler = handler
+        self._artifacts: list[JobArtifact] = []
+        self._logs: list[LogMessage] = []
+        self._state_transitions: list[Tuple[datetime, JobStatus]] = [(datetime.now(timezone.utc), JobStatus.new)]
+
+    @property
+    def handler(self) -> JobHandler:
+        return self._handler
+
+    @property
+    def status(self) -> JobStatus:
+        return self._state_transitions[-1][1]
+
+    @status.setter
+    def status(self, status: JobStatus):
+        if status in _COMPLETED_STATUSES and self.status in _COMPLETED_STATUSES:
+            raise ValueError(f"Job is already in a completed state ({self.status})")
+        if self.status == status:
+            return
+        self._state_transitions.append((datetime.now(timezone.utc), status))
+
+    @property
+    def artifacts(self) -> list[JobArtifact]:
+        return self._artifacts
+
+    def register_artifact(self, artifact: JobArtifact) -> None:
+        self._artifacts.append(artifact)
+
+    def _find_state_transition_date(self, status: Iterable[JobStatus]) -> datetime | None:
+        for date, s in reversed(self._state_transitions):
+            if s in status:
+                return date
+        return None
+
+    @property
+    def scheduled_at(self) -> datetime | None:
+        return self._find_state_transition_date([JobStatus.scheduled])
+
+    @property
+    def started_at(self) -> datetime | None:
+        return self._find_state_transition_date([JobStatus.running])
+
+    @property
+    def completed_at(self) -> datetime | None:
+        return self._find_state_transition_date(_COMPLETED_STATUSES)
+
+    @property
+    def logs(self) -> list[LogMessage]:
+        return self._logs[:]
+
+    def append_log(self, message: LogMessage) -> None:
+        self._logs.append(message)
+
+    # TODO: implement
+    def cancel(self) -> None:
+        raise NotImplementedError
+
+
+class _SchedulerBackend(abc.ABC):
+    @abc.abstractmethod
+    def on_log_message_cb(self, job: Job, message: LogMessage) -> None:
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def on_status_change_cb(self, job: Job, status: JobStatus) -> None:
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def on_artifact_collected_cb(self, job: Job, artifact: JobArtifact) -> None:
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    async def shutdown(self) -> None:
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def schedule(
+        self,
+        job: Job,
+        on_log_message_cb: Callable[[str], None],
+        on_status_change_cb: Callable[[JobStatus], None],
+        on_artifact_collected_cb: Callable[[JobArtifact], None],
+    ) -> None:
+        raise NotImplementedError
+
+
+class _NaiveSchedulerBackend(_SchedulerBackend):
+    def __init__(self, timeout: int = 5):
+        self._timeout = timeout
+        self._loop = asyncio.new_event_loop()
+        # There may be performance implications of using threads due to Python
+        # GIL; may need to measure if it's a real problem though
+        self._thread = threading.Thread(target=self._run_loop, daemon=True)
+        self._thread.start()
+
+    def _run_loop(self) -> None:
+        asyncio.set_event_loop(self._loop)
+        self._loop.run_forever()
+
+        # When stopping the loop, give tasks a chance to finish
+        # TODO: should we explicitly inform jobs of pending stoppage?
+        for task in asyncio.all_tasks(self._loop):
+            self._loop.run_until_complete(task)
+        self._loop.close()
+
+    async def shutdown(self) -> None:
+        self._loop.call_soon_threadsafe(self._loop.stop)
+        self._thread.join()
+
+    # TODO: decouple scheduling and running the job
+    def schedule(
+        self,
+        job: Job,
+        on_log_message_cb: Callable[[str], None],
+        on_status_change_cb: Callable[[JobStatus], None],
+        on_artifact_collected_cb: Callable[[JobArtifact], None],
+    ) -> None:
+        async def do():
+            try:
+                job.status = JobStatus.running
+                await job.handler(on_log_message_cb, on_status_change_cb, on_artifact_collected_cb)
+            except Exception as e:
+                on_log_message_cb(str(e))
+                job.status = JobStatus.failed
+                logger.exception(f"Job {job.id} failed.")
+
+        asyncio.run_coroutine_threadsafe(do(), self._loop)
+
+    def on_log_message_cb(self, job: Job, message: LogMessage) -> None:
+        pass
+
+    def on_status_change_cb(self, job: Job, status: JobStatus) -> None:
+        pass
+
+    def on_artifact_collected_cb(self, job: Job, artifact: JobArtifact) -> None:
+        pass
+
+
+_BACKENDS = {
+    "naive": _NaiveSchedulerBackend,
+}
+
+
+def _get_backend_impl(backend: str) -> _SchedulerBackend:
+    try:
+        return _BACKENDS[backend]()
+    except KeyError as e:
+        raise ValueError(f"Unknown backend {backend}") from e
+
+
+class Scheduler:
+    def __init__(self, backend: str = "naive"):
+        # TODO: if server crashes, job states are lost; we need to persist jobs on disc
+        self._jobs: dict[JobID, Job] = {}
+        self._backend = _get_backend_impl(backend)
+
+    def _on_log_message_cb(self, job: Job, message: str) -> None:
+        msg = (datetime.now(timezone.utc), message)
+        # At least for the time being, until there's a better way to expose
+        # logs to users, log messages on console
+        logger.info(f"Job {job.id}: {message}")
+        job.append_log(msg)
+        self._backend.on_log_message_cb(job, msg)
+
+    def _on_status_change_cb(self, job: Job, status: JobStatus) -> None:
+        job.status = status
+        self._backend.on_status_change_cb(job, status)
+
+    def _on_artifact_collected_cb(self, job: Job, artifact: JobArtifact) -> None:
+        job.register_artifact(artifact)
+        self._backend.on_artifact_collected_cb(job, artifact)
+
+    def schedule(self, type_: JobType, job_id: JobID, handler: JobHandler) -> JobID:
+        job = Job(type_, job_id, handler)
+        if job.id in self._jobs:
+            raise ValueError(f"Job {job.id} already exists")
+
+        self._jobs[job.id] = job
+        job.status = JobStatus.scheduled
+        self._backend.schedule(
+            job,
+            functools.partial(self._on_log_message_cb, job),
+            functools.partial(self._on_status_change_cb, job),
+            functools.partial(self._on_artifact_collected_cb, job),
+        )
+
+        return job.id
+
+    def cancel(self, job_id: JobID) -> None:
+        self.get_job(job_id).cancel()
+
+    def get_job(self, job_id: JobID) -> Job:
+        try:
+            return self._jobs[job_id]
+        except KeyError as e:
+            raise ValueError(f"Job {job_id} not found") from e
+
+    def get_jobs(self, type_: JobType | None = None) -> list[Job]:
+        jobs = list(self._jobs.values())
+        if type_:
+            jobs = [job for job in jobs if job._type == type_]
+        return jobs
+
+    async def shutdown(self):
+        # TODO: also cancel jobs once implemented
+        await self._backend.shutdown()