Merge branch 'main' into eval_api_final

2025-03-26 12:29:45 -07:00 · 2025-03-26 12:29:45 -07:00 · bc0cd07008
commit bc0cd07008
parent 7f12ea290f cb2a9784ab
79 changed files with 3257 additions and 2358 deletions
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@ -6,14 +6,12 @@

 import copy
 import json
-import os
 import re
 import secrets
 import string
 import uuid
 from datetime import datetime, timezone
 from typing import AsyncGenerator, List, Optional, Union
-from urllib.parse import urlparse

 import httpx

@ -60,7 +58,6 @@ from llama_stack.apis.inference import (
 )
 from llama_stack.apis.safety import Safety
 from llama_stack.apis.tools import (
-    RAGDocument,
    ToolGroups,
    ToolInvocationResult,
    ToolRuntime,
@ -453,8 +450,16 @@ class ChatAgent(ShieldRunnerMixin):
        stream: bool = False,
        documents: Optional[List[Document]] = None,
    ) -> AsyncGenerator:
+        # if document is passed in a turn, we parse the raw text of the document
+        # and sent it as a user message
        if documents:
-            await self.handle_documents(session_id, documents, input_messages)
+            contexts = []
+            for document in documents:
+                raw_document_text = await get_raw_document_text(document)
+                contexts.append(raw_document_text)
+
+            attached_context = "\n".join(contexts)
+            input_messages[-1].context = attached_context

        session_info = await self.storage.get_session_info(session_id)
        # if the session has a memory bank id, let the memory tool use it
@ -829,7 +834,10 @@ class ChatAgent(ShieldRunnerMixin):
                    )
                    tool_name_to_args[tool_def.identifier] = toolgroup_to_args.get(toolgroup_name, {})

-        self.tool_defs, self.tool_name_to_args = list(tool_name_to_def.values()), tool_name_to_args
+        self.tool_defs, self.tool_name_to_args = (
+            list(tool_name_to_def.values()),
+            tool_name_to_args,
+        )

    def _parse_toolgroup_name(self, toolgroup_name_with_maybe_tool_name: str) -> tuple[str, Optional[str]]:
        """Parse a toolgroup name into its components.
@ -880,144 +888,27 @@ class ChatAgent(ShieldRunnerMixin):
        logger.debug(f"tool call {tool_name_str} completed with result: {result}")
        return result

-    async def handle_documents(
-        self,
-        session_id: str,
-        documents: List[Document],
-        input_messages: List[Message],
-    ) -> None:
-        memory_tool = any(tool_def.tool_name == MEMORY_QUERY_TOOL for tool_def in self.tool_defs)
-        code_interpreter_tool = any(tool_def.tool_name == BuiltinTool.code_interpreter for tool_def in self.tool_defs)
-        content_items = []
-        url_items = []
-        pattern = re.compile("^(https?://|file://|data:)")
-        for d in documents:
-            if isinstance(d.content, URL):
-                url_items.append(d.content)
-            elif pattern.match(d.content):
-                url_items.append(URL(uri=d.content))
-            else:
-                content_items.append(d)

-        # Save the contents to a tempdir and use its path as a URL if code interpreter is present
-        if code_interpreter_tool:
-            for c in content_items:
-                temp_file_path = os.path.join(self.tempdir, f"{make_random_string()}.txt")
-                with open(temp_file_path, "w") as temp_file:
-                    temp_file.write(c.content)
-                url_items.append(URL(uri=f"file://{temp_file_path}"))
-
-        if memory_tool and code_interpreter_tool:
-            # if both memory and code_interpreter are available, we download the URLs
-            # and attach the data to the last message.
-            await attachment_message(self.tempdir, url_items, input_messages[-1])
-            # Since memory is present, add all the data to the memory bank
-            await self.add_to_session_vector_db(session_id, documents)
-        elif code_interpreter_tool:
-            # if only code_interpreter is available, we download the URLs to a tempdir
-            # and attach the path to them as a message to inference with the
-            # assumption that the model invokes the code_interpreter tool with the path
-            await attachment_message(self.tempdir, url_items, input_messages[-1])
-        elif memory_tool:
-            # if only memory is available, we load the data from the URLs and content items to the memory bank
-            await self.add_to_session_vector_db(session_id, documents)
-        else:
-            # if no memory or code_interpreter tool is available,
-            # we try to load the data from the URLs and content items as a message to inference
-            # and add it to the last message's context
-            input_messages[-1].context = "\n".join(
-                [doc.content for doc in content_items] + await load_data_from_urls(url_items)
-            )
-
-    async def _ensure_vector_db(self, session_id: str) -> str:
-        session_info = await self.storage.get_session_info(session_id)
-        if session_info is None:
-            raise ValueError(f"Session {session_id} not found")
-
-        if session_info.vector_db_id is None:
-            vector_db_id = f"vector_db_{session_id}"
-
-            # TODO: the semantic for registration is definitely not "creation"
-            # so we need to fix it if we expect the agent to create a new vector db
-            # for each session
-            await self.vector_io_api.register_vector_db(
-                vector_db_id=vector_db_id,
-                embedding_model="all-MiniLM-L6-v2",
-            )
-            await self.storage.add_vector_db_to_session(session_id, vector_db_id)
-        else:
-            vector_db_id = session_info.vector_db_id
-
-        return vector_db_id
-
-    async def add_to_session_vector_db(self, session_id: str, data: List[Document]) -> None:
-        vector_db_id = await self._ensure_vector_db(session_id)
-        documents = [
-            RAGDocument(
-                document_id=str(uuid.uuid4()),
-                content=a.content,
-                mime_type=a.mime_type,
-                metadata={},
-            )
-            for a in data
-        ]
-        await self.tool_runtime_api.rag_tool.insert(
-            documents=documents,
-            vector_db_id=vector_db_id,
-            chunk_size_in_tokens=512,
-        )
+async def load_data_from_url(url: str) -> str:
+    if url.startswith("http"):
+        async with httpx.AsyncClient() as client:
+            r = await client.get(url)
+            resp = r.text
+            return resp
+    raise ValueError(f"Unexpected URL: {type(url)}")


-async def load_data_from_urls(urls: List[URL]) -> List[str]:
-    data = []
-    for url in urls:
-        uri = url.uri
-        if uri.startswith("file://"):
-            filepath = uri[len("file://") :]
-            with open(filepath, "r") as f:
-                data.append(f.read())
-        elif uri.startswith("http"):
-            async with httpx.AsyncClient() as client:
-                r = await client.get(uri)
-                resp = r.text
-                data.append(resp)
-    return data
-
-
-async def attachment_message(tempdir: str, urls: List[URL], message: UserMessage) -> None:
-    contents = []
-
-    for url in urls:
-        uri = url.uri
-        if uri.startswith("file://"):
-            filepath = uri[len("file://") :]
-        elif uri.startswith("http"):
-            path = urlparse(uri).path
-            basename = os.path.basename(path)
-            filepath = f"{tempdir}/{make_random_string() + basename}"
-            logger.info(f"Downloading {url} -> {filepath}")
-
-            async with httpx.AsyncClient() as client:
-                r = await client.get(uri)
-                resp = r.text
-                with open(filepath, "w") as fp:
-                    fp.write(resp)
-        else:
-            raise ValueError(f"Unsupported URL {url}")
-
-        contents.append(
-            TextContentItem(
-                text=f'# User provided a file accessible to you at "{filepath}"\nYou can use code_interpreter to load and inspect it.'
-            )
-        )
-
-    if isinstance(message.content, list):
-        message.content.extend(contents)
+async def get_raw_document_text(document: Document) -> str:
+    if not document.mime_type.startswith("text/"):
+        raise ValueError(f"Unexpected document mime type: {document.mime_type}")
+    if isinstance(document.content, URL):
+        return await load_data_from_url(document.content.uri)
+    elif isinstance(document.content, str):
+        return document.content
+    elif isinstance(document.content, TextContentItem):
+        return document.content.text
    else:
-        if isinstance(message.content, str):
-            message.content = [TextContentItem(text=message.content)] + contents
-        else:
-            message.content = [message.content] + contents
+        raise ValueError(f"Unexpected document content type: {type(document.content)}")


 def _interpret_content_as_attachment(
--- a/llama_stack/providers/inline/telemetry/meta_reference/config.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/config.py
@ -28,6 +28,11 @@ class TelemetryConfig(BaseModel):
        default="http://localhost:4318/v1/metrics",
        description="The OpenTelemetry collector endpoint URL for metrics",
    )
+    service_name: str = Field(
+        # service name is always the same, use zero-width space to avoid clutter
+        default="",
+        description="The service name to use for telemetry",
+    )
    sinks: List[TelemetrySink] = Field(
        default=[TelemetrySink.CONSOLE, TelemetrySink.SQLITE],
        description="List of telemetry sinks to enable (possible values: otel, sqlite, console)",
@ -47,6 +52,7 @@ class TelemetryConfig(BaseModel):
    @classmethod
    def sample_run_config(cls, __distro_dir__: str, db_name: str = "trace_store.db") -> Dict[str, Any]:
        return {
+            "service_name": "${env.OTEL_SERVICE_NAME:}",
            "sinks": "${env.TELEMETRY_SINKS:console,sqlite}",
            "sqlite_db_path": "${env.SQLITE_DB_PATH:" + __distro_dir__ + "/" + db_name + "}",
        }
--- a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py
@ -67,8 +67,7 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry):

        resource = Resource.create(
            {
-                # service name is always the same, use zero-width space to avoid clutter
-                ResourceAttributes.SERVICE_NAME: "",
+                ResourceAttributes.SERVICE_NAME: self.config.service_name,
            }
        )

--- a/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
+++ b/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+import asyncio
 import hashlib
 import logging
 import sqlite3
@ -29,6 +30,15 @@ def serialize_vector(vector: List[float]) -> bytes:
    return struct.pack(f"{len(vector)}f", *vector)


+def _create_sqlite_connection(db_path):
+    """Create a SQLite connection with sqlite_vec extension loaded."""
+    connection = sqlite3.connect(db_path)
+    connection.enable_load_extension(True)
+    sqlite_vec.load(connection)
+    connection.enable_load_extension(False)
+    return connection
+
+
 class SQLiteVecIndex(EmbeddingIndex):
    """
    An index implementation that stores embeddings in a SQLite virtual table using sqlite-vec.
@ -37,40 +47,56 @@ class SQLiteVecIndex(EmbeddingIndex):
      - A virtual table (vec_chunks_{bank_id}) that holds the serialized vector.
    """

-    def __init__(self, dimension: int, connection: sqlite3.Connection, bank_id: str):
+    def __init__(self, dimension: int, db_path: str, bank_id: str):
        self.dimension = dimension
-        self.connection = connection
+        self.db_path = db_path
        self.bank_id = bank_id
        self.metadata_table = f"chunks_{bank_id}".replace("-", "_")
        self.vector_table = f"vec_chunks_{bank_id}".replace("-", "_")

    @classmethod
-    async def create(cls, dimension: int, connection: sqlite3.Connection, bank_id: str):
-        instance = cls(dimension, connection, bank_id)
+    async def create(cls, dimension: int, db_path: str, bank_id: str):
+        instance = cls(dimension, db_path, bank_id)
        await instance.initialize()
        return instance

    async def initialize(self) -> None:
-        cur = self.connection.cursor()
-        # Create the table to store chunk metadata.
-        cur.execute(f"""
-            CREATE TABLE IF NOT EXISTS {self.metadata_table} (
-                id TEXT PRIMARY KEY,
-                chunk TEXT
-            );
-        """)
-        # Create the virtual table for embeddings.
-        cur.execute(f"""
-            CREATE VIRTUAL TABLE IF NOT EXISTS {self.vector_table}
-            USING vec0(embedding FLOAT[{self.dimension}], id TEXT);
-        """)
-        self.connection.commit()
+        def _init_tables():
+            connection = _create_sqlite_connection(self.db_path)
+            cur = connection.cursor()
+            try:
+                # Create the table to store chunk metadata.
+                cur.execute(f"""
+                    CREATE TABLE IF NOT EXISTS {self.metadata_table} (
+                        id TEXT PRIMARY KEY,
+                        chunk TEXT
+                    );
+                """)
+                # Create the virtual table for embeddings.
+                cur.execute(f"""
+                    CREATE VIRTUAL TABLE IF NOT EXISTS {self.vector_table}
+                    USING vec0(embedding FLOAT[{self.dimension}], id TEXT);
+                """)
+                connection.commit()
+            finally:
+                cur.close()
+                connection.close()

-    async def delete(self):
-        cur = self.connection.cursor()
-        cur.execute(f"DROP TABLE IF EXISTS {self.metadata_table};")
-        cur.execute(f"DROP TABLE IF EXISTS {self.vector_table};")
-        self.connection.commit()
+        await asyncio.to_thread(_init_tables)
+
+    async def delete(self) -> None:
+        def _drop_tables():
+            connection = _create_sqlite_connection(self.db_path)
+            cur = connection.cursor()
+            try:
+                cur.execute(f"DROP TABLE IF EXISTS {self.metadata_table};")
+                cur.execute(f"DROP TABLE IF EXISTS {self.vector_table};")
+                connection.commit()
+            finally:
+                cur.close()
+                connection.close()
+
+        await asyncio.to_thread(_drop_tables)

    async def add_chunks(self, chunks: List[Chunk], embeddings: NDArray, batch_size: int = 500):
        """
@ -81,44 +107,55 @@ class SQLiteVecIndex(EmbeddingIndex):
        """
        assert all(isinstance(chunk.content, str) for chunk in chunks), "SQLiteVecIndex only supports text chunks"

-        cur = self.connection.cursor()
-        try:
-            # Start transaction
-            cur.execute("BEGIN TRANSACTION")
-            for i in range(0, len(chunks), batch_size):
-                batch_chunks = chunks[i : i + batch_size]
-                batch_embeddings = embeddings[i : i + batch_size]
-                # Prepare metadata inserts
-                metadata_data = [
-                    (generate_chunk_id(chunk.metadata["document_id"], chunk.content), chunk.model_dump_json())
-                    for chunk in batch_chunks
-                    if isinstance(chunk.content, str)
-                ]
-                # Insert metadata (ON CONFLICT to avoid duplicates)
-                cur.executemany(
-                    f"""
-                    INSERT INTO {self.metadata_table} (id, chunk)
-                    VALUES (?, ?)
-                    ON CONFLICT(id) DO UPDATE SET chunk = excluded.chunk;
-                    """,
-                    metadata_data,
-                )
-                # Prepare embeddings inserts
-                embedding_data = [
-                    (generate_chunk_id(chunk.metadata["document_id"], chunk.content), serialize_vector(emb.tolist()))
-                    for chunk, emb in zip(batch_chunks, batch_embeddings, strict=True)
-                    if isinstance(chunk.content, str)
-                ]
-                # Insert embeddings in batch
-                cur.executemany(f"INSERT INTO {self.vector_table} (id, embedding) VALUES (?, ?);", embedding_data)
-            self.connection.commit()
+        def _execute_all_batch_inserts():
+            connection = _create_sqlite_connection(self.db_path)
+            cur = connection.cursor()

-        except sqlite3.Error as e:
-            self.connection.rollback()  # Rollback on failure
-            logger.error(f"Error inserting into {self.vector_table}: {e}")
+            try:
+                # Start transaction a single transcation for all batches
+                cur.execute("BEGIN TRANSACTION")
+                for i in range(0, len(chunks), batch_size):
+                    batch_chunks = chunks[i : i + batch_size]
+                    batch_embeddings = embeddings[i : i + batch_size]
+                    # Prepare metadata inserts
+                    metadata_data = [
+                        (generate_chunk_id(chunk.metadata["document_id"], chunk.content), chunk.model_dump_json())
+                        for chunk in batch_chunks
+                        if isinstance(chunk.content, str)
+                    ]
+                    # Insert metadata (ON CONFLICT to avoid duplicates)
+                    cur.executemany(
+                        f"""
+                        INSERT INTO {self.metadata_table} (id, chunk)
+                        VALUES (?, ?)
+                        ON CONFLICT(id) DO UPDATE SET chunk = excluded.chunk;
+                        """,
+                        metadata_data,
+                    )
+                    # Prepare embeddings inserts
+                    embedding_data = [
+                        (
+                            generate_chunk_id(chunk.metadata["document_id"], chunk.content),
+                            serialize_vector(emb.tolist()),
+                        )
+                        for chunk, emb in zip(batch_chunks, batch_embeddings, strict=True)
+                        if isinstance(chunk.content, str)
+                    ]
+                    # Insert embeddings in batch
+                    cur.executemany(f"INSERT INTO {self.vector_table} (id, embedding) VALUES (?, ?);", embedding_data)
+                connection.commit()

-        finally:
-            cur.close()  # Ensure cursor is closed
+            except sqlite3.Error as e:
+                connection.rollback()  # Rollback on failure
+                logger.error(f"Error inserting into {self.vector_table}: {e}")
+                raise
+
+            finally:
+                cur.close()
+                connection.close()
+
+        # Process all batches in a single thread
+        await asyncio.to_thread(_execute_all_batch_inserts)

    async def query(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse:
        """
@ -127,18 +164,28 @@ class SQLiteVecIndex(EmbeddingIndex):
        """
        emb_list = embedding.tolist() if isinstance(embedding, np.ndarray) else list(embedding)
        emb_blob = serialize_vector(emb_list)
-        cur = self.connection.cursor()
-        query_sql = f"""
-            SELECT m.id, m.chunk, v.distance
-            FROM {self.vector_table} AS v
-            JOIN {self.metadata_table} AS m ON m.id = v.id
-            WHERE v.embedding MATCH ? AND k = ?
-            ORDER BY v.distance;
-        """
-        cur.execute(query_sql, (emb_blob, k))
-        rows = cur.fetchall()
-        chunks = []
-        scores = []
+
+        def _execute_query():
+            connection = _create_sqlite_connection(self.db_path)
+            cur = connection.cursor()
+
+            try:
+                query_sql = f"""
+                    SELECT m.id, m.chunk, v.distance
+                    FROM {self.vector_table} AS v
+                    JOIN {self.metadata_table} AS m ON m.id = v.id
+                    WHERE v.embedding MATCH ? AND k = ?
+                    ORDER BY v.distance;
+                """
+                cur.execute(query_sql, (emb_blob, k))
+                return cur.fetchall()
+            finally:
+                cur.close()
+                connection.close()
+
+        rows = await asyncio.to_thread(_execute_query)
+
+        chunks, scores = [], []
        for _id, chunk_json, distance in rows:
            try:
                chunk = Chunk.model_validate_json(chunk_json)
@ -163,63 +210,81 @@ class SQLiteVecVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
        self.config = config
        self.inference_api = inference_api
        self.cache: Dict[str, VectorDBWithIndex] = {}
-        self.connection: Optional[sqlite3.Connection] = None

    async def initialize(self) -> None:
-        # Open a connection to the SQLite database (the file is specified in the config).
-        self.connection = sqlite3.connect(self.config.db_path)
-        self.connection.enable_load_extension(True)
-        sqlite_vec.load(self.connection)
-        self.connection.enable_load_extension(False)
-        cur = self.connection.cursor()
-        # Create a table to persist vector DB registrations.
-        cur.execute("""
-            CREATE TABLE IF NOT EXISTS vector_dbs (
-                id TEXT PRIMARY KEY,
-                metadata TEXT
-            );
-        """)
-        self.connection.commit()
-        # Load any existing vector DB registrations.
-        cur.execute("SELECT metadata FROM vector_dbs")
-        rows = cur.fetchall()
+        def _setup_connection():
+            # Open a connection to the SQLite database (the file is specified in the config).
+            connection = _create_sqlite_connection(self.config.db_path)
+            cur = connection.cursor()
+            try:
+                # Create a table to persist vector DB registrations.
+                cur.execute("""
+                    CREATE TABLE IF NOT EXISTS vector_dbs (
+                        id TEXT PRIMARY KEY,
+                        metadata TEXT
+                    );
+                """)
+                connection.commit()
+                # Load any existing vector DB registrations.
+                cur.execute("SELECT metadata FROM vector_dbs")
+                rows = cur.fetchall()
+                return rows
+            finally:
+                cur.close()
+                connection.close()
+
+        rows = await asyncio.to_thread(_setup_connection)
        for row in rows:
            vector_db_data = row[0]
            vector_db = VectorDB.model_validate_json(vector_db_data)
-            index = await SQLiteVecIndex.create(vector_db.embedding_dimension, self.connection, vector_db.identifier)
+            index = await SQLiteVecIndex.create(
+                vector_db.embedding_dimension, self.config.db_path, vector_db.identifier
+            )
            self.cache[vector_db.identifier] = VectorDBWithIndex(vector_db, index, self.inference_api)

    async def shutdown(self) -> None:
-        if self.connection:
-            self.connection.close()
-            self.connection = None
+        # nothing to do since we don't maintain a persistent connection
+        pass

    async def register_vector_db(self, vector_db: VectorDB) -> None:
-        if self.connection is None:
-            raise RuntimeError("SQLite connection not initialized")
-        cur = self.connection.cursor()
-        cur.execute(
-            "INSERT OR REPLACE INTO vector_dbs (id, metadata) VALUES (?, ?)",
-            (vector_db.identifier, vector_db.model_dump_json()),
-        )
-        self.connection.commit()
-        index = await SQLiteVecIndex.create(vector_db.embedding_dimension, self.connection, vector_db.identifier)
+        def _register_db():
+            connection = _create_sqlite_connection(self.config.db_path)
+            cur = connection.cursor()
+            try:
+                cur.execute(
+                    "INSERT OR REPLACE INTO vector_dbs (id, metadata) VALUES (?, ?)",
+                    (vector_db.identifier, vector_db.model_dump_json()),
+                )
+                connection.commit()
+            finally:
+                cur.close()
+                connection.close()
+
+        await asyncio.to_thread(_register_db)
+        index = await SQLiteVecIndex.create(vector_db.embedding_dimension, self.config.db_path, vector_db.identifier)
        self.cache[vector_db.identifier] = VectorDBWithIndex(vector_db, index, self.inference_api)

    async def list_vector_dbs(self) -> List[VectorDB]:
        return [v.vector_db for v in self.cache.values()]

    async def unregister_vector_db(self, vector_db_id: str) -> None:
-        if self.connection is None:
-            raise RuntimeError("SQLite connection not initialized")
        if vector_db_id not in self.cache:
            logger.warning(f"Vector DB {vector_db_id} not found")
            return
        await self.cache[vector_db_id].index.delete()
        del self.cache[vector_db_id]
-        cur = self.connection.cursor()
-        cur.execute("DELETE FROM vector_dbs WHERE id = ?", (vector_db_id,))
-        self.connection.commit()
+
+        def _delete_vector_db_from_registry():
+            connection = _create_sqlite_connection(self.config.db_path)
+            cur = connection.cursor()
+            try:
+                cur.execute("DELETE FROM vector_dbs WHERE id = ?", (vector_db_id,))
+                connection.commit()
+            finally:
+                cur.close()
+                connection.close()
+
+        await asyncio.to_thread(_delete_vector_db_from_registry)

    async def insert_chunks(self, vector_db_id: str, chunks: List[Chunk], ttl_seconds: Optional[int] = None) -> None:
        if vector_db_id not in self.cache:
--- a/llama_stack/providers/registry/files.py
+++ b/llama_stack/providers/registry/files.py
@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.providers.datatypes import ProviderSpec
+
+
+def available_providers() -> list[ProviderSpec]:
+    return []
--- a/llama_stack/providers/registry/post_training.py
+++ b/llama_stack/providers/registry/post_training.py
@ -6,7 +6,7 @@

 from typing import List

-from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec
+from llama_stack.providers.datatypes import AdapterSpec, Api, InlineProviderSpec, ProviderSpec, remote_provider_spec


 def available_providers() -> List[ProviderSpec]:
@ -22,4 +22,13 @@ def available_providers() -> List[ProviderSpec]:
                Api.datasets,
            ],
        ),
+        remote_provider_spec(
+            api=Api.post_training,
+            adapter=AdapterSpec(
+                adapter_type="nvidia",
+                pip_packages=["requests", "aiohttp"],
+                module="llama_stack.providers.remote.post_training.nvidia",
+                config_class="llama_stack.providers.remote.post_training.nvidia.NvidiaPostTrainingConfig",
+            ),
+        ),
    ]
--- a/llama_stack/providers/remote/inference/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py
@ -55,7 +55,7 @@ from .openai_utils import (
    convert_openai_completion_choice,
    convert_openai_completion_stream,
 )
-from .utils import _is_nvidia_hosted, check_health
+from .utils import _is_nvidia_hosted

 logger = logging.getLogger(__name__)

@ -134,7 +134,9 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
        if content_has_media(content):
            raise NotImplementedError("Media is not supported")

-        await check_health(self._config)  # this raises errors
+        # ToDo: check health of NeMo endpoints and enable this
+        # removing this health check as NeMo customizer endpoint health check is returning 404
+        # await check_health(self._config)  # this raises errors

        provider_model_id = self.get_provider_model_id(model_id)
        request = convert_completion_request(
@ -236,7 +238,7 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
        if tool_prompt_format:
            warnings.warn("tool_prompt_format is not supported by NVIDIA NIM, ignoring", stacklevel=2)

-        await check_health(self._config)  # this raises errors
+        # await check_health(self._config)  # this raises errors

        provider_model_id = self.get_provider_model_id(model_id)
        request = await convert_chat_completion_request(
--- a/llama_stack/providers/remote/post_training/init.py
+++ b/llama_stack/providers/remote/post_training/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/llama_stack/providers/remote/post_training/nvidia/README.md
+++ b/llama_stack/providers/remote/post_training/nvidia/README.md
@ -0,0 +1,138 @@
+# NVIDIA Post-Training Provider for LlamaStack
+
+This provider enables fine-tuning of LLMs using NVIDIA's NeMo Customizer service.
+
+## Features
+
+- Supervised fine-tuning of Llama models
+- LoRA fine-tuning support
+- Job management and status tracking
+
+## Getting Started
+
+### Prerequisites
+
+- LlamaStack with NVIDIA configuration
+- Access to Hosted NVIDIA NeMo Customizer service
+- Dataset registered in the Hosted NVIDIA NeMo Customizer service
+- Base model downloaded and available in the Hosted NVIDIA NeMo Customizer service
+
+### Setup
+
+Build the NVIDIA environment:
+
+```bash
+llama stack build --template nvidia --image-type conda
+```
+
+### Basic Usage using the LlamaStack Python Client
+
+### Create Customization Job
+
+#### Initialize the client
+
+```python
+import os
+
+os.environ["NVIDIA_API_KEY"] = "your-api-key"
+os.environ["NVIDIA_CUSTOMIZER_URL"] = "http://nemo.test"
+os.environ["NVIDIA_USER_ID"] = "llama-stack-user"
+os.environ["NVIDIA_DATASET_NAMESPACE"] = "default"
+os.environ["NVIDIA_PROJECT_ID"] = "test-project"
+os.environ["NVIDIA_OUTPUT_MODEL_DIR"] = "test-example-model@v1"
+
+from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
+
+client = LlamaStackAsLibraryClient("nvidia")
+client.initialize()
+```
+
+#### Configure fine-tuning parameters
+
+```python
+from llama_stack_client.types.post_training_supervised_fine_tune_params import (
+    TrainingConfig,
+    TrainingConfigDataConfig,
+    TrainingConfigOptimizerConfig,
+)
+from llama_stack_client.types.algorithm_config_param import LoraFinetuningConfig
+```
+
+#### Set up LoRA configuration
+
+```python
+algorithm_config = LoraFinetuningConfig(type="LoRA", adapter_dim=16)
+```
+
+#### Configure training data
+
+```python
+data_config = TrainingConfigDataConfig(
+    dataset_id="your-dataset-id",  # Use client.datasets.list() to see available datasets
+    batch_size=16,
+)
+```
+
+#### Configure optimizer
+
+```python
+optimizer_config = TrainingConfigOptimizerConfig(
+    lr=0.0001,
+)
+```
+
+#### Set up training configuration
+
+```python
+training_config = TrainingConfig(
+    n_epochs=2,
+    data_config=data_config,
+    optimizer_config=optimizer_config,
+)
+```
+
+#### Start fine-tuning job
+
+```python
+training_job = client.post_training.supervised_fine_tune(
+    job_uuid="unique-job-id",
+    model="meta-llama/Llama-3.1-8B-Instruct",
+    checkpoint_dir="",
+    algorithm_config=algorithm_config,
+    training_config=training_config,
+    logger_config={},
+    hyperparam_search_config={},
+)
+```
+
+### List all jobs
+
+```python
+jobs = client.post_training.job.list()
+```
+
+###  Check job status
+
+```python
+job_status = client.post_training.job.status(job_uuid="your-job-id")
+```
+
+### Cancel a job
+
+```python
+client.post_training.job.cancel(job_uuid="your-job-id")
+```
+
+### Inference with the fine-tuned model
+
+```python
+response = client.inference.completion(
+    content="Complete the sentence using one word: Roses are red, violets are ",
+    stream=False,
+    model_id="test-example-model@v1",
+    sampling_params={
+        "max_tokens": 50,
+    },
+)
+print(response.content)
+```
--- a/llama_stack/providers/remote/post_training/nvidia/init.py
+++ b/llama_stack/providers/remote/post_training/nvidia/init.py
@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .config import NvidiaPostTrainingConfig
+
+
+async def get_adapter_impl(
+    config: NvidiaPostTrainingConfig,
+    _deps,
+):
+    from .post_training import NvidiaPostTrainingAdapter
+
+    if not isinstance(config, NvidiaPostTrainingConfig):
+        raise RuntimeError(f"Unexpected config type: {type(config)}")
+
+    impl = NvidiaPostTrainingAdapter(config)
+    return impl
+
+
+__all__ = ["get_adapter_impl", "NvidiaPostTrainingAdapter"]
--- a/llama_stack/providers/remote/post_training/nvidia/config.py
+++ b/llama_stack/providers/remote/post_training/nvidia/config.py
@ -0,0 +1,113 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import os
+from typing import Any, Dict, Optional
+
+from pydantic import BaseModel, Field
+
+# TODO: add default values for all fields
+
+
+class NvidiaPostTrainingConfig(BaseModel):
+    """Configuration for NVIDIA Post Training implementation."""
+
+    api_key: Optional[str] = Field(
+        default_factory=lambda: os.getenv("NVIDIA_API_KEY"),
+        description="The NVIDIA API key.",
+    )
+
+    dataset_namespace: Optional[str] = Field(
+        default_factory=lambda: os.getenv("NVIDIA_DATASET_NAMESPACE", "default"),
+        description="The NVIDIA dataset namespace.",
+    )
+
+    project_id: Optional[str] = Field(
+        default_factory=lambda: os.getenv("NVIDIA_PROJECT_ID", "test-example-model@v1"),
+        description="The NVIDIA project ID.",
+    )
+
+    # ToDO: validate this, add default value
+    customizer_url: Optional[str] = Field(
+        default_factory=lambda: os.getenv("NVIDIA_CUSTOMIZER_URL"),
+        description="Base URL for the NeMo Customizer API",
+    )
+
+    timeout: int = Field(
+        default=300,
+        description="Timeout for the NVIDIA Post Training API",
+    )
+
+    max_retries: int = Field(
+        default=3,
+        description="Maximum number of retries for the NVIDIA Post Training API",
+    )
+
+    # ToDo: validate this
+    output_model_dir: str = Field(
+        default_factory=lambda: os.getenv("NVIDIA_OUTPUT_MODEL_DIR", "test-example-model@v1"),
+        description="Directory to save the output model",
+    )
+
+    @classmethod
+    def sample_run_config(cls, **kwargs) -> Dict[str, Any]:
+        return {
+            "api_key": "${env.NVIDIA_API_KEY:}",
+            "dataset_namespace": "${env.NVIDIA_DATASET_NAMESPACE:default}",
+            "project_id": "${env.NVIDIA_PROJECT_ID:test-project}",
+            "customizer_url": "${env.NVIDIA_CUSTOMIZER_URL:http://nemo.test}",
+        }
+
+
+class SFTLoRADefaultConfig(BaseModel):
+    """NVIDIA-specific training configuration with default values."""
+
+    # ToDo: split into SFT and LoRA configs??
+
+    # General training parameters
+    n_epochs: int = 50
+
+    # NeMo customizer specific parameters
+    log_every_n_steps: Optional[int] = None
+    val_check_interval: float = 0.25
+    sequence_packing_enabled: bool = False
+    weight_decay: float = 0.01
+    lr: float = 0.0001
+
+    # SFT specific parameters
+    hidden_dropout: Optional[float] = None
+    attention_dropout: Optional[float] = None
+    ffn_dropout: Optional[float] = None
+
+    # LoRA default parameters
+    lora_adapter_dim: int = 8
+    lora_adapter_dropout: Optional[float] = None
+    lora_alpha: int = 16
+
+    # Data config
+    batch_size: int = 8
+
+    @classmethod
+    def sample_config(cls) -> Dict[str, Any]:
+        """Return a sample configuration for NVIDIA training."""
+        return {
+            "n_epochs": 50,
+            "log_every_n_steps": 10,
+            "val_check_interval": 0.25,
+            "sequence_packing_enabled": False,
+            "weight_decay": 0.01,
+            "hidden_dropout": 0.1,
+            "attention_dropout": 0.1,
+            "lora_adapter_dim": 8,
+            "lora_alpha": 16,
+            "data_config": {
+                "dataset_id": "default",
+                "batch_size": 8,
+            },
+            "optimizer_config": {
+                "lr": 0.0001,
+            },
+        }
--- a/llama_stack/providers/remote/post_training/nvidia/models.py
+++ b/llama_stack/providers/remote/post_training/nvidia/models.py
@ -0,0 +1,24 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import List
+
+from llama_stack.models.llama.datatypes import CoreModelId
+from llama_stack.providers.utils.inference.model_registry import (
+    ProviderModelEntry,
+    build_hf_repo_model_entry,
+)
+
+_MODEL_ENTRIES = [
+    build_hf_repo_model_entry(
+        "meta/llama-3.1-8b-instruct",
+        CoreModelId.llama3_1_8b_instruct.value,
+    )
+]
+
+
+def get_model_entries() -> List[ProviderModelEntry]:
+    return _MODEL_ENTRIES
--- a/llama_stack/providers/remote/post_training/nvidia/post_training.py
+++ b/llama_stack/providers/remote/post_training/nvidia/post_training.py
@ -0,0 +1,439 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import warnings
+from datetime import datetime
+from typing import Any, Dict, List, Literal, Optional
+
+import aiohttp
+from pydantic import BaseModel, ConfigDict
+
+from llama_stack.apis.post_training import (
+    AlgorithmConfig,
+    DPOAlignmentConfig,
+    JobStatus,
+    PostTrainingJob,
+    PostTrainingJobArtifactsResponse,
+    PostTrainingJobStatusResponse,
+    TrainingConfig,
+)
+from llama_stack.providers.remote.post_training.nvidia.config import NvidiaPostTrainingConfig
+from llama_stack.providers.remote.post_training.nvidia.utils import warn_unsupported_params
+from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
+
+from .models import _MODEL_ENTRIES
+
+# Map API status to JobStatus enum
+STATUS_MAPPING = {
+    "running": "in_progress",
+    "completed": "completed",
+    "failed": "failed",
+    "cancelled": "cancelled",
+    "pending": "scheduled",
+}
+
+
+class NvidiaPostTrainingJob(PostTrainingJob):
+    """Parse the response from the Customizer API.
+    Inherits job_uuid from PostTrainingJob.
+    Adds status, created_at, updated_at parameters.
+    Passes through all other parameters from data field in the response.
+    """
+
+    model_config = ConfigDict(extra="allow")
+    status: JobStatus
+    created_at: datetime
+    updated_at: datetime
+
+
+class ListNvidiaPostTrainingJobs(BaseModel):
+    data: List[NvidiaPostTrainingJob]
+
+
+class NvidiaPostTrainingJobStatusResponse(PostTrainingJobStatusResponse):
+    model_config = ConfigDict(extra="allow")
+
+
+class NvidiaPostTrainingAdapter(ModelRegistryHelper):
+    def __init__(self, config: NvidiaPostTrainingConfig):
+        self.config = config
+        self.headers = {}
+        if config.api_key:
+            self.headers["Authorization"] = f"Bearer {config.api_key}"
+
+        self.timeout = aiohttp.ClientTimeout(total=config.timeout)
+        # TODO: filter by available models based on /config endpoint
+        ModelRegistryHelper.__init__(self, model_entries=_MODEL_ENTRIES)
+        self.session = aiohttp.ClientSession(headers=self.headers, timeout=self.timeout)
+        self.customizer_url = config.customizer_url
+
+        if not self.customizer_url:
+            warnings.warn("Customizer URL is not set, using default value: http://nemo.test", stacklevel=2)
+            self.customizer_url = "http://nemo.test"
+
+    async def _make_request(
+        self,
+        method: str,
+        path: str,
+        headers: Optional[Dict[str, Any]] = None,
+        params: Optional[Dict[str, Any]] = None,
+        json: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        """Helper method to make HTTP requests to the Customizer API."""
+        url = f"{self.customizer_url}{path}"
+        request_headers = self.headers.copy()
+
+        if headers:
+            request_headers.update(headers)
+
+        # Add content-type header for JSON requests
+        if json and "Content-Type" not in request_headers:
+            request_headers["Content-Type"] = "application/json"
+
+        for _ in range(self.config.max_retries):
+            async with self.session.request(method, url, params=params, json=json, **kwargs) as response:
+                if response.status >= 400:
+                    error_data = await response.json()
+                    raise Exception(f"API request failed: {error_data}")
+                return await response.json()
+
+    async def get_training_jobs(
+        self,
+        page: Optional[int] = 1,
+        page_size: Optional[int] = 10,
+        sort: Optional[Literal["created_at", "-created_at"]] = "created_at",
+    ) -> ListNvidiaPostTrainingJobs:
+        """Get all customization jobs.
+        Updated the base class return type from ListPostTrainingJobsResponse to ListNvidiaPostTrainingJobs.
+
+        Returns a ListNvidiaPostTrainingJobs object with the following fields:
+            - data: List[NvidiaPostTrainingJob] - List of NvidiaPostTrainingJob objects
+
+        ToDo: Support for schema input for filtering.
+        """
+        params = {"page": page, "page_size": page_size, "sort": sort}
+
+        response = await self._make_request("GET", "/v1/customization/jobs", params=params)
+
+        jobs = []
+        for job in response.get("data", []):
+            job_id = job.pop("id")
+            job_status = job.pop("status", "unknown").lower()
+            mapped_status = STATUS_MAPPING.get(job_status, "unknown")
+
+            # Convert string timestamps to datetime objects
+            created_at = (
+                datetime.fromisoformat(job.pop("created_at"))
+                if "created_at" in job
+                else datetime.now(tz=datetime.timezone.utc)
+            )
+            updated_at = (
+                datetime.fromisoformat(job.pop("updated_at"))
+                if "updated_at" in job
+                else datetime.now(tz=datetime.timezone.utc)
+            )
+
+            # Create NvidiaPostTrainingJob instance
+            jobs.append(
+                NvidiaPostTrainingJob(
+                    job_uuid=job_id,
+                    status=JobStatus(mapped_status),
+                    created_at=created_at,
+                    updated_at=updated_at,
+                    **job,
+                )
+            )
+
+        return ListNvidiaPostTrainingJobs(data=jobs)
+
+    async def get_training_job_status(self, job_uuid: str) -> NvidiaPostTrainingJobStatusResponse:
+        """Get the status of a customization job.
+        Updated the base class return type from PostTrainingJobResponse to NvidiaPostTrainingJob.
+
+        Returns a NvidiaPostTrainingJob object with the following fields:
+            - job_uuid: str - Unique identifier for the job
+            - status: JobStatus - Current status of the job (in_progress, completed, failed, cancelled, scheduled)
+            - created_at: datetime - The time when the job was created
+            - updated_at: datetime - The last time the job status was updated
+
+        Additional fields that may be included:
+            - steps_completed: Optional[int] - Number of training steps completed
+            - epochs_completed: Optional[int] - Number of epochs completed
+            - percentage_done: Optional[float] - Percentage of training completed (0-100)
+            - best_epoch: Optional[int] - The epoch with the best performance
+            - train_loss: Optional[float] - Training loss of the best checkpoint
+            - val_loss: Optional[float] - Validation loss of the best checkpoint
+            - metrics: Optional[Dict] - Additional training metrics
+            - status_logs: Optional[List] - Detailed logs of status changes
+        """
+        response = await self._make_request(
+            "GET",
+            f"/v1/customization/jobs/{job_uuid}/status",
+            params={"job_id": job_uuid},
+        )
+
+        api_status = response.pop("status").lower()
+        mapped_status = STATUS_MAPPING.get(api_status, "unknown")
+
+        return NvidiaPostTrainingJobStatusResponse(
+            status=JobStatus(mapped_status),
+            job_uuid=job_uuid,
+            started_at=datetime.fromisoformat(response.pop("created_at")),
+            updated_at=datetime.fromisoformat(response.pop("updated_at")),
+            **response,
+        )
+
+    async def cancel_training_job(self, job_uuid: str) -> None:
+        await self._make_request(
+            method="POST", path=f"/v1/customization/jobs/{job_uuid}/cancel", params={"job_id": job_uuid}
+        )
+
+    async def get_training_job_artifacts(self, job_uuid: str) -> PostTrainingJobArtifactsResponse:
+        raise NotImplementedError("Job artifacts are not implemented yet")
+
+    async def get_post_training_artifacts(self, job_uuid: str) -> PostTrainingJobArtifactsResponse:
+        raise NotImplementedError("Job artifacts are not implemented yet")
+
+    async def supervised_fine_tune(
+        self,
+        job_uuid: str,
+        training_config: Dict[str, Any],
+        hyperparam_search_config: Dict[str, Any],
+        logger_config: Dict[str, Any],
+        model: str,
+        checkpoint_dir: Optional[str],
+        algorithm_config: Optional[AlgorithmConfig] = None,
+        extra_json: Optional[Dict[str, Any]] = None,
+        params: Optional[Dict[str, Any]] = None,
+        headers: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ) -> NvidiaPostTrainingJob:
+        """
+        Fine-tunes a model on a dataset.
+        Currently only supports Lora finetuning for standlone docker container.
+        Assumptions:
+            - nemo microservice is running and endpoint is set in config.customizer_url
+            - dataset is registered separately in nemo datastore
+            - model checkpoint is downloaded as per nemo customizer requirements
+
+        Parameters:
+            training_config: TrainingConfig - Configuration for training
+            model: str - Model identifier
+            algorithm_config: Optional[AlgorithmConfig] - Algorithm-specific configuration
+            checkpoint_dir: Optional[str] - Directory containing model checkpoints, ignored atm
+            job_uuid: str - Unique identifier for the job, ignored atm
+            hyperparam_search_config: Dict[str, Any] - Configuration for hyperparameter search, ignored atm
+            logger_config: Dict[str, Any] - Configuration for logging, ignored atm
+
+        Environment Variables:
+            - NVIDIA_API_KEY: str - API key for the NVIDIA API
+                Default: None
+            - NVIDIA_DATASET_NAMESPACE: str - Namespace of the dataset
+                Default: "default"
+            - NVIDIA_CUSTOMIZER_URL: str - URL of the NeMo Customizer API
+                Default: "http://nemo.test"
+            - NVIDIA_PROJECT_ID: str - ID of the project
+                Default: "test-project"
+            - NVIDIA_OUTPUT_MODEL_DIR: str - Directory to save the output model
+                Default: "test-example-model@v1"
+
+        Supported models:
+            - meta/llama-3.1-8b-instruct
+
+        Supported algorithm configs:
+            - LoRA, SFT
+
+        Supported Parameters:
+            - TrainingConfig:
+                - n_epochs: int - Number of epochs to train
+                    Default: 50
+                - data_config: DataConfig - Configuration for the dataset
+                - optimizer_config: OptimizerConfig - Configuration for the optimizer
+                - dtype: str - Data type for training
+                    not supported (users are informed via warnings)
+                - efficiency_config: EfficiencyConfig - Configuration for efficiency
+                    not supported
+                - max_steps_per_epoch: int - Maximum number of steps per epoch
+                    Default: 1000
+                ## NeMo customizer specific parameters
+                - log_every_n_steps: int - Log every n steps
+                    Default: None
+                - val_check_interval: float - Validation check interval
+                    Default: 0.25
+                - sequence_packing_enabled: bool - Sequence packing enabled
+                    Default: False
+                ## NeMo customizer specific SFT parameters
+                - hidden_dropout: float - Hidden dropout
+                    Default: None (0.0-1.0)
+                - attention_dropout: float - Attention dropout
+                    Default: None (0.0-1.0)
+                - ffn_dropout: float - FFN dropout
+                    Default: None (0.0-1.0)
+
+            - DataConfig:
+                - dataset_id: str - Dataset ID
+                - batch_size: int - Batch size
+                    Default: 8
+
+            - OptimizerConfig:
+                - lr: float - Learning rate
+                    Default: 0.0001
+                ## NeMo customizer specific parameter
+                - weight_decay: float - Weight decay
+                    Default: 0.01
+
+            - LoRA config:
+                ## NeMo customizer specific LoRA parameters
+                - adapter_dim: int - Adapter dimension
+                    Default: 8 (supports powers of 2)
+                - adapter_dropout: float - Adapter dropout
+                    Default: None (0.0-1.0)
+                - alpha: int - Scaling factor for the LoRA update
+                    Default: 16
+            Note:
+                - checkpoint_dir, hyperparam_search_config, logger_config are not supported (users are informed via warnings)
+                - Some parameters from TrainingConfig, DataConfig, OptimizerConfig are not supported (users are informed via warnings)
+
+            User is informed about unsupported parameters via warnings.
+        """
+        # Map model to nvidia model name
+        # ToDo: only supports llama-3.1-8b-instruct now, need to update this to support other models
+        nvidia_model = self.get_provider_model_id(model)
+
+        # Check for unsupported method parameters
+        unsupported_method_params = []
+        if checkpoint_dir:
+            unsupported_method_params.append(f"checkpoint_dir={checkpoint_dir}")
+        if hyperparam_search_config:
+            unsupported_method_params.append("hyperparam_search_config")
+        if logger_config:
+            unsupported_method_params.append("logger_config")
+
+        if unsupported_method_params:
+            warnings.warn(
+                f"Parameters: {', '.join(unsupported_method_params)} are not supported and will be ignored",
+                stacklevel=2,
+            )
+
+        # Define all supported parameters
+        supported_params = {
+            "training_config": {
+                "n_epochs",
+                "data_config",
+                "optimizer_config",
+                "log_every_n_steps",
+                "val_check_interval",
+                "sequence_packing_enabled",
+                "hidden_dropout",
+                "attention_dropout",
+                "ffn_dropout",
+            },
+            "data_config": {"dataset_id", "batch_size"},
+            "optimizer_config": {"lr", "weight_decay"},
+            "lora_config": {"type", "adapter_dim", "adapter_dropout", "alpha"},
+        }
+
+        # Validate all parameters at once
+        warn_unsupported_params(training_config, supported_params["training_config"], "TrainingConfig")
+        warn_unsupported_params(training_config["data_config"], supported_params["data_config"], "DataConfig")
+        warn_unsupported_params(
+            training_config["optimizer_config"], supported_params["optimizer_config"], "OptimizerConfig"
+        )
+
+        output_model = self.config.output_model_dir
+
+        # Prepare base job configuration
+        job_config = {
+            "config": nvidia_model,
+            "dataset": {
+                "name": training_config["data_config"]["dataset_id"],
+                "namespace": self.config.dataset_namespace,
+            },
+            "hyperparameters": {
+                "training_type": "sft",
+                "finetuning_type": "lora",
+                **{
+                    k: v
+                    for k, v in {
+                        "epochs": training_config.get("n_epochs"),
+                        "batch_size": training_config["data_config"].get("batch_size"),
+                        "learning_rate": training_config["optimizer_config"].get("lr"),
+                        "weight_decay": training_config["optimizer_config"].get("weight_decay"),
+                        "log_every_n_steps": training_config.get("log_every_n_steps"),
+                        "val_check_interval": training_config.get("val_check_interval"),
+                        "sequence_packing_enabled": training_config.get("sequence_packing_enabled"),
+                    }.items()
+                    if v is not None
+                },
+            },
+            "project": self.config.project_id,
+            # TODO: ignored ownership, add it later
+            # "ownership": {"created_by": self.config.user_id, "access_policies": self.config.access_policies},
+            "output_model": output_model,
+        }
+
+        # Handle SFT-specific optional parameters
+        job_config["hyperparameters"]["sft"] = {
+            k: v
+            for k, v in {
+                "ffn_dropout": training_config.get("ffn_dropout"),
+                "hidden_dropout": training_config.get("hidden_dropout"),
+                "attention_dropout": training_config.get("attention_dropout"),
+            }.items()
+            if v is not None
+        }
+
+        # Remove the sft dictionary if it's empty
+        if not job_config["hyperparameters"]["sft"]:
+            job_config["hyperparameters"].pop("sft")
+
+        # Handle LoRA-specific configuration
+        if algorithm_config:
+            if isinstance(algorithm_config, dict) and algorithm_config.get("type") == "LoRA":
+                warn_unsupported_params(algorithm_config, supported_params["lora_config"], "LoRA config")
+                job_config["hyperparameters"]["lora"] = {
+                    k: v
+                    for k, v in {
+                        "adapter_dim": algorithm_config.get("adapter_dim"),
+                        "alpha": algorithm_config.get("alpha"),
+                        "adapter_dropout": algorithm_config.get("adapter_dropout"),
+                    }.items()
+                    if v is not None
+                }
+            else:
+                raise NotImplementedError(f"Unsupported algorithm config: {algorithm_config}")
+
+        # Create the customization job
+        response = await self._make_request(
+            method="POST",
+            path="/v1/customization/jobs",
+            headers={"Accept": "application/json"},
+            json=job_config,
+        )
+
+        job_uuid = response["id"]
+        response.pop("status")
+        created_at = datetime.fromisoformat(response.pop("created_at"))
+        updated_at = datetime.fromisoformat(response.pop("updated_at"))
+
+        return NvidiaPostTrainingJob(
+            job_uuid=job_uuid, status=JobStatus.in_progress, created_at=created_at, updated_at=updated_at, **response
+        )
+
+    async def preference_optimize(
+        self,
+        job_uuid: str,
+        finetuned_model: str,
+        algorithm_config: DPOAlignmentConfig,
+        training_config: TrainingConfig,
+        hyperparam_search_config: Dict[str, Any],
+        logger_config: Dict[str, Any],
+    ) -> PostTrainingJob:
+        """Optimize a model based on preference data."""
+        raise NotImplementedError("Preference optimization is not implemented yet")
+
+    async def get_training_job_container_logs(self, job_uuid: str) -> PostTrainingJobStatusResponse:
+        raise NotImplementedError("Job logs are not implemented yet")
--- a/llama_stack/providers/remote/post_training/nvidia/utils.py
+++ b/llama_stack/providers/remote/post_training/nvidia/utils.py
@ -0,0 +1,63 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import logging
+import warnings
+from typing import Any, Dict, Set, Tuple
+
+from pydantic import BaseModel
+
+from llama_stack.apis.post_training import TrainingConfig
+from llama_stack.providers.remote.post_training.nvidia.config import SFTLoRADefaultConfig
+
+from .config import NvidiaPostTrainingConfig
+
+logger = logging.getLogger(__name__)
+
+
+def warn_unsupported_params(config_dict: Any, supported_keys: Set[str], config_name: str) -> None:
+    keys = set(config_dict.__annotations__.keys()) if isinstance(config_dict, BaseModel) else config_dict.keys()
+    unsupported_params = [k for k in keys if k not in supported_keys]
+    if unsupported_params:
+        warnings.warn(
+            f"Parameters: {unsupported_params} in `{config_name}` not supported and will be ignored.", stacklevel=2
+        )
+
+
+def validate_training_params(
+    training_config: Dict[str, Any], supported_keys: Set[str], config_name: str = "TrainingConfig"
+) -> None:
+    """
+    Validates training parameters against supported keys.
+
+    Args:
+        training_config: Dictionary containing training configuration parameters
+        supported_keys: Set of supported parameter keys
+        config_name: Name of the configuration for warning messages
+    """
+    sft_lora_fields = set(SFTLoRADefaultConfig.__annotations__.keys())
+    training_config_fields = set(TrainingConfig.__annotations__.keys())
+
+    # Check for not supported parameters:
+    # - not in either of configs
+    # - in TrainingConfig but not in SFTLoRADefaultConfig
+    unsupported_params = []
+    for key in training_config:
+        if isinstance(key, str) and key not in (supported_keys.union(sft_lora_fields)):
+            if key in (not sft_lora_fields or training_config_fields):
+                unsupported_params.append(key)
+
+    if unsupported_params:
+        warnings.warn(
+            f"Parameters: {unsupported_params} in `{config_name}` are not supported and will be ignored.", stacklevel=2
+        )
+
+
+# ToDo: implement post health checks for customizer are enabled
+async def _get_health(url: str) -> Tuple[bool, bool]: ...
+
+
+async def check_health(config: NvidiaPostTrainingConfig) -> None: ...
--- a/llama_stack/providers/utils/inference/openai_compat.py
+++ b/llama_stack/providers/utils/inference/openai_compat.py
@ -147,6 +147,9 @@ def get_sampling_options(params: SamplingParams) -> dict:
        if params.repetition_penalty is not None and params.repetition_penalty != 1.0:
            options["repeat_penalty"] = params.repetition_penalty

+        if params.stop is not None:
+            options["stop"] = params.stop
+
    return options