From 83b5523e2d09fe4f0d419036e11e2cbf527851fc Mon Sep 17 00:00:00 2001
From: Charlie Doern <cdoern@redhat.com>
Date: Tue, 15 Apr 2025 08:17:03 -0400
Subject: [PATCH 1/8] feat: add `--providers` to llama stack build (#1718)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

allow users to specify only the providers they want in the llama stack
build command. If a user wants a non-interactive build, but doesn't want
to use a template, `--providers` allows someone to specify something
like `--providers inference=remote::ollama` for a distro with JUST
ollama

## Test Plan

`llama stack build --providers inference=remote::ollama --image-type
venv`
<img width="1084" alt="Screenshot 2025-03-20 at 9 34 14 AM"
src="https://github.com/user-attachments/assets/502b5fa2-edab-4267-a595-4f987204a6a9"
/>

`llama stack run --image-type venv
/Users/charliedoern/projects/Documents/llama-stack/venv-run.yaml`
<img width="1149" alt="Screenshot 2025-03-20 at 9 35 19 AM"
src="https://github.com/user-attachments/assets/433765f3-6b7f-4383-9241-dad085b69228"
/>

---------

Signed-off-by: Charlie Doern <cdoern@redhat.com>
Signed-off-by: Sébastien Han <seb@redhat.com>
Co-authored-by: Sébastien Han <seb@redhat.com>
---
 .github/workflows/providers-build.yml | 26 +++++++++++++++++++
 llama_stack/cli/stack/_build.py       | 37 +++++++++++++++++++++++++++
 llama_stack/cli/stack/build.py        |  6 +++++
 3 files changed, 69 insertions(+)
diff --git a/.github/workflows/providers-build.yml b/.github/workflows/providers-build.yml
index 010894283..ee532a94a 100644
--- a/.github/workflows/providers-build.yml
+++ b/.github/workflows/providers-build.yml
@@ -81,3 +81,29 @@ jobs:
         run: |
           source test/bin/activate
           uv pip list
+
+  build-single-provider:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          python-version: "3.10"
+
+      - name: Install LlamaStack
+        run: |
+          uv venv
+          source .venv/bin/activate
+          uv pip install -e .
+
+      - name: Build a single provider
+        run: |
+          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --image-type venv --image-name test --providers inference=remote::ollama
diff --git a/llama_stack/cli/stack/_build.py b/llama_stack/cli/stack/_build.py
index ac1933e0e..3251bc632 100644
--- a/llama_stack/cli/stack/_build.py
+++ b/llama_stack/cli/stack/_build.py
@@ -89,6 +89,43 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
                 color="red",
             )
             sys.exit(1)
+    elif args.providers:
+        providers = dict()
+        for api_provider in args.providers.split(","):
+            if "=" not in api_provider:
+                cprint(
+                    "Could not parse `--providers`. Please ensure the list is in the format api1=provider1,api2=provider2",
+                    color="red",
+                )
+                sys.exit(1)
+            api, provider = api_provider.split("=")
+            providers_for_api = get_provider_registry().get(Api(api), None)
+            if providers_for_api is None:
+                cprint(
+                    f"{api} is not a valid API.",
+                    color="red",
+                )
+                sys.exit(1)
+            if provider in providers_for_api:
+                providers.setdefault(api, []).append(provider)
+            else:
+                cprint(
+                    f"{provider} is not a valid provider for the {api} API.",
+                    color="red",
+                )
+                sys.exit(1)
+        distribution_spec = DistributionSpec(
+            providers=providers,
+            description=",".join(args.providers),
+        )
+        if not args.image_type:
+            cprint(
+                f"Please specify a image-type (container | conda | venv) for {args.template}",
+                color="red",
+            )
+            sys.exit(1)
+
+        build_config = BuildConfig(image_type=args.image_type, distribution_spec=distribution_spec)
     elif not args.config and not args.template:
         name = prompt(
             "> Enter a name for your Llama Stack (e.g. my-local-stack): ",
diff --git a/llama_stack/cli/stack/build.py b/llama_stack/cli/stack/build.py
index c511a0682..93e7d9b22 100644
--- a/llama_stack/cli/stack/build.py
+++ b/llama_stack/cli/stack/build.py
@@ -75,6 +75,12 @@ the build. If not specified, currently active environment will be used if found.
             default=False,
             help="Run the stack after building using the same image type, name, and other applicable arguments",
         )
+        self.parser.add_argument(
+            "--providers",
+            type=str,
+            default=None,
+            help="Build a config for a list of providers and only those providers. This list is formatted like: api1=provider1,api2=provider2. Where there can be multiple providers per API.",
+        )
 
     def _run_stack_build_command(self, args: argparse.Namespace) -> None:
         # always keep implementation completely silo-ed away from CLI so CLI

From 71ed47ea7604afd97b141c49e8a6598375baa246 Mon Sep 17 00:00:00 2001
From: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
Date: Tue, 15 Apr 2025 07:56:23 -0700
Subject: [PATCH 2/8] docs: add example for intel gpu in vllm remote (#1952)

# What does this PR do?

PR adds instructions to setup vLLM remote endpoint for vllm-remote llama
stack distribution.

## Test Plan

* Verified with manual tests of the configured vllm-remote against vllm
endpoint running on the system with Intel GPU
* Also verified with ci pytests (see cmdline below). Test passes in the
same capacity as it does on the A10 Nvidia setup (some tests do fail
which seems to be known issues with vllm remote llama stack
distribution)

```
pytest -s -v tests/integration/inference/test_text_inference.py \
   --stack-config=http://localhost:5001 \
   --text-model=meta-llama/Llama-3.2-3B-Instruct
```

CC: @ashwinb

Signed-off-by: Dmitry Rogozhkin <dmitry.v.rogozhkin@intel.com>
---
 .../self_hosted_distro/remote-vllm.md         | 51 ++++++++++++++++++-
 .../templates/remote-vllm/doc_template.md     | 51 ++++++++++++++++++-
 2 files changed, 100 insertions(+), 2 deletions(-)

diff --git a/docs/source/distributions/self_hosted_distro/remote-vllm.md b/docs/source/distributions/self_hosted_distro/remote-vllm.md
index e18b5bf40..efa443778 100644
--- a/docs/source/distributions/self_hosted_distro/remote-vllm.md
+++ b/docs/source/distributions/self_hosted_distro/remote-vllm.md
@@ -41,7 +41,7 @@ The following environment variables can be configured:
 
 ## Setting up vLLM server
 
-In the following sections, we'll use either AMD and NVIDIA GPUs to serve as hardware accelerators for the vLLM
+In the following sections, we'll use AMD, NVIDIA or Intel GPUs to serve as hardware accelerators for the vLLM
 server, which acts as both the LLM inference provider and the safety provider. Note that vLLM also
 [supports many other hardware accelerators](https://docs.vllm.ai/en/latest/getting_started/installation.html) and
 that we only use GPUs here for demonstration purposes.
@@ -162,6 +162,55 @@ docker run \
     --port $SAFETY_PORT
 ```
 
+### Setting up vLLM server on Intel GPU
+
+Refer to [vLLM Documentation for XPU](https://docs.vllm.ai/en/v0.8.2/getting_started/installation/gpu.html?device=xpu) to get a vLLM endpoint. In addition to vLLM side setup which guides towards installing vLLM from sources orself-building vLLM Docker container, Intel provides prebuilt vLLM container to use on systems with Intel GPUs supported by PyTorch XPU backend:
+- [intel/vllm](https://hub.docker.com/r/intel/vllm)
+
+Here is a sample script to start a vLLM server locally via Docker using Intel provided container:
+
+```bash
+export INFERENCE_PORT=8000
+export INFERENCE_MODEL=meta-llama/Llama-3.2-1B-Instruct
+export ZE_AFFINITY_MASK=0
+
+docker run \
+    --pull always \
+    --device /dev/dri \
+    -v /dev/dri/by-path:/dev/dri/by-path \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
+    --env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \
+    -p $INFERENCE_PORT:$INFERENCE_PORT \
+    --ipc=host \
+    intel/vllm:xpu \
+    --gpu-memory-utilization 0.7 \
+    --model $INFERENCE_MODEL \
+    --port $INFERENCE_PORT
+```
+
+If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
+
+```bash
+export SAFETY_PORT=8081
+export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+export ZE_AFFINITY_MASK=1
+
+docker run \
+    --pull always \
+    --device /dev/dri \
+    -v /dev/dri/by-path:/dev/dri/by-path \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
+    --env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \
+    -p $SAFETY_PORT:$SAFETY_PORT \
+    --ipc=host \
+    intel/vllm:xpu \
+    --gpu-memory-utilization 0.7 \
+    --model $SAFETY_MODEL \
+    --port $SAFETY_PORT
+```
+
 ## Running Llama Stack
 
 Now you are ready to run Llama Stack with vLLM as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
diff --git a/llama_stack/templates/remote-vllm/doc_template.md b/llama_stack/templates/remote-vllm/doc_template.md
index efcdb62c6..fe50e9d49 100644
--- a/llama_stack/templates/remote-vllm/doc_template.md
+++ b/llama_stack/templates/remote-vllm/doc_template.md
@@ -28,7 +28,7 @@ The following environment variables can be configured:
 
 ## Setting up vLLM server
 
-In the following sections, we'll use either AMD and NVIDIA GPUs to serve as hardware accelerators for the vLLM
+In the following sections, we'll use AMD, NVIDIA or Intel GPUs to serve as hardware accelerators for the vLLM
 server, which acts as both the LLM inference provider and the safety provider. Note that vLLM also
 [supports many other hardware accelerators](https://docs.vllm.ai/en/latest/getting_started/installation.html) and
 that we only use GPUs here for demonstration purposes.
@@ -149,6 +149,55 @@ docker run \
     --port $SAFETY_PORT
 ```
 
+### Setting up vLLM server on Intel GPU
+
+Refer to [vLLM Documentation for XPU](https://docs.vllm.ai/en/v0.8.2/getting_started/installation/gpu.html?device=xpu) to get a vLLM endpoint. In addition to vLLM side setup which guides towards installing vLLM from sources orself-building vLLM Docker container, Intel provides prebuilt vLLM container to use on systems with Intel GPUs supported by PyTorch XPU backend:
+- [intel/vllm](https://hub.docker.com/r/intel/vllm)
+
+Here is a sample script to start a vLLM server locally via Docker using Intel provided container:
+
+```bash
+export INFERENCE_PORT=8000
+export INFERENCE_MODEL=meta-llama/Llama-3.2-1B-Instruct
+export ZE_AFFINITY_MASK=0
+
+docker run \
+    --pull always \
+    --device /dev/dri \
+    -v /dev/dri/by-path:/dev/dri/by-path \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
+    --env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \
+    -p $INFERENCE_PORT:$INFERENCE_PORT \
+    --ipc=host \
+    intel/vllm:xpu \
+    --gpu-memory-utilization 0.7 \
+    --model $INFERENCE_MODEL \
+    --port $INFERENCE_PORT
+```
+
+If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
+
+```bash
+export SAFETY_PORT=8081
+export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+export ZE_AFFINITY_MASK=1
+
+docker run \
+    --pull always \
+    --device /dev/dri \
+    -v /dev/dri/by-path:/dev/dri/by-path \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
+    --env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \
+    -p $SAFETY_PORT:$SAFETY_PORT \
+    --ipc=host \
+    intel/vllm:xpu \
+    --gpu-memory-utilization 0.7 \
+    --model $SAFETY_MODEL \
+    --port $SAFETY_PORT
+```
+
 ## Running Llama Stack
 
 Now you are ready to run Llama Stack with vLLM as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.

From 093881071a6681a0e3b19eaf8986d5f83a21501d Mon Sep 17 00:00:00 2001
From: Michael Clifford <mcliffor@redhat.com>
Date: Tue, 15 Apr 2025 12:11:08 -0400
Subject: [PATCH 3/8] fix: add max_tokens slider to playground tools page
 (#1958)

# What does this PR do?

This PR adds a `max_tokens` slider to playground tools page. I have
found that in some instances the llama stack server throws a 500 error
if the max_tokens value is not explicitly set in the agent's
`sampling_params`. This PR, uses the same implementation of the
`max_tokens` slider from the chat page, and includes it on the tools
page.


## Test Plan
1. Attempting to call a tool without these changes results in a `500:
Internal server error: An unexpected error occurred`.
2. Attempting to call a tool with these changes results in the expected
output.

Signed-off-by: Michael Clifford <mcliffor@redhat.com>
---
 .../distribution/ui/page/playground/tools.py      | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/llama_stack/distribution/ui/page/playground/tools.py b/llama_stack/distribution/ui/page/playground/tools.py
index e987f617b..bc2e8975f 100644
--- a/llama_stack/distribution/ui/page/playground/tools.py
+++ b/llama_stack/distribution/ui/page/playground/tools.py
@@ -56,6 +56,17 @@ def tool_chat_page():
         st.subheader(f"Active Tools: 🛠 {len(active_tool_list)}")
         st.json(active_tool_list)
 
+        st.subheader("Chat Configurations")
+        max_tokens = st.slider(
+            "Max Tokens",
+            min_value=0,
+            max_value=4096,
+            value=512,
+            step=1,
+            help="The maximum number of tokens to generate",
+            on_change=reset_agent,
+        )
+
     @st.cache_resource
     def create_agent():
         return Agent(
@@ -63,9 +74,7 @@ def tool_chat_page():
             model=model,
             instructions="You are a helpful assistant. When you use a tool always respond with a summary of the result.",
             tools=toolgroup_selection,
-            sampling_params={
-                "strategy": {"type": "greedy"},
-            },
+            sampling_params={"strategy": {"type": "greedy"}, "max_tokens": max_tokens},
         )
 
     agent = create_agent()

From fb8ff77ff2db5477ee42649df5f05a172e66a0af Mon Sep 17 00:00:00 2001
From: Chirag Modi <98582575+cmodi-meta@users.noreply.github.com>
Date: Tue, 15 Apr 2025 13:26:17 -0700
Subject: [PATCH 4/8] docs: 0.2.2 doc updates (#1961)

Add updates to android site readme for 0.2.2
---
 .../distributions/ondevice_distro/android_sdk.md     | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/docs/source/distributions/ondevice_distro/android_sdk.md b/docs/source/distributions/ondevice_distro/android_sdk.md
index 4fa6eaf70..a097a2adf 100644
--- a/docs/source/distributions/ondevice_distro/android_sdk.md
+++ b/docs/source/distributions/ondevice_distro/android_sdk.md
@@ -24,7 +24,7 @@ The key files in the app are `ExampleLlamaStackLocalInference.kt`, `ExampleLlama
 Add the following dependency in your `build.gradle.kts` file:
 ```
 dependencies {
- implementation("com.llama.llamastack:llama-stack-client-kotlin:0.1.4.2")
+ implementation("com.llama.llamastack:llama-stack-client-kotlin:0.2.2")
 }
 ```
 This will download jar files in your gradle cache in a directory like `~/.gradle/caches/modules-2/files-2.1/com.llama.llamastack/`
@@ -37,11 +37,7 @@ For local inferencing, it is required to include the ExecuTorch library into you
 
 Include the ExecuTorch library by:
 1. Download the `download-prebuilt-et-lib.sh` script file from the [llama-stack-client-kotlin-client-local](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release/llama-stack-client-kotlin-client-local/download-prebuilt-et-lib.sh) directory to your local machine.
-2. Move the script to the top level of your Android app where the app directory resides:
-<p align="center">
-<img src="https://github.com/meta-llama/llama-stack-client-kotlin/blob/latest-release/doc/img/example_android_app_directory.png" style="width:300px">
-</p>
-
+2. Move the script to the top level of your Android app where the `app` directory resides.
 3. Run `sh download-prebuilt-et-lib.sh` to create an `app/libs` directory and download the `executorch.aar` in that path. This generates an ExecuTorch library for the XNNPACK delegate.
 4. Add the `executorch.aar` dependency in your `build.gradle.kts` file:
 ```
@@ -52,6 +48,8 @@ dependencies {
 }
 ```
 
+See other dependencies for the local RAG in Android app [README](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release/examples/android_app#quick-start).
+
 ## Llama Stack APIs in Your Android App
 Breaking down the demo app, this section will show the core pieces that are used to initialize and run inference with Llama Stack using the Kotlin library.
 
@@ -60,7 +58,7 @@ Start a Llama Stack server on localhost. Here is an example of how you can do th
 ```
 conda create -n stack-fireworks python=3.10
 conda activate stack-fireworks
-pip install --no-cache llama-stack==0.1.4
+pip install --no-cache llama-stack==0.2.2
 llama stack build --template fireworks --image-type conda
 export FIREWORKS_API_KEY=<SOME_KEY>
 llama stack run fireworks --port 5050

From b5a9ef4c6d9dd2a6d16383107bb9765da66a3faa Mon Sep 17 00:00:00 2001
From: Daniel Alvarez Sanchez <dalvarez@redhat.com>
Date: Wed, 16 Apr 2025 02:31:12 +0200
Subject: [PATCH 5/8] fix: Do not send an empty 'tools' list to remote vllm
 (#1957)

Fixes: #1955

Since 0.2.0, the vLLM gets an empty list (vs ``None``in 0.1.9 and
before) when there are no tools configured which causes the issue
described in #1955 p. This patch avoids sending the 'tools' param to the
vLLM altogether instead of an empty list.

It also adds a small unit test to avoid regressions.

The OpenAI
[specification](https://platform.openai.com/docs/api-reference/chat/create)
does not explicitly state that the list cannot be empty but I found this
out through experimentation and it might depend on the actual remote
vllm. In any case, as this parameter is Optional, is best to skip it
altogether if there's no tools configured.

Signed-off-by: Daniel Alvarez <dalvarez@redhat.com>
---
 .../providers/remote/inference/vllm/vllm.py    |  3 ++-
 .../providers/inference/test_remote_vllm.py    | 18 +++++++++++++++++-
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py
index 2b9eae1e9..d141afa86 100644
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@@ -374,7 +374,8 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
             options["max_tokens"] = self.config.max_tokens
 
         input_dict: dict[str, Any] = {}
-        if isinstance(request, ChatCompletionRequest) and request.tools is not None:
+        # Only include the 'tools' param if there is any. It can break things if an empty list is sent to the vLLM.
+        if isinstance(request, ChatCompletionRequest) and request.tools:
             input_dict = {"tools": _convert_to_vllm_tools_in_request(request.tools)}
 
         if isinstance(request, ChatCompletionRequest):
diff --git a/tests/unit/providers/inference/test_remote_vllm.py b/tests/unit/providers/inference/test_remote_vllm.py
index 9c2281d85..88399198d 100644
--- a/tests/unit/providers/inference/test_remote_vllm.py
+++ b/tests/unit/providers/inference/test_remote_vllm.py
@@ -26,7 +26,12 @@ from openai.types.chat.chat_completion_chunk import (
 )
 from openai.types.model import Model as OpenAIModel
 
-from llama_stack.apis.inference import ToolChoice, ToolConfig
+from llama_stack.apis.inference import (
+    ChatCompletionRequest,
+    ToolChoice,
+    ToolConfig,
+    UserMessage,
+)
 from llama_stack.apis.models import Model
 from llama_stack.models.llama.datatypes import StopReason
 from llama_stack.providers.remote.inference.vllm.config import VLLMInferenceAdapterConfig
@@ -232,3 +237,14 @@ def test_chat_completion_doesnt_block_event_loop(caplog):
     # above.
     asyncio_warnings = [record.message for record in caplog.records if record.name == "asyncio"]
     assert not asyncio_warnings
+
+
+@pytest.mark.asyncio
+async def test_get_params_empty_tools(vllm_inference_adapter):
+    request = ChatCompletionRequest(
+        tools=[],
+        model="test_model",
+        messages=[UserMessage(content="test")],
+    )
+    params = await vllm_inference_adapter._get_params(request)
+    assert "tools" not in params

From 00b232c2826756bbd395c7f0fe0be8e3179f9801 Mon Sep 17 00:00:00 2001
From: Francisco Arceo <arceofrancisco@gmail.com>
Date: Wed, 16 Apr 2025 14:58:25 -0600
Subject: [PATCH 6/8] chore: Fix to persist the theme preference across page
 navigation. (#1974)

# What does this PR do?
This PR persists the theme preference across page navigation.

Currently, if the default theme is detected, it is used.

But if a user flips **_the default theme_** and goes to a new page, the
theme will switch back to the default.

This resolves that issue.

## Test Plan
[Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.*]

[//]: # (## Documentation)

Signed-off-by: Francisco Javier Arceo <farceo@redhat.com>
---
 docs/_static/js/detect_theme.js | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/docs/_static/js/detect_theme.js b/docs/_static/js/detect_theme.js
index 484b2bb8b..712565ef7 100644
--- a/docs/_static/js/detect_theme.js
+++ b/docs/_static/js/detect_theme.js
@@ -1,9 +1,32 @@
 document.addEventListener("DOMContentLoaded", function () {
   const prefersDark = window.matchMedia("(prefers-color-scheme: dark)").matches;
   const htmlElement = document.documentElement;
-  if (prefersDark) {
-    htmlElement.setAttribute("data-theme", "dark");
+
+  // Check if theme is saved in localStorage
+  const savedTheme = localStorage.getItem("sphinx-rtd-theme");
+
+  if (savedTheme) {
+    // Use the saved theme preference
+    htmlElement.setAttribute("data-theme", savedTheme);
+    document.body.classList.toggle("dark", savedTheme === "dark");
   } else {
-    htmlElement.setAttribute("data-theme", "light");
+    // Fall back to system preference
+    const theme = prefersDark ? "dark" : "light";
+    htmlElement.setAttribute("data-theme", theme);
+    document.body.classList.toggle("dark", theme === "dark");
+    // Save initial preference
+    localStorage.setItem("sphinx-rtd-theme", theme);
   }
+
+  // Listen for theme changes from the existing toggle
+  const observer = new MutationObserver(function(mutations) {
+    mutations.forEach(function(mutation) {
+      if (mutation.attributeName === "data-theme") {
+        const currentTheme = htmlElement.getAttribute("data-theme");
+        localStorage.setItem("sphinx-rtd-theme", currentTheme);
+      }
+    });
+  });
+
+  observer.observe(htmlElement, { attributes: true });
 });

From 30fc66923be97a63162d77a6cecfdba3ad2537df Mon Sep 17 00:00:00 2001
From: Jash Gulabrai <37194352+JashG@users.noreply.github.com>
Date: Wed, 16 Apr 2025 18:02:08 -0400
Subject: [PATCH 7/8] fix: Add llama-3.2-1b-instruct to NVIDIA fine-tuned model
 list (#1975)

# What does this PR do?
Adds `meta/llama-3.2-1b-instruct` to list of models that NeMo Customizer
can fine-tune. This is the model our example notebooks typically use for
fine-tuning.

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
[Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.*]

[//]: # (## Documentation)

Co-authored-by: Jash Gulabrai <jgulabrai@nvidia.com>
---
 llama_stack/providers/remote/post_training/nvidia/models.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/llama_stack/providers/remote/post_training/nvidia/models.py b/llama_stack/providers/remote/post_training/nvidia/models.py
index 7c696ac20..1b31b4dbe 100644
--- a/llama_stack/providers/remote/post_training/nvidia/models.py
+++ b/llama_stack/providers/remote/post_training/nvidia/models.py
@@ -16,7 +16,11 @@ _MODEL_ENTRIES = [
     build_hf_repo_model_entry(
         "meta/llama-3.1-8b-instruct",
         CoreModelId.llama3_1_8b_instruct.value,
-    )
+    ),
+    build_hf_repo_model_entry(
+        "meta/llama-3.2-1b-instruct",
+        CoreModelId.llama3_2_1b_instruct.value,
+    ),
 ]
 
 

From b44f84ce186d4c039621e25acd3af78febddaf28 Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Wed, 16 Apr 2025 15:33:37 -0700
Subject: [PATCH 8/8] test: disable flaky dataset (#1979)

# What does this PR do?


## Test Plan
---
 tests/integration/datasets/test_datasets.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/integration/datasets/test_datasets.py b/tests/integration/datasets/test_datasets.py
index 60db95f30..18b31d39c 100644
--- a/tests/integration/datasets/test_datasets.py
+++ b/tests/integration/datasets/test_datasets.py
@@ -31,6 +31,7 @@ def data_url_from_file(file_path: str) -> str:
     return data_url
 
 
+@pytest.mark.skip(reason="flaky. Couldn't find 'llamastack/simpleqa' on the Hugging Face Hub")
 @pytest.mark.parametrize(
     "purpose, source, provider_id, limit",
     [