diff --git a/.github/workflows/providers-build.yml b/.github/workflows/providers-build.yml
index 010894283..ee532a94a 100644
--- a/.github/workflows/providers-build.yml
+++ b/.github/workflows/providers-build.yml
@@ -81,3 +81,29 @@ jobs:
run: |
source test/bin/activate
uv pip list
+
+ build-single-provider:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v4
+
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: '3.10'
+
+ - name: Install uv
+ uses: astral-sh/setup-uv@v5
+ with:
+ python-version: "3.10"
+
+ - name: Install LlamaStack
+ run: |
+ uv venv
+ source .venv/bin/activate
+ uv pip install -e .
+
+ - name: Build a single provider
+ run: |
+ USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --image-type venv --image-name test --providers inference=remote::ollama
diff --git a/docs/_static/js/detect_theme.js b/docs/_static/js/detect_theme.js
index 484b2bb8b..712565ef7 100644
--- a/docs/_static/js/detect_theme.js
+++ b/docs/_static/js/detect_theme.js
@@ -1,9 +1,32 @@
document.addEventListener("DOMContentLoaded", function () {
const prefersDark = window.matchMedia("(prefers-color-scheme: dark)").matches;
const htmlElement = document.documentElement;
- if (prefersDark) {
- htmlElement.setAttribute("data-theme", "dark");
+
+ // Check if theme is saved in localStorage
+ const savedTheme = localStorage.getItem("sphinx-rtd-theme");
+
+ if (savedTheme) {
+ // Use the saved theme preference
+ htmlElement.setAttribute("data-theme", savedTheme);
+ document.body.classList.toggle("dark", savedTheme === "dark");
} else {
- htmlElement.setAttribute("data-theme", "light");
+ // Fall back to system preference
+ const theme = prefersDark ? "dark" : "light";
+ htmlElement.setAttribute("data-theme", theme);
+ document.body.classList.toggle("dark", theme === "dark");
+ // Save initial preference
+ localStorage.setItem("sphinx-rtd-theme", theme);
}
+
+ // Listen for theme changes from the existing toggle
+ const observer = new MutationObserver(function(mutations) {
+ mutations.forEach(function(mutation) {
+ if (mutation.attributeName === "data-theme") {
+ const currentTheme = htmlElement.getAttribute("data-theme");
+ localStorage.setItem("sphinx-rtd-theme", currentTheme);
+ }
+ });
+ });
+
+ observer.observe(htmlElement, { attributes: true });
});
diff --git a/docs/source/distributions/ondevice_distro/android_sdk.md b/docs/source/distributions/ondevice_distro/android_sdk.md
index 4fa6eaf70..a097a2adf 100644
--- a/docs/source/distributions/ondevice_distro/android_sdk.md
+++ b/docs/source/distributions/ondevice_distro/android_sdk.md
@@ -24,7 +24,7 @@ The key files in the app are `ExampleLlamaStackLocalInference.kt`, `ExampleLlama
Add the following dependency in your `build.gradle.kts` file:
```
dependencies {
- implementation("com.llama.llamastack:llama-stack-client-kotlin:0.1.4.2")
+ implementation("com.llama.llamastack:llama-stack-client-kotlin:0.2.2")
}
```
This will download jar files in your gradle cache in a directory like `~/.gradle/caches/modules-2/files-2.1/com.llama.llamastack/`
@@ -37,11 +37,7 @@ For local inferencing, it is required to include the ExecuTorch library into you
Include the ExecuTorch library by:
1. Download the `download-prebuilt-et-lib.sh` script file from the [llama-stack-client-kotlin-client-local](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release/llama-stack-client-kotlin-client-local/download-prebuilt-et-lib.sh) directory to your local machine.
-2. Move the script to the top level of your Android app where the app directory resides:
-
-
-
-
+2. Move the script to the top level of your Android app where the `app` directory resides.
3. Run `sh download-prebuilt-et-lib.sh` to create an `app/libs` directory and download the `executorch.aar` in that path. This generates an ExecuTorch library for the XNNPACK delegate.
4. Add the `executorch.aar` dependency in your `build.gradle.kts` file:
```
@@ -52,6 +48,8 @@ dependencies {
}
```
+See other dependencies for the local RAG in Android app [README](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release/examples/android_app#quick-start).
+
## Llama Stack APIs in Your Android App
Breaking down the demo app, this section will show the core pieces that are used to initialize and run inference with Llama Stack using the Kotlin library.
@@ -60,7 +58,7 @@ Start a Llama Stack server on localhost. Here is an example of how you can do th
```
conda create -n stack-fireworks python=3.10
conda activate stack-fireworks
-pip install --no-cache llama-stack==0.1.4
+pip install --no-cache llama-stack==0.2.2
llama stack build --template fireworks --image-type conda
export FIREWORKS_API_KEY=
llama stack run fireworks --port 5050
diff --git a/docs/source/distributions/self_hosted_distro/remote-vllm.md b/docs/source/distributions/self_hosted_distro/remote-vllm.md
index e18b5bf40..efa443778 100644
--- a/docs/source/distributions/self_hosted_distro/remote-vllm.md
+++ b/docs/source/distributions/self_hosted_distro/remote-vllm.md
@@ -41,7 +41,7 @@ The following environment variables can be configured:
## Setting up vLLM server
-In the following sections, we'll use either AMD and NVIDIA GPUs to serve as hardware accelerators for the vLLM
+In the following sections, we'll use AMD, NVIDIA or Intel GPUs to serve as hardware accelerators for the vLLM
server, which acts as both the LLM inference provider and the safety provider. Note that vLLM also
[supports many other hardware accelerators](https://docs.vllm.ai/en/latest/getting_started/installation.html) and
that we only use GPUs here for demonstration purposes.
@@ -162,6 +162,55 @@ docker run \
--port $SAFETY_PORT
```
+### Setting up vLLM server on Intel GPU
+
+Refer to [vLLM Documentation for XPU](https://docs.vllm.ai/en/v0.8.2/getting_started/installation/gpu.html?device=xpu) to get a vLLM endpoint. In addition to vLLM side setup which guides towards installing vLLM from sources orself-building vLLM Docker container, Intel provides prebuilt vLLM container to use on systems with Intel GPUs supported by PyTorch XPU backend:
+- [intel/vllm](https://hub.docker.com/r/intel/vllm)
+
+Here is a sample script to start a vLLM server locally via Docker using Intel provided container:
+
+```bash
+export INFERENCE_PORT=8000
+export INFERENCE_MODEL=meta-llama/Llama-3.2-1B-Instruct
+export ZE_AFFINITY_MASK=0
+
+docker run \
+ --pull always \
+ --device /dev/dri \
+ -v /dev/dri/by-path:/dev/dri/by-path \
+ -v ~/.cache/huggingface:/root/.cache/huggingface \
+ --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
+ --env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \
+ -p $INFERENCE_PORT:$INFERENCE_PORT \
+ --ipc=host \
+ intel/vllm:xpu \
+ --gpu-memory-utilization 0.7 \
+ --model $INFERENCE_MODEL \
+ --port $INFERENCE_PORT
+```
+
+If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
+
+```bash
+export SAFETY_PORT=8081
+export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+export ZE_AFFINITY_MASK=1
+
+docker run \
+ --pull always \
+ --device /dev/dri \
+ -v /dev/dri/by-path:/dev/dri/by-path \
+ -v ~/.cache/huggingface:/root/.cache/huggingface \
+ --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
+ --env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \
+ -p $SAFETY_PORT:$SAFETY_PORT \
+ --ipc=host \
+ intel/vllm:xpu \
+ --gpu-memory-utilization 0.7 \
+ --model $SAFETY_MODEL \
+ --port $SAFETY_PORT
+```
+
## Running Llama Stack
Now you are ready to run Llama Stack with vLLM as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
diff --git a/llama_stack/cli/stack/_build.py b/llama_stack/cli/stack/_build.py
index ac1933e0e..3251bc632 100644
--- a/llama_stack/cli/stack/_build.py
+++ b/llama_stack/cli/stack/_build.py
@@ -89,6 +89,43 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
color="red",
)
sys.exit(1)
+ elif args.providers:
+ providers = dict()
+ for api_provider in args.providers.split(","):
+ if "=" not in api_provider:
+ cprint(
+ "Could not parse `--providers`. Please ensure the list is in the format api1=provider1,api2=provider2",
+ color="red",
+ )
+ sys.exit(1)
+ api, provider = api_provider.split("=")
+ providers_for_api = get_provider_registry().get(Api(api), None)
+ if providers_for_api is None:
+ cprint(
+ f"{api} is not a valid API.",
+ color="red",
+ )
+ sys.exit(1)
+ if provider in providers_for_api:
+ providers.setdefault(api, []).append(provider)
+ else:
+ cprint(
+ f"{provider} is not a valid provider for the {api} API.",
+ color="red",
+ )
+ sys.exit(1)
+ distribution_spec = DistributionSpec(
+ providers=providers,
+ description=",".join(args.providers),
+ )
+ if not args.image_type:
+ cprint(
+ f"Please specify a image-type (container | conda | venv) for {args.template}",
+ color="red",
+ )
+ sys.exit(1)
+
+ build_config = BuildConfig(image_type=args.image_type, distribution_spec=distribution_spec)
elif not args.config and not args.template:
name = prompt(
"> Enter a name for your Llama Stack (e.g. my-local-stack): ",
diff --git a/llama_stack/cli/stack/build.py b/llama_stack/cli/stack/build.py
index c511a0682..93e7d9b22 100644
--- a/llama_stack/cli/stack/build.py
+++ b/llama_stack/cli/stack/build.py
@@ -75,6 +75,12 @@ the build. If not specified, currently active environment will be used if found.
default=False,
help="Run the stack after building using the same image type, name, and other applicable arguments",
)
+ self.parser.add_argument(
+ "--providers",
+ type=str,
+ default=None,
+ help="Build a config for a list of providers and only those providers. This list is formatted like: api1=provider1,api2=provider2. Where there can be multiple providers per API.",
+ )
def _run_stack_build_command(self, args: argparse.Namespace) -> None:
# always keep implementation completely silo-ed away from CLI so CLI
diff --git a/llama_stack/distribution/ui/page/playground/tools.py b/llama_stack/distribution/ui/page/playground/tools.py
index e987f617b..bc2e8975f 100644
--- a/llama_stack/distribution/ui/page/playground/tools.py
+++ b/llama_stack/distribution/ui/page/playground/tools.py
@@ -56,6 +56,17 @@ def tool_chat_page():
st.subheader(f"Active Tools: 🛠{len(active_tool_list)}")
st.json(active_tool_list)
+ st.subheader("Chat Configurations")
+ max_tokens = st.slider(
+ "Max Tokens",
+ min_value=0,
+ max_value=4096,
+ value=512,
+ step=1,
+ help="The maximum number of tokens to generate",
+ on_change=reset_agent,
+ )
+
@st.cache_resource
def create_agent():
return Agent(
@@ -63,9 +74,7 @@ def tool_chat_page():
model=model,
instructions="You are a helpful assistant. When you use a tool always respond with a summary of the result.",
tools=toolgroup_selection,
- sampling_params={
- "strategy": {"type": "greedy"},
- },
+ sampling_params={"strategy": {"type": "greedy"}, "max_tokens": max_tokens},
)
agent = create_agent()
diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py
index 2b9eae1e9..d141afa86 100644
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@@ -374,7 +374,8 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
options["max_tokens"] = self.config.max_tokens
input_dict: dict[str, Any] = {}
- if isinstance(request, ChatCompletionRequest) and request.tools is not None:
+ # Only include the 'tools' param if there is any. It can break things if an empty list is sent to the vLLM.
+ if isinstance(request, ChatCompletionRequest) and request.tools:
input_dict = {"tools": _convert_to_vllm_tools_in_request(request.tools)}
if isinstance(request, ChatCompletionRequest):
diff --git a/llama_stack/providers/remote/post_training/nvidia/models.py b/llama_stack/providers/remote/post_training/nvidia/models.py
index 7c696ac20..1b31b4dbe 100644
--- a/llama_stack/providers/remote/post_training/nvidia/models.py
+++ b/llama_stack/providers/remote/post_training/nvidia/models.py
@@ -16,7 +16,11 @@ _MODEL_ENTRIES = [
build_hf_repo_model_entry(
"meta/llama-3.1-8b-instruct",
CoreModelId.llama3_1_8b_instruct.value,
- )
+ ),
+ build_hf_repo_model_entry(
+ "meta/llama-3.2-1b-instruct",
+ CoreModelId.llama3_2_1b_instruct.value,
+ ),
]
diff --git a/llama_stack/templates/remote-vllm/doc_template.md b/llama_stack/templates/remote-vllm/doc_template.md
index efcdb62c6..fe50e9d49 100644
--- a/llama_stack/templates/remote-vllm/doc_template.md
+++ b/llama_stack/templates/remote-vllm/doc_template.md
@@ -28,7 +28,7 @@ The following environment variables can be configured:
## Setting up vLLM server
-In the following sections, we'll use either AMD and NVIDIA GPUs to serve as hardware accelerators for the vLLM
+In the following sections, we'll use AMD, NVIDIA or Intel GPUs to serve as hardware accelerators for the vLLM
server, which acts as both the LLM inference provider and the safety provider. Note that vLLM also
[supports many other hardware accelerators](https://docs.vllm.ai/en/latest/getting_started/installation.html) and
that we only use GPUs here for demonstration purposes.
@@ -149,6 +149,55 @@ docker run \
--port $SAFETY_PORT
```
+### Setting up vLLM server on Intel GPU
+
+Refer to [vLLM Documentation for XPU](https://docs.vllm.ai/en/v0.8.2/getting_started/installation/gpu.html?device=xpu) to get a vLLM endpoint. In addition to vLLM side setup which guides towards installing vLLM from sources orself-building vLLM Docker container, Intel provides prebuilt vLLM container to use on systems with Intel GPUs supported by PyTorch XPU backend:
+- [intel/vllm](https://hub.docker.com/r/intel/vllm)
+
+Here is a sample script to start a vLLM server locally via Docker using Intel provided container:
+
+```bash
+export INFERENCE_PORT=8000
+export INFERENCE_MODEL=meta-llama/Llama-3.2-1B-Instruct
+export ZE_AFFINITY_MASK=0
+
+docker run \
+ --pull always \
+ --device /dev/dri \
+ -v /dev/dri/by-path:/dev/dri/by-path \
+ -v ~/.cache/huggingface:/root/.cache/huggingface \
+ --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
+ --env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \
+ -p $INFERENCE_PORT:$INFERENCE_PORT \
+ --ipc=host \
+ intel/vllm:xpu \
+ --gpu-memory-utilization 0.7 \
+ --model $INFERENCE_MODEL \
+ --port $INFERENCE_PORT
+```
+
+If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
+
+```bash
+export SAFETY_PORT=8081
+export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+export ZE_AFFINITY_MASK=1
+
+docker run \
+ --pull always \
+ --device /dev/dri \
+ -v /dev/dri/by-path:/dev/dri/by-path \
+ -v ~/.cache/huggingface:/root/.cache/huggingface \
+ --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
+ --env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \
+ -p $SAFETY_PORT:$SAFETY_PORT \
+ --ipc=host \
+ intel/vllm:xpu \
+ --gpu-memory-utilization 0.7 \
+ --model $SAFETY_MODEL \
+ --port $SAFETY_PORT
+```
+
## Running Llama Stack
Now you are ready to run Llama Stack with vLLM as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
diff --git a/tests/integration/datasets/test_datasets.py b/tests/integration/datasets/test_datasets.py
index 60db95f30..18b31d39c 100644
--- a/tests/integration/datasets/test_datasets.py
+++ b/tests/integration/datasets/test_datasets.py
@@ -31,6 +31,7 @@ def data_url_from_file(file_path: str) -> str:
return data_url
+@pytest.mark.skip(reason="flaky. Couldn't find 'llamastack/simpleqa' on the Hugging Face Hub")
@pytest.mark.parametrize(
"purpose, source, provider_id, limit",
[
diff --git a/tests/unit/providers/inference/test_remote_vllm.py b/tests/unit/providers/inference/test_remote_vllm.py
index 9c2281d85..88399198d 100644
--- a/tests/unit/providers/inference/test_remote_vllm.py
+++ b/tests/unit/providers/inference/test_remote_vllm.py
@@ -26,7 +26,12 @@ from openai.types.chat.chat_completion_chunk import (
)
from openai.types.model import Model as OpenAIModel
-from llama_stack.apis.inference import ToolChoice, ToolConfig
+from llama_stack.apis.inference import (
+ ChatCompletionRequest,
+ ToolChoice,
+ ToolConfig,
+ UserMessage,
+)
from llama_stack.apis.models import Model
from llama_stack.models.llama.datatypes import StopReason
from llama_stack.providers.remote.inference.vllm.config import VLLMInferenceAdapterConfig
@@ -232,3 +237,14 @@ def test_chat_completion_doesnt_block_event_loop(caplog):
# above.
asyncio_warnings = [record.message for record in caplog.records if record.name == "asyncio"]
assert not asyncio_warnings
+
+
+@pytest.mark.asyncio
+async def test_get_params_empty_tools(vllm_inference_adapter):
+ request = ChatCompletionRequest(
+ tools=[],
+ model="test_model",
+ messages=[UserMessage(content="test")],
+ )
+ params = await vllm_inference_adapter._get_params(request)
+ assert "tools" not in params