From af15426ad7e07149f50f7275a495163504055742 Mon Sep 17 00:00:00 2001 From: ehhuang Date: Thu, 6 Feb 2025 17:30:21 -0800 Subject: [PATCH 01/29] doc: getting started notebook (#996) # What does this PR do? Fix link ## Test Plan --- docs/source/building_applications/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/building_applications/index.md b/docs/source/building_applications/index.md index 45dca5a1c..e89a90299 100644 --- a/docs/source/building_applications/index.md +++ b/docs/source/building_applications/index.md @@ -4,7 +4,7 @@ Llama Stack provides all the building blocks needed to create sophisticated AI a The best way to get started is to look at this notebook which walks through the various APIs (from basic inference, to RAG agents) and how to use them. -**Notebook**: [Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb) +**Notebook**: [Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb) Here are some key topics that will help you build effective agents: From d0d568c5ba220a6e34a3a0c48c68bf7ddaf62e43 Mon Sep 17 00:00:00 2001 From: ehhuang Date: Thu, 6 Feb 2025 20:19:38 -0800 Subject: [PATCH 02/29] test: fix flaky agent test (#1002) Summary: Test Plan: LLAMA_STACK_CONFIG=fireworks pytest -s -v tests/client-sdk/ --safety-shield meta-llama/Llama-Guard-3-8 all tests passed --- tests/client-sdk/agents/test_agents.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/client-sdk/agents/test_agents.py b/tests/client-sdk/agents/test_agents.py index 2b1db7df0..302c1c6e7 100644 --- a/tests/client-sdk/agents/test_agents.py +++ b/tests/client-sdk/agents/test_agents.py @@ -269,6 +269,7 @@ def test_override_system_message_behavior(llama_stack_client, agent_config): **agent_config, "instructions": "You are a pirate", "client_tools": [client_tool.get_tool_definition()], + "model": "meta-llama/Llama-3.2-3B-Instruct", } agent = Agent(llama_stack_client, agent_config, client_tools=(client_tool,)) From 840344975db5a3b5e0526bfb0e5f717e452632a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Fri, 7 Feb 2025 17:04:25 +0100 Subject: [PATCH 03/29] test: rm unused exception alias in pytest.raises (#991) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? Refactored tests by removing unused exception alias (as exc_info) in pytest.raises, improving code clarity and reducing lint warnings. exc_info was never used. Signed-off-by: Sébastien Han ## Test Plan Please describe: - tests you ran to verify your changes with result summaries. - provide instructions so it can be reproduced. ## Sources Please link relevant resources if necessary. ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Ran pre-commit to handle lint / formatting issues. - [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ ] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests. Signed-off-by: Sébastien Han --- llama_stack/providers/tests/datasetio/test_datasetio.py | 4 ++-- .../providers/tests/inference/test_model_registration.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/llama_stack/providers/tests/datasetio/test_datasetio.py b/llama_stack/providers/tests/datasetio/test_datasetio.py index cf28045a4..fd76bafe0 100644 --- a/llama_stack/providers/tests/datasetio/test_datasetio.py +++ b/llama_stack/providers/tests/datasetio/test_datasetio.py @@ -95,7 +95,7 @@ class TestDatasetIO: assert len(response) == 1 assert response[0].identifier == "test_dataset" - with pytest.raises(Exception) as exc_info: + with pytest.raises(ValueError): # unregister a dataset that does not exist await datasets_impl.unregister_dataset("test_dataset2") @@ -104,7 +104,7 @@ class TestDatasetIO: assert isinstance(response, list) assert len(response) == 0 - with pytest.raises(Exception) as exc_info: + with pytest.raises(ValueError): await datasets_impl.unregister_dataset("test_dataset") @pytest.mark.asyncio diff --git a/llama_stack/providers/tests/inference/test_model_registration.py b/llama_stack/providers/tests/inference/test_model_registration.py index 96a34ec0e..664564d22 100644 --- a/llama_stack/providers/tests/inference/test_model_registration.py +++ b/llama_stack/providers/tests/inference/test_model_registration.py @@ -32,7 +32,7 @@ class TestModelRegistration: ) # Try to register a model that's too large for local inference - with pytest.raises(ValueError) as exc_info: + with pytest.raises(ValueError): await models_impl.register_model( model_id="Llama3.1-70B-Instruct", ) @@ -42,7 +42,7 @@ class TestModelRegistration: _, models_impl = inference_stack # Try to register a non-existent model - with pytest.raises(Exception) as exc_info: + with pytest.raises(ValueError): await models_impl.register_model( model_id="Llama3-NonExistent-Model", ) @@ -59,7 +59,7 @@ class TestModelRegistration: }, ) - with pytest.raises(ValueError) as exc_info: + with pytest.raises(ValueError): await models_impl.register_model( model_id="custom-model-2", metadata={ @@ -88,7 +88,7 @@ class TestModelRegistration: async def test_register_with_invalid_llama_model(self, inference_stack): _, models_impl = inference_stack - with pytest.raises(ValueError) as exc_info: + with pytest.raises(ValueError): await models_impl.register_model( model_id="custom-model-2", metadata={"llama_model": "invalid-llama-model"}, From 3f9764d50ca90789517f16a6f29621f4a893afe9 Mon Sep 17 00:00:00 2001 From: Yuan Tang Date: Fri, 7 Feb 2025 12:02:15 -0500 Subject: [PATCH 04/29] fix: List providers command prints out non-existing APIs from registry. Fixes #966 (#969) Fixes #966. Verified that: 1. Correct list of APIs are printed out when running `llama stack list-providers` 2. `llama stack list-providers ` works as expected. --------- Signed-off-by: Yuan Tang --- llama_stack/cli/stack/list_providers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_stack/cli/stack/list_providers.py b/llama_stack/cli/stack/list_providers.py index 96e978826..909fea030 100644 --- a/llama_stack/cli/stack/list_providers.py +++ b/llama_stack/cli/stack/list_providers.py @@ -22,9 +22,9 @@ class StackListProviders(Subcommand): self.parser.set_defaults(func=self._run_providers_list_cmd) def _add_arguments(self): - from llama_stack.distribution.datatypes import Api + from llama_stack.distribution.distribution import providable_apis - api_values = [a.value for a in Api] + api_values = [api.value for api in providable_apis()] self.parser.add_argument( "api", type=str, From e6c9f2a4856192d6cb57a038d98d21a253c4319a Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Fri, 7 Feb 2025 09:03:35 -0800 Subject: [PATCH 05/29] Delete CHANGELOG.md We use weekly releases as a way to communicate important improvements. Keeping this information synced across is more overhead than we have bandwidth for right now. We may change this process over time. --- CHANGELOG.md | 44 -------------------------------------------- 1 file changed, 44 deletions(-) delete mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md deleted file mode 100644 index 04cd09777..000000000 --- a/CHANGELOG.md +++ /dev/null @@ -1,44 +0,0 @@ -# Changelog - -## 0.2.0 - -### Added - -### Changed - -### Removed - - -## 0.0.53 - -### Added -- Resource-oriented design for models, shields, memory banks, datasets and eval tasks -- Persistence for registered objects with distribution -- Ability to persist memory banks created for FAISS -- PostgreSQL KVStore implementation -- Environment variable placeholder support in run.yaml files -- Comprehensive Zero-to-Hero notebooks and quickstart guides -- Support for quantized models in Ollama -- Vision models support for Together, Fireworks, Meta-Reference, and Ollama, and vLLM -- Bedrock distribution with safety shields support -- Evals API with task registration and scoring functions -- MMLU and SimpleQA benchmark scoring functions -- Huggingface dataset provider integration for benchmarks -- Support for custom dataset registration from local paths -- Benchmark evaluation CLI tools with visualization tables -- RAG evaluation scoring functions and metrics -- Local persistence for datasets and eval tasks - -### Changed -- Split safety into distinct providers (llama-guard, prompt-guard, code-scanner) -- Changed provider naming convention (`impls` → `inline`, `adapters` → `remote`) -- Updated API signatures for dataset and eval task registration -- Restructured folder organization for providers -- Enhanced Docker build configuration -- Added version prefixing for REST API routes -- Enhanced evaluation task registration workflow -- Improved benchmark evaluation output formatting -- Restructured evals folder organization for better modularity - -### Removed -- `llama stack configure` command From 657f24b964744a4ebfe7c89db72ef43c3d2a321c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Fri, 7 Feb 2025 18:35:00 +0100 Subject: [PATCH 06/29] chore: add missing ToolConfig import in groq.py (#983) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? Imported `ToolConfig` from the `llama_stack.apis.inference` module to resolve missing reference and ensure proper functionality within the `groq.py` file. Signed-off-by: Sébastien Han ## Test Plan Without the change, pytest will run with the following error: ``` uv run pytest -v -s -k "ollama" llama_stack/providers/tests/ /Users/leseb/Documents/AI/llama-stack/.venv/lib/python3.13/site-packages/pytest_asyncio/plugin.py:207: PytestDeprecationWarning: The configuration option "asyncio_default_fixture_loop_scope" is unset. The event loop scope for asynchronous fixtures will default to the fixture caching scope. Future versions of pytest-asyncio will default the loop scope for asynchronous fixtures to function scope. Set the default fixture loop scope explicitly in order to avoid unexpected behavior in the future. Valid fixture loop scopes are: "function", "class", "module", "package", "session" warnings.warn(PytestDeprecationWarning(_DEFAULT_FIXTURE_LOOP_SCOPE_UNSET)) ============================================ test session starts ============================================= platform darwin -- Python 3.13.1, pytest-8.3.4, pluggy-1.5.0 -- /Users/leseb/Documents/AI/llama-stack/.venv/bin/python3 cachedir: .pytest_cache metadata: {'Python': '3.13.1', 'Platform': 'macOS-15.3-arm64-arm-64bit-Mach-O', 'Packages': {'pytest': '8.3.4', 'pluggy': '1.5.0'}, 'Plugins': {'html': '4.1.1', 'metadata': '3.1.1', 'asyncio': '0.25.3', 'anyio': '4.8.0', 'nbval': '0.11.0'}} rootdir: /Users/leseb/Documents/AI/llama-stack configfile: pyproject.toml plugins: html-4.1.1, metadata-3.1.1, asyncio-0.25.3, anyio-4.8.0, nbval-0.11.0 asyncio: mode=Mode.STRICT, asyncio_default_fixture_loop_scope=None collected 379 items / 1 error / 349 deselected / 30 selected =================================================== ERRORS =================================================== __________________ ERROR collecting llama_stack/providers/tests/inference/groq/test_init.py __________________ llama_stack/providers/tests/inference/groq/test_init.py:11: in from llama_stack.providers.remote.inference.groq.groq import GroqInferenceAdapter llama_stack/providers/remote/inference/groq/groq.py:72: in class GroqInferenceAdapter(Inference, ModelRegistryHelper, NeedsRequestProviderData): llama_stack/providers/remote/inference/groq/groq.py:102: in GroqInferenceAdapter tool_config: Optional[ToolConfig] = None, E NameError: name 'ToolConfig' is not defined ========================================== short test summary info =========================================== ERROR llama_stack/providers/tests/inference/groq/test_init.py - NameError: name 'ToolConfig' is not defined !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Interrupted: 1 error during collection !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! =============================== 349 deselected, 22 warnings, 1 error in 0.28s ================================ ``` With the change the test continues to run and fails with a different error: ``` uv run pytest -v -s llama_stack/providers/tests/ /Users/leseb/Documents/AI/llama-stack/.venv/lib/python3.13/site-packages/pytest_asyncio/plugin.py:207: PytestDeprecationWarning: The configuration option "asyncio_default_fixture_loop_scope" is unset. The event loop scope for asynchronous fixtures will default to the fixture caching scope. Future versions of pytest-asyncio will default the loop scope for asynchronous fixtures to function scope. Set the default fixture loop scope explicitly in order to avoid unexpected behavior in the future. Valid fixture loop scopes are: "function", "class", "module", "package", "session" warnings.warn(PytestDeprecationWarning(_DEFAULT_FIXTURE_LOOP_SCOPE_UNSET)) ============================================ test session starts ============================================= platform darwin -- Python 3.13.1, pytest-8.3.4, pluggy-1.5.0 -- /Users/leseb/Documents/AI/llama-stack/.venv/bin/python3 cachedir: .pytest_cache metadata: {'Python': '3.13.1', 'Platform': 'macOS-15.3-arm64-arm-64bit-Mach-O', 'Packages': {'pytest': '8.3.4', 'pluggy': '1.5.0'}, 'Plugins': {'html': '4.1.1', 'metadata': '3.1.1', 'asyncio': '0.25.3', 'anyio': '4.8.0', 'nbval': '0.11.0'}} rootdir: /Users/leseb/Documents/AI/llama-stack configfile: pyproject.toml plugins: html-4.1.1, metadata-3.1.1, asyncio-0.25.3, anyio-4.8.0, nbval-0.11.0 asyncio: mode=Mode.STRICT, asyncio_default_fixture_loop_scope=None collected 342 items / 1 error =================================================== ERRORS =================================================== ______________ ERROR collecting llama_stack/providers/tests/inference/test_vision_inference.py _______________ llama_stack/providers/tests/inference/test_vision_inference.py:29: in class TestVisionModelInference: llama_stack/providers/tests/inference/test_vision_inference.py:35: in TestVisionModelInference ImageContentItem(image=dict(data=PASTA_IMAGE)), E pydantic_core._pydantic_core.ValidationError: 1 validation error for ImageContentItem E image.data E Input should be a valid string, unable to parse raw data as a unicode string [type=string_unicode, input_value=b'\xff\xd8\xff\xe0\x00\x1...0\xe6\x9f5\xb5?\xff\xd9', input_type=bytes] E For further information visit https://errors.pydantic.dev/2.10/v/string_unicode ========================================== short test summary info =========================================== ERROR llama_stack/providers/tests/inference/test_vision_inference.py - pydantic_core._pydantic_core.ValidationError: 1 validation error for ImageContentItem !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Interrupted: 1 error during collection !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ======================================= 22 warnings, 1 error in 0.25s ======================================== ``` Which is fixed in https://github.com/meta-llama/llama-stack/pull/1003. ## Sources Please link relevant resources if necessary. ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Ran pre-commit to handle lint / formatting issues. - [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ ] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests. Signed-off-by: Sébastien Han --- llama_stack/providers/remote/inference/groq/groq.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llama_stack/providers/remote/inference/groq/groq.py b/llama_stack/providers/remote/inference/groq/groq.py index 461b3ee61..4e6cc2d6b 100644 --- a/llama_stack/providers/remote/inference/groq/groq.py +++ b/llama_stack/providers/remote/inference/groq/groq.py @@ -26,6 +26,7 @@ from llama_stack.apis.inference import ( Message, ResponseFormat, ToolChoice, + ToolConfig, ) from llama_stack.distribution.request_headers import NeedsRequestProviderData from llama_stack.providers.remote.inference.groq.config import GroqConfig From a9950ce806e6ffbe902bc3645db644c2dc49c4d8 Mon Sep 17 00:00:00 2001 From: ehhuang Date: Fri, 7 Feb 2025 09:35:38 -0800 Subject: [PATCH 07/29] test: remove flaky agent test (#1006) Summary: Test Plan: --- tests/client-sdk/agents/test_agents.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/client-sdk/agents/test_agents.py b/tests/client-sdk/agents/test_agents.py index 302c1c6e7..85b7af831 100644 --- a/tests/client-sdk/agents/test_agents.py +++ b/tests/client-sdk/agents/test_agents.py @@ -263,7 +263,8 @@ def test_custom_tool(llama_stack_client, agent_config): assert "CustomTool" in logs_str -def test_override_system_message_behavior(llama_stack_client, agent_config): +# TODO: fix this flaky test +def xtest_override_system_message_behavior(llama_stack_client, agent_config): client_tool = TestClientTool() agent_config = { **agent_config, From c97e05f75e295dfa747b22f1010c772c06b8520f Mon Sep 17 00:00:00 2001 From: Yuan Tang Date: Fri, 7 Feb 2025 12:35:49 -0500 Subject: [PATCH 08/29] test: Split inference tests to text and vision (#1008) # What does this PR do? This PR splits the inference tests into text and vision to make testing on vLLM provider easier as mentioned in https://github.com/meta-llama/llama-stack/pull/951 since serving multiple models (e.g. Llama-3.2-11B-Vision-Instruct and Llama-3.1-8B-Instruct) on a single port using the OpenAI API is [not supported yet](https://docs.vllm.ai/en/v0.5.5/serving/faq.html) so it's a bit tricky to test both at the same time. ## Test Plan All previously passing tests related to text still pass: `LLAMA_STACK_BASE_URL=http://localhost:5002 pytest -v tests/client-sdk/inference/test_text_inference.py` All vision tests passed via `LLAMA_STACK_BASE_URL=http://localhost:5002 pytest -v tests/client-sdk/inference/test_vision_inference.py`. Signed-off-by: Yuan Tang --- .github/workflows/tests.yml | 2 +- tests/client-sdk/README.md | 6 +- ...st_inference.py => test_text_inference.py} | 118 ---------------- .../inference/test_vision_inference.py | 133 ++++++++++++++++++ 4 files changed, 137 insertions(+), 122 deletions(-) rename tests/client-sdk/inference/{test_inference.py => test_text_inference.py} (73%) create mode 100644 tests/client-sdk/inference/test_vision_inference.py diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index ff13a4cb0..cfc26000b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -54,7 +54,7 @@ jobs: echo "REPORT_FILE=${REPORT_OUTPUT}" >> "$GITHUB_ENV" export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct - LLAMA_STACK_CONFIG=./llama_stack/templates/${{ matrix.provider }}/run.yaml pytest --md-report --md-report-verbose=1 ./tests/client-sdk/inference/test_inference.py --md-report-output "$REPORT_OUTPUT" + LLAMA_STACK_CONFIG=./llama_stack/templates/${{ matrix.provider }}/run.yaml pytest --md-report --md-report-verbose=1 ./tests/client-sdk/inference/ --md-report-output "$REPORT_OUTPUT" - name: Output reports to the job summary if: always() diff --git a/tests/client-sdk/README.md b/tests/client-sdk/README.md index 13142d46f..d4d439d96 100644 --- a/tests/client-sdk/README.md +++ b/tests/client-sdk/README.md @@ -4,18 +4,18 @@ You can run llama stack integration tests on either a Llama Stack Library or a L To test on a Llama Stack library with certain configuration, run ```bash LLAMA_STACK_CONFIG=./llama_stack/templates/cerebras/run.yaml -pytest -s -v tests/client-sdk/inference/test_inference.py +pytest -s -v tests/client-sdk/inference/ ``` or just the template name ```bash LLAMA_STACK_CONFIG=together -pytest -s -v tests/client-sdk/inference/test_inference.py +pytest -s -v tests/client-sdk/inference/ ``` To test on a Llama Stack endpoint, run ```bash LLAMA_STACK_BASE_URL=http//localhost:8089 -pytest -s -v tests/client-sdk/inference/test_inference.py +pytest -s -v tests/client-sdk/inference ``` ## Report Generation diff --git a/tests/client-sdk/inference/test_inference.py b/tests/client-sdk/inference/test_text_inference.py similarity index 73% rename from tests/client-sdk/inference/test_inference.py rename to tests/client-sdk/inference/test_text_inference.py index 9bbd1061a..4b24f1d38 100644 --- a/tests/client-sdk/inference/test_inference.py +++ b/tests/client-sdk/inference/test_text_inference.py @@ -4,9 +4,6 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -import base64 -import pathlib - import pytest from pydantic import BaseModel @@ -56,23 +53,6 @@ def get_weather_tool_definition(): } -@pytest.fixture -def image_path(): - return pathlib.Path(__file__).parent / "dog.png" - - -@pytest.fixture -def base64_image_data(image_path): - # Convert the image to base64 - return base64.b64encode(image_path.read_bytes()).decode("utf-8") - - -@pytest.fixture -def base64_image_url(base64_image_data, image_path): - # suffix includes the ., so we remove it - return f"data:image/{image_path.suffix[1:]};base64,{base64_image_data}" - - def test_text_completion_non_streaming(llama_stack_client, text_model_id): response = llama_stack_client.inference.completion( content="Complete the sentence using one word: Roses are red, violets are ", @@ -299,101 +279,3 @@ def test_text_chat_completion_structured_output(llama_stack_client, text_model_i assert answer.last_name == "Jordan" assert answer.year_of_birth == 1963 assert answer.num_seasons_in_nba == 15 - - -def test_image_chat_completion_non_streaming(llama_stack_client, vision_model_id): - message = { - "role": "user", - "content": [ - { - "type": "image", - "image": { - "url": { - # TODO: Replace with Github based URI to resources/sample1.jpg - "uri": "https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg" - }, - }, - }, - { - "type": "text", - "text": "Describe what is in this image.", - }, - ], - } - response = llama_stack_client.inference.chat_completion( - model_id=vision_model_id, - messages=[message], - stream=False, - ) - message_content = response.completion_message.content.lower().strip() - assert len(message_content) > 0 - assert any(expected in message_content for expected in {"dog", "puppy", "pup"}) - - -def test_image_chat_completion_streaming(llama_stack_client, vision_model_id): - message = { - "role": "user", - "content": [ - { - "type": "image", - "image": { - "url": { - # TODO: Replace with Github based URI to resources/sample1.jpg - "uri": "https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg" - }, - }, - }, - { - "type": "text", - "text": "Describe what is in this image.", - }, - ], - } - response = llama_stack_client.inference.chat_completion( - model_id=vision_model_id, - messages=[message], - stream=True, - ) - streamed_content = "" - for chunk in response: - streamed_content += chunk.event.delta.text.lower() - assert len(streamed_content) > 0 - assert any(expected in streamed_content for expected in {"dog", "puppy", "pup"}) - - -@pytest.mark.parametrize("type_", ["url", "data"]) -def test_image_chat_completion_base64(llama_stack_client, vision_model_id, base64_image_data, base64_image_url, type_): - image_spec = { - "url": { - "type": "image", - "image": { - "url": { - "uri": base64_image_url, - }, - }, - }, - "data": { - "type": "image", - "image": { - "data": base64_image_data, - }, - }, - }[type_] - - message = { - "role": "user", - "content": [ - image_spec, - { - "type": "text", - "text": "Describe what is in this image.", - }, - ], - } - response = llama_stack_client.inference.chat_completion( - model_id=vision_model_id, - messages=[message], - stream=False, - ) - message_content = response.completion_message.content.lower().strip() - assert len(message_content) > 0 diff --git a/tests/client-sdk/inference/test_vision_inference.py b/tests/client-sdk/inference/test_vision_inference.py new file mode 100644 index 000000000..df4b9d933 --- /dev/null +++ b/tests/client-sdk/inference/test_vision_inference.py @@ -0,0 +1,133 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import base64 +import pathlib + +import pytest + + +@pytest.fixture(scope="session") +def inference_provider_type(llama_stack_client): + providers = llama_stack_client.providers.list() + inference_providers = [p for p in providers if p.api == "inference"] + assert len(inference_providers) > 0, "No inference providers found" + return inference_providers[0].provider_type + + +@pytest.fixture +def image_path(): + return pathlib.Path(__file__).parent / "dog.png" + + +@pytest.fixture +def base64_image_data(image_path): + # Convert the image to base64 + return base64.b64encode(image_path.read_bytes()).decode("utf-8") + + +@pytest.fixture +def base64_image_url(base64_image_data, image_path): + # suffix includes the ., so we remove it + return f"data:image/{image_path.suffix[1:]};base64,{base64_image_data}" + + +def test_image_chat_completion_non_streaming(llama_stack_client, vision_model_id): + message = { + "role": "user", + "content": [ + { + "type": "image", + "image": { + "url": { + # TODO: Replace with Github based URI to resources/sample1.jpg + "uri": "https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg" + }, + }, + }, + { + "type": "text", + "text": "Describe what is in this image.", + }, + ], + } + response = llama_stack_client.inference.chat_completion( + model_id=vision_model_id, + messages=[message], + stream=False, + ) + message_content = response.completion_message.content.lower().strip() + assert len(message_content) > 0 + assert any(expected in message_content for expected in {"dog", "puppy", "pup"}) + + +def test_image_chat_completion_streaming(llama_stack_client, vision_model_id): + message = { + "role": "user", + "content": [ + { + "type": "image", + "image": { + "url": { + # TODO: Replace with Github based URI to resources/sample1.jpg + "uri": "https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg" + }, + }, + }, + { + "type": "text", + "text": "Describe what is in this image.", + }, + ], + } + response = llama_stack_client.inference.chat_completion( + model_id=vision_model_id, + messages=[message], + stream=True, + ) + streamed_content = "" + for chunk in response: + streamed_content += chunk.event.delta.text.lower() + assert len(streamed_content) > 0 + assert any(expected in streamed_content for expected in {"dog", "puppy", "pup"}) + + +@pytest.mark.parametrize("type_", ["url", "data"]) +def test_image_chat_completion_base64(llama_stack_client, vision_model_id, base64_image_data, base64_image_url, type_): + image_spec = { + "url": { + "type": "image", + "image": { + "url": { + "uri": base64_image_url, + }, + }, + }, + "data": { + "type": "image", + "image": { + "data": base64_image_data, + }, + }, + }[type_] + + message = { + "role": "user", + "content": [ + image_spec, + { + "type": "text", + "text": "Describe what is in this image.", + }, + ], + } + response = llama_stack_client.inference.chat_completion( + model_id=vision_model_id, + messages=[message], + stream=False, + ) + message_content = response.completion_message.content.lower().strip() + assert len(message_content) > 0 From f8f2f7f9bb439b765539e9c566aef58ab6dc420b Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Fri, 7 Feb 2025 09:39:08 -0800 Subject: [PATCH 09/29] feat: Add HTTPS serving option (#1000) # What does this PR do? Enables HTTPS option for Llama Stack. While doing so, introduces a `ServerConfig` sub-structure to house all server related configuration (port, ssl, etc.) Also simplified the `start_container.sh` entrypoint to simply be `python` instead of a complex bash command line. ## Test Plan Conda: Run: ```bash $ llama stack build --template together $ llama stack run --port 8322 # ensure server starts $ llama-stack-client configure --endpoint http://localhost:8322 $ llama-stack-client models list ``` Create a self-signed SSL key / cert pair. Then, using a local checkout of `llama-stack-client-python`, change https://github.com/meta-llama/llama-stack-client-python/blob/main/src/llama_stack_client/_base_client.py#L759 to add `kwargs.setdefault("verify", False)` so SSL verification is disabled. Then: ```bash $ llama stack run --port 8322 --tls-keyfile --tls-certfile $ llama-stack-client configure --endpoint https://localhost:8322 # notice the `https` $ llama-stack-client models list ``` Also tested with containers (but of course one needs to make sure the cert and key files are appropriately provided to the container.) --- llama_stack/cli/stack/run.py | 13 +++++++ llama_stack/distribution/datatypes.py | 22 +++++++++++ llama_stack/distribution/server/server.py | 42 +++++++++++++++++++-- llama_stack/distribution/start_conda_env.sh | 5 ++- llama_stack/distribution/start_container.sh | 12 +++++- 5 files changed, 88 insertions(+), 6 deletions(-) diff --git a/llama_stack/cli/stack/run.py b/llama_stack/cli/stack/run.py index f84def184..e7d6df292 100644 --- a/llama_stack/cli/stack/run.py +++ b/llama_stack/cli/stack/run.py @@ -55,6 +55,16 @@ class StackRun(Subcommand): default=[], metavar="KEY=VALUE", ) + self.parser.add_argument( + "--tls-keyfile", + type=str, + help="Path to TLS key file for HTTPS", + ) + self.parser.add_argument( + "--tls-certfile", + type=str, + help="Path to TLS certificate file for HTTPS", + ) def _run_stack_run_cmd(self, args: argparse.Namespace) -> None: import importlib.resources @@ -178,4 +188,7 @@ class StackRun(Subcommand): return run_args.extend(["--env", f"{key}={value}"]) + if args.tls_keyfile and args.tls_certfile: + run_args.extend(["--tls-keyfile", args.tls_keyfile, "--tls-certfile", args.tls_certfile]) + run_with_pty(run_args) diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py index 8b579b636..97706f22a 100644 --- a/llama_stack/distribution/datatypes.py +++ b/llama_stack/distribution/datatypes.py @@ -117,6 +117,23 @@ class Provider(BaseModel): config: Dict[str, Any] +class ServerConfig(BaseModel): + port: int = Field( + default=8321, + description="Port to listen on", + ge=1024, + le=65535, + ) + tls_certfile: Optional[str] = Field( + default=None, + description="Path to TLS certificate file for HTTPS", + ) + tls_keyfile: Optional[str] = Field( + default=None, + description="Path to TLS key file for HTTPS", + ) + + class StackRunConfig(BaseModel): version: str = LLAMA_STACK_RUN_CONFIG_VERSION @@ -159,6 +176,11 @@ a default SQLite store will be used.""", eval_tasks: List[EvalTaskInput] = Field(default_factory=list) tool_groups: List[ToolGroupInput] = Field(default_factory=list) + server: ServerConfig = Field( + default_factory=ServerConfig, + description="Configuration for the HTTP(S) server", + ) + class BuildConfig(BaseModel): version: str = LLAMA_STACK_BUILD_CONFIG_VERSION diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py index fcd0e3cad..d2c32de11 100644 --- a/llama_stack/distribution/server/server.py +++ b/llama_stack/distribution/server/server.py @@ -282,8 +282,19 @@ def main(): action="append", help="Environment variables in KEY=value format. Can be specified multiple times.", ) + parser.add_argument( + "--tls-keyfile", + help="Path to TLS key file for HTTPS", + required="--tls-certfile" in sys.argv, + ) + parser.add_argument( + "--tls-certfile", + help="Path to TLS certificate file for HTTPS", + required="--tls-keyfile" in sys.argv, + ) args = parser.parse_args() + if args.env: for env_pair in args.env: try: @@ -381,11 +392,36 @@ def main(): import uvicorn - # FYI this does not do hot-reloads + # Configure SSL if certificates are provided + port = args.port or config.server.port + + ssl_config = None + if args.tls_keyfile: + keyfile = args.tls_keyfile + certfile = args.tls_certfile + else: + keyfile = config.server.tls_keyfile + certfile = config.server.tls_certfile + + if keyfile and certfile: + ssl_config = { + "ssl_keyfile": keyfile, + "ssl_certfile": certfile, + } + print(f"HTTPS enabled with certificates:\n Key: {keyfile}\n Cert: {certfile}") listen_host = ["::", "0.0.0.0"] if not args.disable_ipv6 else "0.0.0.0" - print(f"Listening on {listen_host}:{args.port}") - uvicorn.run(app, host=listen_host, port=args.port) + print(f"Listening on {listen_host}:{port}") + + uvicorn_config = { + "app": app, + "host": listen_host, + "port": port, + } + if ssl_config: + uvicorn_config.update(ssl_config) + + uvicorn.run(**uvicorn_config) def extract_path_params(route: str) -> List[str]: diff --git a/llama_stack/distribution/start_conda_env.sh b/llama_stack/distribution/start_conda_env.sh index c37f30ef0..fe830059f 100755 --- a/llama_stack/distribution/start_conda_env.sh +++ b/llama_stack/distribution/start_conda_env.sh @@ -34,6 +34,7 @@ shift # Process environment variables from --env arguments env_vars="" +other_args="" while [[ $# -gt 0 ]]; do case "$1" in --env) @@ -48,6 +49,7 @@ while [[ $# -gt 0 ]]; do fi ;; *) + other_args="$other_args $1" shift ;; esac @@ -61,4 +63,5 @@ $CONDA_PREFIX/bin/python \ -m llama_stack.distribution.server.server \ --yaml-config "$yaml_config" \ --port "$port" \ - $env_vars + $env_vars \ + $other_args diff --git a/llama_stack/distribution/start_container.sh b/llama_stack/distribution/start_container.sh index 2c5d65d09..a5f543fb4 100755 --- a/llama_stack/distribution/start_container.sh +++ b/llama_stack/distribution/start_container.sh @@ -40,8 +40,12 @@ shift port="$1" shift +# Initialize other_args +other_args="" + # Process environment variables from --env arguments env_vars="" + while [[ $# -gt 0 ]]; do case "$1" in --env) @@ -55,6 +59,7 @@ while [[ $# -gt 0 ]]; do fi ;; *) + other_args="$other_args $1" shift ;; esac @@ -93,5 +98,8 @@ $CONTAINER_BINARY run $CONTAINER_OPTS -it \ -v "$yaml_config:/app/config.yaml" \ $mounts \ --env LLAMA_STACK_PORT=$port \ - --entrypoint='["python", "-m", "llama_stack.distribution.server.server", "--yaml-config", "/app/config.yaml"]' \ - $container_image:$version_tag + --entrypoint python \ + $container_image:$version_tag \ + -m llama_stack.distribution.server.server \ + --yaml-config /app/config.yaml \ + $other_args From 0b7098493afdd7c2414462877a742ada49d0077c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Fri, 7 Feb 2025 18:44:16 +0100 Subject: [PATCH 10/29] test: encode image data as base64 (#1003) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? Previously, the test was failing due to a pydantic validation error caused by passing raw binary image data instead of a valid Unicode string. This fix encodes the image data as base64, ensuring it is a valid string format compatible with `ImageContentItem`. Error: ``` ______________ ERROR collecting llama_stack/providers/tests/inference/test_vision_inference.py _______________ llama_stack/providers/tests/inference/test_vision_inference.py:31: in class TestVisionModelInference: llama_stack/providers/tests/inference/test_vision_inference.py:37: in TestVisionModelInference ImageContentItem(image=dict(data=PASTA_IMAGE)), E pydantic_core._pydantic_core.ValidationError: 1 validation error for ImageContentItem E image.data E Input should be a valid string, unable to parse raw data as a unicode string [type=string_unicode, input_value=b'\xff\xd8\xff\xe0\x00\x1...0\xe6\x9f5\xb5?\xff\xd9', input_type=bytes] E For further information visit https://errors.pydantic.dev/2.10/v/string_unicode ``` Signed-off-by: Sébastien Han [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan Execute the following: ``` ollama run llama3.2-vision --keepalive 2m & uv run pytest -v -s -k "ollama" --inference-model=llama3.2-vision:latest llama_stack/providers/tests/inference/test_vision_inference.py llama_stack/providers/tests/inference/test_vision_inference.py::TestVisionModelInference::test_vision_chat_completion_non_streaming[-ollama-image0-expected_strings0] PASSED llama_stack/providers/tests/inference/test_vision_inference.py::TestVisionModelInference::test_vision_chat_completion_non_streaming[-ollama-image1-expected_strings1] FAILED llama_stack/providers/tests/inference/test_vision_inference.py::TestVisionModelInference::test_vision_chat_completion_streaming[-ollama] FAILED ``` The last two tests are failing because Cloudflare blocked me from accessing https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg but this has no impact on the current fix. [//]: # (## Documentation) [//]: # (- [ ] Added a Changelog entry if the change is significant) Signed-off-by: Sébastien Han --- .../providers/tests/inference/test_vision_inference.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama_stack/providers/tests/inference/test_vision_inference.py b/llama_stack/providers/tests/inference/test_vision_inference.py index 964f70901..a2434ac41 100644 --- a/llama_stack/providers/tests/inference/test_vision_inference.py +++ b/llama_stack/providers/tests/inference/test_vision_inference.py @@ -4,12 +4,12 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +import base64 from pathlib import Path import pytest -from llama_stack.apis.common.content_types import ImageContentItem, TextContentItem, URL - +from llama_stack.apis.common.content_types import URL, ImageContentItem, TextContentItem from llama_stack.apis.inference import ( ChatCompletionResponse, ChatCompletionResponseEventType, @@ -23,7 +23,7 @@ from .utils import group_chunks THIS_DIR = Path(__file__).parent with open(THIS_DIR / "pasta.jpeg", "rb") as f: - PASTA_IMAGE = f.read() + PASTA_IMAGE = base64.b64encode(f.read()).decode("utf-8") class TestVisionModelInference: From 2a4a612373d521d064f0b2ca0e7814748815da53 Mon Sep 17 00:00:00 2001 From: Charlie Doern Date: Fri, 7 Feb 2025 12:47:02 -0500 Subject: [PATCH 11/29] fix: Ensure a better error stack trace when llama-stack is not built (#950) # What does this PR do? currently this is the output when you run a distribution locally without running `llama stack build`: ``` Traceback (most recent call last): File "/Users/charliedoern/Documents/llama-sdk.py", line 25, in models = client.models.list() ^^^^^^^^^^^^^^^^^^^^ File "/Users/charliedoern/Documents/llama-stack-client-python/src/llama_stack_client/resources/models.py", line 107, in list raise exc File "/Users/charliedoern/Documents/llama-stack-client-python/src/llama_stack_client/resources/models.py", line 95, in list return self._get( ^^^^^^^^^^ File "/Users/charliedoern/Documents/llama-stack-client-python/src/llama_stack_client/_base_client.py", line 1212, in get return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/Users/charliedoern/Documents/llama-stack/llama_stack/distribution/library_client.py", line 168, in request return asyncio.run(self.async_client.request(*args, **kwargs)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/homebrew/Cellar/python@3.11/3.11.10/Frameworks/Python.framework/Versions/3.11/lib/python3.11/asyncio/runners.py", line 190, in run return runner.run(main) ^^^^^^^^^^^^^^^^ File "/opt/homebrew/Cellar/python@3.11/3.11.10/Frameworks/Python.framework/Versions/3.11/lib/python3.11/asyncio/runners.py", line 118, in run return self._loop.run_until_complete(task) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/homebrew/Cellar/python@3.11/3.11.10/Frameworks/Python.framework/Versions/3.11/lib/python3.11/asyncio/base_events.py", line 654, in run_until_complete return future.result() ^^^^^^^^^^^^^^^ File "/Users/charliedoern/Documents/llama-stack/llama_stack/distribution/library_client.py", line 258, in request if not self.endpoint_impls: ^^^^^^^^^^^^^^^^^^^ AttributeError: 'AsyncLlamaStackAsLibraryClient' object has no attribute 'endpoint_impls' ``` the intended exception is never raised, add an except for an AttributeError so users can catch when they call things like `models.list()` and so that a more useful error telling them that the client is not properly initialized is printed. ## Test Plan Please describe: - I ran the script found here: https://llama-stack.readthedocs.io/en/latest/getting_started/index.html#run-inference-with-python-sdk locally with the changes in this PR and the exception was caught successfully. ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Ran pre-commit to handle lint / formatting issues. - [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ ] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests. --------- Signed-off-by: Charlie Doern Co-authored-by: Ashwin Bharambe --- llama_stack/distribution/library_client.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llama_stack/distribution/library_client.py b/llama_stack/distribution/library_client.py index 13aa67956..d4a7cde7e 100644 --- a/llama_stack/distribution/library_client.py +++ b/llama_stack/distribution/library_client.py @@ -198,6 +198,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): async def initialize(self) -> bool: try: + self.endpoint_impls = None self.impls = await construct_stack(self.config, self.custom_provider_registry) except ModuleNotFoundError as _e: cprint(_e.msg, "red") From 316c43fdafab04ee68a3391a43f7d08f94133ba5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Fri, 7 Feb 2025 18:52:16 +0100 Subject: [PATCH 12/29] refactor(ollama): model availability check (#986) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? Moved model availability check logic into a dedicated check_model_availability function. Eliminated redundant code by reusing the helper function in both embedding and non-embedding model registration. Signed-off-by: Sébastien Han ## Test Plan Run Ollama and serve 2 models to get most the unit test pass: ``` ollama run llama3.2:3b-instruct-fp16 --keepalive 2m & ollama run llama3.1:8b --keepalive 2m & ``` Run the unit test: ``` uv run pytest -v -k "ollama" --inference-model=llama3.2:3b-instruct-fp16 llama_stack/providers/tests/inference/test_model_registration.py /Users/leseb/Documents/AI/llama-stack/.venv/lib/python3.13/site-packages/pytest_asyncio/plugin.py:207: PytestDeprecationWarning: The configuration option "asyncio_default_fixture_loop_scope" is unset. The event loop scope for asynchronous fixtures will default to the fixture caching scope. Future versions of pytest-asyncio will default the loop scope for asynchronous fixtures to function scope. Set the default fixture loop scope explicitly in order to avoid unexpected behavior in the future. Valid fixture loop scopes are: "function", "class", "module", "package", "session" warnings.warn(PytestDeprecationWarning(_DEFAULT_FIXTURE_LOOP_SCOPE_UNSET)) ============================================ test session starts ============================================= platform darwin -- Python 3.13.1, pytest-8.3.4, pluggy-1.5.0 -- /Users/leseb/Documents/AI/llama-stack/.venv/bin/python3 cachedir: .pytest_cache metadata: {'Python': '3.13.1', 'Platform': 'macOS-15.3-arm64-arm-64bit-Mach-O', 'Packages': {'pytest': '8.3.4', 'pluggy': '1.5.0'}, 'Plugins': {'html': '4.1.1', 'metadata': '3.1.1', 'asyncio': '0.25.3', 'anyio': '4.8.0', 'nbval': '0.11.0'}} rootdir: /Users/leseb/Documents/AI/llama-stack configfile: pyproject.toml plugins: html-4.1.1, metadata-3.1.1, asyncio-0.25.3, anyio-4.8.0, nbval-0.11.0 asyncio: mode=Mode.STRICT, asyncio_default_fixture_loop_scope=None collected 65 items / 60 deselected / 5 selected llama_stack/providers/tests/inference/test_model_registration.py::TestModelRegistration::test_register_unsupported_model[-ollama] PASSED [ 20%] llama_stack/providers/tests/inference/test_model_registration.py::TestModelRegistration::test_register_nonexistent_model[-ollama] PASSED [ 40%] llama_stack/providers/tests/inference/test_model_registration.py::TestModelRegistration::test_register_with_llama_model[-ollama] FAILED [ 60%] llama_stack/providers/tests/inference/test_model_registration.py::TestModelRegistration::test_initialize_model_during_registering[-ollama] FAILED [ 80%] llama_stack/providers/tests/inference/test_model_registration.py::TestModelRegistration::test_register_with_invalid_llama_model[-ollama] PASSED [100%] ================================================== FAILURES ================================================== _______________________ TestModelRegistration.test_register_with_llama_model[-ollama] ________________________ llama_stack/providers/tests/inference/test_model_registration.py:54: in test_register_with_llama_model _ = await models_impl.register_model( llama_stack/providers/utils/telemetry/trace_protocol.py:91: in async_wrapper result = await method(self, *args, **kwargs) llama_stack/distribution/routers/routing_tables.py:245: in register_model registered_model = await self.register_object(model) llama_stack/distribution/routers/routing_tables.py:192: in register_object registered_obj = await register_object_with_provider(obj, p) llama_stack/distribution/routers/routing_tables.py:53: in register_object_with_provider return await p.register_model(obj) llama_stack/providers/utils/telemetry/trace_protocol.py:91: in async_wrapper result = await method(self, *args, **kwargs) llama_stack/providers/remote/inference/ollama/ollama.py:368: in register_model await check_model_availability(model.provider_resource_id) llama_stack/providers/remote/inference/ollama/ollama.py:359: in check_model_availability raise ValueError( E ValueError: Model 'custom-model' is not available in Ollama. Available models: llama3.1:8b, llama3.2:3b-instruct-fp16 __________________ TestModelRegistration.test_initialize_model_during_registering[-ollama] ___________________ llama_stack/providers/tests/inference/test_model_registration.py:85: in test_initialize_model_during_registering mock_load_model.assert_called_once() /opt/homebrew/Cellar/python@3.13/3.13.1/Frameworks/Python.framework/Versions/3.13/lib/python3.13/unittest/mock.py:956: in assert_called_once raise AssertionError(msg) E AssertionError: Expected 'load_model' to have been called once. Called 0 times. -------------------------------------------- Captured stderr call -------------------------------------------- W0207 11:55:26.777000 90854 .venv/lib/python3.13/site-packages/torch/distributed/elastic/multiprocessing/redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs. ========================================== short test summary info =========================================== FAILED llama_stack/providers/tests/inference/test_model_registration.py::TestModelRegistration::test_register_with_llama_model[-ollama] - ValueError: Model 'custom-model' is not available in Ollama. Available models: llama3.1:8b, llama3.2:3b-i... FAILED llama_stack/providers/tests/inference/test_model_registration.py::TestModelRegistration::test_initialize_model_during_registering[-ollama] - AssertionError: Expected 'load_model' to have been called once. Called 0 times. =========================== 2 failed, 3 passed, 60 deselected, 2 warnings in 1.84s =========================== ``` We only "care" about the `test_register_nonexistent_model` for this code. ## Sources Please link relevant resources if necessary. ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Ran pre-commit to handle lint / formatting issues. - [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ ] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests. Signed-off-by: Sébastien Han --- .../remote/inference/ollama/ollama.py | 22 ++++++++----------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py index cff8aa742..ecd195854 100644 --- a/llama_stack/providers/remote/inference/ollama/ollama.py +++ b/llama_stack/providers/remote/inference/ollama/ollama.py @@ -352,24 +352,20 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate): return EmbeddingsResponse(embeddings=embeddings) async def register_model(self, model: Model) -> Model: - # ollama does not have embedding models running. Check if the model is in list of available models. - if model.model_type == ModelType.embedding: - response = await self.client.list() + async def check_model_availability(model_id: str): + response = await self.client.ps() available_models = [m["model"] for m in response["models"]] - if model.provider_resource_id not in available_models: + if model_id not in available_models: raise ValueError( - f"Model '{model.provider_resource_id}' is not available in Ollama. " - f"Available models: {', '.join(available_models)}" + f"Model '{model_id}' is not available in Ollama. Available models: {', '.join(available_models)}" ) + + if model.model_type == ModelType.embedding: + await check_model_availability(model.provider_resource_id) return model + model = await self.register_helper.register_model(model) - models = await self.client.ps() - available_models = [m["model"] for m in models["models"]] - if model.provider_resource_id not in available_models: - raise ValueError( - f"Model '{model.provider_resource_id}' is not available in Ollama. " - f"Available models: {', '.join(available_models)}" - ) + await check_model_availability(model.provider_resource_id) return model From 10bda65b947db1faa7572792e134088e0b3eb0f3 Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Fri, 7 Feb 2025 09:55:48 -0800 Subject: [PATCH 13/29] Nuke use_proxy from code execution --- .../inline/tool_runtime/code_interpreter/code_execution.py | 1 - 1 file changed, 1 deletion(-) diff --git a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py index b48f92d36..6f4b25b9d 100644 --- a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py +++ b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py @@ -67,7 +67,6 @@ def generate_bwrap_command(bind_dirs: List[str]) -> str: @dataclass class CodeExecutionContext: matplotlib_dump_dir: str - use_proxy: bool = False @dataclass From a8820597eed304427f7c70e05b5e05f0a3318ab3 Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Fri, 7 Feb 2025 11:36:29 -0800 Subject: [PATCH 14/29] Minor clean up of notebook --- docs/getting_started.ipynb | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/getting_started.ipynb b/docs/getting_started.ipynb index 4e4893158..96e39eb82 100644 --- a/docs/getting_started.ipynb +++ b/docs/getting_started.ipynb @@ -86,7 +86,6 @@ "# NBVAL_SKIP\n", "\n", "!apt-get install -y bubblewrap\n", - "# install a branch of llama stack\n", "import os\n", "os.environ[\"UV_SYSTEM_PYTHON\"] = \"1\"\n", "!pip install uv\n", From 62e5461da79b3ed46ec117fef9da5d6032af3acc Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Fri, 7 Feb 2025 11:56:22 -0800 Subject: [PATCH 15/29] No spaces in ipynb tests --- docs/conftest.py | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 docs/conftest.py diff --git a/docs/conftest.py b/docs/conftest.py new file mode 100644 index 000000000..bec535f77 --- /dev/null +++ b/docs/conftest.py @@ -0,0 +1,9 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +def pytest_collection_modifyitems(items): + for item in items: + item.name = item.name.replace(' ', '_') From c335ed8765865f7258f6ba51cf6cec840f07930b Mon Sep 17 00:00:00 2001 From: Hardik Shah Date: Fri, 7 Feb 2025 12:24:07 -0800 Subject: [PATCH 16/29] raise when client initialize fails --- llama_stack/distribution/library_client.py | 24 +++++++++++----------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/llama_stack/distribution/library_client.py b/llama_stack/distribution/library_client.py index d4a7cde7e..2c0f73974 100644 --- a/llama_stack/distribution/library_client.py +++ b/llama_stack/distribution/library_client.py @@ -17,17 +17,6 @@ from typing import Any, get_args, get_origin, Optional, TypeVar import httpx import yaml -from llama_stack_client import ( - APIResponse, - AsyncAPIResponse, - AsyncLlamaStackClient, - AsyncStream, - LlamaStackClient, - NOT_GIVEN, -) -from pydantic import BaseModel, TypeAdapter -from rich.console import Console -from termcolor import cprint from llama_stack.distribution.build import print_pip_install_help from llama_stack.distribution.configure import parse_and_maybe_upgrade_config @@ -46,6 +35,17 @@ from llama_stack.providers.utils.telemetry.tracing import ( setup_logger, start_trace, ) +from llama_stack_client import ( + APIResponse, + AsyncAPIResponse, + AsyncLlamaStackClient, + AsyncStream, + LlamaStackClient, + NOT_GIVEN, +) +from pydantic import BaseModel, TypeAdapter +from rich.console import Console +from termcolor import cprint T = TypeVar("T") @@ -214,7 +214,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): f"Please run:\n\n{prefix}llama stack build --template {self.config_path_or_template_name} --image-type venv\n\n", "yellow", ) - return False + raise _e if Api.telemetry in self.impls: setup_logger(self.impls[Api.telemetry]) From ddd06105a4a90c175d83b1b1cb1e58da14de67cf Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 7 Feb 2025 21:52:50 +0000 Subject: [PATCH 17/29] Bump version to 0.1.2 --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 402024772..5e9cb75e2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "llama_stack" -version = "0.1.1" +version = "0.1.2" authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }] description = "Llama Stack" readme = "README.md" @@ -25,8 +25,8 @@ dependencies = [ "fire", "httpx", "huggingface-hub", - "llama-models>=0.1.1", - "llama-stack-client>=0.1.1", + "llama-models>=0.1.2", + "llama-stack-client>=0.1.2", "prompt-toolkit", "python-dotenv", "pydantic>=2", From a229de6d1eaff027e7970202d71c1ee7e63ae3a2 Mon Sep 17 00:00:00 2001 From: Jeff Tang Date: Fri, 7 Feb 2025 15:36:15 -0800 Subject: [PATCH 18/29] Getting started notebook update (#936) # What does this PR do? Added examples (Section 4) of using Llama Stack 0.1 distro on together and Llama 3.2 to answer questions about an image with LS Chat and Agent APIs. --- docs/getting_started.ipynb | 225 +++++++++++++++++++++++++++++++++++++ 1 file changed, 225 insertions(+) diff --git a/docs/getting_started.ipynb b/docs/getting_started.ipynb index 96e39eb82..abe537c8e 100644 --- a/docs/getting_started.ipynb +++ b/docs/getting_started.ipynb @@ -3396,6 +3396,231 @@ "response = client.scoring.score(input_rows=rows, scoring_functions=scoring_params)\n", "pprint(response)\n" ] + }, + { + "cell_type": "markdown", + "id": "ad077440", + "metadata": {}, + "source": [ + "## 4. Image Understanding with Llama 3.2\n", + "\n", + "Below is a complete example of using Together's Llama Stack 0.1 server at https://llama-stack.together.ai to ask Llama 3.2 questions about an image." + ] + }, + { + "cell_type": "markdown", + "id": "82e381ec", + "metadata": {}, + "source": [ + "### 4.1 Setup and helpers\n", + "\n", + "Below we install the Llama Stack client 0.1, download the example image, define two image helpers, and set Llama Stack Together server URL and Llama 3.2 model name.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "865fc5a8", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install llama-stack-client==0.1.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "44e05e16", + "metadata": {}, + "outputs": [], + "source": [ + "!wget https://raw.githubusercontent.com/meta-llama/llama-models/refs/heads/main/Llama_Repo.jpeg" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "469750f7", + "metadata": {}, + "outputs": [], + "source": [ + "from PIL import Image\n", + "import matplotlib.pyplot as plt\n", + "\n", + "def display_image(path):\n", + " img = Image.open(path)\n", + " plt.imshow(img)\n", + " plt.axis('off')\n", + " plt.show()\n", + "\n", + "display_image(\"Llama_Repo.jpeg\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2c1e1c2", + "metadata": {}, + "outputs": [], + "source": [ + "import base64\n", + "\n", + "def encode_image(image_path):\n", + " with open(image_path, \"rb\") as image_file:\n", + " base64_string = base64.b64encode(image_file.read()).decode(\"utf-8\")\n", + " base64_url = f\"data:image/png;base64,{base64_string}\"\n", + " return base64_url" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c565f99e", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_stack_client import LlamaStackClient\n", + "\n", + "LLAMA_STACK_API_TOGETHER_URL=\"https://llama-stack.together.ai\"\n", + "LLAMA32_11B_INSTRUCT = \"meta-llama/Llama-3.2-11B-Vision-Instruct\"" + ] + }, + { + "cell_type": "markdown", + "id": "7737cd41", + "metadata": {}, + "source": [ + "### 4.2 Using Llama Stack Chat API\n", + "\n", + "The code below uses the Llama Stack 0.1's chat API to interact with Llama 3.2:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d7914894", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_stack_client.lib.inference.event_logger import EventLogger\n", + "\n", + "async def run_main(image_path: str, prompt):\n", + " client = LlamaStackClient(\n", + " base_url=LLAMA_STACK_API_TOGETHER_URL,\n", + " )\n", + "\n", + " message = {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\n", + " \"type\": \"image\",\n", + " \"image\": {\n", + " \"url\": {\n", + " \"uri\": encode_image(image_path)\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"type\": \"text\",\n", + " \"text\": prompt,\n", + " }\n", + " ]\n", + " }\n", + "\n", + " response = client.inference.chat_completion(\n", + " messages=[message],\n", + " model_id=LLAMA32_11B_INSTRUCT,\n", + " stream=False,\n", + " )\n", + "\n", + " print(response.completion_message.content.lower().strip())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ee09b97", + "metadata": {}, + "outputs": [], + "source": [ + "await run_main(\"Llama_Repo.jpeg\",\n", + " \"How many different colors are those llamas?\\\n", + " What are those colors?\")" + ] + }, + { + "cell_type": "markdown", + "id": "e741d7b9", + "metadata": {}, + "source": [ + "### 4.3 Using Llama Stack Agent API\n", + "\n", + "The code below uses the Llama Stack 0.1's Agent API to interact with Llama 3.2:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f9a83275", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_stack_client.lib.agents.agent import Agent\n", + "from llama_stack_client.lib.agents.event_logger import EventLogger\n", + "from llama_stack_client.types.agent_create_params import AgentConfig\n", + "\n", + "async def run_main(image_path, prompt):\n", + " base64_image = encode_image(image_path)\n", + "\n", + " client = LlamaStackClient(\n", + " base_url=LLAMA_STACK_API_TOGETHER_URL,\n", + " )\n", + "\n", + " agent_config = AgentConfig(\n", + " model=LLAMA32_11B_INSTRUCT,\n", + " instructions=\"You are a helpful assistant\",\n", + " enable_session_persistence=False,\n", + " )\n", + "\n", + " agent = Agent(client, agent_config)\n", + " session_id = agent.create_session(\"test-session\")\n", + "\n", + " response = agent.create_turn(\n", + " messages=[{\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\n", + " \"type\": \"image\",\n", + " \"image\": {\n", + " \"url\": {\n", + " \"uri\": encode_image(image_path)\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"type\": \"text\",\n", + " \"text\": prompt,\n", + " }\n", + " ]\n", + " }],\n", + " session_id=session_id,\n", + " )\n", + "\n", + " for log in EventLogger().log(response):\n", + " log.print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15d0098b", + "metadata": {}, + "outputs": [], + "source": [ + "await run_main(\"Llama_Repo.jpeg\",\n", + " \"How many different colors are those llamas?\\\n", + " What are those colors?\")" + ] } ], "metadata": { From 7766e68e92e9f883e637269f0dcb765036c775d5 Mon Sep 17 00:00:00 2001 From: raghotham Date: Fri, 7 Feb 2025 15:36:20 -0800 Subject: [PATCH 19/29] docs: update index.md for 0.1.2 (#1013) # What does this PR do? [Provide a short summary of what this PR does and why. Link to relevant issues if applicable.] [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan [Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.*] [//]: # (## Documentation) [//]: # (- [ ] Added a Changelog entry if the change is significant) --- docs/source/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/index.md b/docs/source/index.md index 095f50885..2834f5641 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -2,7 +2,7 @@ ```{admonition} News :class: tip -Llama Stack 0.1.1 is now available! See the [release notes](https://github.com/meta-llama/llama-stack/releases/tag/v0.1.1) for more details. +Llama Stack 0.1.2 is now available! See the [release notes](https://github.com/meta-llama/llama-stack/releases/tag/v0.1.2) for more details. ``` # Llama Stack From 413099ef6a7b1106086d4cdcb11093b62b7c72b5 Mon Sep 17 00:00:00 2001 From: Yuan Tang Date: Sat, 8 Feb 2025 14:49:46 -0500 Subject: [PATCH 20/29] test: Make text-based chat completion tests run 10x faster (#1016) # What does this PR do? This significantly shortens the test time (about 10x faster) since most of the time is spent on outputing the tokens "there are several planets in our solar system that have...". We want to have an answer quicker, especially when testing even larger models. ## Test Plan ``` LLAMA_STACK_BASE_URL=http://localhost:5002 pytest -v tests/client-sdk/inference/test_text_inference.py -k "test_text_chat_completion_non_streaming or test_text_chat_completion_streaming" ================================================================== test session starts =================================================================== platform linux -- Python 3.10.16, pytest-8.3.4, pluggy-1.5.0 -- /home/yutang/.conda/envs/myenv/bin/python3.10 cachedir: .pytest_cache rootdir: /home/yutang/repos/llama-stack configfile: pyproject.toml plugins: anyio-4.7.0 collected 12 items / 8 deselected / 4 selected tests/client-sdk/inference/test_text_inference.py::test_text_chat_completion_non_streaming[meta-llama/Llama-3.1-8B-Instruct-Which planet do humans live on?-Earth] PASSED [ 25%] tests/client-sdk/inference/test_text_inference.py::test_text_chat_completion_non_streaming[meta-llama/Llama-3.1-8B-Instruct-Which planet has rings around it with a name starting with letter S?-Saturn] PASSED [ 50%] tests/client-sdk/inference/test_text_inference.py::test_text_chat_completion_streaming[meta-llama/Llama-3.1-8B-Instruct-What's the name of the Sun in latin?-Sol] PASSED [ 75%] tests/client-sdk/inference/test_text_inference.py::test_text_chat_completion_streaming[meta-llama/Llama-3.1-8B-Instruct-What is the name of the US captial?-Washington] PASSED [100%] ``` --------- Signed-off-by: Yuan Tang --- tests/client-sdk/inference/test_text_inference.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/client-sdk/inference/test_text_inference.py b/tests/client-sdk/inference/test_text_inference.py index 4b24f1d38..aa0e510dd 100644 --- a/tests/client-sdk/inference/test_text_inference.py +++ b/tests/client-sdk/inference/test_text_inference.py @@ -156,8 +156,8 @@ def test_text_completion_structured_output(llama_stack_client, text_model_id, in @pytest.mark.parametrize( "question,expected", [ - ("What are the names of planets in our solar system?", "Earth"), - ("What are the names of the planets that have rings around them?", "Saturn"), + ("Which planet do humans live on?", "Earth"), + ("Which planet has rings around it with a name starting with letter S?", "Saturn"), ], ) def test_text_chat_completion_non_streaming(llama_stack_client, text_model_id, question, expected): From 80ba9deab112b0c04b5e7a26f6d2c7406f1395ca Mon Sep 17 00:00:00 2001 From: Sarthak Deshpande <60317842+cheesecake100201@users.noreply.github.com> Date: Sun, 9 Feb 2025 01:20:35 +0530 Subject: [PATCH 21/29] chore: Updated requirements.txt (#1017) # What does this PR do? [Provide a short summary of what this PR does and why. Link to relevant issues if applicable.] Updated requirements.txt [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan [Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.*] [//]: # (## Documentation) [//]: # (- [ ] Added a Changelog entry if the change is significant) --------- Co-authored-by: sarthakdeshpande --- requirements.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 157c68820..497feb764 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ annotated-types==0.7.0 anyio==4.8.0 blobfile==3.0.0 certifi==2025.1.31 +chardet==5.2.0 charset-normalizer==3.4.1 click==8.1.8 colorama==0.4.6 ; sys_platform == 'win32' @@ -18,8 +19,8 @@ httpx==0.28.1 huggingface-hub==0.28.1 idna==3.10 jinja2==3.1.5 -llama-models==0.1.1 -llama-stack-client==0.1.1 +llama-models==0.1.2 +llama-stack-client==0.1.2 lxml==5.3.0 markdown-it-py==3.0.0 markupsafe==3.0.2 @@ -34,6 +35,7 @@ pycryptodomex==3.21.0 pydantic==2.10.6 pydantic-core==2.27.2 pygments==2.19.1 +pypdf==5.2.0 python-dateutil==2.9.0.post0 python-dotenv==1.0.1 pytz==2025.1 From b981b49bfa08697603092e5a19320d6e02d6c81f Mon Sep 17 00:00:00 2001 From: Yuan Tang Date: Sat, 8 Feb 2025 23:42:57 -0500 Subject: [PATCH 22/29] test: Use JSON tool prompt format for remote::vllm provider (#1019) # What does this PR do? This PR removes the warnings when running tests for `remote::vllm` provider: ``` Detected the chat template content format to be 'openai'. You can set `--chat-template-content-format` to override this. ``` ## Test Plan All tests passed without the warning messages shown above. Signed-off-by: Yuan Tang --- tests/client-sdk/inference/test_text_inference.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/client-sdk/inference/test_text_inference.py b/tests/client-sdk/inference/test_text_inference.py index aa0e510dd..81b476218 100644 --- a/tests/client-sdk/inference/test_text_inference.py +++ b/tests/client-sdk/inference/test_text_inference.py @@ -11,6 +11,7 @@ PROVIDER_TOOL_PROMPT_FORMAT = { "remote::ollama": "json", "remote::together": "json", "remote::fireworks": "json", + "remote::vllm": "json", } PROVIDER_LOGPROBS_TOP_K = set( From 8186c880218034e669dbd1c1dc111d8814776119 Mon Sep 17 00:00:00 2001 From: Yuan Tang Date: Sun, 9 Feb 2025 22:26:36 -0500 Subject: [PATCH 23/29] docs: Render check marks correctly on PyPI (#1024) # What does this PR do? The table on the project's PyPI page does not render check marks. This PR switches to use the unicode symbol directly that can be rendered correctly on PyPI. Before: ![image](https://github.com/user-attachments/assets/6d01d440-8722-4c37-8b0a-9ba8c0cdb48d) After: ![image](https://github.com/user-attachments/assets/3a7153f2-9468-40f6-97a2-17f903de4287) Signed-off-by: Yuan Tang --- README.md | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index cdf98dc12..a5e5b217d 100644 --- a/README.md +++ b/README.md @@ -34,22 +34,22 @@ By reducing friction and complexity, Llama Stack empowers developers to focus on ### API Providers Here is a list of the various API providers and available distributions to developers started easily, -| **API Provider Builder** | **Environments** | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** | -|:------------------------------------------------------------------------------------------:|:----------------------:|:------------------:|:------------------:|:------------------:|:------------------:|:------------------:| -| Meta Reference | Single Node | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | -| SambaNova | Hosted | | :heavy_check_mark: | | | | -| Cerebras | Hosted | | :heavy_check_mark: | | | | -| Fireworks | Hosted | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | | -| AWS Bedrock | Hosted | | :heavy_check_mark: | | :heavy_check_mark: | | -| Together | Hosted | :heavy_check_mark: | :heavy_check_mark: | | :heavy_check_mark: | | -| Groq | Hosted | | :heavy_check_mark: | | | | -| Ollama | Single Node | | :heavy_check_mark: | | | | -| TGI | Hosted and Single Node | | :heavy_check_mark: | | | | -| NVIDIA NIM | Hosted and Single Node | | :heavy_check_mark: | | | | -| Chroma | Single Node | | | :heavy_check_mark: | | | -| PG Vector | Single Node | | | :heavy_check_mark: | | | -| PyTorch ExecuTorch | On-device iOS | :heavy_check_mark: | :heavy_check_mark: | | | | -| vLLM | Hosted and Single Node | | :heavy_check_mark: | | | | +| **API Provider Builder** | **Environments** | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** | +|:------------------------:|:----------------------:|:----------:|:-------------:|:----------:|:----------:|:-------------:| +| Meta Reference | Single Node | ✅ | ✅ | ✅ | ✅ | ✅ | +| SambaNova | Hosted | | ✅ | | | | +| Cerebras | Hosted | | ✅ | | | | +| Fireworks | Hosted | ✅ | ✅ | ✅ | | | +| AWS Bedrock | Hosted | | ✅ | | ✅ | | +| Together | Hosted | ✅ | ✅ | | ✅ | | +| Groq | Hosted | | ✅ | | | | +| Ollama | Single Node | | ✅ | | | | +| TGI | Hosted and Single Node | | ✅ | | | | +| NVIDIA NIM | Hosted and Single Node | | ✅ | | | | +| Chroma | Single Node | | | ✅ | | | +| PG Vector | Single Node | | | ✅ | | | +| PyTorch ExecuTorch | On-device iOS | ✅ | ✅ | | | | +| vLLM | Hosted and Single Node | | ✅ | | | | ### Distributions From 076213165c1005a5348eb96aed0bab3e9d3935f4 Mon Sep 17 00:00:00 2001 From: Michael Clifford Date: Mon, 10 Feb 2025 09:25:30 -0500 Subject: [PATCH 24/29] docs: update rag.md example code to prevent errors (#1009) --- docs/source/building_applications/rag.md | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/docs/source/building_applications/rag.md b/docs/source/building_applications/rag.md index 6b7a354b7..5287a2367 100644 --- a/docs/source/building_applications/rag.md +++ b/docs/source/building_applications/rag.md @@ -36,13 +36,12 @@ chunks = [ "content": "Your document text here", "mime_type": "text/plain", }, - ..., ] -client.vector_io.insert(vector_db_id, chunks) +client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks) # You can then query for these chunks chunks_response = client.vector_io.query( - vector_db_id, query="What do you know about..." + vector_db_id=vector_db_id, query="What do you know about..." ) ``` @@ -72,8 +71,8 @@ client.tool_runtime.rag_tool.insert( # Query documents results = client.tool_runtime.rag_tool.query( - vector_db_id=vector_db_id, - query="What do you know about...", + vector_db_ids=[vector_db_id], + content="What do you know about...", ) ``` @@ -82,10 +81,14 @@ results = client.tool_runtime.rag_tool.query( One of the most powerful patterns is combining agents with RAG capabilities. Here's a complete example: ```python +from llama_stack_client.types.agent_create_params import AgentConfig +from llama_stack_client.lib.agents.agent import Agent + # Configure agent with memory agent_config = AgentConfig( - model="Llama3.2-3B-Instruct", + model="meta-llama/Llama-3.2-3B-Instruct", instructions="You are a helpful assistant", + enable_session_persistence=False, toolgroups=[ { "name": "builtin::rag", @@ -105,10 +108,10 @@ response = agent.create_turn( {"role": "user", "content": "I am providing some documents for reference."} ], documents=[ - dict( - content="https://raw.githubusercontent.com/example/doc.rst", - mime_type="text/plain", - ) + { + "content": "https://raw.githubusercontent.com/example/doc.rst", + "mime_type": "text/plain", + } ], session_id=session_id, ) From 371f11a569e7ad314d208681ff20a405fb514840 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Mon, 10 Feb 2025 17:42:30 +0100 Subject: [PATCH 25/29] build: update uv lock to sync package versions (#1026) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? [Provide a short summary of what this PR does and why. Link to relevant issues if applicable.] Updated `uv.lock` to reflect the latest versions of `llama-models`, `llama-stack`, and `llama-stack-client` (bumped to 0.1.2). This ensures dependency consistency and avoids potential issues with outdated package references. Added `uv-sync` hook from `uv-pre-commit` repository to ensure synchronization of dependencies. Signed-off-by: Sébastien Han [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan [Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.*] [//]: # (## Documentation) [//]: # (- [ ] Added a Changelog entry if the change is significant) Signed-off-by: Sébastien Han --- .github/workflows/pre-commit.yml | 4 ++++ .pre-commit-config.yaml | 1 + uv.lock | 18 +++++++++--------- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index faa2eda31..046387ab9 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -23,3 +23,7 @@ jobs: .pre-commit-config.yaml - uses: pre-commit/action@v3.0.1 + + - name: Verify if there are any diff files after pre-commit + run: | + git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index adafccf64..bca91081f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -48,6 +48,7 @@ repos: hooks: - id: uv-export args: ["--frozen", "--no-hashes", "--no-emit-project"] + - id: uv-sync # - repo: https://github.com/pre-commit/mirrors-mypy # rev: v1.14.0 diff --git a/uv.lock b/uv.lock index f492872bc..087396eea 100644 --- a/uv.lock +++ b/uv.lock @@ -687,7 +687,7 @@ wheels = [ [[package]] name = "llama-models" -version = "0.1.1" +version = "0.1.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "jinja2" }, @@ -696,14 +696,14 @@ dependencies = [ { name = "pyyaml" }, { name = "tiktoken" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/df/80/4a4595cf5e55f71c0e15b85ff2f4c04b0742bf664ede062a09c9d383bf7b/llama_models-0.1.1.tar.gz", hash = "sha256:7cb5a9fe38485b47aff4c93e183d6d390a676a7619f3355502576b652f17733a", size = 1608412 } +sdist = { url = "https://files.pythonhosted.org/packages/b5/f2/ed8310d4677cd38ab45ffba45aea2a4e9882b640045ad9c3198ac69e5a85/llama_models-0.1.2.tar.gz", hash = "sha256:1266eaec7a8db336e4ed034d2b494189ccb7fd6d6b7aefe874eee749a4340b9b", size = 1608069 } wheels = [ - { url = "https://files.pythonhosted.org/packages/d9/93/d49dd0f0cd37df1a7a7fb25444d010f626cdf42b21eea11d839b0f6a808a/llama_models-0.1.1-py3-none-any.whl", hash = "sha256:7e4f15dc4f6f011852ea2c42f9770b75140f5eca670b32cc67fc0a4605c55f89", size = 1650981 }, + { url = "https://files.pythonhosted.org/packages/55/a7/34b9e88ef4109759c8881f43b8006139e3d13d54c440b8c571b253655f54/llama_models-0.1.2-py3-none-any.whl", hash = "sha256:8aa5287d1c6325698991ff677e71148cac347e07493bb5b3ab891e614b89e1f8", size = 1651273 }, ] [[package]] name = "llama-stack" -version = "0.1.1" +version = "0.1.2" source = { editable = "." } dependencies = [ { name = "blobfile" }, @@ -751,8 +751,8 @@ requires-dist = [ { name = "fire" }, { name = "httpx" }, { name = "huggingface-hub" }, - { name = "llama-models", specifier = ">=0.1.1" }, - { name = "llama-stack-client", specifier = ">=0.1.1" }, + { name = "llama-models", specifier = ">=0.1.2" }, + { name = "llama-stack-client", specifier = ">=0.1.2" }, { name = "myst-parser", marker = "extra == 'docs'" }, { name = "nbval", marker = "extra == 'dev'" }, { name = "pre-commit", marker = "extra == 'dev'" }, @@ -780,7 +780,7 @@ requires-dist = [ [[package]] name = "llama-stack-client" -version = "0.1.1" +version = "0.1.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, @@ -797,9 +797,9 @@ dependencies = [ { name = "tqdm" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/07/42/7004958ac1a6da9a8060decf0d9120fdeb3b2775de090a0a473f2ee4a27d/llama_stack_client-0.1.1.tar.gz", hash = "sha256:3e549a848ade959d342fa52ec49b1913b7bb615a77b5b8dcaefe6ff94409049e", size = 179729 } +sdist = { url = "https://files.pythonhosted.org/packages/9e/75/8b41a3026c871a8650cd8d2cfda9f891a9163458813574f36518bb40afe4/llama_stack_client-0.1.2.tar.gz", hash = "sha256:94277ddae52be557d771dcdc15d85af9012b5aa87439dd69ec1dc0ff486b0c8e", size = 188023 } wheels = [ - { url = "https://files.pythonhosted.org/packages/80/66/5255c09dc001ff437fd6fe6fad27142035b60073df243f7df0494095f605/llama_stack_client-0.1.1-py3-none-any.whl", hash = "sha256:e07d58fdcc1eaa370dd00b94c2dd1a8169c0ac60c37f6f2772cbc2c5b63f2e62", size = 348665 }, + { url = "https://files.pythonhosted.org/packages/c4/32/3a3a97eecff1f1e3a1dc90e9b00681abea11ec4f43a7ca549981261e18b6/llama_stack_client-0.1.2-py3-none-any.whl", hash = "sha256:85ff0fb57a62d7d0470cfaa2b07a595c9fb3483297944d5e5a066db850d38ccd", size = 359415 }, ] [[package]] From ab9516c7899d2411f1096d6b63e6366c228d9460 Mon Sep 17 00:00:00 2001 From: Ellis Tarn Date: Mon, 10 Feb 2025 13:24:15 -0800 Subject: [PATCH 26/29] fix: Gaps in doc codegen (#1035) # What does this PR do? Catches docs up to source with: ``` python llama_stack/scripts/distro_codegen.py ``` [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan [Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.*] Manually checked ``` sphinx-autobuild docs/source build/html ``` [//]: # (## Documentation) [//]: # (- [ ] Added a Changelog entry if the change is significant) --- distributions/dependencies.json | 192 +++++++++--------- .../self_hosted_distro/ollama.md | 4 +- llama_stack/scripts/distro_codegen.py | 2 +- llama_stack/templates/bedrock/run.yaml | 2 + llama_stack/templates/cerebras/run.yaml | 2 + .../templates/dell/run-with-safety.yaml | 2 + llama_stack/templates/dell/run.yaml | 2 + .../templates/fireworks/run-with-safety.yaml | 2 + llama_stack/templates/fireworks/run.yaml | 2 + .../hf-endpoint/run-with-safety.yaml | 2 + llama_stack/templates/hf-endpoint/run.yaml | 2 + .../hf-serverless/run-with-safety.yaml | 2 + llama_stack/templates/hf-serverless/run.yaml | 2 + .../meta-reference-gpu/run-with-safety.yaml | 2 + .../templates/meta-reference-gpu/run.yaml | 2 + .../meta-reference-quantized-gpu/run.yaml | 2 + llama_stack/templates/nvidia/run.yaml | 2 + .../templates/ollama/run-with-safety.yaml | 2 + llama_stack/templates/ollama/run.yaml | 2 + .../remote-vllm/run-with-safety.yaml | 2 + llama_stack/templates/remote-vllm/run.yaml | 2 + llama_stack/templates/sambanova/run.yaml | 2 + .../templates/tgi/run-with-safety.yaml | 2 + llama_stack/templates/tgi/run.yaml | 2 + .../templates/together/run-with-safety.yaml | 2 + llama_stack/templates/together/run.yaml | 2 + llama_stack/templates/vllm-gpu/run.yaml | 2 + 27 files changed, 146 insertions(+), 100 deletions(-) diff --git a/distributions/dependencies.json b/distributions/dependencies.json index 6babf3440..c1450d97e 100644 --- a/distributions/dependencies.json +++ b/distributions/dependencies.json @@ -66,6 +66,40 @@ "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" ], + "dell": [ + "aiohttp", + "aiosqlite", + "autoevals", + "blobfile", + "chardet", + "chromadb-client", + "datasets", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "huggingface_hub", + "matplotlib", + "nltk", + "numpy", + "openai", + "opentelemetry-exporter-otlp-proto-http", + "opentelemetry-sdk", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "requests", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch torchvision --index-url https://download.pytorch.org/whl/cpu" + ], "fireworks": [ "aiosqlite", "autoevals", @@ -252,6 +286,38 @@ "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" ], + "nvidia": [ + "aiosqlite", + "autoevals", + "blobfile", + "chardet", + "datasets", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "matplotlib", + "mcp", + "nltk", + "numpy", + "openai", + "opentelemetry-exporter-otlp-proto-http", + "opentelemetry-sdk", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "requests", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch torchvision --index-url https://download.pytorch.org/whl/cpu" + ], "ollama": [ "aiohttp", "aiosqlite", @@ -319,6 +385,36 @@ "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" ], + "sambanova": [ + "aiosqlite", + "blobfile", + "chardet", + "chromadb-client", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "matplotlib", + "nltk", + "numpy", + "openai", + "opentelemetry-exporter-otlp-proto-http", + "opentelemetry-sdk", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "requests", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch torchvision --index-url https://download.pytorch.org/whl/cpu" + ], "tgi": [ "aiohttp", "aiosqlite", @@ -421,101 +517,5 @@ "vllm", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" - ], - "nvidia": [ - "aiosqlite", - "autoevals", - "blobfile", - "chardet", - "datasets", - "faiss-cpu", - "fastapi", - "fire", - "httpx", - "matplotlib", - "mcp", - "nltk", - "numpy", - "openai", - "opentelemetry-exporter-otlp-proto-http", - "opentelemetry-sdk", - "pandas", - "pillow", - "psycopg2-binary", - "pypdf", - "redis", - "requests", - "scikit-learn", - "scipy", - "sentencepiece", - "tqdm", - "transformers", - "uvicorn", - "sentence-transformers --no-deps", - "torch torchvision --index-url https://download.pytorch.org/whl/cpu" - ], - "sambanova": [ - "aiosqlite", - "blobfile", - "chardet", - "chromadb-client", - "faiss-cpu", - "fastapi", - "fire", - "httpx", - "matplotlib", - "nltk", - "numpy", - "openai", - "opentelemetry-exporter-otlp-proto-http", - "opentelemetry-sdk", - "pandas", - "pillow", - "psycopg2-binary", - "pypdf", - "redis", - "requests", - "scikit-learn", - "scipy", - "sentencepiece", - "tqdm", - "transformers", - "uvicorn", - "sentence-transformers --no-deps", - "torch torchvision --index-url https://download.pytorch.org/whl/cpu" - ], - "dell": [ - "aiohttp", - "aiosqlite", - "autoevals", - "blobfile", - "chardet", - "chromadb-client", - "datasets", - "faiss-cpu", - "fastapi", - "fire", - "httpx", - "huggingface_hub", - "matplotlib", - "nltk", - "numpy", - "openai", - "opentelemetry-exporter-otlp-proto-http", - "opentelemetry-sdk", - "pandas", - "pillow", - "psycopg2-binary", - "pypdf", - "redis", - "requests", - "scikit-learn", - "scipy", - "sentencepiece", - "tqdm", - "transformers", - "uvicorn", - "sentence-transformers --no-deps", - "torch torchvision --index-url https://download.pytorch.org/whl/cpu" ] } diff --git a/docs/source/distributions/self_hosted_distro/ollama.md b/docs/source/distributions/self_hosted_distro/ollama.md index 54f6b8fdf..73a609421 100644 --- a/docs/source/distributions/self_hosted_distro/ollama.md +++ b/docs/source/distributions/self_hosted_distro/ollama.md @@ -26,9 +26,7 @@ The `llamastack/distribution-ollama` distribution consists of the following prov | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | -You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration. - -### Environment Variables +You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.### Environment Variables The following environment variables can be configured: diff --git a/llama_stack/scripts/distro_codegen.py b/llama_stack/scripts/distro_codegen.py index 7064d3104..c73c15d41 100644 --- a/llama_stack/scripts/distro_codegen.py +++ b/llama_stack/scripts/distro_codegen.py @@ -29,7 +29,7 @@ def find_template_dirs(templates_dir: Path) -> Iterator[Path]: if not templates_dir.exists(): raise FileNotFoundError(f"Templates directory not found: {templates_dir}") - return (d for d in templates_dir.iterdir() if d.is_dir() and d.name != "__pycache__") + return sorted(d for d in templates_dir.iterdir() if d.is_dir() and d.name != "__pycache__") def process_template(template_dir: Path, progress) -> None: diff --git a/llama_stack/templates/bedrock/run.yaml b/llama_stack/templates/bedrock/run.yaml index 39408c1bd..be6c9a928 100644 --- a/llama_stack/templates/bedrock/run.yaml +++ b/llama_stack/templates/bedrock/run.yaml @@ -115,3 +115,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/cerebras/run.yaml b/llama_stack/templates/cerebras/run.yaml index 5a70890a8..05d3f4525 100644 --- a/llama_stack/templates/cerebras/run.yaml +++ b/llama_stack/templates/cerebras/run.yaml @@ -117,3 +117,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/dell/run-with-safety.yaml b/llama_stack/templates/dell/run-with-safety.yaml index bdc82d03a..04c5957d4 100644 --- a/llama_stack/templates/dell/run-with-safety.yaml +++ b/llama_stack/templates/dell/run-with-safety.yaml @@ -116,3 +116,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/dell/run.yaml b/llama_stack/templates/dell/run.yaml index 2ba62a782..706444eb1 100644 --- a/llama_stack/templates/dell/run.yaml +++ b/llama_stack/templates/dell/run.yaml @@ -107,3 +107,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/fireworks/run-with-safety.yaml b/llama_stack/templates/fireworks/run-with-safety.yaml index a4b425436..0fbe14a5a 100644 --- a/llama_stack/templates/fireworks/run-with-safety.yaml +++ b/llama_stack/templates/fireworks/run-with-safety.yaml @@ -172,3 +172,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/fireworks/run.yaml b/llama_stack/templates/fireworks/run.yaml index a497317bd..ccf67dcbb 100644 --- a/llama_stack/templates/fireworks/run.yaml +++ b/llama_stack/templates/fireworks/run.yaml @@ -161,3 +161,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/hf-endpoint/run-with-safety.yaml b/llama_stack/templates/hf-endpoint/run-with-safety.yaml index 0329f580b..f520a2fda 100644 --- a/llama_stack/templates/hf-endpoint/run-with-safety.yaml +++ b/llama_stack/templates/hf-endpoint/run-with-safety.yaml @@ -124,3 +124,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/hf-endpoint/run.yaml b/llama_stack/templates/hf-endpoint/run.yaml index 8163fe28e..708cb1bcc 100644 --- a/llama_stack/templates/hf-endpoint/run.yaml +++ b/llama_stack/templates/hf-endpoint/run.yaml @@ -114,3 +114,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/hf-serverless/run-with-safety.yaml b/llama_stack/templates/hf-serverless/run-with-safety.yaml index 9cee920a5..7f0abf5be 100644 --- a/llama_stack/templates/hf-serverless/run-with-safety.yaml +++ b/llama_stack/templates/hf-serverless/run-with-safety.yaml @@ -124,3 +124,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/hf-serverless/run.yaml b/llama_stack/templates/hf-serverless/run.yaml index c8ad0d38d..c0b7a4c60 100644 --- a/llama_stack/templates/hf-serverless/run.yaml +++ b/llama_stack/templates/hf-serverless/run.yaml @@ -114,3 +114,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml index 0faaabb15..c5286fc6b 100644 --- a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml +++ b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml @@ -126,3 +126,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/meta-reference-gpu/run.yaml b/llama_stack/templates/meta-reference-gpu/run.yaml index 6ffe1fa36..310585f23 100644 --- a/llama_stack/templates/meta-reference-gpu/run.yaml +++ b/llama_stack/templates/meta-reference-gpu/run.yaml @@ -115,3 +115,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml index 5ff87a901..d43cf3917 100644 --- a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml +++ b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml @@ -117,3 +117,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/nvidia/run.yaml b/llama_stack/templates/nvidia/run.yaml index 6dc325e9d..c8ae362f5 100644 --- a/llama_stack/templates/nvidia/run.yaml +++ b/llama_stack/templates/nvidia/run.yaml @@ -147,3 +147,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/ollama/run-with-safety.yaml b/llama_stack/templates/ollama/run-with-safety.yaml index 5b5c9c253..ac5dab755 100644 --- a/llama_stack/templates/ollama/run-with-safety.yaml +++ b/llama_stack/templates/ollama/run-with-safety.yaml @@ -121,3 +121,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml index 3cc1cb2ac..485223675 100644 --- a/llama_stack/templates/ollama/run.yaml +++ b/llama_stack/templates/ollama/run.yaml @@ -110,3 +110,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/remote-vllm/run-with-safety.yaml b/llama_stack/templates/remote-vllm/run-with-safety.yaml index 4a0fa9a85..1fe998a1f 100644 --- a/llama_stack/templates/remote-vllm/run-with-safety.yaml +++ b/llama_stack/templates/remote-vllm/run-with-safety.yaml @@ -126,3 +126,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/remote-vllm/run.yaml b/llama_stack/templates/remote-vllm/run.yaml index 9631f94a2..9d3db8a31 100644 --- a/llama_stack/templates/remote-vllm/run.yaml +++ b/llama_stack/templates/remote-vllm/run.yaml @@ -115,3 +115,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/sambanova/run.yaml b/llama_stack/templates/sambanova/run.yaml index 6cec51824..39b0f3c4e 100644 --- a/llama_stack/templates/sambanova/run.yaml +++ b/llama_stack/templates/sambanova/run.yaml @@ -126,3 +126,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/tgi/run-with-safety.yaml b/llama_stack/templates/tgi/run-with-safety.yaml index 503505c32..ed6c9ef6f 100644 --- a/llama_stack/templates/tgi/run-with-safety.yaml +++ b/llama_stack/templates/tgi/run-with-safety.yaml @@ -114,3 +114,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/tgi/run.yaml b/llama_stack/templates/tgi/run.yaml index f1953c513..8bf76f37b 100644 --- a/llama_stack/templates/tgi/run.yaml +++ b/llama_stack/templates/tgi/run.yaml @@ -113,3 +113,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/together/run-with-safety.yaml b/llama_stack/templates/together/run-with-safety.yaml index ec351108e..298926630 100644 --- a/llama_stack/templates/together/run-with-safety.yaml +++ b/llama_stack/templates/together/run-with-safety.yaml @@ -167,3 +167,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/together/run.yaml b/llama_stack/templates/together/run.yaml index c2afd98e9..920003759 100644 --- a/llama_stack/templates/together/run.yaml +++ b/llama_stack/templates/together/run.yaml @@ -156,3 +156,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/vllm-gpu/run.yaml b/llama_stack/templates/vllm-gpu/run.yaml index 165e4d51d..41a545e1a 100644 --- a/llama_stack/templates/vllm-gpu/run.yaml +++ b/llama_stack/templates/vllm-gpu/run.yaml @@ -117,3 +117,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 From afca9d92f93b6384d1344be0726ea1d4c0b2c5e9 Mon Sep 17 00:00:00 2001 From: Ellis Tarn Date: Mon, 10 Feb 2025 13:35:16 -0800 Subject: [PATCH 27/29] fix: Readthedocs cannot parse comments, resulting in docs bugs (#1033) --- docs/source/distributions/self_hosted_distro/dell.md | 2 +- .../distributions/self_hosted_distro/fireworks.md | 2 +- .../self_hosted_distro/meta-reference-gpu.md | 2 +- .../meta-reference-quantized-gpu.md | 2 +- .../source/distributions/self_hosted_distro/ollama.md | 2 +- .../distributions/self_hosted_distro/remote-vllm.md | 2 +- .../distributions/self_hosted_distro/sambanova.md | 2 +- docs/source/distributions/self_hosted_distro/tgi.md | 2 +- .../distributions/self_hosted_distro/together.md | 2 +- llama_stack/templates/template.py | 11 +++++++++-- 10 files changed, 18 insertions(+), 11 deletions(-) diff --git a/docs/source/distributions/self_hosted_distro/dell.md b/docs/source/distributions/self_hosted_distro/dell.md index be326ffa5..aef3ecf58 100644 --- a/docs/source/distributions/self_hosted_distro/dell.md +++ b/docs/source/distributions/self_hosted_distro/dell.md @@ -1,7 +1,7 @@ - --- orphan: true --- + # Dell Distribution of Llama Stack diff --git a/docs/source/distributions/self_hosted_distro/fireworks.md b/docs/source/distributions/self_hosted_distro/fireworks.md index 9afeb4894..f77d9f656 100644 --- a/docs/source/distributions/self_hosted_distro/fireworks.md +++ b/docs/source/distributions/self_hosted_distro/fireworks.md @@ -1,7 +1,7 @@ - --- orphan: true --- + # Fireworks Distribution ```{toctree} diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md index d00d8177f..b183757db 100644 --- a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md +++ b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md @@ -1,7 +1,7 @@ - --- orphan: true --- + # Meta Reference Distribution ```{toctree} diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md index e46c2d112..9aeb7a88b 100644 --- a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md +++ b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md @@ -1,7 +1,7 @@ - --- orphan: true --- + # Meta Reference Quantized Distribution ```{toctree} diff --git a/docs/source/distributions/self_hosted_distro/ollama.md b/docs/source/distributions/self_hosted_distro/ollama.md index 73a609421..c015b9610 100644 --- a/docs/source/distributions/self_hosted_distro/ollama.md +++ b/docs/source/distributions/self_hosted_distro/ollama.md @@ -1,7 +1,7 @@ - --- orphan: true --- + # Ollama Distribution ```{toctree} diff --git a/docs/source/distributions/self_hosted_distro/remote-vllm.md b/docs/source/distributions/self_hosted_distro/remote-vllm.md index ff626d40d..6c3bbd1d0 100644 --- a/docs/source/distributions/self_hosted_distro/remote-vllm.md +++ b/docs/source/distributions/self_hosted_distro/remote-vllm.md @@ -1,7 +1,7 @@ - --- orphan: true --- + # Remote vLLM Distribution ```{toctree} :maxdepth: 2 diff --git a/docs/source/distributions/self_hosted_distro/sambanova.md b/docs/source/distributions/self_hosted_distro/sambanova.md index 86ef4ac58..e6ac616be 100644 --- a/docs/source/distributions/self_hosted_distro/sambanova.md +++ b/docs/source/distributions/self_hosted_distro/sambanova.md @@ -1,7 +1,7 @@ - --- orphan: true --- + # SambaNova Distribution ```{toctree} diff --git a/docs/source/distributions/self_hosted_distro/tgi.md b/docs/source/distributions/self_hosted_distro/tgi.md index b970ab9fe..f4eecf2cd 100644 --- a/docs/source/distributions/self_hosted_distro/tgi.md +++ b/docs/source/distributions/self_hosted_distro/tgi.md @@ -1,7 +1,7 @@ - --- orphan: true --- + # TGI Distribution diff --git a/docs/source/distributions/self_hosted_distro/together.md b/docs/source/distributions/self_hosted_distro/together.md index 45ae462d5..8e36c1eb0 100644 --- a/docs/source/distributions/self_hosted_distro/together.md +++ b/docs/source/distributions/self_hosted_distro/together.md @@ -1,7 +1,7 @@ - --- orphan: true --- + # Together Distribution ```{toctree} diff --git a/llama_stack/templates/template.py b/llama_stack/templates/template.py index 09efd2038..04a09741c 100644 --- a/llama_stack/templates/template.py +++ b/llama_stack/templates/template.py @@ -131,8 +131,15 @@ class DistributionTemplate(BaseModel): providers_str = ", ".join(f"`{p}`" for p in providers) providers_table += f"| {api} | {providers_str} |\n" - template = "\n" - template += self.template_path.read_text() + template = self.template_path.read_text() + comment = "\n" + orphantext = "---\norphan: true\n---\n" + + if template.startswith(orphantext): + template = template.replace(orphantext, orphantext + comment) + else: + template = comment + template + # Render template with rich-generated table env = jinja2.Environment( trim_blocks=True, From 36d35406a77df5bc4dad341a4ac64f2d8e3b8a5b Mon Sep 17 00:00:00 2001 From: Ellis Tarn Date: Mon, 10 Feb 2025 14:27:17 -0800 Subject: [PATCH 28/29] fix: a bad newline in ollama docs (#1036) # What does this PR do? Catches a bug in the previous codegen which was removing newlines. [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan ``` python llama_stack/scripts/distro_codegen.py ``` [//]: # (## Documentation) [//]: # (- [ ] Added a Changelog entry if the change is significant) --- docs/source/distributions/self_hosted_distro/ollama.md | 4 +++- llama_stack/templates/ollama/doc_template.md | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/source/distributions/self_hosted_distro/ollama.md b/docs/source/distributions/self_hosted_distro/ollama.md index c015b9610..a3a45f9a8 100644 --- a/docs/source/distributions/self_hosted_distro/ollama.md +++ b/docs/source/distributions/self_hosted_distro/ollama.md @@ -26,7 +26,9 @@ The `llamastack/distribution-ollama` distribution consists of the following prov | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | -You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.### Environment Variables +You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration. + +### Environment Variables The following environment variables can be configured: diff --git a/llama_stack/templates/ollama/doc_template.md b/llama_stack/templates/ollama/doc_template.md index eb4aadd29..29efe39c3 100644 --- a/llama_stack/templates/ollama/doc_template.md +++ b/llama_stack/templates/ollama/doc_template.md @@ -16,7 +16,7 @@ The `llamastack/distribution-{{ name }}` distribution consists of the following You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration. -{%- if run_config_env_vars %} +{% if run_config_env_vars %} ### Environment Variables The following environment variables can be configured: From 3856927ee8ac4c3d8c6b1cc36cbaf0be29cf5076 Mon Sep 17 00:00:00 2001 From: Bill Murdock Date: Mon, 10 Feb 2025 18:08:33 -0500 Subject: [PATCH 29/29] fix: Update Qdrant support post-refactor (#1022) # What does this PR do? I tried running the Qdrant provider and found some bugs. See #1021 for details. @terrytangyuan wrote there: > Please feel free to submit your changes in a PR. I fixed similar issues for pgvector provider. This might be an issue introduced from a refactoring. So I am submitting this PR. Closes #1021 ## Test Plan Here are the highlights for what I did to test this: References: - https://llama-stack.readthedocs.io/en/latest/getting_started/index.html - https://github.com/meta-llama/llama-stack-apps/blob/main/examples/agents/rag_with_vector_db.py - https://github.com/meta-llama/llama-stack/blob/main/docs/zero_to_hero_guide/README.md#build-configure-and-run-llama-stack Install and run Qdrant server: ``` podman pull qdrant/qdrant mkdir qdrant-data podman run -p 6333:6333 -v $(pwd)/qdrant-data:/qdrant/storage qdrant/qdrant ``` Install and run Llama Stack from the venv-support PR (mainly because I didn't want to install conda): ``` brew install cmake # Should just need this once git clone https://github.com/meta-llama/llama-models.git gh repo clone cdoern/llama-stack cd llama-stack gh pr checkout 1018 # This is the checkout that introduces venv support for build/run. Otherwise you have to use conda. Eventually this wil be part of main, hopefully. uv sync --extra dev uv pip install -e . source .venv/bin/activate uv pip install qdrant_client LLAMA_STACK_DIR=$(pwd) LLAMA_MODELS_DIR=../llama-models llama stack build --template ollama --image-type venv ``` ``` edit llama_stack/templates/ollama/run.yaml ``` in that editor under: ``` vector_io: ``` add: ``` - provider_id: qdrant provider_type: remote::qdrant config: {} ``` see https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/vector_io/qdrant/config.py#L14 for config options (but I didn't need any) ``` LLAMA_STACK_DIR=$(pwd) LLAMA_MODELS_DIR=../llama-models llama stack run ollama --image-type venv \ --port $LLAMA_STACK_PORT \ --env INFERENCE_MODEL=$INFERENCE_MODEL \ --env SAFETY_MODEL=$SAFETY_MODEL \ --env OLLAMA_URL=$OLLAMA_URL ``` Then I tested it out in a notebook. Key highlights included: ``` qdrant_provider = None for provider in client.providers.list(): if provider.api == "vector_io" and provider.provider_id == "qdrant": qdrant_provider = provider qdrant_provider assert qdrant_provider is not None, "QDrant is not a provider. You need to edit the run yaml file you use in your `llama stack run` call" vector_db_id = f"test-vector-db-{uuid.uuid4().hex}" client.vector_dbs.register( vector_db_id=vector_db_id, embedding_model="all-MiniLM-L6-v2", embedding_dimension=384, provider_id=qdrant_provider.provider_id, ) ``` Other than that, I just followed what was in https://llama-stack.readthedocs.io/en/latest/getting_started/index.html It would be good to have automated tests for this in the future, but that would be a big undertaking. Signed-off-by: Bill Murdock --- llama_stack/providers/remote/vector_io/qdrant/__init__.py | 4 ++-- llama_stack/providers/remote/vector_io/qdrant/qdrant.py | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/llama_stack/providers/remote/vector_io/qdrant/__init__.py b/llama_stack/providers/remote/vector_io/qdrant/__init__.py index 54605fcf9..c584e29ef 100644 --- a/llama_stack/providers/remote/vector_io/qdrant/__init__.py +++ b/llama_stack/providers/remote/vector_io/qdrant/__init__.py @@ -12,8 +12,8 @@ from .config import QdrantConfig async def get_adapter_impl(config: QdrantConfig, deps: Dict[Api, ProviderSpec]): - from .qdrant import QdrantVectorMemoryAdapter + from .qdrant import QdrantVectorDBAdapter - impl = QdrantVectorMemoryAdapter(config, deps[Api.inference]) + impl = QdrantVectorDBAdapter(config, deps[Api.inference]) await impl.initialize() return impl diff --git a/llama_stack/providers/remote/vector_io/qdrant/qdrant.py b/llama_stack/providers/remote/vector_io/qdrant/qdrant.py index 719070528..e7ad136eb 100644 --- a/llama_stack/providers/remote/vector_io/qdrant/qdrant.py +++ b/llama_stack/providers/remote/vector_io/qdrant/qdrant.py @@ -55,7 +55,7 @@ class QdrantIndex(EmbeddingIndex): points = [] for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)): - chunk_id = f"{chunk.document_id}:chunk-{i}" + chunk_id = f"{chunk.metadata['document_id']}:chunk-{i}" points.append( PointStruct( id=convert_id(chunk_id), @@ -93,6 +93,9 @@ class QdrantIndex(EmbeddingIndex): return QueryChunksResponse(chunks=chunks, scores=scores) + async def delete(self): + await self.client.delete_collection(collection_name=self.collection_name) + class QdrantVectorDBAdapter(VectorIO, VectorDBsProtocolPrivate): def __init__(self, config: QdrantConfig, inference_api: Api.inference) -> None: