From 5e4ac1b7c1feee0e770a4149bafa9c6bb7ac812f Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Tue, 19 Nov 2024 09:15:05 -0800 Subject: [PATCH 01/22] Make sure server code uses version prefixed routes --- docs/resources/llama-stack-spec.html | 2 +- docs/resources/llama-stack-spec.yaml | 2 +- llama_stack/distribution/server/endpoints.py | 4 +++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/resources/llama-stack-spec.html b/docs/resources/llama-stack-spec.html index 838633a4f..cf4bf5125 100644 --- a/docs/resources/llama-stack-spec.html +++ b/docs/resources/llama-stack-spec.html @@ -21,7 +21,7 @@ "info": { "title": "[DRAFT] Llama Stack Specification", "version": "alpha", - "description": "This is the specification of the llama stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. The specification is still in draft and subject to change.\n Generated at 2024-11-18 23:37:24.867143" + "description": "This is the specification of the llama stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. The specification is still in draft and subject to change.\n Generated at 2024-11-19 09:14:01.145131" }, "servers": [ { diff --git a/docs/resources/llama-stack-spec.yaml b/docs/resources/llama-stack-spec.yaml index 994e3aac4..e84f11bdd 100644 --- a/docs/resources/llama-stack-spec.yaml +++ b/docs/resources/llama-stack-spec.yaml @@ -3400,7 +3400,7 @@ info: description: "This is the specification of the llama stack that provides\n \ \ a set of endpoints and their corresponding interfaces that are tailored\ \ to\n best leverage Llama Models. The specification is still in\ - \ draft and subject to change.\n Generated at 2024-11-18 23:37:24.867143" + \ draft and subject to change.\n Generated at 2024-11-19 09:14:01.145131" title: '[DRAFT] Llama Stack Specification' version: alpha jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema diff --git a/llama_stack/distribution/server/endpoints.py b/llama_stack/distribution/server/endpoints.py index 93432abe1..af429e020 100644 --- a/llama_stack/distribution/server/endpoints.py +++ b/llama_stack/distribution/server/endpoints.py @@ -9,6 +9,8 @@ from typing import Dict, List from pydantic import BaseModel +from llama_stack.apis.version import LLAMA_STACK_API_VERSION + from llama_stack.distribution.resolver import api_protocol_map from llama_stack.providers.datatypes import Api @@ -33,7 +35,7 @@ def get_all_api_endpoints() -> Dict[Api, List[ApiEndpoint]]: continue webmethod = method.__webmethod__ - route = webmethod.route + route = f"/{LLAMA_STACK_API_VERSION}/{webmethod.route.lstrip('/')}" if webmethod.method == "GET": method = "get" From 1619d37cc653cb1d9cbddcbc5627cd818b11b3e6 Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Tue, 19 Nov 2024 09:54:30 -0800 Subject: [PATCH 02/22] codegen per-distro dependencies; not hooked into setup.py yet --- MANIFEST.in | 1 + distributions/dependencies.json | 177 ++++++++++++++++++++++++++ llama_stack/scripts/distro_codegen.py | 38 ++++++ 3 files changed, 216 insertions(+) create mode 100644 distributions/dependencies.json diff --git a/MANIFEST.in b/MANIFEST.in index 27cb775f7..4d1843051 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,5 @@ include requirements.txt +include distributions/dependencies.json include llama_stack/distribution/*.sh include llama_stack/cli/scripts/*.sh include llama_stack/templates/*/*.yaml diff --git a/distributions/dependencies.json b/distributions/dependencies.json new file mode 100644 index 000000000..6827af1f1 --- /dev/null +++ b/distributions/dependencies.json @@ -0,0 +1,177 @@ +{ + "together": [ + "scipy", + "scikit-learn", + "nltk", + "chardet", + "chromadb-client", + "psycopg2-binary", + "sentencepiece", + "faiss-cpu", + "blobfile", + "pandas", + "pillow", + "together", + "pypdf", + "matplotlib", + "aiosqlite", + "redis", + "transformers", + "numpy", + "tqdm", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu", + "aiosqlite", + "fastapi", + "fire", + "httpx", + "uvicorn" + ], + "remote-vllm": [ + "scipy", + "scikit-learn", + "nltk", + "chardet", + "chromadb-client", + "psycopg2-binary", + "sentencepiece", + "faiss-cpu", + "blobfile", + "pandas", + "pillow", + "pypdf", + "matplotlib", + "openai", + "aiosqlite", + "redis", + "transformers", + "numpy", + "tqdm", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu", + "aiosqlite", + "fastapi", + "fire", + "httpx", + "uvicorn" + ], + "fireworks": [ + "scipy", + "scikit-learn", + "nltk", + "chardet", + "chromadb-client", + "psycopg2-binary", + "sentencepiece", + "faiss-cpu", + "blobfile", + "pandas", + "pillow", + "pypdf", + "matplotlib", + "aiosqlite", + "redis", + "transformers", + "fireworks-ai", + "numpy", + "tqdm", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu", + "aiosqlite", + "fastapi", + "fire", + "httpx", + "uvicorn" + ], + "tgi": [ + "scipy", + "scikit-learn", + "nltk", + "aiohttp", + "chardet", + "chromadb-client", + "psycopg2-binary", + "huggingface_hub", + "sentencepiece", + "faiss-cpu", + "blobfile", + "pandas", + "pillow", + "pypdf", + "matplotlib", + "aiosqlite", + "transformers", + "redis", + "numpy", + "tqdm", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu", + "aiosqlite", + "fastapi", + "fire", + "httpx", + "uvicorn" + ], + "meta-reference-gpu": [ + "lm-format-enforcer", + "scipy", + "scikit-learn", + "nltk", + "accelerate", + "chardet", + "chromadb-client", + "psycopg2-binary", + "sentencepiece", + "zmq", + "faiss-cpu", + "torchvision", + "blobfile", + "fairscale", + "pandas", + "pillow", + "pypdf", + "matplotlib", + "transformers", + "torch", + "aiosqlite", + "redis", + "numpy", + "tqdm", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu", + "aiosqlite", + "fastapi", + "fire", + "httpx", + "uvicorn" + ], + "ollama": [ + "scipy", + "scikit-learn", + "nltk", + "aiohttp", + "ollama", + "chardet", + "chromadb-client", + "psycopg2-binary", + "sentencepiece", + "faiss-cpu", + "blobfile", + "pandas", + "pillow", + "pypdf", + "matplotlib", + "aiosqlite", + "transformers", + "redis", + "numpy", + "tqdm", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu", + "aiosqlite", + "fastapi", + "fire", + "httpx", + "uvicorn" + ] +} diff --git a/llama_stack/scripts/distro_codegen.py b/llama_stack/scripts/distro_codegen.py index f0d3bb4b9..8bcf97374 100644 --- a/llama_stack/scripts/distro_codegen.py +++ b/llama_stack/scripts/distro_codegen.py @@ -6,6 +6,7 @@ import concurrent.futures import importlib +import json import subprocess import sys from functools import partial @@ -14,6 +15,11 @@ from typing import Iterator from rich.progress import Progress, SpinnerColumn, TextColumn +from llama_stack.distribution.build import ( + get_provider_dependencies, + SERVER_DEPENDENCIES, +) + REPO_ROOT = Path(__file__).parent.parent.parent @@ -67,6 +73,36 @@ def check_for_changes() -> bool: return result.returncode != 0 +def collect_template_dependencies(template_dir: Path) -> tuple[str, list[str]]: + try: + module_name = f"llama_stack.templates.{template_dir.name}" + module = importlib.import_module(module_name) + + if template_func := getattr(module, "get_distribution_template", None): + template = template_func() + normal_deps, special_deps = get_provider_dependencies(template.providers) + # Combine all dependencies in order: normal deps, special deps, server deps + all_deps = normal_deps + special_deps + SERVER_DEPENDENCIES + return template.name, all_deps + except Exception: + return None, [] + return None, [] + + +def generate_dependencies_file(): + templates_dir = REPO_ROOT / "llama_stack" / "templates" + distribution_deps = {} + + for template_dir in find_template_dirs(templates_dir): + name, deps = collect_template_dependencies(template_dir) + if name: + distribution_deps[name] = deps + + deps_file = REPO_ROOT / "distributions" / "dependencies.json" + with open(deps_file, "w") as f: + json.dump(distribution_deps, f, indent=2) + + def main(): templates_dir = REPO_ROOT / "llama_stack" / "templates" @@ -88,6 +124,8 @@ def main(): list(executor.map(process_func, template_dirs)) progress.update(task, advance=len(template_dirs)) + generate_dependencies_file() + if check_for_changes(): print( "Distribution template changes detected. Please commit the changes.", From 1b0f5fff5ae36f765d24bfaab24bc305ede5ebe3 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Tue, 19 Nov 2024 10:26:05 -0800 Subject: [PATCH 03/22] fix curl endpoint --- docs/source/getting_started/index.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/getting_started/index.md b/docs/source/getting_started/index.md index eb95db7cc..189bd6cb5 100644 --- a/docs/source/getting_started/index.md +++ b/docs/source/getting_started/index.md @@ -535,10 +535,10 @@ $ llama-stack-client models list Once the server is set up, we can test it with a client to verify it's working correctly. The following command will send a chat completion request to the server's `/inference/chat_completion` API: ```bash -$ curl http://localhost:5000/inference/chat_completion \ +$ curl http://localhost:5000/alpha/inference/chat-completion \ -H "Content-Type: application/json" \ -d '{ - "model_id": "Llama3.1-8B-Instruct", + "model_id": "meta-llama/Llama-3.1-8B-Instruct", "messages": [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Write me a 2 sentence poem about the moon"} From 39e99b39fe60b0064f91cacd52911b9863da54c9 Mon Sep 17 00:00:00 2001 From: Henry Tai Date: Wed, 20 Nov 2024 02:32:19 +0800 Subject: [PATCH 04/22] update quick start to have the working instruction (#467) # What does this PR do? Fix the instruction in quickstart readme so the new developers/users can run it without issues. ## Test Plan None ## Sources Please link relevant resources if necessary. ## Before submitting - [X] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Ran pre-commit to handle lint / formatting issues. - [X] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [X] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests. Co-authored-by: Henry Tai --- docs/zero_to_hero_guide/quickstart.md | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/docs/zero_to_hero_guide/quickstart.md b/docs/zero_to_hero_guide/quickstart.md index 54a01e219..df8e9abc4 100644 --- a/docs/zero_to_hero_guide/quickstart.md +++ b/docs/zero_to_hero_guide/quickstart.md @@ -22,14 +22,22 @@ If you're looking for more specific topics like tool calling or agent setup, we - Download and unzip `Ollama-darwin.zip`. - Run the `Ollama` application. -2. **Download the Ollama CLI**: +1. **Download the Ollama CLI**: - Ensure you have the `ollama` command line tool by downloading and installing it from the same website. -3. **Verify Installation**: +1. **Start ollama server**: + - Open the terminal and run: + ``` + ollama serve + ``` + +1. **Run the model**: - Open the terminal and run: ```bash - ollama run llama3.2:1b + ollama run llama3.2:3b-instruct-fp16 ``` + **Note**: The supported models for llama stack for now is listed in [here](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/ollama.py#L43) + --- @@ -84,6 +92,8 @@ If you're looking for more specific topics like tool calling or agent setup, we ```bash llama stack run /path/to/your/distro/llamastack-ollama/ollama-run.yaml --port 5050 ``` + Note: + 1. Everytime you run a new model with `ollama run`, you will need to restart the llama stack. Otherwise it won't see the new model The server will start and listen on `http://localhost:5050`. @@ -97,7 +107,7 @@ After setting up the server, open a new terminal window and verify it's working curl http://localhost:5050/inference/chat_completion \ -H "Content-Type: application/json" \ -d '{ - "model": "llama3.2:1b", + "model": "Llama3.2-3B-Instruct", "messages": [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Write me a 2-sentence poem about the moon"} @@ -106,6 +116,8 @@ curl http://localhost:5050/inference/chat_completion \ }' ``` +You can check the available models with the command `llama-stack-client models list`. + **Expected Output:** ```json { From c46b462c229c933ed4d5006fcb5951573abd17c6 Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Tue, 19 Nov 2024 11:36:53 -0800 Subject: [PATCH 05/22] Updates to docker build script --- llama_stack/distribution/build_container.sh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/llama_stack/distribution/build_container.sh b/llama_stack/distribution/build_container.sh index 139883618..b56c76ebd 100755 --- a/llama_stack/distribution/build_container.sh +++ b/llama_stack/distribution/build_container.sh @@ -9,6 +9,7 @@ LLAMA_MODELS_DIR=${LLAMA_MODELS_DIR:-} LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-} TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-} +BUILD_PLATFORM=${BUILD_PLATFORM:-} if [ "$#" -lt 4 ]; then echo "Usage: $0 []" >&2 @@ -77,6 +78,10 @@ if [ -n "$special_pip_deps" ]; then done fi +# This has been added to simplify UI development, but we likely need +# to add this as a dependency to `llama-stack` itself +add_to_docker "RUN pip install llama-stack-client" + stack_mount="/app/llama-stack-source" models_mount="/app/llama-models-source" @@ -116,7 +121,6 @@ RUN pip install --no-cache $models_mount EOF fi - add_to_docker < Date: Tue, 19 Nov 2024 11:44:35 -0800 Subject: [PATCH 06/22] Add llama-stack-client as a legitimate dependency for llama-stack --- llama_stack/distribution/build_container.sh | 4 ---- requirements.txt | 1 + 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/llama_stack/distribution/build_container.sh b/llama_stack/distribution/build_container.sh index b56c76ebd..230ca34ac 100755 --- a/llama_stack/distribution/build_container.sh +++ b/llama_stack/distribution/build_container.sh @@ -78,10 +78,6 @@ if [ -n "$special_pip_deps" ]; then done fi -# This has been added to simplify UI development, but we likely need -# to add this as a dependency to `llama-stack` itself -add_to_docker "RUN pip install llama-stack-client" - stack_mount="/app/llama-stack-source" models_mount="/app/llama-models-source" diff --git a/requirements.txt b/requirements.txt index da8b8e638..dcb30d605 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,7 @@ fire httpx huggingface-hub llama-models>=0.0.50 +llama-stack-client>=0.0.50 prompt-toolkit python-dotenv pydantic>=2 From 05d1ead02f8ee2c3ff34be9fb89d9a5e6bf91e7a Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Tue, 19 Nov 2024 13:25:36 -0800 Subject: [PATCH 07/22] Update condition in tests to handle llama-3.1 vs llama3.1 (HF names) --- .../providers/tests/inference/test_text_inference.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/llama_stack/providers/tests/inference/test_text_inference.py b/llama_stack/providers/tests/inference/test_text_inference.py index 7b7aca5bd..6e263432a 100644 --- a/llama_stack/providers/tests/inference/test_text_inference.py +++ b/llama_stack/providers/tests/inference/test_text_inference.py @@ -25,7 +25,11 @@ from .utils import group_chunks def get_expected_stop_reason(model: str): - return StopReason.end_of_message if "Llama3.1" in model else StopReason.end_of_turn + return ( + StopReason.end_of_message + if ("Llama3.1" in model or "Llama-3.1" in model) + else StopReason.end_of_turn + ) @pytest.fixture @@ -34,7 +38,7 @@ def common_params(inference_model): "tool_choice": ToolChoice.auto, "tool_prompt_format": ( ToolPromptFormat.json - if "Llama3.1" in inference_model + if ("Llama3.1" in inference_model or "Llama-3.1" in inference_model) else ToolPromptFormat.python_list ), } From 38ba3b9f0ce33fe546ac82b94834590064175e4d Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Tue, 19 Nov 2024 13:36:14 -0800 Subject: [PATCH 08/22] Fix fireworks stream completion --- .../providers/remote/inference/fireworks/fireworks.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama_stack/providers/remote/inference/fireworks/fireworks.py b/llama_stack/providers/remote/inference/fireworks/fireworks.py index 3ff50d378..02d4b82ef 100644 --- a/llama_stack/providers/remote/inference/fireworks/fireworks.py +++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py @@ -214,10 +214,10 @@ class FireworksInferenceAdapter( async def _to_async_generator(): if "messages" in params: - stream = await self._get_client().chat.completions.acreate(**params) + stream = self._get_client().chat.completions.acreate(**params) else: - stream = self._get_client().completion.create(**params) - for chunk in stream: + stream = self._get_client().completion.acreate(**params) + async for chunk in stream: yield chunk stream = _to_async_generator() From 185df4b568bf2faac2671bf0c046cf584670c812 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Tue, 19 Nov 2024 14:09:00 -0800 Subject: [PATCH 09/22] fix fireworks registration --- llama_stack/providers/remote/inference/fireworks/fireworks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_stack/providers/remote/inference/fireworks/fireworks.py b/llama_stack/providers/remote/inference/fireworks/fireworks.py index 02d4b82ef..d8cbca5f9 100644 --- a/llama_stack/providers/remote/inference/fireworks/fireworks.py +++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py @@ -54,7 +54,7 @@ MODEL_ALIASES = [ ), build_model_alias( "fireworks/llama-v3p2-3b-instruct", - CoreModelId.llama3_2_11b_vision_instruct.value, + CoreModelId.llama3_2_3b_instruct.value, ), build_model_alias( "fireworks/llama-v3p2-11b-vision-instruct", From 189df6358af28dc7588b2035207180027818ddab Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Tue, 19 Nov 2024 14:16:00 -0800 Subject: [PATCH 10/22] codegen docs --- distributions/dependencies.json | 164 +++++++++--------- .../self_hosted_distro/fireworks.md | 2 +- llama_stack/templates/fireworks/run.yaml | 2 +- 3 files changed, 84 insertions(+), 84 deletions(-) diff --git a/distributions/dependencies.json b/distributions/dependencies.json index 6827af1f1..469b6f14e 100644 --- a/distributions/dependencies.json +++ b/distributions/dependencies.json @@ -1,24 +1,24 @@ { "together": [ "scipy", + "blobfile", + "together", + "tqdm", + "sentencepiece", + "matplotlib", + "pandas", + "pypdf", "scikit-learn", "nltk", - "chardet", - "chromadb-client", - "psycopg2-binary", - "sentencepiece", "faiss-cpu", - "blobfile", - "pandas", - "pillow", - "together", - "pypdf", - "matplotlib", + "chardet", + "numpy", + "psycopg2-binary", "aiosqlite", + "pillow", "redis", "transformers", - "numpy", - "tqdm", + "chromadb-client", "sentence-transformers --no-deps", "torch --index-url https://download.pytorch.org/whl/cpu", "aiosqlite", @@ -29,24 +29,24 @@ ], "remote-vllm": [ "scipy", + "blobfile", + "tqdm", + "sentencepiece", + "matplotlib", + "pandas", + "pypdf", "scikit-learn", "nltk", - "chardet", - "chromadb-client", - "psycopg2-binary", - "sentencepiece", "faiss-cpu", - "blobfile", - "pandas", - "pillow", - "pypdf", - "matplotlib", + "chardet", "openai", + "numpy", + "psycopg2-binary", "aiosqlite", + "pillow", "redis", "transformers", - "numpy", - "tqdm", + "chromadb-client", "sentence-transformers --no-deps", "torch --index-url https://download.pytorch.org/whl/cpu", "aiosqlite", @@ -57,24 +57,24 @@ ], "fireworks": [ "scipy", + "blobfile", + "tqdm", + "sentencepiece", + "fireworks-ai", + "matplotlib", + "pandas", + "pypdf", "scikit-learn", "nltk", - "chardet", - "chromadb-client", - "psycopg2-binary", - "sentencepiece", "faiss-cpu", - "blobfile", - "pandas", - "pillow", - "pypdf", - "matplotlib", + "chardet", + "numpy", + "psycopg2-binary", "aiosqlite", + "pillow", "redis", "transformers", - "fireworks-ai", - "numpy", - "tqdm", + "chromadb-client", "sentence-transformers --no-deps", "torch --index-url https://download.pytorch.org/whl/cpu", "aiosqlite", @@ -85,25 +85,25 @@ ], "tgi": [ "scipy", - "scikit-learn", - "nltk", - "aiohttp", - "chardet", - "chromadb-client", - "psycopg2-binary", + "blobfile", + "tqdm", "huggingface_hub", "sentencepiece", - "faiss-cpu", - "blobfile", - "pandas", - "pillow", - "pypdf", "matplotlib", - "aiosqlite", - "transformers", - "redis", + "pandas", + "pypdf", + "scikit-learn", + "nltk", + "faiss-cpu", + "chardet", "numpy", - "tqdm", + "psycopg2-binary", + "aiosqlite", + "pillow", + "redis", + "transformers", + "chromadb-client", + "aiohttp", "sentence-transformers --no-deps", "torch --index-url https://download.pytorch.org/whl/cpu", "aiosqlite", @@ -113,30 +113,30 @@ "uvicorn" ], "meta-reference-gpu": [ - "lm-format-enforcer", "scipy", - "scikit-learn", - "nltk", - "accelerate", - "chardet", - "chromadb-client", - "psycopg2-binary", + "blobfile", + "tqdm", + "torchvision", "sentencepiece", "zmq", - "faiss-cpu", - "torchvision", - "blobfile", - "fairscale", - "pandas", - "pillow", - "pypdf", "matplotlib", - "transformers", + "pandas", + "pypdf", + "scikit-learn", + "accelerate", + "nltk", + "faiss-cpu", "torch", - "aiosqlite", - "redis", + "chardet", "numpy", - "tqdm", + "psycopg2-binary", + "aiosqlite", + "pillow", + "redis", + "fairscale", + "lm-format-enforcer", + "transformers", + "chromadb-client", "sentence-transformers --no-deps", "torch --index-url https://download.pytorch.org/whl/cpu", "aiosqlite", @@ -147,25 +147,25 @@ ], "ollama": [ "scipy", + "blobfile", + "tqdm", + "sentencepiece", + "matplotlib", + "pandas", + "pypdf", "scikit-learn", "nltk", - "aiohttp", "ollama", - "chardet", - "chromadb-client", - "psycopg2-binary", - "sentencepiece", "faiss-cpu", - "blobfile", - "pandas", - "pillow", - "pypdf", - "matplotlib", - "aiosqlite", - "transformers", - "redis", + "chardet", "numpy", - "tqdm", + "psycopg2-binary", + "aiosqlite", + "pillow", + "redis", + "transformers", + "chromadb-client", + "aiohttp", "sentence-transformers --no-deps", "torch --index-url https://download.pytorch.org/whl/cpu", "aiosqlite", diff --git a/docs/source/getting_started/distributions/self_hosted_distro/fireworks.md b/docs/source/getting_started/distributions/self_hosted_distro/fireworks.md index f940e6de2..66a150f50 100644 --- a/docs/source/getting_started/distributions/self_hosted_distro/fireworks.md +++ b/docs/source/getting_started/distributions/self_hosted_distro/fireworks.md @@ -26,7 +26,7 @@ The following models are available by default: - `meta-llama/Llama-3.1-70B-Instruct (fireworks/llama-v3p1-70b-instruct)` - `meta-llama/Llama-3.1-405B-Instruct-FP8 (fireworks/llama-v3p1-405b-instruct)` - `meta-llama/Llama-3.2-3B-Instruct (fireworks/llama-v3p2-1b-instruct)` -- `meta-llama/Llama-3.2-11B-Vision-Instruct (fireworks/llama-v3p2-3b-instruct)` +- `meta-llama/Llama-3.2-3B-Instruct (fireworks/llama-v3p2-3b-instruct)` - `meta-llama/Llama-3.2-11B-Vision-Instruct (fireworks/llama-v3p2-11b-vision-instruct)` - `meta-llama/Llama-3.2-90B-Vision-Instruct (fireworks/llama-v3p2-90b-vision-instruct)` - `meta-llama/Llama-Guard-3-8B (fireworks/llama-guard-3-8b)` diff --git a/llama_stack/templates/fireworks/run.yaml b/llama_stack/templates/fireworks/run.yaml index c9c05a8e0..aa44f0f84 100644 --- a/llama_stack/templates/fireworks/run.yaml +++ b/llama_stack/templates/fireworks/run.yaml @@ -61,7 +61,7 @@ models: provider_id: null provider_model_id: fireworks/llama-v3p2-1b-instruct - metadata: {} - model_id: meta-llama/Llama-3.2-11B-Vision-Instruct + model_id: meta-llama/Llama-3.2-3B-Instruct provider_id: null provider_model_id: fireworks/llama-v3p2-3b-instruct - metadata: {} From 2da93c883533d49dd070f58b8f3ab5bc019c136c Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Tue, 19 Nov 2024 14:20:07 -0800 Subject: [PATCH 11/22] fix 3.2-1b fireworks --- distributions/dependencies.json | 204 +++++++++--------- .../self_hosted_distro/fireworks.md | 2 +- .../remote/inference/fireworks/fireworks.py | 2 +- llama_stack/templates/fireworks/run.yaml | 2 +- 4 files changed, 105 insertions(+), 105 deletions(-) diff --git a/distributions/dependencies.json b/distributions/dependencies.json index 469b6f14e..0f85b70c6 100644 --- a/distributions/dependencies.json +++ b/distributions/dependencies.json @@ -1,26 +1,26 @@ { "together": [ - "scipy", - "blobfile", - "together", - "tqdm", - "sentencepiece", - "matplotlib", - "pandas", "pypdf", - "scikit-learn", - "nltk", - "faiss-cpu", - "chardet", - "numpy", - "psycopg2-binary", - "aiosqlite", - "pillow", + "sentencepiece", + "pandas", "redis", - "transformers", + "nltk", + "psycopg2-binary", + "scikit-learn", + "chardet", + "matplotlib", + "pillow", + "tqdm", "chromadb-client", - "sentence-transformers --no-deps", + "transformers", + "blobfile", + "aiosqlite", + "together", + "faiss-cpu", + "scipy", + "numpy", "torch --index-url https://download.pytorch.org/whl/cpu", + "sentence-transformers --no-deps", "aiosqlite", "fastapi", "fire", @@ -28,27 +28,27 @@ "uvicorn" ], "remote-vllm": [ - "scipy", - "blobfile", - "tqdm", - "sentencepiece", - "matplotlib", - "pandas", "pypdf", - "scikit-learn", - "nltk", - "faiss-cpu", - "chardet", - "openai", - "numpy", - "psycopg2-binary", - "aiosqlite", - "pillow", + "sentencepiece", + "pandas", "redis", - "transformers", + "nltk", + "psycopg2-binary", + "scikit-learn", + "chardet", + "matplotlib", + "pillow", + "tqdm", "chromadb-client", - "sentence-transformers --no-deps", + "transformers", + "openai", + "blobfile", + "aiosqlite", + "faiss-cpu", + "scipy", + "numpy", "torch --index-url https://download.pytorch.org/whl/cpu", + "sentence-transformers --no-deps", "aiosqlite", "fastapi", "fire", @@ -56,27 +56,27 @@ "uvicorn" ], "fireworks": [ - "scipy", - "blobfile", - "tqdm", + "pypdf", "sentencepiece", + "pandas", + "redis", + "nltk", + "psycopg2-binary", + "scikit-learn", + "chardet", "fireworks-ai", "matplotlib", - "pandas", - "pypdf", - "scikit-learn", - "nltk", - "faiss-cpu", - "chardet", - "numpy", - "psycopg2-binary", - "aiosqlite", "pillow", - "redis", - "transformers", + "tqdm", "chromadb-client", - "sentence-transformers --no-deps", + "transformers", + "blobfile", + "aiosqlite", + "faiss-cpu", + "scipy", + "numpy", "torch --index-url https://download.pytorch.org/whl/cpu", + "sentence-transformers --no-deps", "aiosqlite", "fastapi", "fire", @@ -84,28 +84,28 @@ "uvicorn" ], "tgi": [ - "scipy", - "blobfile", + "pypdf", + "sentencepiece", + "pandas", + "redis", + "nltk", + "psycopg2-binary", + "scikit-learn", + "chardet", + "matplotlib", + "pillow", "tqdm", "huggingface_hub", - "sentencepiece", - "matplotlib", - "pandas", - "pypdf", - "scikit-learn", - "nltk", - "faiss-cpu", - "chardet", - "numpy", - "psycopg2-binary", - "aiosqlite", - "pillow", - "redis", - "transformers", "chromadb-client", "aiohttp", - "sentence-transformers --no-deps", + "transformers", + "blobfile", + "aiosqlite", + "faiss-cpu", + "scipy", + "numpy", "torch --index-url https://download.pytorch.org/whl/cpu", + "sentence-transformers --no-deps", "aiosqlite", "fastapi", "fire", @@ -113,32 +113,32 @@ "uvicorn" ], "meta-reference-gpu": [ - "scipy", - "blobfile", - "tqdm", - "torchvision", - "sentencepiece", - "zmq", - "matplotlib", - "pandas", "pypdf", - "scikit-learn", - "accelerate", - "nltk", - "faiss-cpu", + "sentencepiece", "torch", - "chardet", - "numpy", - "psycopg2-binary", - "aiosqlite", - "pillow", + "pandas", "redis", + "nltk", + "psycopg2-binary", + "scikit-learn", + "chardet", + "accelerate", + "matplotlib", + "pillow", "fairscale", + "tqdm", "lm-format-enforcer", - "transformers", "chromadb-client", - "sentence-transformers --no-deps", + "transformers", + "blobfile", + "aiosqlite", + "torchvision", + "faiss-cpu", + "zmq", + "scipy", + "numpy", "torch --index-url https://download.pytorch.org/whl/cpu", + "sentence-transformers --no-deps", "aiosqlite", "fastapi", "fire", @@ -146,28 +146,28 @@ "uvicorn" ], "ollama": [ - "scipy", - "blobfile", - "tqdm", - "sentencepiece", - "matplotlib", - "pandas", - "pypdf", - "scikit-learn", - "nltk", "ollama", - "faiss-cpu", - "chardet", - "numpy", - "psycopg2-binary", - "aiosqlite", - "pillow", + "pypdf", + "sentencepiece", + "pandas", "redis", - "transformers", + "nltk", + "psycopg2-binary", + "scikit-learn", + "chardet", + "matplotlib", + "pillow", + "tqdm", "chromadb-client", "aiohttp", - "sentence-transformers --no-deps", + "transformers", + "blobfile", + "aiosqlite", + "faiss-cpu", + "scipy", + "numpy", "torch --index-url https://download.pytorch.org/whl/cpu", + "sentence-transformers --no-deps", "aiosqlite", "fastapi", "fire", diff --git a/docs/source/getting_started/distributions/self_hosted_distro/fireworks.md b/docs/source/getting_started/distributions/self_hosted_distro/fireworks.md index 66a150f50..cca1155e1 100644 --- a/docs/source/getting_started/distributions/self_hosted_distro/fireworks.md +++ b/docs/source/getting_started/distributions/self_hosted_distro/fireworks.md @@ -25,7 +25,7 @@ The following models are available by default: - `meta-llama/Llama-3.1-8B-Instruct (fireworks/llama-v3p1-8b-instruct)` - `meta-llama/Llama-3.1-70B-Instruct (fireworks/llama-v3p1-70b-instruct)` - `meta-llama/Llama-3.1-405B-Instruct-FP8 (fireworks/llama-v3p1-405b-instruct)` -- `meta-llama/Llama-3.2-3B-Instruct (fireworks/llama-v3p2-1b-instruct)` +- `meta-llama/Llama-3.2-1B-Instruct (fireworks/llama-v3p2-1b-instruct)` - `meta-llama/Llama-3.2-3B-Instruct (fireworks/llama-v3p2-3b-instruct)` - `meta-llama/Llama-3.2-11B-Vision-Instruct (fireworks/llama-v3p2-11b-vision-instruct)` - `meta-llama/Llama-3.2-90B-Vision-Instruct (fireworks/llama-v3p2-90b-vision-instruct)` diff --git a/llama_stack/providers/remote/inference/fireworks/fireworks.py b/llama_stack/providers/remote/inference/fireworks/fireworks.py index d8cbca5f9..c3e634155 100644 --- a/llama_stack/providers/remote/inference/fireworks/fireworks.py +++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py @@ -50,7 +50,7 @@ MODEL_ALIASES = [ ), build_model_alias( "fireworks/llama-v3p2-1b-instruct", - CoreModelId.llama3_2_3b_instruct.value, + CoreModelId.llama3_2_1b_instruct.value, ), build_model_alias( "fireworks/llama-v3p2-3b-instruct", diff --git a/llama_stack/templates/fireworks/run.yaml b/llama_stack/templates/fireworks/run.yaml index aa44f0f84..6add39c3a 100644 --- a/llama_stack/templates/fireworks/run.yaml +++ b/llama_stack/templates/fireworks/run.yaml @@ -57,7 +57,7 @@ models: provider_id: null provider_model_id: fireworks/llama-v3p1-405b-instruct - metadata: {} - model_id: meta-llama/Llama-3.2-3B-Instruct + model_id: meta-llama/Llama-3.2-1B-Instruct provider_id: null provider_model_id: fireworks/llama-v3p2-1b-instruct - metadata: {} From 887ccc2143ed922f529eab87cd7bf1e4718e4915 Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Tue, 19 Nov 2024 15:20:51 -0800 Subject: [PATCH 12/22] Ensure llama-stack-client is installed in the container with TEST_PYPI --- llama_stack/distribution/build_container.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_stack/distribution/build_container.sh b/llama_stack/distribution/build_container.sh index 230ca34ac..2730ae174 100755 --- a/llama_stack/distribution/build_container.sh +++ b/llama_stack/distribution/build_container.sh @@ -97,7 +97,7 @@ else add_to_docker "RUN pip install fastapi libcst" add_to_docker < Date: Tue, 19 Nov 2024 15:50:26 -0800 Subject: [PATCH 13/22] Add logs (prints :/) to dump out what URL vllm / tgi is connecting to --- llama_stack/providers/remote/inference/tgi/tgi.py | 1 + llama_stack/providers/remote/inference/vllm/vllm.py | 1 + 2 files changed, 2 insertions(+) diff --git a/llama_stack/providers/remote/inference/tgi/tgi.py b/llama_stack/providers/remote/inference/tgi/tgi.py index 30745cb10..92492e3da 100644 --- a/llama_stack/providers/remote/inference/tgi/tgi.py +++ b/llama_stack/providers/remote/inference/tgi/tgi.py @@ -264,6 +264,7 @@ class _HfAdapter(Inference, ModelsProtocolPrivate): class TGIAdapter(_HfAdapter): async def initialize(self, config: TGIImplConfig) -> None: + print(f"Initializing TGI client with url={config.url}") self.client = AsyncInferenceClient(model=config.url, token=config.api_token) endpoint_info = await self.client.get_endpoint_info() self.max_tokens = endpoint_info["max_total_tokens"] diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py index 788f6cac4..3c877639c 100644 --- a/llama_stack/providers/remote/inference/vllm/vllm.py +++ b/llama_stack/providers/remote/inference/vllm/vllm.py @@ -53,6 +53,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): self.client = None async def initialize(self) -> None: + print(f"Initializing VLLM client with base_url={self.config.url}") self.client = OpenAI(base_url=self.config.url, api_key=self.config.api_token) async def shutdown(self) -> None: From e605d57fb78285828530b2603d21aaa8593df75d Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Tue, 19 Nov 2024 15:59:47 -0800 Subject: [PATCH 14/22] use API version in "remote" stack client --- llama_stack/distribution/client.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llama_stack/distribution/client.py b/llama_stack/distribution/client.py index b36ef94e4..e1243cb7a 100644 --- a/llama_stack/distribution/client.py +++ b/llama_stack/distribution/client.py @@ -15,6 +15,8 @@ import httpx from pydantic import BaseModel, parse_obj_as from termcolor import cprint +from llama_stack.apis.version import LLAMA_STACK_API_VERSION + from llama_stack.providers.datatypes import RemoteProviderConfig _CLIENT_CLASSES = {} @@ -117,7 +119,7 @@ def create_api_client_class(protocol) -> Type: break kwargs[param.name] = args[i] - url = f"{self.base_url}{webmethod.route}" + url = f"{self.base_url}/{LLAMA_STACK_API_VERSION}/{webmethod.route.lstrip('/')}" def convert(value): if isinstance(value, list): From f78200b1898e1de19e6ee270bdf7e873ef52fa76 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Tue, 19 Nov 2024 16:37:30 -0800 Subject: [PATCH 15/22] docs --- .../distributions/self_hosted_distro/index.md | 1 + docs/source/getting_started/index.md | 405 +----------------- 2 files changed, 9 insertions(+), 397 deletions(-) diff --git a/docs/source/getting_started/distributions/self_hosted_distro/index.md b/docs/source/getting_started/distributions/self_hosted_distro/index.md index ed6ab5d7f..502b95cb4 100644 --- a/docs/source/getting_started/distributions/self_hosted_distro/index.md +++ b/docs/source/getting_started/distributions/self_hosted_distro/index.md @@ -23,5 +23,6 @@ tgi dell-tgi together fireworks +remote-vllm bedrock ``` diff --git a/docs/source/getting_started/index.md b/docs/source/getting_started/index.md index 189bd6cb5..6400fb285 100644 --- a/docs/source/getting_started/index.md +++ b/docs/source/getting_started/index.md @@ -53,9 +53,9 @@ Please see our pages in detail for the types of distributions we offer: 3. [On-device Distribution](./distributions/ondevice_distro/index.md): If you want to run Llama Stack inference on your iOS / Android device. -### Quick Start Commands +### Table of Contents -Once you have decided on the inference provider and distribution to use, use the following quick start commands to get started. +Once you have decided on the inference provider and distribution to use, use the following guides to get started. ##### 1.0 Prerequisite @@ -109,421 +109,32 @@ Access to Single-Node CPU with Fireworks hosted endpoint via API_KEY from [firew ##### 1.1. Start the distribution -**(Option 1) Via Docker** -::::{tab-set} - :::{tab-item} meta-reference-gpu -``` -$ cd llama-stack/distributions/meta-reference-gpu && docker compose up -``` - -This will download and start running a pre-built Docker container. Alternatively, you may use the following commands: - -``` -docker run -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./run.yaml:/root/my-run.yaml --gpus=all distribution-meta-reference-gpu --yaml_config /root/my-run.yaml -``` +[Start Meta Reference GPU Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/meta-reference-gpu.html) ::: :::{tab-item} vLLM -``` -$ cd llama-stack/distributions/remote-vllm && docker compose up -``` - -The script will first start up vLLM server on port 8000, then start up Llama Stack distribution server hooking up to it for inference. You should see the following outputs -- -``` - -``` - -To kill the server -``` -docker compose down -``` +[Start vLLM Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/remote-vllm.html) ::: :::{tab-item} tgi -``` -$ cd llama-stack/distributions/tgi && docker compose up -``` - -The script will first start up TGI server, then start up Llama Stack distribution server hooking up to the remote TGI provider for inference. You should see the following outputs -- -``` -[text-generation-inference] | 2024-10-15T18:56:33.810397Z INFO text_generation_router::server: router/src/server.rs:1813: Using config Some(Llama) -[text-generation-inference] | 2024-10-15T18:56:33.810448Z WARN text_generation_router::server: router/src/server.rs:1960: Invalid hostname, defaulting to 0.0.0.0 -[text-generation-inference] | 2024-10-15T18:56:33.864143Z INFO text_generation_router::server: router/src/server.rs:2353: Connected -INFO: Started server process [1] -INFO: Waiting for application startup. -INFO: Application startup complete. -INFO: Uvicorn running on http://[::]:5000 (Press CTRL+C to quit) -``` - -To kill the server -``` -docker compose down -``` -::: - - -:::{tab-item} ollama -``` -$ cd llama-stack/distributions/ollama && docker compose up - -# OR - -$ cd llama-stack/distributions/ollama-gpu && docker compose up -``` - -You will see outputs similar to following --- -``` -[ollama] | [GIN] 2024/10/18 - 21:19:41 | 200 | 226.841µs | ::1 | GET "/api/ps" -[ollama] | [GIN] 2024/10/18 - 21:19:42 | 200 | 60.908µs | ::1 | GET "/api/ps" -INFO: Started server process [1] -INFO: Waiting for application startup. -INFO: Application startup complete. -INFO: Uvicorn running on http://[::]:5000 (Press CTRL+C to quit) -[llamastack] | Resolved 12 providers -[llamastack] | inner-inference => ollama0 -[llamastack] | models => __routing_table__ -[llamastack] | inference => __autorouted__ -``` - -To kill the server -``` -docker compose down -``` -::: - -:::{tab-item} fireworks -``` -$ cd llama-stack/distributions/fireworks && docker compose up -``` - -Make sure your `run.yaml` file has the inference provider pointing to the correct Fireworks URL server endpoint. E.g. -``` -inference: - - provider_id: fireworks - provider_type: remote::fireworks - config: - url: https://api.fireworks.ai/inference - api_key: -``` -::: - -:::{tab-item} together -``` -$ cd distributions/together && docker compose up -``` - -Make sure your `run.yaml` file has the inference provider pointing to the correct Together URL server endpoint. E.g. -``` -inference: - - provider_id: together - provider_type: remote::together - config: - url: https://api.together.xyz/v1 - api_key: -``` -::: - - -:::: - -**(Option 2) Via Conda** - -::::{tab-set} - -:::{tab-item} meta-reference-gpu -1. Install the `llama` CLI. See [CLI Reference](https://llama-stack.readthedocs.io/en/latest/cli_reference/index.html) - -2. Build the `meta-reference-gpu` distribution - -``` -$ llama stack build --template meta-reference-gpu --image-type conda -``` - -3. Start running distribution -``` -$ llama stack run ~/.llama/distributions/llamastack-meta-reference-gpu/meta-reference-gpu-run.yaml -``` - -Note: If you wish to use pgvector or chromadb as memory provider. You may need to update generated `run.yaml` file to point to the desired memory provider. See [Memory Providers](https://llama-stack.readthedocs.io/en/latest/api_providers/memory_api.html) for more details. Or comment out the pgvector or chromadb memory provider in `run.yaml` file to use the default inline memory provider, keeping only the following section: -``` -memory: - - provider_id: faiss-0 - provider_type: faiss - config: - kvstore: - namespace: null - type: sqlite - db_path: ~/.llama/runtime/faiss_store.db -``` - -::: - -:::{tab-item} tgi -1. Install the `llama` CLI. See [CLI Reference](https://llama-stack.readthedocs.io/en/latest/cli_reference/index.html) - -2. Build the `tgi` distribution - -```bash -llama stack build --template tgi --image-type conda -``` - -3. Start a TGI server endpoint - -4. Make sure in your `run.yaml` file, your `conda_env` is pointing to the conda environment and inference provider is pointing to the correct TGI server endpoint. E.g. -``` -conda_env: llamastack-tgi -... -inference: - - provider_id: tgi0 - provider_type: remote::tgi - config: - url: http://127.0.0.1:5009 -``` - -5. Start Llama Stack server -```bash -$ llama stack run ~/.llama/distributions/llamastack-tgi/tgi-run.yaml -``` - -Note: If you wish to use pgvector or chromadb as memory provider. You may need to update generated `run.yaml` file to point to the desired memory provider. See [Memory Providers](https://llama-stack.readthedocs.io/en/latest/api_providers/memory_api.html) for more details. Or comment out the pgvector or chromadb memory provider in `run.yaml` file to use the default inline memory provider, keeping only the following section: -``` -memory: - - provider_id: faiss-0 - provider_type: faiss - config: - kvstore: - namespace: null - type: sqlite - db_path: ~/.llama/runtime/faiss_store.db -``` +[Start TGI Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/tgi.html) ::: :::{tab-item} ollama - -If you wish to separately spin up a Ollama server, and connect with Llama Stack, you may use the following commands. - -#### Start Ollama server. -- Please check the [Ollama Documentations](https://github.com/ollama/ollama) for more details. - -**Via Docker** -``` -docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama -``` - -**Via CLI** -``` -ollama run -``` - -#### Start Llama Stack server pointing to Ollama server - -Make sure your `run.yaml` file has the inference provider pointing to the correct Ollama endpoint. E.g. -``` -conda_env: llamastack-ollama -... -inference: - - provider_id: ollama0 - provider_type: remote::ollama - config: - url: http://127.0.0.1:11434 -``` - -``` -llama stack build --template ollama --image-type conda -llama stack run ~/.llama/distributions/llamastack-ollama/ollama-run.yaml -``` - -Note: If you wish to use pgvector or chromadb as memory provider. You may need to update generated `run.yaml` file to point to the desired memory provider. See [Memory Providers](https://llama-stack.readthedocs.io/en/latest/api_providers/memory_api.html) for more details. Or comment out the pgvector or chromadb memory provider in `run.yaml` file to use the default inline memory provider, keeping only the following section: -``` -memory: - - provider_id: faiss-0 - provider_type: faiss - config: - kvstore: - namespace: null - type: sqlite - db_path: ~/.llama/runtime/faiss_store.db -``` - -::: - -:::{tab-item} fireworks - -```bash -llama stack build --template fireworks --image-type conda -# -- modify run.yaml to a valid Fireworks server endpoint -llama stack run ./run.yaml -``` - -Make sure your `run.yaml` file has the inference provider pointing to the correct Fireworks URL server endpoint. E.g. -``` -conda_env: llamastack-fireworks -... -inference: - - provider_id: fireworks - provider_type: remote::fireworks - config: - url: https://api.fireworks.ai/inference - api_key: -``` +[Start Ollama Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/ollama.html) ::: :::{tab-item} together - -```bash -llama stack build --template together --image-type conda -# -- modify run.yaml to a valid Together server endpoint -llama stack run ~/.llama/distributions/llamastack-together/together-run.yaml -``` - -Make sure your `run.yaml` file has the inference provider pointing to the correct Together URL server endpoint. E.g. -``` -conda_env: llamastack-together -... -inference: - - provider_id: together - provider_type: remote::together - config: - url: https://api.together.xyz/v1 - api_key: -``` -::: - -:::: - -##### 1.2 (Optional) Update Model Serving Configuration -::::{tab-set} - -:::{tab-item} meta-reference-gpu -You may change the `config.model` in `run.yaml` to update the model currently being served by the distribution. Make sure you have the model checkpoint downloaded in your `~/.llama`. -``` -inference: - - provider_id: meta0 - provider_type: inline::meta-reference - config: - model: Llama3.2-11B-Vision-Instruct - quantization: null - torch_seed: null - max_seq_len: 4096 - max_batch_size: 1 -``` - -Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints. -::: - -:::{tab-item} tgi -To serve a new model with `tgi`, change the docker command flag `--model-id `. - -This can be done by edit the `command` args in `compose.yaml`. E.g. Replace "Llama-3.2-1B-Instruct" with the model you want to serve. - -``` -command: ["--dtype", "bfloat16", "--usage-stats", "on", "--sharded", "false", "--model-id", "meta-llama/Llama-3.2-1B-Instruct", "--port", "5009", "--cuda-memory-fraction", "0.3"] -``` - -or by changing the docker run command's `--model-id` flag -``` -docker run --rm -it -v $HOME/.cache/huggingface:/data -p 5009:5009 --gpus all ghcr.io/huggingface/text-generation-inference:latest --dtype bfloat16 --usage-stats on --sharded false --model-id meta-llama/Llama-3.2-1B-Instruct --port 5009 -``` - -Make sure your `run.yaml` file has the inference provider pointing to the TGI server endpoint serving your model. -``` -inference: - - provider_id: tgi0 - provider_type: remote::tgi - config: - url: http://127.0.0.1:5009 -``` -``` - -Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints. -::: - -:::{tab-item} ollama -You can use ollama for managing model downloads. - -``` -ollama pull llama3.1:8b-instruct-fp16 -ollama pull llama3.1:70b-instruct-fp16 -``` - -> Please check the [OLLAMA_SUPPORTED_MODELS](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers.remote/inference/ollama/ollama.py) for the supported Ollama models. - - -To serve a new model with `ollama` -``` -ollama run -``` - -To make sure that the model is being served correctly, run `ollama ps` to get a list of models being served by ollama. -``` -$ ollama ps - -NAME ID SIZE PROCESSOR UNTIL -llama3.1:8b-instruct-fp16 4aacac419454 17 GB 100% GPU 4 minutes from now -``` - -To verify that the model served by ollama is correctly connected to Llama Stack server -``` -$ llama-stack-client models list -+----------------------+----------------------+---------------+-----------------------------------------------+ -| identifier | llama_model | provider_id | metadata | -+======================+======================+===============+===============================================+ -| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | ollama0 | {'ollama_model': 'llama3.1:8b-instruct-fp16'} | -+----------------------+----------------------+---------------+-----------------------------------------------+ -``` -::: - -:::{tab-item} together -Use `llama-stack-client models list` to check the available models served by together. - -``` -$ llama-stack-client models list -+------------------------------+------------------------------+---------------+------------+ -| identifier | llama_model | provider_id | metadata | -+==============================+==============================+===============+============+ -| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | together0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.1-70B-Instruct | Llama3.1-70B-Instruct | together0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.1-405B-Instruct | Llama3.1-405B-Instruct | together0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.2-3B-Instruct | Llama3.2-3B-Instruct | together0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.2-11B-Vision-Instruct | Llama3.2-11B-Vision-Instruct | together0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.2-90B-Vision-Instruct | Llama3.2-90B-Vision-Instruct | together0 | {} | -+------------------------------+------------------------------+---------------+------------+ -``` +[Start Together Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/together.html) ::: :::{tab-item} fireworks -Use `llama-stack-client models list` to check the available models served by Fireworks. -``` -$ llama-stack-client models list -+------------------------------+------------------------------+---------------+------------+ -| identifier | llama_model | provider_id | metadata | -+==============================+==============================+===============+============+ -| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | fireworks0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.1-70B-Instruct | Llama3.1-70B-Instruct | fireworks0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.1-405B-Instruct | Llama3.1-405B-Instruct | fireworks0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.2-1B-Instruct | Llama3.2-1B-Instruct | fireworks0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.2-3B-Instruct | Llama3.2-3B-Instruct | fireworks0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.2-11B-Vision-Instruct | Llama3.2-11B-Vision-Instruct | fireworks0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.2-90B-Vision-Instruct | Llama3.2-90B-Vision-Instruct | fireworks0 | {} | -+------------------------------+------------------------------+---------------+------------+ -``` +[Start Fireworks Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/fireworks.html) ::: :::: - ##### Troubleshooting - If you encounter any issues, search through our [GitHub Issues](https://github.com/meta-llama/llama-stack/issues), or file an new issue. - Use `--port ` flag to use a different port number. For docker run, update the `-p :` flag. From c49acc5226b50f51b3756fe66315ab3dd2e847f9 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Tue, 19 Nov 2024 16:39:40 -0800 Subject: [PATCH 16/22] docs --- docs/source/getting_started/index.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/source/getting_started/index.md b/docs/source/getting_started/index.md index 6400fb285..bc0258376 100644 --- a/docs/source/getting_started/index.md +++ b/docs/source/getting_started/index.md @@ -109,12 +109,13 @@ Access to Single-Node CPU with Fireworks hosted endpoint via API_KEY from [firew ##### 1.1. Start the distribution +::::{tab-set} :::{tab-item} meta-reference-gpu -[Start Meta Reference GPU Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/meta-reference-gpu.html) +- [Start Meta Reference GPU Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/meta-reference-gpu.html) ::: :::{tab-item} vLLM -[Start vLLM Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/remote-vllm.html) +- [Start vLLM Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/remote-vllm.html) ::: :::{tab-item} tgi From b0fdf7552ac5ba5cc3398b4a74b10f53af3677bc Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Tue, 19 Nov 2024 16:41:45 -0800 Subject: [PATCH 17/22] docs --- docs/source/getting_started/index.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/getting_started/index.md b/docs/source/getting_started/index.md index bc0258376..5fc2c5ed8 100644 --- a/docs/source/getting_started/index.md +++ b/docs/source/getting_started/index.md @@ -119,19 +119,19 @@ Access to Single-Node CPU with Fireworks hosted endpoint via API_KEY from [firew ::: :::{tab-item} tgi -[Start TGI Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/tgi.html) +- [Start TGI Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/tgi.html) ::: :::{tab-item} ollama -[Start Ollama Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/ollama.html) +- [Start Ollama Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/ollama.html) ::: :::{tab-item} together -[Start Together Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/together.html) +- [Start Together Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/together.html) ::: :::{tab-item} fireworks -[Start Fireworks Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/fireworks.html) +- [Start Fireworks Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/fireworks.html) ::: :::: From dd5466e17d5b384c42f6ed5a2a570fe24a8da71f Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Tue, 19 Nov 2024 16:44:15 -0800 Subject: [PATCH 18/22] Bump version to 0.0.53 --- requirements.txt | 4 ++-- setup.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index dcb30d605..fddf51880 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,8 +2,8 @@ blobfile fire httpx huggingface-hub -llama-models>=0.0.50 -llama-stack-client>=0.0.50 +llama-models>=0.0.53 +llama-stack-client>=0.0.53 prompt-toolkit python-dotenv pydantic>=2 diff --git a/setup.py b/setup.py index 3145506f9..13f389a11 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ def read_requirements(): setup( name="llama_stack", - version="0.0.50", + version="0.0.53", author="Meta Llama", author_email="llama-oss@meta.com", description="Llama Stack", From e670f99ef7d3e0b3ff1041e4785ad7c7a5db2a99 Mon Sep 17 00:00:00 2001 From: Dinesh Yeduguru Date: Tue, 19 Nov 2024 17:36:08 -0800 Subject: [PATCH 19/22] add changelog (#487) --- CHANGELOG.md | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 000000000..b081678c4 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,35 @@ +# Changelog + +## 0.0.53 + +### Added +- Resource-oriented design for models, shields, memory banks, datasets and eval tasks +- Persistence for registered objects with distribution +- Ability to persist memory banks created for FAISS +- PostgreSQL KVStore implementation +- Environment variable placeholder support in run.yaml files +- Comprehensive Zero-to-Hero notebooks and quickstart guides +- Support for quantized models in Ollama +- Vision models support for Together, Fireworks, Meta-Reference, and Ollama, and vLLM +- Bedrock distribution with safety shields support +- Evals API with task registration and scoring functions +- MMLU and SimpleQA benchmark scoring functions +- Huggingface dataset provider integration for benchmarks +- Support for custom dataset registration from local paths +- Benchmark evaluation CLI tools with visualization tables +- RAG evaluation scoring functions and metrics +- Local persistence for datasets and eval tasks + +### Changed +- Split safety into distinct providers (llama-guard, prompt-guard, code-scanner) +- Changed provider naming convention (`impls` → `inline`, `adapters` → `remote`) +- Updated API signatures for dataset and eval task registration +- Restructured folder organization for providers +- Enhanced Docker build configuration +- Added version prefixing for REST API routes +- Enhanced evaluation task registration workflow +- Improved benchmark evaluation output formatting +- Restructured evals folder organization for better modularity + +### Removed +- `llama stack configure` command From 08be0232907d37cf36522df2dd7a0be80ba2d711 Mon Sep 17 00:00:00 2001 From: varunfb Date: Tue, 19 Nov 2024 17:42:43 -0800 Subject: [PATCH 20/22] Added optional md5 validate command once download is completed (#486) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? Adds description at the end of successful download the optionally run the verify md5 checksums command. ## Test Plan Screenshot 2024-11-19 at 12 11 37 PM ## Before submitting - [x] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [x] Ran pre-commit to handle lint / formatting issues. - [x] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [x] Updated relevant documentation. - [x] Wrote necessary unit or integration tests. --------- Co-authored-by: varunfb --- llama_stack/cli/download.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/llama_stack/cli/download.py b/llama_stack/cli/download.py index bb57186e5..c2f8ac855 100644 --- a/llama_stack/cli/download.py +++ b/llama_stack/cli/download.py @@ -380,6 +380,7 @@ def _hf_download( def _meta_download( model: "Model", + model_id: str, meta_url: str, info: "LlamaDownloadInfo", max_concurrent_downloads: int, @@ -405,8 +406,15 @@ def _meta_download( downloader = ParallelDownloader(max_concurrent_downloads=max_concurrent_downloads) asyncio.run(downloader.download_all(tasks)) - print(f"\nSuccessfully downloaded model to {output_dir}") - cprint(f"\nMD5 Checksums are at: {output_dir / 'checklist.chk'}", "white") + cprint(f"\nSuccessfully downloaded model to {output_dir}", "green") + cprint( + f"\nView MD5 checksum files at: {output_dir / 'checklist.chk'}", + "white", + ) + cprint( + f"\n[Optionally] To run MD5 checksums, use the following command: llama model verify-download --model-id {model_id}", + "yellow", + ) class ModelEntry(BaseModel): @@ -512,7 +520,7 @@ def run_download_cmd(args: argparse.Namespace, parser: argparse.ArgumentParser): ) if "llamameta.net" not in meta_url: parser.error("Invalid Meta URL provided") - _meta_download(model, meta_url, info, args.max_parallel) + _meta_download(model, model_id, meta_url, info, args.max_parallel) except Exception as e: parser.error(f"Download failed: {str(e)}") From 1086b500f94828fbe21772619ed022d586fc62fb Mon Sep 17 00:00:00 2001 From: Mengtao Yuan Date: Tue, 19 Nov 2024 20:59:02 -0800 Subject: [PATCH 21/22] Support Tavily as built-in search tool. (#485) # What does this PR do? Add Tavily as a built-in search tool, in addition to Brave and Bing. ## Test Plan It's tested using ollama remote, showing parity to the Brave search tool. - Install and run ollama with `ollama run llama3.1:8b-instruct-fp16` - Build ollama distribution `llama stack build --template ollama --image-type conda` - Run ollama `stack run /$USER/.llama/distributions/llamastack-ollama/ollama-run.yaml --port 5001` - Client test command: `python - m agents.test_agents.TestAgents.test_create_agent_turn_with_tavily_search`, with enviroments: MASTER_ADDR=0.0.0.0;MASTER_PORT=5001;RANK=0;REMOTE_STACK_HOST=0.0.0.0;REMOTE_STACK_PORT=5001;TAVILY_SEARCH_API_KEY=tvly-;WORLD_SIZE=1 Test passes on the specific case (ollama remote). Server output: ``` Listening on ['::', '0.0.0.0']:5001 INFO: Started server process [7220] INFO: Waiting for application startup. INFO: Application startup complete. INFO: Uvicorn running on http://['::', '0.0.0.0']:5001 (Press CTRL+C to quit) INFO: 127.0.0.1:65209 - "POST /agents/create HTTP/1.1" 200 OK INFO: 127.0.0.1:65210 - "POST /agents/session/create HTTP/1.1" 200 OK INFO: 127.0.0.1:65211 - "POST /agents/turn/create HTTP/1.1" 200 OK role='user' content='What are the latest developments in quantum computing?' context=None role='assistant' content='' stop_reason= tool_calls=[ToolCall(call_id='fc92ccb8-1039-4ce8-ba5e-8f2b0147661c', tool_name=, arguments={'query': 'latest developments in quantum computing'})] role='ipython' call_id='fc92ccb8-1039-4ce8-ba5e-8f2b0147661c' tool_name= content='{"query": "latest developments in quantum computing", "top_k": [{"title": "IBM Unveils 400 Qubit-Plus Quantum Processor and Next-Generation IBM ...", "url": "https://newsroom.ibm.com/2022-11-09-IBM-Unveils-400-Qubit-Plus-Quantum-Processor-and-Next-Generation-IBM-Quantum-System-Two", "content": "This system is targeted to be online by the end of 2023 and will be a building b......onnect large-scale ...", "url": "https://news.mit.edu/2023/quantum-interconnects-photon-emission-0105", "content": "Quantum computers hold the promise of performing certain tasks that are intractable even on the world\'s most powerful supercomputers. In the future, scientists anticipate using quantum computing to emulate materials systems, simulate quantum chemistry, and optimize hard tasks, with impacts potentially spanning finance to pharmaceuticals.", "score": 0.71721, "raw_content": null}]}' Assistant: The latest developments in quantum computing include: * IBM unveiling its 400 qubit-plus quantum processor and next-generation IBM Quantum System Two, which will be a building block of quantum-centric supercomputing. * The development of utility-scale quantum computing, which can serve as a scientific tool to explore utility-scale classes of problems in chemistry, physics, and materials beyond brute force classical simulation of quantum mechanics. * The introduction of advanced hardware across IBM's global fleet of 100+ qubit systems, as well as easy-to-use software that users and computational scientists can now obtain reliable results from quantum systems as they map increasingly larger and more complex problems to quantum circuits. * Research on quantum repeaters, which use defects in diamond to interconnect quantum systems and could provide the foundation for scalable quantum networking. * The development of a new source of quantum light, which could be used to improve the efficiency of quantum computers. * The creation of a new mathematical "blueprint" that is accelerating fusion device development using Dyson maps. * Research on canceling noise to improve quantum devices, with MIT researchers developing a protocol to extend the life of quantum coherence. ``` Verified with tool response. The final model response is updated with the search requests. ## Sources ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [x] Ran pre-commit to handle lint / formatting issues. - [x] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [x] Updated relevant documentation. - [x] Wrote necessary unit or integration tests. Co-authored-by: Martin Yuan --- llama_stack/apis/agents/agents.py | 1 + .../agents/meta_reference/tools/builtin.py | 18 +++ .../providers/tests/agents/test_agents.py | 136 +++++++++++------- 3 files changed, 106 insertions(+), 49 deletions(-) diff --git a/llama_stack/apis/agents/agents.py b/llama_stack/apis/agents/agents.py index f2602ddde..25de35497 100644 --- a/llama_stack/apis/agents/agents.py +++ b/llama_stack/apis/agents/agents.py @@ -54,6 +54,7 @@ class ToolDefinitionCommon(BaseModel): class SearchEngineType(Enum): bing = "bing" brave = "brave" + tavily = "tavily" @json_schema_type diff --git a/llama_stack/providers/inline/agents/meta_reference/tools/builtin.py b/llama_stack/providers/inline/agents/meta_reference/tools/builtin.py index 4c9cdfcd2..a1e7d08f5 100644 --- a/llama_stack/providers/inline/agents/meta_reference/tools/builtin.py +++ b/llama_stack/providers/inline/agents/meta_reference/tools/builtin.py @@ -86,10 +86,13 @@ class PhotogenTool(SingleMessageBuiltinTool): class SearchTool(SingleMessageBuiltinTool): def __init__(self, engine: SearchEngineType, api_key: str, **kwargs) -> None: self.api_key = api_key + self.engine_type = engine if engine == SearchEngineType.bing: self.engine = BingSearch(api_key, **kwargs) elif engine == SearchEngineType.brave: self.engine = BraveSearch(api_key, **kwargs) + elif engine == SearchEngineType.tavily: + self.engine = TavilySearch(api_key, **kwargs) else: raise ValueError(f"Unknown search engine: {engine}") @@ -257,6 +260,21 @@ class BraveSearch: return {"query": query, "top_k": clean_response} +class TavilySearch: + def __init__(self, api_key: str) -> None: + self.api_key = api_key + + async def search(self, query: str) -> str: + response = requests.post( + "https://api.tavily.com/search", + json={"api_key": self.api_key, "query": query}, + ) + return json.dumps(self._clean_tavily_response(response.json())) + + def _clean_tavily_response(self, search_response, top_k=3): + return {"query": search_response["query"], "top_k": search_response["results"]} + + class WolframAlphaTool(SingleMessageBuiltinTool): def __init__(self, api_key: str) -> None: self.api_key = api_key diff --git a/llama_stack/providers/tests/agents/test_agents.py b/llama_stack/providers/tests/agents/test_agents.py index 60c047058..ee2f3d29f 100644 --- a/llama_stack/providers/tests/agents/test_agents.py +++ b/llama_stack/providers/tests/agents/test_agents.py @@ -68,6 +68,73 @@ def query_attachment_messages(): ] +async def create_agent_turn_with_search_tool( + agents_stack: Dict[str, object], + search_query_messages: List[object], + common_params: Dict[str, str], + search_tool_definition: SearchToolDefinition, +) -> None: + """ + Create an agent turn with a search tool. + + Args: + agents_stack (Dict[str, object]): The agents stack. + search_query_messages (List[object]): The search query messages. + common_params (Dict[str, str]): The common parameters. + search_tool_definition (SearchToolDefinition): The search tool definition. + """ + + # Create an agent with the search tool + agent_config = AgentConfig( + **{ + **common_params, + "tools": [search_tool_definition], + } + ) + + agent_id, session_id = await create_agent_session( + agents_stack.impls[Api.agents], agent_config + ) + turn_request = dict( + agent_id=agent_id, + session_id=session_id, + messages=search_query_messages, + stream=True, + ) + + turn_response = [ + chunk + async for chunk in await agents_stack.impls[Api.agents].create_agent_turn( + **turn_request + ) + ] + + assert len(turn_response) > 0 + assert all( + isinstance(chunk, AgentTurnResponseStreamChunk) for chunk in turn_response + ) + + check_event_types(turn_response) + + # Check for tool execution events + tool_execution_events = [ + chunk + for chunk in turn_response + if isinstance(chunk.event.payload, AgentTurnResponseStepCompletePayload) + and chunk.event.payload.step_details.step_type == StepType.tool_execution.value + ] + assert len(tool_execution_events) > 0, "No tool execution events found" + + # Check the tool execution details + tool_execution = tool_execution_events[0].event.payload.step_details + assert isinstance(tool_execution, ToolExecutionStep) + assert len(tool_execution.tool_calls) > 0 + assert tool_execution.tool_calls[0].tool_name == BuiltinTool.brave_search + assert len(tool_execution.tool_responses) > 0 + + check_turn_complete_event(turn_response, session_id, search_query_messages) + + class TestAgents: @pytest.mark.asyncio async def test_agent_turns_with_safety( @@ -215,63 +282,34 @@ class TestAgents: async def test_create_agent_turn_with_brave_search( self, agents_stack, search_query_messages, common_params ): - agents_impl = agents_stack.impls[Api.agents] - if "BRAVE_SEARCH_API_KEY" not in os.environ: pytest.skip("BRAVE_SEARCH_API_KEY not set, skipping test") - # Create an agent with Brave search tool - agent_config = AgentConfig( - **{ - **common_params, - "tools": [ - SearchToolDefinition( - type=AgentTool.brave_search.value, - api_key=os.environ["BRAVE_SEARCH_API_KEY"], - engine=SearchEngineType.brave, - ) - ], - } + search_tool_definition = SearchToolDefinition( + type=AgentTool.brave_search.value, + api_key=os.environ["BRAVE_SEARCH_API_KEY"], + engine=SearchEngineType.brave, + ) + await create_agent_turn_with_search_tool( + agents_stack, search_query_messages, common_params, search_tool_definition ) - agent_id, session_id = await create_agent_session(agents_impl, agent_config) - turn_request = dict( - agent_id=agent_id, - session_id=session_id, - messages=search_query_messages, - stream=True, + @pytest.mark.asyncio + async def test_create_agent_turn_with_tavily_search( + self, agents_stack, search_query_messages, common_params + ): + if "TAVILY_SEARCH_API_KEY" not in os.environ: + pytest.skip("TAVILY_SEARCH_API_KEY not set, skipping test") + + search_tool_definition = SearchToolDefinition( + type=AgentTool.brave_search.value, # place holder only + api_key=os.environ["TAVILY_SEARCH_API_KEY"], + engine=SearchEngineType.tavily, ) - - turn_response = [ - chunk async for chunk in await agents_impl.create_agent_turn(**turn_request) - ] - - assert len(turn_response) > 0 - assert all( - isinstance(chunk, AgentTurnResponseStreamChunk) for chunk in turn_response + await create_agent_turn_with_search_tool( + agents_stack, search_query_messages, common_params, search_tool_definition ) - check_event_types(turn_response) - - # Check for tool execution events - tool_execution_events = [ - chunk - for chunk in turn_response - if isinstance(chunk.event.payload, AgentTurnResponseStepCompletePayload) - and chunk.event.payload.step_details.step_type - == StepType.tool_execution.value - ] - assert len(tool_execution_events) > 0, "No tool execution events found" - - # Check the tool execution details - tool_execution = tool_execution_events[0].event.payload.step_details - assert isinstance(tool_execution, ToolExecutionStep) - assert len(tool_execution.tool_calls) > 0 - assert tool_execution.tool_calls[0].tool_name == BuiltinTool.brave_search - assert len(tool_execution.tool_responses) > 0 - - check_turn_complete_event(turn_response, session_id, search_query_messages) - def check_event_types(turn_response): event_types = [chunk.event.payload.event_type for chunk in turn_response] From 89f5093dfcb9acf53ef2507f51137e1e05202952 Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Tue, 19 Nov 2024 21:05:59 -0800 Subject: [PATCH 22/22] Fix tgi doc --- distributions/dependencies.json | 254 +++++++++--------- llama_stack/scripts/distro_codegen.py | 5 +- llama_stack/templates/tgi/build.yaml | 2 +- .../templates/tgi/run-with-safety.yaml | 2 +- llama_stack/templates/tgi/run.yaml | 2 +- llama_stack/templates/tgi/tgi.py | 2 +- 6 files changed, 132 insertions(+), 135 deletions(-) diff --git a/distributions/dependencies.json b/distributions/dependencies.json index 0f85b70c6..92ebd1105 100644 --- a/distributions/dependencies.json +++ b/distributions/dependencies.json @@ -1,177 +1,171 @@ { "together": [ - "pypdf", - "sentencepiece", - "pandas", - "redis", - "nltk", - "psycopg2-binary", - "scikit-learn", - "chardet", - "matplotlib", - "pillow", - "tqdm", - "chromadb-client", - "transformers", + "aiosqlite", "blobfile", - "aiosqlite", - "together", + "chardet", + "chromadb-client", "faiss-cpu", - "scipy", - "numpy", - "torch --index-url https://download.pytorch.org/whl/cpu", - "sentence-transformers --no-deps", - "aiosqlite", "fastapi", "fire", "httpx", - "uvicorn" + "matplotlib", + "nltk", + "numpy", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "together", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" ], "remote-vllm": [ - "pypdf", - "sentencepiece", - "pandas", - "redis", - "nltk", - "psycopg2-binary", - "scikit-learn", - "chardet", - "matplotlib", - "pillow", - "tqdm", - "chromadb-client", - "transformers", - "openai", + "aiosqlite", "blobfile", - "aiosqlite", + "chardet", + "chromadb-client", "faiss-cpu", - "scipy", - "numpy", - "torch --index-url https://download.pytorch.org/whl/cpu", - "sentence-transformers --no-deps", - "aiosqlite", "fastapi", "fire", "httpx", - "uvicorn" + "matplotlib", + "nltk", + "numpy", + "openai", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" ], "fireworks": [ - "pypdf", - "sentencepiece", - "pandas", - "redis", - "nltk", - "psycopg2-binary", - "scikit-learn", - "chardet", - "fireworks-ai", - "matplotlib", - "pillow", - "tqdm", - "chromadb-client", - "transformers", + "aiosqlite", "blobfile", - "aiosqlite", + "chardet", + "chromadb-client", "faiss-cpu", - "scipy", - "numpy", - "torch --index-url https://download.pytorch.org/whl/cpu", - "sentence-transformers --no-deps", - "aiosqlite", "fastapi", "fire", + "fireworks-ai", "httpx", - "uvicorn" + "matplotlib", + "nltk", + "numpy", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" ], "tgi": [ - "pypdf", - "sentencepiece", - "pandas", - "redis", - "nltk", - "psycopg2-binary", - "scikit-learn", - "chardet", - "matplotlib", - "pillow", - "tqdm", - "huggingface_hub", - "chromadb-client", "aiohttp", - "transformers", + "aiosqlite", "blobfile", - "aiosqlite", + "chardet", + "chromadb-client", "faiss-cpu", - "scipy", - "numpy", - "torch --index-url https://download.pytorch.org/whl/cpu", - "sentence-transformers --no-deps", - "aiosqlite", "fastapi", "fire", "httpx", - "uvicorn" + "huggingface_hub", + "matplotlib", + "nltk", + "numpy", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" ], "meta-reference-gpu": [ + "accelerate", + "aiosqlite", + "blobfile", + "chardet", + "chromadb-client", + "fairscale", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "lm-format-enforcer", + "matplotlib", + "nltk", + "numpy", + "pandas", + "pillow", + "psycopg2-binary", "pypdf", + "redis", + "scikit-learn", + "scipy", "sentencepiece", "torch", - "pandas", - "redis", - "nltk", - "psycopg2-binary", - "scikit-learn", - "chardet", - "accelerate", - "matplotlib", - "pillow", - "fairscale", - "tqdm", - "lm-format-enforcer", - "chromadb-client", - "transformers", - "blobfile", - "aiosqlite", "torchvision", - "faiss-cpu", + "tqdm", + "transformers", + "uvicorn", "zmq", - "scipy", - "numpy", - "torch --index-url https://download.pytorch.org/whl/cpu", "sentence-transformers --no-deps", - "aiosqlite", - "fastapi", - "fire", - "httpx", - "uvicorn" + "torch --index-url https://download.pytorch.org/whl/cpu" ], "ollama": [ - "ollama", - "pypdf", - "sentencepiece", - "pandas", - "redis", - "nltk", - "psycopg2-binary", - "scikit-learn", - "chardet", - "matplotlib", - "pillow", - "tqdm", - "chromadb-client", "aiohttp", - "transformers", + "aiosqlite", "blobfile", - "aiosqlite", + "chardet", + "chromadb-client", "faiss-cpu", - "scipy", - "numpy", - "torch --index-url https://download.pytorch.org/whl/cpu", - "sentence-transformers --no-deps", - "aiosqlite", "fastapi", "fire", "httpx", - "uvicorn" + "matplotlib", + "nltk", + "numpy", + "ollama", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" ] } diff --git a/llama_stack/scripts/distro_codegen.py b/llama_stack/scripts/distro_codegen.py index 8bcf97374..b82319bd5 100644 --- a/llama_stack/scripts/distro_codegen.py +++ b/llama_stack/scripts/distro_codegen.py @@ -82,7 +82,10 @@ def collect_template_dependencies(template_dir: Path) -> tuple[str, list[str]]: template = template_func() normal_deps, special_deps = get_provider_dependencies(template.providers) # Combine all dependencies in order: normal deps, special deps, server deps - all_deps = normal_deps + special_deps + SERVER_DEPENDENCIES + all_deps = sorted(list(set(normal_deps + SERVER_DEPENDENCIES))) + sorted( + list(set(special_deps)) + ) + return template.name, all_deps except Exception: return None, [] diff --git a/llama_stack/templates/tgi/build.yaml b/llama_stack/templates/tgi/build.yaml index 5f44c2d86..0f7602e2f 100644 --- a/llama_stack/templates/tgi/build.yaml +++ b/llama_stack/templates/tgi/build.yaml @@ -2,7 +2,7 @@ version: '2' name: tgi distribution_spec: description: Use (an external) TGI server for running LLM inference - docker_image: llamastack/distribution-tgi:test-0.0.52rc3 + docker_image: null providers: inference: - remote::tgi diff --git a/llama_stack/templates/tgi/run-with-safety.yaml b/llama_stack/templates/tgi/run-with-safety.yaml index b988c28e1..ebf082cd6 100644 --- a/llama_stack/templates/tgi/run-with-safety.yaml +++ b/llama_stack/templates/tgi/run-with-safety.yaml @@ -1,6 +1,6 @@ version: '2' image_name: tgi -docker_image: llamastack/distribution-tgi:test-0.0.52rc3 +docker_image: null conda_env: tgi apis: - agents diff --git a/llama_stack/templates/tgi/run.yaml b/llama_stack/templates/tgi/run.yaml index 485c02ad8..352afabb5 100644 --- a/llama_stack/templates/tgi/run.yaml +++ b/llama_stack/templates/tgi/run.yaml @@ -1,6 +1,6 @@ version: '2' image_name: tgi -docker_image: llamastack/distribution-tgi:test-0.0.52rc3 +docker_image: null conda_env: tgi apis: - agents diff --git a/llama_stack/templates/tgi/tgi.py b/llama_stack/templates/tgi/tgi.py index 79f2ad395..caa341df3 100644 --- a/llama_stack/templates/tgi/tgi.py +++ b/llama_stack/templates/tgi/tgi.py @@ -41,7 +41,7 @@ def get_distribution_template() -> DistributionTemplate: name="tgi", distro_type="self_hosted", description="Use (an external) TGI server for running LLM inference", - docker_image="llamastack/distribution-tgi:test-0.0.52rc3", + docker_image=None, template_path=Path(__file__).parent / "doc_template.md", providers=providers, default_models=[inference_model, safety_model],