diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index faa2eda31..046387ab9 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -23,3 +23,7 @@ jobs: .pre-commit-config.yaml - uses: pre-commit/action@v3.0.1 + + - name: Verify if there are any diff files after pre-commit + run: | + git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index ff13a4cb0..cfc26000b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -54,7 +54,7 @@ jobs: echo "REPORT_FILE=${REPORT_OUTPUT}" >> "$GITHUB_ENV" export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct - LLAMA_STACK_CONFIG=./llama_stack/templates/${{ matrix.provider }}/run.yaml pytest --md-report --md-report-verbose=1 ./tests/client-sdk/inference/test_inference.py --md-report-output "$REPORT_OUTPUT" + LLAMA_STACK_CONFIG=./llama_stack/templates/${{ matrix.provider }}/run.yaml pytest --md-report --md-report-verbose=1 ./tests/client-sdk/inference/ --md-report-output "$REPORT_OUTPUT" - name: Output reports to the job summary if: always() diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index adafccf64..bca91081f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -48,6 +48,7 @@ repos: hooks: - id: uv-export args: ["--frozen", "--no-hashes", "--no-emit-project"] + - id: uv-sync # - repo: https://github.com/pre-commit/mirrors-mypy # rev: v1.14.0 diff --git a/CHANGELOG.md b/CHANGELOG.md deleted file mode 100644 index 04cd09777..000000000 --- a/CHANGELOG.md +++ /dev/null @@ -1,44 +0,0 @@ -# Changelog - -## 0.2.0 - -### Added - -### Changed - -### Removed - - -## 0.0.53 - -### Added -- Resource-oriented design for models, shields, memory banks, datasets and eval tasks -- Persistence for registered objects with distribution -- Ability to persist memory banks created for FAISS -- PostgreSQL KVStore implementation -- Environment variable placeholder support in run.yaml files -- Comprehensive Zero-to-Hero notebooks and quickstart guides -- Support for quantized models in Ollama -- Vision models support for Together, Fireworks, Meta-Reference, and Ollama, and vLLM -- Bedrock distribution with safety shields support -- Evals API with task registration and scoring functions -- MMLU and SimpleQA benchmark scoring functions -- Huggingface dataset provider integration for benchmarks -- Support for custom dataset registration from local paths -- Benchmark evaluation CLI tools with visualization tables -- RAG evaluation scoring functions and metrics -- Local persistence for datasets and eval tasks - -### Changed -- Split safety into distinct providers (llama-guard, prompt-guard, code-scanner) -- Changed provider naming convention (`impls` → `inline`, `adapters` → `remote`) -- Updated API signatures for dataset and eval task registration -- Restructured folder organization for providers -- Enhanced Docker build configuration -- Added version prefixing for REST API routes -- Enhanced evaluation task registration workflow -- Improved benchmark evaluation output formatting -- Restructured evals folder organization for better modularity - -### Removed -- `llama stack configure` command diff --git a/README.md b/README.md index cdf98dc12..a5e5b217d 100644 --- a/README.md +++ b/README.md @@ -34,22 +34,22 @@ By reducing friction and complexity, Llama Stack empowers developers to focus on ### API Providers Here is a list of the various API providers and available distributions to developers started easily, -| **API Provider Builder** | **Environments** | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** | -|:------------------------------------------------------------------------------------------:|:----------------------:|:------------------:|:------------------:|:------------------:|:------------------:|:------------------:| -| Meta Reference | Single Node | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | -| SambaNova | Hosted | | :heavy_check_mark: | | | | -| Cerebras | Hosted | | :heavy_check_mark: | | | | -| Fireworks | Hosted | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | | -| AWS Bedrock | Hosted | | :heavy_check_mark: | | :heavy_check_mark: | | -| Together | Hosted | :heavy_check_mark: | :heavy_check_mark: | | :heavy_check_mark: | | -| Groq | Hosted | | :heavy_check_mark: | | | | -| Ollama | Single Node | | :heavy_check_mark: | | | | -| TGI | Hosted and Single Node | | :heavy_check_mark: | | | | -| NVIDIA NIM | Hosted and Single Node | | :heavy_check_mark: | | | | -| Chroma | Single Node | | | :heavy_check_mark: | | | -| PG Vector | Single Node | | | :heavy_check_mark: | | | -| PyTorch ExecuTorch | On-device iOS | :heavy_check_mark: | :heavy_check_mark: | | | | -| vLLM | Hosted and Single Node | | :heavy_check_mark: | | | | +| **API Provider Builder** | **Environments** | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** | +|:------------------------:|:----------------------:|:----------:|:-------------:|:----------:|:----------:|:-------------:| +| Meta Reference | Single Node | ✅ | ✅ | ✅ | ✅ | ✅ | +| SambaNova | Hosted | | ✅ | | | | +| Cerebras | Hosted | | ✅ | | | | +| Fireworks | Hosted | ✅ | ✅ | ✅ | | | +| AWS Bedrock | Hosted | | ✅ | | ✅ | | +| Together | Hosted | ✅ | ✅ | | ✅ | | +| Groq | Hosted | | ✅ | | | | +| Ollama | Single Node | | ✅ | | | | +| TGI | Hosted and Single Node | | ✅ | | | | +| NVIDIA NIM | Hosted and Single Node | | ✅ | | | | +| Chroma | Single Node | | | ✅ | | | +| PG Vector | Single Node | | | ✅ | | | +| PyTorch ExecuTorch | On-device iOS | ✅ | ✅ | | | | +| vLLM | Hosted and Single Node | | ✅ | | | | ### Distributions diff --git a/distributions/dependencies.json b/distributions/dependencies.json index badb728e3..3d1de48b9 100644 --- a/distributions/dependencies.json +++ b/distributions/dependencies.json @@ -69,6 +69,40 @@ "fiddlecube": [ "httpx" ], + "dell": [ + "aiohttp", + "aiosqlite", + "autoevals", + "blobfile", + "chardet", + "chromadb-client", + "datasets", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "huggingface_hub", + "matplotlib", + "nltk", + "numpy", + "openai", + "opentelemetry-exporter-otlp-proto-http", + "opentelemetry-sdk", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "requests", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch torchvision --index-url https://download.pytorch.org/whl/cpu" + ], "fireworks": [ "aiosqlite", "autoevals", @@ -255,6 +289,38 @@ "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" ], + "nvidia": [ + "aiosqlite", + "autoevals", + "blobfile", + "chardet", + "datasets", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "matplotlib", + "mcp", + "nltk", + "numpy", + "openai", + "opentelemetry-exporter-otlp-proto-http", + "opentelemetry-sdk", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "requests", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch torchvision --index-url https://download.pytorch.org/whl/cpu" + ], "ollama": [ "aiohttp", "aiosqlite", @@ -322,6 +388,36 @@ "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" ], + "sambanova": [ + "aiosqlite", + "blobfile", + "chardet", + "chromadb-client", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "matplotlib", + "nltk", + "numpy", + "openai", + "opentelemetry-exporter-otlp-proto-http", + "opentelemetry-sdk", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "requests", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch torchvision --index-url https://download.pytorch.org/whl/cpu" + ], "tgi": [ "aiohttp", "aiosqlite", @@ -424,101 +520,5 @@ "vllm", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" - ], - "nvidia": [ - "aiosqlite", - "autoevals", - "blobfile", - "chardet", - "datasets", - "faiss-cpu", - "fastapi", - "fire", - "httpx", - "matplotlib", - "mcp", - "nltk", - "numpy", - "openai", - "opentelemetry-exporter-otlp-proto-http", - "opentelemetry-sdk", - "pandas", - "pillow", - "psycopg2-binary", - "pypdf", - "redis", - "requests", - "scikit-learn", - "scipy", - "sentencepiece", - "tqdm", - "transformers", - "uvicorn", - "sentence-transformers --no-deps", - "torch torchvision --index-url https://download.pytorch.org/whl/cpu" - ], - "sambanova": [ - "aiosqlite", - "blobfile", - "chardet", - "chromadb-client", - "faiss-cpu", - "fastapi", - "fire", - "httpx", - "matplotlib", - "nltk", - "numpy", - "openai", - "opentelemetry-exporter-otlp-proto-http", - "opentelemetry-sdk", - "pandas", - "pillow", - "psycopg2-binary", - "pypdf", - "redis", - "requests", - "scikit-learn", - "scipy", - "sentencepiece", - "tqdm", - "transformers", - "uvicorn", - "sentence-transformers --no-deps", - "torch torchvision --index-url https://download.pytorch.org/whl/cpu" - ], - "dell": [ - "aiohttp", - "aiosqlite", - "autoevals", - "blobfile", - "chardet", - "chromadb-client", - "datasets", - "faiss-cpu", - "fastapi", - "fire", - "httpx", - "huggingface_hub", - "matplotlib", - "nltk", - "numpy", - "openai", - "opentelemetry-exporter-otlp-proto-http", - "opentelemetry-sdk", - "pandas", - "pillow", - "psycopg2-binary", - "pypdf", - "redis", - "requests", - "scikit-learn", - "scipy", - "sentencepiece", - "tqdm", - "transformers", - "uvicorn", - "sentence-transformers --no-deps", - "torch torchvision --index-url https://download.pytorch.org/whl/cpu" ] } diff --git a/docs/conftest.py b/docs/conftest.py new file mode 100644 index 000000000..bec535f77 --- /dev/null +++ b/docs/conftest.py @@ -0,0 +1,9 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +def pytest_collection_modifyitems(items): + for item in items: + item.name = item.name.replace(' ', '_') diff --git a/docs/getting_started.ipynb b/docs/getting_started.ipynb index 4e4893158..abe537c8e 100644 --- a/docs/getting_started.ipynb +++ b/docs/getting_started.ipynb @@ -86,7 +86,6 @@ "# NBVAL_SKIP\n", "\n", "!apt-get install -y bubblewrap\n", - "# install a branch of llama stack\n", "import os\n", "os.environ[\"UV_SYSTEM_PYTHON\"] = \"1\"\n", "!pip install uv\n", @@ -3397,6 +3396,231 @@ "response = client.scoring.score(input_rows=rows, scoring_functions=scoring_params)\n", "pprint(response)\n" ] + }, + { + "cell_type": "markdown", + "id": "ad077440", + "metadata": {}, + "source": [ + "## 4. Image Understanding with Llama 3.2\n", + "\n", + "Below is a complete example of using Together's Llama Stack 0.1 server at https://llama-stack.together.ai to ask Llama 3.2 questions about an image." + ] + }, + { + "cell_type": "markdown", + "id": "82e381ec", + "metadata": {}, + "source": [ + "### 4.1 Setup and helpers\n", + "\n", + "Below we install the Llama Stack client 0.1, download the example image, define two image helpers, and set Llama Stack Together server URL and Llama 3.2 model name.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "865fc5a8", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install llama-stack-client==0.1.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "44e05e16", + "metadata": {}, + "outputs": [], + "source": [ + "!wget https://raw.githubusercontent.com/meta-llama/llama-models/refs/heads/main/Llama_Repo.jpeg" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "469750f7", + "metadata": {}, + "outputs": [], + "source": [ + "from PIL import Image\n", + "import matplotlib.pyplot as plt\n", + "\n", + "def display_image(path):\n", + " img = Image.open(path)\n", + " plt.imshow(img)\n", + " plt.axis('off')\n", + " plt.show()\n", + "\n", + "display_image(\"Llama_Repo.jpeg\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2c1e1c2", + "metadata": {}, + "outputs": [], + "source": [ + "import base64\n", + "\n", + "def encode_image(image_path):\n", + " with open(image_path, \"rb\") as image_file:\n", + " base64_string = base64.b64encode(image_file.read()).decode(\"utf-8\")\n", + " base64_url = f\"data:image/png;base64,{base64_string}\"\n", + " return base64_url" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c565f99e", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_stack_client import LlamaStackClient\n", + "\n", + "LLAMA_STACK_API_TOGETHER_URL=\"https://llama-stack.together.ai\"\n", + "LLAMA32_11B_INSTRUCT = \"meta-llama/Llama-3.2-11B-Vision-Instruct\"" + ] + }, + { + "cell_type": "markdown", + "id": "7737cd41", + "metadata": {}, + "source": [ + "### 4.2 Using Llama Stack Chat API\n", + "\n", + "The code below uses the Llama Stack 0.1's chat API to interact with Llama 3.2:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d7914894", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_stack_client.lib.inference.event_logger import EventLogger\n", + "\n", + "async def run_main(image_path: str, prompt):\n", + " client = LlamaStackClient(\n", + " base_url=LLAMA_STACK_API_TOGETHER_URL,\n", + " )\n", + "\n", + " message = {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\n", + " \"type\": \"image\",\n", + " \"image\": {\n", + " \"url\": {\n", + " \"uri\": encode_image(image_path)\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"type\": \"text\",\n", + " \"text\": prompt,\n", + " }\n", + " ]\n", + " }\n", + "\n", + " response = client.inference.chat_completion(\n", + " messages=[message],\n", + " model_id=LLAMA32_11B_INSTRUCT,\n", + " stream=False,\n", + " )\n", + "\n", + " print(response.completion_message.content.lower().strip())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ee09b97", + "metadata": {}, + "outputs": [], + "source": [ + "await run_main(\"Llama_Repo.jpeg\",\n", + " \"How many different colors are those llamas?\\\n", + " What are those colors?\")" + ] + }, + { + "cell_type": "markdown", + "id": "e741d7b9", + "metadata": {}, + "source": [ + "### 4.3 Using Llama Stack Agent API\n", + "\n", + "The code below uses the Llama Stack 0.1's Agent API to interact with Llama 3.2:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f9a83275", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_stack_client.lib.agents.agent import Agent\n", + "from llama_stack_client.lib.agents.event_logger import EventLogger\n", + "from llama_stack_client.types.agent_create_params import AgentConfig\n", + "\n", + "async def run_main(image_path, prompt):\n", + " base64_image = encode_image(image_path)\n", + "\n", + " client = LlamaStackClient(\n", + " base_url=LLAMA_STACK_API_TOGETHER_URL,\n", + " )\n", + "\n", + " agent_config = AgentConfig(\n", + " model=LLAMA32_11B_INSTRUCT,\n", + " instructions=\"You are a helpful assistant\",\n", + " enable_session_persistence=False,\n", + " )\n", + "\n", + " agent = Agent(client, agent_config)\n", + " session_id = agent.create_session(\"test-session\")\n", + "\n", + " response = agent.create_turn(\n", + " messages=[{\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\n", + " \"type\": \"image\",\n", + " \"image\": {\n", + " \"url\": {\n", + " \"uri\": encode_image(image_path)\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"type\": \"text\",\n", + " \"text\": prompt,\n", + " }\n", + " ]\n", + " }],\n", + " session_id=session_id,\n", + " )\n", + "\n", + " for log in EventLogger().log(response):\n", + " log.print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15d0098b", + "metadata": {}, + "outputs": [], + "source": [ + "await run_main(\"Llama_Repo.jpeg\",\n", + " \"How many different colors are those llamas?\\\n", + " What are those colors?\")" + ] } ], "metadata": { diff --git a/docs/source/building_applications/index.md b/docs/source/building_applications/index.md index 45dca5a1c..e89a90299 100644 --- a/docs/source/building_applications/index.md +++ b/docs/source/building_applications/index.md @@ -4,7 +4,7 @@ Llama Stack provides all the building blocks needed to create sophisticated AI a The best way to get started is to look at this notebook which walks through the various APIs (from basic inference, to RAG agents) and how to use them. -**Notebook**: [Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb) +**Notebook**: [Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb) Here are some key topics that will help you build effective agents: diff --git a/docs/source/building_applications/rag.md b/docs/source/building_applications/rag.md index 6b7a354b7..5287a2367 100644 --- a/docs/source/building_applications/rag.md +++ b/docs/source/building_applications/rag.md @@ -36,13 +36,12 @@ chunks = [ "content": "Your document text here", "mime_type": "text/plain", }, - ..., ] -client.vector_io.insert(vector_db_id, chunks) +client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks) # You can then query for these chunks chunks_response = client.vector_io.query( - vector_db_id, query="What do you know about..." + vector_db_id=vector_db_id, query="What do you know about..." ) ``` @@ -72,8 +71,8 @@ client.tool_runtime.rag_tool.insert( # Query documents results = client.tool_runtime.rag_tool.query( - vector_db_id=vector_db_id, - query="What do you know about...", + vector_db_ids=[vector_db_id], + content="What do you know about...", ) ``` @@ -82,10 +81,14 @@ results = client.tool_runtime.rag_tool.query( One of the most powerful patterns is combining agents with RAG capabilities. Here's a complete example: ```python +from llama_stack_client.types.agent_create_params import AgentConfig +from llama_stack_client.lib.agents.agent import Agent + # Configure agent with memory agent_config = AgentConfig( - model="Llama3.2-3B-Instruct", + model="meta-llama/Llama-3.2-3B-Instruct", instructions="You are a helpful assistant", + enable_session_persistence=False, toolgroups=[ { "name": "builtin::rag", @@ -105,10 +108,10 @@ response = agent.create_turn( {"role": "user", "content": "I am providing some documents for reference."} ], documents=[ - dict( - content="https://raw.githubusercontent.com/example/doc.rst", - mime_type="text/plain", - ) + { + "content": "https://raw.githubusercontent.com/example/doc.rst", + "mime_type": "text/plain", + } ], session_id=session_id, ) diff --git a/docs/source/distributions/self_hosted_distro/dell.md b/docs/source/distributions/self_hosted_distro/dell.md index be326ffa5..aef3ecf58 100644 --- a/docs/source/distributions/self_hosted_distro/dell.md +++ b/docs/source/distributions/self_hosted_distro/dell.md @@ -1,7 +1,7 @@ - --- orphan: true --- + # Dell Distribution of Llama Stack diff --git a/docs/source/distributions/self_hosted_distro/fireworks.md b/docs/source/distributions/self_hosted_distro/fireworks.md index 9afeb4894..f77d9f656 100644 --- a/docs/source/distributions/self_hosted_distro/fireworks.md +++ b/docs/source/distributions/self_hosted_distro/fireworks.md @@ -1,7 +1,7 @@ - --- orphan: true --- + # Fireworks Distribution ```{toctree} diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md index d00d8177f..b183757db 100644 --- a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md +++ b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md @@ -1,7 +1,7 @@ - --- orphan: true --- + # Meta Reference Distribution ```{toctree} diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md index e46c2d112..9aeb7a88b 100644 --- a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md +++ b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md @@ -1,7 +1,7 @@ - --- orphan: true --- + # Meta Reference Quantized Distribution ```{toctree} diff --git a/docs/source/distributions/self_hosted_distro/ollama.md b/docs/source/distributions/self_hosted_distro/ollama.md index 54f6b8fdf..a3a45f9a8 100644 --- a/docs/source/distributions/self_hosted_distro/ollama.md +++ b/docs/source/distributions/self_hosted_distro/ollama.md @@ -1,7 +1,7 @@ - --- orphan: true --- + # Ollama Distribution ```{toctree} diff --git a/docs/source/distributions/self_hosted_distro/remote-vllm.md b/docs/source/distributions/self_hosted_distro/remote-vllm.md index ff626d40d..6c3bbd1d0 100644 --- a/docs/source/distributions/self_hosted_distro/remote-vllm.md +++ b/docs/source/distributions/self_hosted_distro/remote-vllm.md @@ -1,7 +1,7 @@ - --- orphan: true --- + # Remote vLLM Distribution ```{toctree} :maxdepth: 2 diff --git a/docs/source/distributions/self_hosted_distro/sambanova.md b/docs/source/distributions/self_hosted_distro/sambanova.md index 86ef4ac58..e6ac616be 100644 --- a/docs/source/distributions/self_hosted_distro/sambanova.md +++ b/docs/source/distributions/self_hosted_distro/sambanova.md @@ -1,7 +1,7 @@ - --- orphan: true --- + # SambaNova Distribution ```{toctree} diff --git a/docs/source/distributions/self_hosted_distro/tgi.md b/docs/source/distributions/self_hosted_distro/tgi.md index b970ab9fe..f4eecf2cd 100644 --- a/docs/source/distributions/self_hosted_distro/tgi.md +++ b/docs/source/distributions/self_hosted_distro/tgi.md @@ -1,7 +1,7 @@ - --- orphan: true --- + # TGI Distribution diff --git a/docs/source/distributions/self_hosted_distro/together.md b/docs/source/distributions/self_hosted_distro/together.md index 45ae462d5..8e36c1eb0 100644 --- a/docs/source/distributions/self_hosted_distro/together.md +++ b/docs/source/distributions/self_hosted_distro/together.md @@ -1,7 +1,7 @@ - --- orphan: true --- + # Together Distribution ```{toctree} diff --git a/docs/source/index.md b/docs/source/index.md index 095f50885..2834f5641 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -2,7 +2,7 @@ ```{admonition} News :class: tip -Llama Stack 0.1.1 is now available! See the [release notes](https://github.com/meta-llama/llama-stack/releases/tag/v0.1.1) for more details. +Llama Stack 0.1.2 is now available! See the [release notes](https://github.com/meta-llama/llama-stack/releases/tag/v0.1.2) for more details. ``` # Llama Stack diff --git a/llama_stack/cli/stack/list_providers.py b/llama_stack/cli/stack/list_providers.py index 96e978826..909fea030 100644 --- a/llama_stack/cli/stack/list_providers.py +++ b/llama_stack/cli/stack/list_providers.py @@ -22,9 +22,9 @@ class StackListProviders(Subcommand): self.parser.set_defaults(func=self._run_providers_list_cmd) def _add_arguments(self): - from llama_stack.distribution.datatypes import Api + from llama_stack.distribution.distribution import providable_apis - api_values = [a.value for a in Api] + api_values = [api.value for api in providable_apis()] self.parser.add_argument( "api", type=str, diff --git a/llama_stack/cli/stack/run.py b/llama_stack/cli/stack/run.py index f84def184..e7d6df292 100644 --- a/llama_stack/cli/stack/run.py +++ b/llama_stack/cli/stack/run.py @@ -55,6 +55,16 @@ class StackRun(Subcommand): default=[], metavar="KEY=VALUE", ) + self.parser.add_argument( + "--tls-keyfile", + type=str, + help="Path to TLS key file for HTTPS", + ) + self.parser.add_argument( + "--tls-certfile", + type=str, + help="Path to TLS certificate file for HTTPS", + ) def _run_stack_run_cmd(self, args: argparse.Namespace) -> None: import importlib.resources @@ -178,4 +188,7 @@ class StackRun(Subcommand): return run_args.extend(["--env", f"{key}={value}"]) + if args.tls_keyfile and args.tls_certfile: + run_args.extend(["--tls-keyfile", args.tls_keyfile, "--tls-certfile", args.tls_certfile]) + run_with_pty(run_args) diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py index 8b579b636..97706f22a 100644 --- a/llama_stack/distribution/datatypes.py +++ b/llama_stack/distribution/datatypes.py @@ -117,6 +117,23 @@ class Provider(BaseModel): config: Dict[str, Any] +class ServerConfig(BaseModel): + port: int = Field( + default=8321, + description="Port to listen on", + ge=1024, + le=65535, + ) + tls_certfile: Optional[str] = Field( + default=None, + description="Path to TLS certificate file for HTTPS", + ) + tls_keyfile: Optional[str] = Field( + default=None, + description="Path to TLS key file for HTTPS", + ) + + class StackRunConfig(BaseModel): version: str = LLAMA_STACK_RUN_CONFIG_VERSION @@ -159,6 +176,11 @@ a default SQLite store will be used.""", eval_tasks: List[EvalTaskInput] = Field(default_factory=list) tool_groups: List[ToolGroupInput] = Field(default_factory=list) + server: ServerConfig = Field( + default_factory=ServerConfig, + description="Configuration for the HTTP(S) server", + ) + class BuildConfig(BaseModel): version: str = LLAMA_STACK_BUILD_CONFIG_VERSION diff --git a/llama_stack/distribution/library_client.py b/llama_stack/distribution/library_client.py index 13aa67956..2c0f73974 100644 --- a/llama_stack/distribution/library_client.py +++ b/llama_stack/distribution/library_client.py @@ -17,17 +17,6 @@ from typing import Any, get_args, get_origin, Optional, TypeVar import httpx import yaml -from llama_stack_client import ( - APIResponse, - AsyncAPIResponse, - AsyncLlamaStackClient, - AsyncStream, - LlamaStackClient, - NOT_GIVEN, -) -from pydantic import BaseModel, TypeAdapter -from rich.console import Console -from termcolor import cprint from llama_stack.distribution.build import print_pip_install_help from llama_stack.distribution.configure import parse_and_maybe_upgrade_config @@ -46,6 +35,17 @@ from llama_stack.providers.utils.telemetry.tracing import ( setup_logger, start_trace, ) +from llama_stack_client import ( + APIResponse, + AsyncAPIResponse, + AsyncLlamaStackClient, + AsyncStream, + LlamaStackClient, + NOT_GIVEN, +) +from pydantic import BaseModel, TypeAdapter +from rich.console import Console +from termcolor import cprint T = TypeVar("T") @@ -198,6 +198,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): async def initialize(self) -> bool: try: + self.endpoint_impls = None self.impls = await construct_stack(self.config, self.custom_provider_registry) except ModuleNotFoundError as _e: cprint(_e.msg, "red") @@ -213,7 +214,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): f"Please run:\n\n{prefix}llama stack build --template {self.config_path_or_template_name} --image-type venv\n\n", "yellow", ) - return False + raise _e if Api.telemetry in self.impls: setup_logger(self.impls[Api.telemetry]) diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py index fcd0e3cad..d2c32de11 100644 --- a/llama_stack/distribution/server/server.py +++ b/llama_stack/distribution/server/server.py @@ -282,8 +282,19 @@ def main(): action="append", help="Environment variables in KEY=value format. Can be specified multiple times.", ) + parser.add_argument( + "--tls-keyfile", + help="Path to TLS key file for HTTPS", + required="--tls-certfile" in sys.argv, + ) + parser.add_argument( + "--tls-certfile", + help="Path to TLS certificate file for HTTPS", + required="--tls-keyfile" in sys.argv, + ) args = parser.parse_args() + if args.env: for env_pair in args.env: try: @@ -381,11 +392,36 @@ def main(): import uvicorn - # FYI this does not do hot-reloads + # Configure SSL if certificates are provided + port = args.port or config.server.port + + ssl_config = None + if args.tls_keyfile: + keyfile = args.tls_keyfile + certfile = args.tls_certfile + else: + keyfile = config.server.tls_keyfile + certfile = config.server.tls_certfile + + if keyfile and certfile: + ssl_config = { + "ssl_keyfile": keyfile, + "ssl_certfile": certfile, + } + print(f"HTTPS enabled with certificates:\n Key: {keyfile}\n Cert: {certfile}") listen_host = ["::", "0.0.0.0"] if not args.disable_ipv6 else "0.0.0.0" - print(f"Listening on {listen_host}:{args.port}") - uvicorn.run(app, host=listen_host, port=args.port) + print(f"Listening on {listen_host}:{port}") + + uvicorn_config = { + "app": app, + "host": listen_host, + "port": port, + } + if ssl_config: + uvicorn_config.update(ssl_config) + + uvicorn.run(**uvicorn_config) def extract_path_params(route: str) -> List[str]: diff --git a/llama_stack/distribution/start_conda_env.sh b/llama_stack/distribution/start_conda_env.sh index c37f30ef0..fe830059f 100755 --- a/llama_stack/distribution/start_conda_env.sh +++ b/llama_stack/distribution/start_conda_env.sh @@ -34,6 +34,7 @@ shift # Process environment variables from --env arguments env_vars="" +other_args="" while [[ $# -gt 0 ]]; do case "$1" in --env) @@ -48,6 +49,7 @@ while [[ $# -gt 0 ]]; do fi ;; *) + other_args="$other_args $1" shift ;; esac @@ -61,4 +63,5 @@ $CONDA_PREFIX/bin/python \ -m llama_stack.distribution.server.server \ --yaml-config "$yaml_config" \ --port "$port" \ - $env_vars + $env_vars \ + $other_args diff --git a/llama_stack/distribution/start_container.sh b/llama_stack/distribution/start_container.sh index 2c5d65d09..a5f543fb4 100755 --- a/llama_stack/distribution/start_container.sh +++ b/llama_stack/distribution/start_container.sh @@ -40,8 +40,12 @@ shift port="$1" shift +# Initialize other_args +other_args="" + # Process environment variables from --env arguments env_vars="" + while [[ $# -gt 0 ]]; do case "$1" in --env) @@ -55,6 +59,7 @@ while [[ $# -gt 0 ]]; do fi ;; *) + other_args="$other_args $1" shift ;; esac @@ -93,5 +98,8 @@ $CONTAINER_BINARY run $CONTAINER_OPTS -it \ -v "$yaml_config:/app/config.yaml" \ $mounts \ --env LLAMA_STACK_PORT=$port \ - --entrypoint='["python", "-m", "llama_stack.distribution.server.server", "--yaml-config", "/app/config.yaml"]' \ - $container_image:$version_tag + --entrypoint python \ + $container_image:$version_tag \ + -m llama_stack.distribution.server.server \ + --yaml-config /app/config.yaml \ + $other_args diff --git a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py index b48f92d36..6f4b25b9d 100644 --- a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py +++ b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py @@ -67,7 +67,6 @@ def generate_bwrap_command(bind_dirs: List[str]) -> str: @dataclass class CodeExecutionContext: matplotlib_dump_dir: str - use_proxy: bool = False @dataclass diff --git a/llama_stack/providers/remote/inference/groq/groq.py b/llama_stack/providers/remote/inference/groq/groq.py index 461b3ee61..4e6cc2d6b 100644 --- a/llama_stack/providers/remote/inference/groq/groq.py +++ b/llama_stack/providers/remote/inference/groq/groq.py @@ -26,6 +26,7 @@ from llama_stack.apis.inference import ( Message, ResponseFormat, ToolChoice, + ToolConfig, ) from llama_stack.distribution.request_headers import NeedsRequestProviderData from llama_stack.providers.remote.inference.groq.config import GroqConfig diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py index cff8aa742..ecd195854 100644 --- a/llama_stack/providers/remote/inference/ollama/ollama.py +++ b/llama_stack/providers/remote/inference/ollama/ollama.py @@ -352,24 +352,20 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate): return EmbeddingsResponse(embeddings=embeddings) async def register_model(self, model: Model) -> Model: - # ollama does not have embedding models running. Check if the model is in list of available models. - if model.model_type == ModelType.embedding: - response = await self.client.list() + async def check_model_availability(model_id: str): + response = await self.client.ps() available_models = [m["model"] for m in response["models"]] - if model.provider_resource_id not in available_models: + if model_id not in available_models: raise ValueError( - f"Model '{model.provider_resource_id}' is not available in Ollama. " - f"Available models: {', '.join(available_models)}" + f"Model '{model_id}' is not available in Ollama. Available models: {', '.join(available_models)}" ) + + if model.model_type == ModelType.embedding: + await check_model_availability(model.provider_resource_id) return model + model = await self.register_helper.register_model(model) - models = await self.client.ps() - available_models = [m["model"] for m in models["models"]] - if model.provider_resource_id not in available_models: - raise ValueError( - f"Model '{model.provider_resource_id}' is not available in Ollama. " - f"Available models: {', '.join(available_models)}" - ) + await check_model_availability(model.provider_resource_id) return model diff --git a/llama_stack/providers/remote/vector_io/qdrant/__init__.py b/llama_stack/providers/remote/vector_io/qdrant/__init__.py index 54605fcf9..c584e29ef 100644 --- a/llama_stack/providers/remote/vector_io/qdrant/__init__.py +++ b/llama_stack/providers/remote/vector_io/qdrant/__init__.py @@ -12,8 +12,8 @@ from .config import QdrantConfig async def get_adapter_impl(config: QdrantConfig, deps: Dict[Api, ProviderSpec]): - from .qdrant import QdrantVectorMemoryAdapter + from .qdrant import QdrantVectorDBAdapter - impl = QdrantVectorMemoryAdapter(config, deps[Api.inference]) + impl = QdrantVectorDBAdapter(config, deps[Api.inference]) await impl.initialize() return impl diff --git a/llama_stack/providers/remote/vector_io/qdrant/qdrant.py b/llama_stack/providers/remote/vector_io/qdrant/qdrant.py index 719070528..e7ad136eb 100644 --- a/llama_stack/providers/remote/vector_io/qdrant/qdrant.py +++ b/llama_stack/providers/remote/vector_io/qdrant/qdrant.py @@ -55,7 +55,7 @@ class QdrantIndex(EmbeddingIndex): points = [] for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)): - chunk_id = f"{chunk.document_id}:chunk-{i}" + chunk_id = f"{chunk.metadata['document_id']}:chunk-{i}" points.append( PointStruct( id=convert_id(chunk_id), @@ -93,6 +93,9 @@ class QdrantIndex(EmbeddingIndex): return QueryChunksResponse(chunks=chunks, scores=scores) + async def delete(self): + await self.client.delete_collection(collection_name=self.collection_name) + class QdrantVectorDBAdapter(VectorIO, VectorDBsProtocolPrivate): def __init__(self, config: QdrantConfig, inference_api: Api.inference) -> None: diff --git a/llama_stack/providers/tests/datasetio/test_datasetio.py b/llama_stack/providers/tests/datasetio/test_datasetio.py index cf28045a4..fd76bafe0 100644 --- a/llama_stack/providers/tests/datasetio/test_datasetio.py +++ b/llama_stack/providers/tests/datasetio/test_datasetio.py @@ -95,7 +95,7 @@ class TestDatasetIO: assert len(response) == 1 assert response[0].identifier == "test_dataset" - with pytest.raises(Exception) as exc_info: + with pytest.raises(ValueError): # unregister a dataset that does not exist await datasets_impl.unregister_dataset("test_dataset2") @@ -104,7 +104,7 @@ class TestDatasetIO: assert isinstance(response, list) assert len(response) == 0 - with pytest.raises(Exception) as exc_info: + with pytest.raises(ValueError): await datasets_impl.unregister_dataset("test_dataset") @pytest.mark.asyncio diff --git a/llama_stack/providers/tests/inference/test_model_registration.py b/llama_stack/providers/tests/inference/test_model_registration.py index 96a34ec0e..664564d22 100644 --- a/llama_stack/providers/tests/inference/test_model_registration.py +++ b/llama_stack/providers/tests/inference/test_model_registration.py @@ -32,7 +32,7 @@ class TestModelRegistration: ) # Try to register a model that's too large for local inference - with pytest.raises(ValueError) as exc_info: + with pytest.raises(ValueError): await models_impl.register_model( model_id="Llama3.1-70B-Instruct", ) @@ -42,7 +42,7 @@ class TestModelRegistration: _, models_impl = inference_stack # Try to register a non-existent model - with pytest.raises(Exception) as exc_info: + with pytest.raises(ValueError): await models_impl.register_model( model_id="Llama3-NonExistent-Model", ) @@ -59,7 +59,7 @@ class TestModelRegistration: }, ) - with pytest.raises(ValueError) as exc_info: + with pytest.raises(ValueError): await models_impl.register_model( model_id="custom-model-2", metadata={ @@ -88,7 +88,7 @@ class TestModelRegistration: async def test_register_with_invalid_llama_model(self, inference_stack): _, models_impl = inference_stack - with pytest.raises(ValueError) as exc_info: + with pytest.raises(ValueError): await models_impl.register_model( model_id="custom-model-2", metadata={"llama_model": "invalid-llama-model"}, diff --git a/llama_stack/providers/tests/inference/test_vision_inference.py b/llama_stack/providers/tests/inference/test_vision_inference.py index 964f70901..a2434ac41 100644 --- a/llama_stack/providers/tests/inference/test_vision_inference.py +++ b/llama_stack/providers/tests/inference/test_vision_inference.py @@ -4,12 +4,12 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +import base64 from pathlib import Path import pytest -from llama_stack.apis.common.content_types import ImageContentItem, TextContentItem, URL - +from llama_stack.apis.common.content_types import URL, ImageContentItem, TextContentItem from llama_stack.apis.inference import ( ChatCompletionResponse, ChatCompletionResponseEventType, @@ -23,7 +23,7 @@ from .utils import group_chunks THIS_DIR = Path(__file__).parent with open(THIS_DIR / "pasta.jpeg", "rb") as f: - PASTA_IMAGE = f.read() + PASTA_IMAGE = base64.b64encode(f.read()).decode("utf-8") class TestVisionModelInference: diff --git a/llama_stack/scripts/distro_codegen.py b/llama_stack/scripts/distro_codegen.py index 7064d3104..c73c15d41 100644 --- a/llama_stack/scripts/distro_codegen.py +++ b/llama_stack/scripts/distro_codegen.py @@ -29,7 +29,7 @@ def find_template_dirs(templates_dir: Path) -> Iterator[Path]: if not templates_dir.exists(): raise FileNotFoundError(f"Templates directory not found: {templates_dir}") - return (d for d in templates_dir.iterdir() if d.is_dir() and d.name != "__pycache__") + return sorted(d for d in templates_dir.iterdir() if d.is_dir() and d.name != "__pycache__") def process_template(template_dir: Path, progress) -> None: diff --git a/llama_stack/templates/bedrock/run.yaml b/llama_stack/templates/bedrock/run.yaml index 39408c1bd..be6c9a928 100644 --- a/llama_stack/templates/bedrock/run.yaml +++ b/llama_stack/templates/bedrock/run.yaml @@ -115,3 +115,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/cerebras/run.yaml b/llama_stack/templates/cerebras/run.yaml index 5a70890a8..05d3f4525 100644 --- a/llama_stack/templates/cerebras/run.yaml +++ b/llama_stack/templates/cerebras/run.yaml @@ -117,3 +117,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/dell/run-with-safety.yaml b/llama_stack/templates/dell/run-with-safety.yaml index bdc82d03a..04c5957d4 100644 --- a/llama_stack/templates/dell/run-with-safety.yaml +++ b/llama_stack/templates/dell/run-with-safety.yaml @@ -116,3 +116,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/dell/run.yaml b/llama_stack/templates/dell/run.yaml index 2ba62a782..706444eb1 100644 --- a/llama_stack/templates/dell/run.yaml +++ b/llama_stack/templates/dell/run.yaml @@ -107,3 +107,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/fireworks/run-with-safety.yaml b/llama_stack/templates/fireworks/run-with-safety.yaml index a4b425436..0fbe14a5a 100644 --- a/llama_stack/templates/fireworks/run-with-safety.yaml +++ b/llama_stack/templates/fireworks/run-with-safety.yaml @@ -172,3 +172,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/fireworks/run.yaml b/llama_stack/templates/fireworks/run.yaml index a497317bd..ccf67dcbb 100644 --- a/llama_stack/templates/fireworks/run.yaml +++ b/llama_stack/templates/fireworks/run.yaml @@ -161,3 +161,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/hf-endpoint/run-with-safety.yaml b/llama_stack/templates/hf-endpoint/run-with-safety.yaml index 0329f580b..f520a2fda 100644 --- a/llama_stack/templates/hf-endpoint/run-with-safety.yaml +++ b/llama_stack/templates/hf-endpoint/run-with-safety.yaml @@ -124,3 +124,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/hf-endpoint/run.yaml b/llama_stack/templates/hf-endpoint/run.yaml index 8163fe28e..708cb1bcc 100644 --- a/llama_stack/templates/hf-endpoint/run.yaml +++ b/llama_stack/templates/hf-endpoint/run.yaml @@ -114,3 +114,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/hf-serverless/run-with-safety.yaml b/llama_stack/templates/hf-serverless/run-with-safety.yaml index 9cee920a5..7f0abf5be 100644 --- a/llama_stack/templates/hf-serverless/run-with-safety.yaml +++ b/llama_stack/templates/hf-serverless/run-with-safety.yaml @@ -124,3 +124,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/hf-serverless/run.yaml b/llama_stack/templates/hf-serverless/run.yaml index c8ad0d38d..c0b7a4c60 100644 --- a/llama_stack/templates/hf-serverless/run.yaml +++ b/llama_stack/templates/hf-serverless/run.yaml @@ -114,3 +114,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml index 0faaabb15..c5286fc6b 100644 --- a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml +++ b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml @@ -126,3 +126,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/meta-reference-gpu/run.yaml b/llama_stack/templates/meta-reference-gpu/run.yaml index 6ffe1fa36..310585f23 100644 --- a/llama_stack/templates/meta-reference-gpu/run.yaml +++ b/llama_stack/templates/meta-reference-gpu/run.yaml @@ -115,3 +115,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml index 5ff87a901..d43cf3917 100644 --- a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml +++ b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml @@ -117,3 +117,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/nvidia/run.yaml b/llama_stack/templates/nvidia/run.yaml index 6dc325e9d..c8ae362f5 100644 --- a/llama_stack/templates/nvidia/run.yaml +++ b/llama_stack/templates/nvidia/run.yaml @@ -147,3 +147,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/ollama/doc_template.md b/llama_stack/templates/ollama/doc_template.md index eb4aadd29..29efe39c3 100644 --- a/llama_stack/templates/ollama/doc_template.md +++ b/llama_stack/templates/ollama/doc_template.md @@ -16,7 +16,7 @@ The `llamastack/distribution-{{ name }}` distribution consists of the following You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration. -{%- if run_config_env_vars %} +{% if run_config_env_vars %} ### Environment Variables The following environment variables can be configured: diff --git a/llama_stack/templates/ollama/run-with-safety.yaml b/llama_stack/templates/ollama/run-with-safety.yaml index 5b5c9c253..ac5dab755 100644 --- a/llama_stack/templates/ollama/run-with-safety.yaml +++ b/llama_stack/templates/ollama/run-with-safety.yaml @@ -121,3 +121,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml index 3cc1cb2ac..485223675 100644 --- a/llama_stack/templates/ollama/run.yaml +++ b/llama_stack/templates/ollama/run.yaml @@ -110,3 +110,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/remote-vllm/run-with-safety.yaml b/llama_stack/templates/remote-vllm/run-with-safety.yaml index 4a0fa9a85..1fe998a1f 100644 --- a/llama_stack/templates/remote-vllm/run-with-safety.yaml +++ b/llama_stack/templates/remote-vllm/run-with-safety.yaml @@ -126,3 +126,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/remote-vllm/run.yaml b/llama_stack/templates/remote-vllm/run.yaml index 9631f94a2..9d3db8a31 100644 --- a/llama_stack/templates/remote-vllm/run.yaml +++ b/llama_stack/templates/remote-vllm/run.yaml @@ -115,3 +115,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/sambanova/run.yaml b/llama_stack/templates/sambanova/run.yaml index 6cec51824..39b0f3c4e 100644 --- a/llama_stack/templates/sambanova/run.yaml +++ b/llama_stack/templates/sambanova/run.yaml @@ -126,3 +126,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/template.py b/llama_stack/templates/template.py index 09efd2038..04a09741c 100644 --- a/llama_stack/templates/template.py +++ b/llama_stack/templates/template.py @@ -131,8 +131,15 @@ class DistributionTemplate(BaseModel): providers_str = ", ".join(f"`{p}`" for p in providers) providers_table += f"| {api} | {providers_str} |\n" - template = "\n" - template += self.template_path.read_text() + template = self.template_path.read_text() + comment = "\n" + orphantext = "---\norphan: true\n---\n" + + if template.startswith(orphantext): + template = template.replace(orphantext, orphantext + comment) + else: + template = comment + template + # Render template with rich-generated table env = jinja2.Environment( trim_blocks=True, diff --git a/llama_stack/templates/tgi/run-with-safety.yaml b/llama_stack/templates/tgi/run-with-safety.yaml index 503505c32..ed6c9ef6f 100644 --- a/llama_stack/templates/tgi/run-with-safety.yaml +++ b/llama_stack/templates/tgi/run-with-safety.yaml @@ -114,3 +114,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/tgi/run.yaml b/llama_stack/templates/tgi/run.yaml index f1953c513..8bf76f37b 100644 --- a/llama_stack/templates/tgi/run.yaml +++ b/llama_stack/templates/tgi/run.yaml @@ -113,3 +113,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/together/run-with-safety.yaml b/llama_stack/templates/together/run-with-safety.yaml index ec351108e..298926630 100644 --- a/llama_stack/templates/together/run-with-safety.yaml +++ b/llama_stack/templates/together/run-with-safety.yaml @@ -167,3 +167,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/together/run.yaml b/llama_stack/templates/together/run.yaml index c2afd98e9..920003759 100644 --- a/llama_stack/templates/together/run.yaml +++ b/llama_stack/templates/together/run.yaml @@ -156,3 +156,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/vllm-gpu/run.yaml b/llama_stack/templates/vllm-gpu/run.yaml index 165e4d51d..41a545e1a 100644 --- a/llama_stack/templates/vllm-gpu/run.yaml +++ b/llama_stack/templates/vllm-gpu/run.yaml @@ -117,3 +117,5 @@ tool_groups: provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter provider_id: code-interpreter +server: + port: 8321 diff --git a/pyproject.toml b/pyproject.toml index 402024772..5e9cb75e2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "llama_stack" -version = "0.1.1" +version = "0.1.2" authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }] description = "Llama Stack" readme = "README.md" @@ -25,8 +25,8 @@ dependencies = [ "fire", "httpx", "huggingface-hub", - "llama-models>=0.1.1", - "llama-stack-client>=0.1.1", + "llama-models>=0.1.2", + "llama-stack-client>=0.1.2", "prompt-toolkit", "python-dotenv", "pydantic>=2", diff --git a/requirements.txt b/requirements.txt index 157c68820..497feb764 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ annotated-types==0.7.0 anyio==4.8.0 blobfile==3.0.0 certifi==2025.1.31 +chardet==5.2.0 charset-normalizer==3.4.1 click==8.1.8 colorama==0.4.6 ; sys_platform == 'win32' @@ -18,8 +19,8 @@ httpx==0.28.1 huggingface-hub==0.28.1 idna==3.10 jinja2==3.1.5 -llama-models==0.1.1 -llama-stack-client==0.1.1 +llama-models==0.1.2 +llama-stack-client==0.1.2 lxml==5.3.0 markdown-it-py==3.0.0 markupsafe==3.0.2 @@ -34,6 +35,7 @@ pycryptodomex==3.21.0 pydantic==2.10.6 pydantic-core==2.27.2 pygments==2.19.1 +pypdf==5.2.0 python-dateutil==2.9.0.post0 python-dotenv==1.0.1 pytz==2025.1 diff --git a/tests/client-sdk/README.md b/tests/client-sdk/README.md index 13142d46f..d4d439d96 100644 --- a/tests/client-sdk/README.md +++ b/tests/client-sdk/README.md @@ -4,18 +4,18 @@ You can run llama stack integration tests on either a Llama Stack Library or a L To test on a Llama Stack library with certain configuration, run ```bash LLAMA_STACK_CONFIG=./llama_stack/templates/cerebras/run.yaml -pytest -s -v tests/client-sdk/inference/test_inference.py +pytest -s -v tests/client-sdk/inference/ ``` or just the template name ```bash LLAMA_STACK_CONFIG=together -pytest -s -v tests/client-sdk/inference/test_inference.py +pytest -s -v tests/client-sdk/inference/ ``` To test on a Llama Stack endpoint, run ```bash LLAMA_STACK_BASE_URL=http//localhost:8089 -pytest -s -v tests/client-sdk/inference/test_inference.py +pytest -s -v tests/client-sdk/inference ``` ## Report Generation diff --git a/tests/client-sdk/agents/test_agents.py b/tests/client-sdk/agents/test_agents.py index 2b1db7df0..85b7af831 100644 --- a/tests/client-sdk/agents/test_agents.py +++ b/tests/client-sdk/agents/test_agents.py @@ -263,12 +263,14 @@ def test_custom_tool(llama_stack_client, agent_config): assert "CustomTool" in logs_str -def test_override_system_message_behavior(llama_stack_client, agent_config): +# TODO: fix this flaky test +def xtest_override_system_message_behavior(llama_stack_client, agent_config): client_tool = TestClientTool() agent_config = { **agent_config, "instructions": "You are a pirate", "client_tools": [client_tool.get_tool_definition()], + "model": "meta-llama/Llama-3.2-3B-Instruct", } agent = Agent(llama_stack_client, agent_config, client_tools=(client_tool,)) diff --git a/tests/client-sdk/inference/test_inference.py b/tests/client-sdk/inference/test_text_inference.py similarity index 72% rename from tests/client-sdk/inference/test_inference.py rename to tests/client-sdk/inference/test_text_inference.py index 9bbd1061a..81b476218 100644 --- a/tests/client-sdk/inference/test_inference.py +++ b/tests/client-sdk/inference/test_text_inference.py @@ -4,9 +4,6 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -import base64 -import pathlib - import pytest from pydantic import BaseModel @@ -14,6 +11,7 @@ PROVIDER_TOOL_PROMPT_FORMAT = { "remote::ollama": "json", "remote::together": "json", "remote::fireworks": "json", + "remote::vllm": "json", } PROVIDER_LOGPROBS_TOP_K = set( @@ -56,23 +54,6 @@ def get_weather_tool_definition(): } -@pytest.fixture -def image_path(): - return pathlib.Path(__file__).parent / "dog.png" - - -@pytest.fixture -def base64_image_data(image_path): - # Convert the image to base64 - return base64.b64encode(image_path.read_bytes()).decode("utf-8") - - -@pytest.fixture -def base64_image_url(base64_image_data, image_path): - # suffix includes the ., so we remove it - return f"data:image/{image_path.suffix[1:]};base64,{base64_image_data}" - - def test_text_completion_non_streaming(llama_stack_client, text_model_id): response = llama_stack_client.inference.completion( content="Complete the sentence using one word: Roses are red, violets are ", @@ -176,8 +157,8 @@ def test_text_completion_structured_output(llama_stack_client, text_model_id, in @pytest.mark.parametrize( "question,expected", [ - ("What are the names of planets in our solar system?", "Earth"), - ("What are the names of the planets that have rings around them?", "Saturn"), + ("Which planet do humans live on?", "Earth"), + ("Which planet has rings around it with a name starting with letter S?", "Saturn"), ], ) def test_text_chat_completion_non_streaming(llama_stack_client, text_model_id, question, expected): @@ -299,101 +280,3 @@ def test_text_chat_completion_structured_output(llama_stack_client, text_model_i assert answer.last_name == "Jordan" assert answer.year_of_birth == 1963 assert answer.num_seasons_in_nba == 15 - - -def test_image_chat_completion_non_streaming(llama_stack_client, vision_model_id): - message = { - "role": "user", - "content": [ - { - "type": "image", - "image": { - "url": { - # TODO: Replace with Github based URI to resources/sample1.jpg - "uri": "https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg" - }, - }, - }, - { - "type": "text", - "text": "Describe what is in this image.", - }, - ], - } - response = llama_stack_client.inference.chat_completion( - model_id=vision_model_id, - messages=[message], - stream=False, - ) - message_content = response.completion_message.content.lower().strip() - assert len(message_content) > 0 - assert any(expected in message_content for expected in {"dog", "puppy", "pup"}) - - -def test_image_chat_completion_streaming(llama_stack_client, vision_model_id): - message = { - "role": "user", - "content": [ - { - "type": "image", - "image": { - "url": { - # TODO: Replace with Github based URI to resources/sample1.jpg - "uri": "https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg" - }, - }, - }, - { - "type": "text", - "text": "Describe what is in this image.", - }, - ], - } - response = llama_stack_client.inference.chat_completion( - model_id=vision_model_id, - messages=[message], - stream=True, - ) - streamed_content = "" - for chunk in response: - streamed_content += chunk.event.delta.text.lower() - assert len(streamed_content) > 0 - assert any(expected in streamed_content for expected in {"dog", "puppy", "pup"}) - - -@pytest.mark.parametrize("type_", ["url", "data"]) -def test_image_chat_completion_base64(llama_stack_client, vision_model_id, base64_image_data, base64_image_url, type_): - image_spec = { - "url": { - "type": "image", - "image": { - "url": { - "uri": base64_image_url, - }, - }, - }, - "data": { - "type": "image", - "image": { - "data": base64_image_data, - }, - }, - }[type_] - - message = { - "role": "user", - "content": [ - image_spec, - { - "type": "text", - "text": "Describe what is in this image.", - }, - ], - } - response = llama_stack_client.inference.chat_completion( - model_id=vision_model_id, - messages=[message], - stream=False, - ) - message_content = response.completion_message.content.lower().strip() - assert len(message_content) > 0 diff --git a/tests/client-sdk/inference/test_vision_inference.py b/tests/client-sdk/inference/test_vision_inference.py new file mode 100644 index 000000000..df4b9d933 --- /dev/null +++ b/tests/client-sdk/inference/test_vision_inference.py @@ -0,0 +1,133 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import base64 +import pathlib + +import pytest + + +@pytest.fixture(scope="session") +def inference_provider_type(llama_stack_client): + providers = llama_stack_client.providers.list() + inference_providers = [p for p in providers if p.api == "inference"] + assert len(inference_providers) > 0, "No inference providers found" + return inference_providers[0].provider_type + + +@pytest.fixture +def image_path(): + return pathlib.Path(__file__).parent / "dog.png" + + +@pytest.fixture +def base64_image_data(image_path): + # Convert the image to base64 + return base64.b64encode(image_path.read_bytes()).decode("utf-8") + + +@pytest.fixture +def base64_image_url(base64_image_data, image_path): + # suffix includes the ., so we remove it + return f"data:image/{image_path.suffix[1:]};base64,{base64_image_data}" + + +def test_image_chat_completion_non_streaming(llama_stack_client, vision_model_id): + message = { + "role": "user", + "content": [ + { + "type": "image", + "image": { + "url": { + # TODO: Replace with Github based URI to resources/sample1.jpg + "uri": "https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg" + }, + }, + }, + { + "type": "text", + "text": "Describe what is in this image.", + }, + ], + } + response = llama_stack_client.inference.chat_completion( + model_id=vision_model_id, + messages=[message], + stream=False, + ) + message_content = response.completion_message.content.lower().strip() + assert len(message_content) > 0 + assert any(expected in message_content for expected in {"dog", "puppy", "pup"}) + + +def test_image_chat_completion_streaming(llama_stack_client, vision_model_id): + message = { + "role": "user", + "content": [ + { + "type": "image", + "image": { + "url": { + # TODO: Replace with Github based URI to resources/sample1.jpg + "uri": "https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg" + }, + }, + }, + { + "type": "text", + "text": "Describe what is in this image.", + }, + ], + } + response = llama_stack_client.inference.chat_completion( + model_id=vision_model_id, + messages=[message], + stream=True, + ) + streamed_content = "" + for chunk in response: + streamed_content += chunk.event.delta.text.lower() + assert len(streamed_content) > 0 + assert any(expected in streamed_content for expected in {"dog", "puppy", "pup"}) + + +@pytest.mark.parametrize("type_", ["url", "data"]) +def test_image_chat_completion_base64(llama_stack_client, vision_model_id, base64_image_data, base64_image_url, type_): + image_spec = { + "url": { + "type": "image", + "image": { + "url": { + "uri": base64_image_url, + }, + }, + }, + "data": { + "type": "image", + "image": { + "data": base64_image_data, + }, + }, + }[type_] + + message = { + "role": "user", + "content": [ + image_spec, + { + "type": "text", + "text": "Describe what is in this image.", + }, + ], + } + response = llama_stack_client.inference.chat_completion( + model_id=vision_model_id, + messages=[message], + stream=False, + ) + message_content = response.completion_message.content.lower().strip() + assert len(message_content) > 0 diff --git a/uv.lock b/uv.lock index f492872bc..087396eea 100644 --- a/uv.lock +++ b/uv.lock @@ -687,7 +687,7 @@ wheels = [ [[package]] name = "llama-models" -version = "0.1.1" +version = "0.1.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "jinja2" }, @@ -696,14 +696,14 @@ dependencies = [ { name = "pyyaml" }, { name = "tiktoken" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/df/80/4a4595cf5e55f71c0e15b85ff2f4c04b0742bf664ede062a09c9d383bf7b/llama_models-0.1.1.tar.gz", hash = "sha256:7cb5a9fe38485b47aff4c93e183d6d390a676a7619f3355502576b652f17733a", size = 1608412 } +sdist = { url = "https://files.pythonhosted.org/packages/b5/f2/ed8310d4677cd38ab45ffba45aea2a4e9882b640045ad9c3198ac69e5a85/llama_models-0.1.2.tar.gz", hash = "sha256:1266eaec7a8db336e4ed034d2b494189ccb7fd6d6b7aefe874eee749a4340b9b", size = 1608069 } wheels = [ - { url = "https://files.pythonhosted.org/packages/d9/93/d49dd0f0cd37df1a7a7fb25444d010f626cdf42b21eea11d839b0f6a808a/llama_models-0.1.1-py3-none-any.whl", hash = "sha256:7e4f15dc4f6f011852ea2c42f9770b75140f5eca670b32cc67fc0a4605c55f89", size = 1650981 }, + { url = "https://files.pythonhosted.org/packages/55/a7/34b9e88ef4109759c8881f43b8006139e3d13d54c440b8c571b253655f54/llama_models-0.1.2-py3-none-any.whl", hash = "sha256:8aa5287d1c6325698991ff677e71148cac347e07493bb5b3ab891e614b89e1f8", size = 1651273 }, ] [[package]] name = "llama-stack" -version = "0.1.1" +version = "0.1.2" source = { editable = "." } dependencies = [ { name = "blobfile" }, @@ -751,8 +751,8 @@ requires-dist = [ { name = "fire" }, { name = "httpx" }, { name = "huggingface-hub" }, - { name = "llama-models", specifier = ">=0.1.1" }, - { name = "llama-stack-client", specifier = ">=0.1.1" }, + { name = "llama-models", specifier = ">=0.1.2" }, + { name = "llama-stack-client", specifier = ">=0.1.2" }, { name = "myst-parser", marker = "extra == 'docs'" }, { name = "nbval", marker = "extra == 'dev'" }, { name = "pre-commit", marker = "extra == 'dev'" }, @@ -780,7 +780,7 @@ requires-dist = [ [[package]] name = "llama-stack-client" -version = "0.1.1" +version = "0.1.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, @@ -797,9 +797,9 @@ dependencies = [ { name = "tqdm" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/07/42/7004958ac1a6da9a8060decf0d9120fdeb3b2775de090a0a473f2ee4a27d/llama_stack_client-0.1.1.tar.gz", hash = "sha256:3e549a848ade959d342fa52ec49b1913b7bb615a77b5b8dcaefe6ff94409049e", size = 179729 } +sdist = { url = "https://files.pythonhosted.org/packages/9e/75/8b41a3026c871a8650cd8d2cfda9f891a9163458813574f36518bb40afe4/llama_stack_client-0.1.2.tar.gz", hash = "sha256:94277ddae52be557d771dcdc15d85af9012b5aa87439dd69ec1dc0ff486b0c8e", size = 188023 } wheels = [ - { url = "https://files.pythonhosted.org/packages/80/66/5255c09dc001ff437fd6fe6fad27142035b60073df243f7df0494095f605/llama_stack_client-0.1.1-py3-none-any.whl", hash = "sha256:e07d58fdcc1eaa370dd00b94c2dd1a8169c0ac60c37f6f2772cbc2c5b63f2e62", size = 348665 }, + { url = "https://files.pythonhosted.org/packages/c4/32/3a3a97eecff1f1e3a1dc90e9b00681abea11ec4f43a7ca549981261e18b6/llama_stack_client-0.1.2-py3-none-any.whl", hash = "sha256:85ff0fb57a62d7d0470cfaa2b07a595c9fb3483297944d5e5a066db850d38ccd", size = 359415 }, ] [[package]]