diff --git a/.cursor/rules/general.mdc b/.cursor/rules/general.mdc deleted file mode 100644 index 24daef2ba..000000000 --- a/.cursor/rules/general.mdc +++ /dev/null @@ -1,9 +0,0 @@ ---- -description: General rules always applicable across the project -globs: -alwaysApply: true ---- -# Style - -- Comments must add value to code. Don't write filler comments explaining what you are doing next; they just add noise. -- Add a comment to clarify surprising behavior which would not be obvious. Good variable naming and clear code organization is more important. diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 8097d5f7c..54c01c80d 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -2,4 +2,4 @@ # These owners will be the default owners for everything in # the repo. Unless a later match takes precedence, -* @ashwinb @yanxi0830 @hardikjshah @dltn @raghotham @dineshyv @vladimirivic @sixianyi0721 @ehhuang @terrytangyuan @SLR722 +* @ashwinb @yanxi0830 @hardikjshah @dltn @raghotham @dineshyv @vladimirivic @sixianyi0721 @ehhuang @terrytangyuan @SLR722 @leseb diff --git a/.github/TRIAGERS.md b/.github/TRIAGERS.md new file mode 100644 index 000000000..d4ef6d1ac --- /dev/null +++ b/.github/TRIAGERS.md @@ -0,0 +1,2 @@ +# This file documents Triage members in the Llama Stack community +@franciscojavierarceo @leseb diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 4aba604dd..d68af5615 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -5,4 +5,19 @@ updates: - package-ecosystem: "github-actions" directory: "/" # Will use the default workflow location of `.github/workflows` schedule: - interval: "daily" + interval: "weekly" + day: "saturday" + commit-message: + prefix: chore(github-deps) + - package-ecosystem: "uv" + directory: "/" + schedule: + interval: "weekly" + day: "saturday" + # ignore all non-security updates: https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file#open-pull-requests-limit + open-pull-requests-limit: 0 + labels: + - type/dependencies + - python + commit-message: + prefix: chore(python-deps) diff --git a/.github/workflows/changelog.yml b/.github/workflows/changelog.yml new file mode 100644 index 000000000..5b63e231c --- /dev/null +++ b/.github/workflows/changelog.yml @@ -0,0 +1,29 @@ +name: Update Changelog + +on: + release: + types: [published, unpublished, created, edited, deleted, released] + +permissions: + contents: read + +jobs: + generate_changelog: + name: Generate changelog + permissions: + contents: write # for peter-evans/create-pull-request to create branch + pull-requests: write # for peter-evans/create-pull-request to create a PR + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + ref: main + fetch-depth: 0 + - run: | + python ./scripts/gen-changelog.py + - uses: peter-evans/create-pull-request@v7 + with: + title: 'docs: update CHANGELOG.md for ${{ github.ref_name }}' + commit-message: 'docs: update CHANGELOG.md for ${{ github.ref_name }}' + branch: create-pull-request/changelog + signoff: true diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml new file mode 100644 index 000000000..e03c7401c --- /dev/null +++ b/.github/workflows/integration-tests.yml @@ -0,0 +1,114 @@ +name: Integration Tests + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + paths: + - 'distributions/**' + - 'llama_stack/**' + - 'tests/integration/**' + - 'uv.lock' + - 'pyproject.toml' + - 'requirements.txt' + - '.github/workflows/integration-tests.yml' # This workflow + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + test-matrix: + runs-on: ubuntu-latest + strategy: + matrix: + # Listing tests manually since some of them currently fail + # TODO: generate matrix list from tests/integration when fixed + test-type: [agents, inference, datasets, inspect, scoring, post_training, providers] + client-type: [library, http] + fail-fast: false # we want to run all tests regardless of failure + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + python-version: "3.10" + + - name: Install Ollama + run: | + curl -fsSL https://ollama.com/install.sh | sh + + - name: Pull Ollama image + run: | + ollama pull llama3.2:3b-instruct-fp16 + + - name: Start Ollama in background + run: | + nohup ollama run llama3.2:3b-instruct-fp16 > ollama.log 2>&1 & + + - name: Set Up Environment and Install Dependencies + run: | + uv sync --extra dev --extra test + uv pip install ollama faiss-cpu + # always test against the latest version of the client + # TODO: this is not necessarily a good idea. we need to test against both published and latest + # to find out backwards compatibility issues. + uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main + uv pip install -e . + llama stack build --template ollama --image-type venv + + - name: Wait for Ollama to start + run: | + echo "Waiting for Ollama..." + for i in {1..30}; do + if curl -s http://localhost:11434 | grep -q "Ollama is running"; then + echo "Ollama is running!" + exit 0 + fi + sleep 1 + done + echo "Ollama failed to start" + ollama ps + ollama.log + exit 1 + + - name: Start Llama Stack server in background + if: matrix.client-type == 'http' + env: + INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct" + run: | + source .venv/bin/activate + nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv > server.log 2>&1 & + + - name: Wait for Llama Stack server to be ready + if: matrix.client-type == 'http' + run: | + echo "Waiting for Llama Stack server..." + for i in {1..30}; do + if curl -s http://localhost:8321/v1/health | grep -q "OK"; then + echo "Llama Stack server is up!" + exit 0 + fi + sleep 1 + done + echo "Llama Stack server failed to start" + cat server.log + exit 1 + + - name: Run Integration Tests + env: + INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct" + run: | + if [ "${{ matrix.client-type }}" == "library" ]; then + stack_config="ollama" + else + stack_config="http://localhost:8321" + fi + uv run pytest -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \ + -k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \ + --text-model="meta-llama/Llama-3.2-3B-Instruct" \ + --embedding-model=all-MiniLM-L6-v2 diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 046387ab9..f36453933 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -5,6 +5,10 @@ on: push: branches: [main] +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + jobs: pre-commit: runs-on: ubuntu-latest diff --git a/.github/workflows/providers-build.yml b/.github/workflows/providers-build.yml new file mode 100644 index 000000000..18894a768 --- /dev/null +++ b/.github/workflows/providers-build.yml @@ -0,0 +1,83 @@ +name: Test Llama Stack Build + +on: + push: + branches: + - main + paths: + - 'llama_stack/cli/stack/build.py' + - 'llama_stack/cli/stack/_build.py' + - 'llama_stack/distribution/build.*' + - 'llama_stack/distribution/*.sh' + - '.github/workflows/providers-build.yml' + pull_request: + paths: + - 'llama_stack/cli/stack/build.py' + - 'llama_stack/cli/stack/_build.py' + - 'llama_stack/distribution/build.*' + - 'llama_stack/distribution/*.sh' + - '.github/workflows/providers-build.yml' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + generate-matrix: + runs-on: ubuntu-latest + outputs: + templates: ${{ steps.set-matrix.outputs.templates }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Generate Template List + id: set-matrix + run: | + templates=$(ls llama_stack/templates/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]') + echo "templates=$templates" >> "$GITHUB_OUTPUT" + + build: + needs: generate-matrix + runs-on: ubuntu-latest + strategy: + matrix: + template: ${{ fromJson(needs.generate-matrix.outputs.templates) }} + image-type: [venv, container] + fail-fast: false # We want to run all jobs even if some fail + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + python-version: "3.10" + + - name: Install LlamaStack + run: | + uv venv + source .venv/bin/activate + uv pip install -e . + + - name: Print build dependencies + run: | + uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test --print-deps-only + + - name: Run Llama Stack Build + run: | + # USE_COPY_NOT_MOUNT is set to true since mounting is not supported by docker buildx, we use COPY instead + # LLAMA_STACK_DIR is set to the current directory so we are building from the source + USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test + + - name: Print dependencies in the image + if: matrix.image-type == 'venv' + run: | + source test/bin/activate + uv pip list diff --git a/.github/workflows/semantic-pr.yml b/.github/workflows/semantic-pr.yml index 460acf237..ac75f9064 100644 --- a/.github/workflows/semantic-pr.yml +++ b/.github/workflows/semantic-pr.yml @@ -8,6 +8,10 @@ on: - reopened - synchronize +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + permissions: contents: read diff --git a/.github/workflows/stale_bot.yml b/.github/workflows/stale_bot.yml new file mode 100644 index 000000000..2039fcbb4 --- /dev/null +++ b/.github/workflows/stale_bot.yml @@ -0,0 +1,45 @@ +name: Close stale issues and PRs + +on: + schedule: + - cron: '0 0 * * *' # every day at midnight + +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + stale: + permissions: + issues: write + pull-requests: write + runs-on: ubuntu-latest + steps: + - name: Stale Action + uses: actions/stale@v9 + with: + stale-issue-label: 'stale' + stale-issue-message: > + This issue has been automatically marked as stale because it has not had activity within 60 days. + It will be automatically closed if no further activity occurs within 30 days. + close-issue-message: > + This issue has been automatically closed due to inactivity. + Please feel free to reopen if you feel it is still relevant! + days-before-issue-stale: 60 + days-before-issue-close: 30 + stale-pr-label: 'stale' + stale-pr-message: > + This pull request has been automatically marked as stale because it has not had activity within 60 days. + It will be automatically closed if no further activity occurs within 30 days. + close-pr-message: > + This pull request has been automatically closed due to inactivity. + Please feel free to reopen if you intend to continue working on it! + days-before-pr-stale: 60 + days-before-pr-close: 30 + operations-per-run: 300 diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 48658047f..49aafca79 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -1,36 +1,59 @@ name: Unit Tests on: + push: + branches: [ main ] pull_request: branches: [ main ] + paths: + - 'distributions/**' + - 'llama_stack/**' + - 'tests/unit/**' + - 'uv.lock' + - 'pyproject.toml' + - 'requirements.txt' + - '.github/workflows/unit-tests.yml' # This workflow workflow_dispatch: +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + jobs: unit-tests: runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python: + - "3.10" + - "3.11" + - "3.12" + - "3.13" steps: - uses: actions/checkout@v4 - - name: Set up Python + - name: Set up Python ${{ matrix.python }} uses: actions/setup-python@v5 with: - python-version: '3.10.16' + python-version: ${{ matrix.python }} - uses: astral-sh/setup-uv@v5 with: - python-version: '3.10.16' + python-version: ${{ matrix.python }} enable-cache: false - name: Run unit tests run: | - uv run -p 3.10.16 --with-editable . --with-editable ".[dev]" --with-editable ".[unit]" pytest --cov=llama_stack -s -v tests/unit/ --junitxml=pytest-report.xml + PYTHON_VERSION=${{ matrix.python }} ./scripts/unit-tests.sh --cov=llama_stack --junitxml=pytest-report-${{ matrix.python }}.xml --cov-report=html:htmlcov-${{ matrix.python }} - name: Upload test results if: always() uses: actions/upload-artifact@v4 with: - name: test-results + name: test-results-${{ matrix.python }} path: | .pytest_cache/ - pytest-report.xml + pytest-report-${{ matrix.python }}.xml + htmlcov-${{ matrix.python }}/ retention-days: 7 diff --git a/.github/workflows/update-readthedocs.yml b/.github/workflows/update-readthedocs.yml index e8f14dbba..561a001ef 100644 --- a/.github/workflows/update-readthedocs.yml +++ b/.github/workflows/update-readthedocs.yml @@ -22,6 +22,10 @@ on: - 'pyproject.toml' - '.github/workflows/update-readthedocs.yml' +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + jobs: update-readthedocs: runs-on: ubuntu-latest diff --git a/.gitignore b/.gitignore index 1b15107f3..0ef25cdf1 100644 --- a/.gitignore +++ b/.gitignore @@ -22,3 +22,4 @@ pyrightconfig.json venv/ pytest-report.xml .coverage +.python-version diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 926ae21cc..ff3bc1250 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -8,6 +8,7 @@ repos: rev: v5.0.0 # Latest stable version hooks: - id: check-merge-conflict + args: ['--assume-in-merge'] - id: trailing-whitespace exclude: '\.py$' # Exclude Python files as Ruff already handles them - id: check-added-large-files @@ -76,12 +77,24 @@ repos: name: Distribution Template Codegen additional_dependencies: - uv==0.6.0 - entry: uv run --extra codegen python -m llama_stack.scripts.distro_codegen + entry: uv run --extra codegen ./scripts/distro_codegen.py language: python pass_filenames: false require_serial: true files: ^llama_stack/templates/.*$|^llama_stack/providers/.*/inference/.*/models\.py$ +- repo: local + hooks: + - id: openapi-codegen + name: API Spec Codegen + additional_dependencies: + - uv==0.6.2 + entry: sh -c 'uv run --with ".[dev]" ./docs/openapi_generator/run_openapi_generator.sh > /dev/null' + language: python + pass_filenames: false + require_serial: true + files: ^llama_stack/apis/|^docs/openapi_generator/ + ci: autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks autoupdate_commit_msg: ⬆ [pre-commit.ci] pre-commit autoupdate diff --git a/.python-version b/.python-version deleted file mode 100644 index c8cfe3959..000000000 --- a/.python-version +++ /dev/null @@ -1 +0,0 @@ -3.10 diff --git a/CHANGELOG.md b/CHANGELOG.md index 62862ebdc..953d04def 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,76 @@ # Changelog +# v0.1.8 +Published on: 2025-03-24T01:28:50Z + +# v0.1.8 Release Notes + +### Build and Test Agents +* Safety: Integrated NVIDIA as a safety provider. +* VectorDB: Added Qdrant as an inline provider. +* Agents: Added support for multiple tool groups in agents. +* Agents: Simplified imports for Agents in client package + + +### Agent Evals and Model Customization +* Introduced DocVQA and IfEval benchmarks. + +### Deploying and Monitoring Agents +* Introduced a Containerfile and image workflow for the Playground. +* Implemented support for Bearer (API Key) authentication. +* Added attribute-based access control for resources. +* Fixes on docker deployments: use --pull always and standardized the default port to 8321 +* Deprecated: /v1/inspect/providers use /v1/providers/ instead + +### Better Engineering +* Consolidated scripts under the ./scripts directory. +* Addressed mypy violations in various modules. +* Added Dependabot scans for Python dependencies. +* Implemented a scheduled workflow to update the changelog automatically. +* Enforced concurrency to reduce CI loads. + + +### New Contributors +* @cmodi-meta made their first contribution in https://github.com/meta-llama/llama-stack/pull/1650 +* @jeffmaury made their first contribution in https://github.com/meta-llama/llama-stack/pull/1671 +* @derekhiggins made their first contribution in https://github.com/meta-llama/llama-stack/pull/1698 +* @Bobbins228 made their first contribution in https://github.com/meta-llama/llama-stack/pull/1745 + +**Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.1.7...v0.1.8 + +--- + +# v0.1.7 +Published on: 2025-03-14T22:30:51Z + +## 0.1.7 Release Notes + +### Build and Test Agents +* Inference: ImageType is now refactored to LlamaStackImageType +* Inference: Added tests to measure TTFT +* Inference: Bring back usage metrics +* Agents: Added endpoint for get agent, list agents and list sessions +* Agents: Automated conversion of type hints in client tool for lite llm format +* Agents: Deprecated ToolResponseMessage in agent.resume API +* Added Provider API for listing and inspecting provider info + +### Agent Evals and Model Customization +* Eval: Added new eval benchmarks Math 500 and BFCL v3 +* Deploy and Monitoring of Agents +* Telemetry: Fix tracing to work across coroutines + +### Better Engineering +* Display code coverage for unit tests +* Updated call sites (inference, tool calls, agents) to move to async non blocking calls +* Unit tests also run on Python 3.11, 3.12, and 3.13 +* Added ollama inference to Integration tests CI +* Improved documentation across examples, testing, CLI, updated providers table ) + + + + +--- + # v0.1.6 Published on: 2025-03-08T04:35:08Z diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e639328f0..5828250d0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -61,6 +61,7 @@ outlined on that page and do not file a public issue. We use [uv](https://github.com/astral-sh/uv) to manage python dependencies and virtual environments. You can install `uv` by following this [guide](https://docs.astral.sh/uv/getting-started/installation/). + You can install the dependencies by running: ```bash @@ -70,17 +71,24 @@ uv pip install -e . source .venv/bin/activate ``` +> [!NOTE] +> You can pin a specific version of Python to use for `uv` by adding a `.python-version` file in the root project directory. +> Otherwise, `uv` will automatically select a Python version according to the `requires-python` section of the `pyproject.toml`. +> For more info, see the [uv docs around Python versions](https://docs.astral.sh/uv/concepts/python-versions/). + Note that you can create a dotenv file `.env` that includes necessary environment variables: ``` LLAMA_STACK_BASE_URL=http://localhost:8321 LLAMA_STACK_CLIENT_LOG=debug LLAMA_STACK_PORT=8321 -LLAMA_STACK_CONFIG= +LLAMA_STACK_CONFIG= +TAVILY_SEARCH_API_KEY= +BRAVE_SEARCH_API_KEY= ``` And then use this dotenv file when running client SDK tests via the following: ```bash -uv run --env-file .env -- pytest -v tests/api/inference/test_text_inference.py +uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py --text-model=meta-llama/Llama-3.1-8B-Instruct ``` ## Pre-commit Hooks @@ -102,6 +110,26 @@ uv run pre-commit run --all-files > [!CAUTION] > Before pushing your changes, make sure that the pre-commit hooks have passed successfully. +## Running unit tests + +You can run the unit tests by running: + +```bash +source .venv/bin/activate +./scripts/unit-tests.sh +``` + +If you'd like to run for a non-default version of Python (currently 3.10), pass `PYTHON_VERSION` variable as follows: + +``` +source .venv/bin/activate +PYTHON_VERSION=3.13 ./scripts/unit-tests.sh +``` + +## Running integration tests + +You can run integration tests following the instructions [here](tests/integration/README.md). + ## Adding a new dependency to the project To add a new dependency to the project, you can use the `uv` command. For example, to add `foo` to the project, you can run: @@ -113,9 +141,11 @@ uv sync ## Coding Style +* Comments should provide meaningful insights into the code. Avoid filler comments that simply describe the next step, as they create unnecessary clutter, same goes for docstrings. +* Prefer comments to clarify surprising behavior and/or relationships between parts of the code rather than explain what the next line of code does. +* Catching exceptions, prefer using a specific exception type rather than a broad catch-all like `Exception`. +* Error messages should be prefixed with "Failed to ..." * 4 spaces for indentation rather than tabs -* 80 character line length -* ... ## Common Tasks @@ -137,14 +167,14 @@ LLAMA_STACK_DIR=$(pwd) LLAMA_STACK_CLIENT_DIR=../llama-stack-client-python llama ### Updating Provider Configurations -If you have made changes to a provider's configuration in any form (introducing a new config key, or changing models, etc.), you should run `python llama_stack/scripts/distro_codegen.py` to re-generate various YAML files as well as the documentation. You should not change `docs/source/.../distributions/` files manually as they are auto-generated. +If you have made changes to a provider's configuration in any form (introducing a new config key, or changing models, etc.), you should run `./scripts/distro_codegen.py` to re-generate various YAML files as well as the documentation. You should not change `docs/source/.../distributions/` files manually as they are auto-generated. ### Building the Documentation If you are making changes to the documentation at [https://llama-stack.readthedocs.io/en/latest/](https://llama-stack.readthedocs.io/en/latest/), you can use the following command to build the documentation and preview your changes. You will need [Sphinx](https://www.sphinx-doc.org/en/master/) and the readthedocs theme. ```bash -cd llama-stack/docs +cd docs uv sync --extra docs # This rebuilds the documentation pages. @@ -159,8 +189,7 @@ uv run sphinx-autobuild source build/html --write-all If you modify or add new API endpoints, update the API documentation accordingly. You can do this by running the following command: ```bash -uv sync --extra dev -uv run ./docs/openapi_generator/run_openapi_generator.sh +uv run --with ".[dev]" ./docs/openapi_generator/run_openapi_generator.sh ``` The generated API documentation will be available in `docs/_static/`. Make sure to review the changes before committing. diff --git a/MANIFEST.in b/MANIFEST.in index 572a9ac0a..5aa699e65 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,5 @@ include pyproject.toml -include distributions/dependencies.json +include llama_stack/templates/dependencies.json include llama_stack/models/llama/llama3/tokenizer.model include llama_stack/distribution/*.sh include llama_stack/cli/scripts/*.sh diff --git a/README.md b/README.md index b24e69514..918433d51 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,8 @@ [![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-stack)](https://pypi.org/project/llama-stack/) [![License](https://img.shields.io/pypi/l/llama_stack.svg)](https://github.com/meta-llama/llama-stack/blob/main/LICENSE) [![Discord](https://img.shields.io/discord/1257833999603335178)](https://discord.gg/llama-stack) +[![Unit Tests](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml?query=branch%3Amain) +[![Integration Tests](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml?query=branch%3Amain) [**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) @@ -50,6 +52,10 @@ Here is a list of the various API providers and available distributions that can | PG Vector | Single Node | | | ✅ | | | | PyTorch ExecuTorch | On-device iOS | ✅ | ✅ | | | | | vLLM | Hosted and Single Node | | ✅ | | | | +| OpenAI | Hosted | | ✅ | | | | +| Anthropic | Hosted | | ✅ | | | | +| Gemini | Hosted | | ✅ | | | | + ### Distributions @@ -67,26 +73,6 @@ A Llama Stack Distribution (or "distro") is a pre-configured bundle of provider | Fireworks | [llamastack/distribution-fireworks](https://hub.docker.com/repository/docker/llamastack/distribution-fireworks/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/fireworks.html) | | vLLM | [llamastack/distribution-remote-vllm](https://hub.docker.com/repository/docker/llamastack/distribution-remote-vllm/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html) | -### Installation - -You have two ways to install this repository: - -* **Install as a package**: - You can install the repository directly from [PyPI](https://pypi.org/project/llama-stack/) by running the following command: - ```bash - pip install llama-stack - ``` - -* **Install from source**: - If you prefer to install from the source code, we recommend using [uv](https://github.com/astral-sh/uv). - Then, run the following commands: - ```bash - git clone git@github.com:meta-llama/llama-stack.git - cd llama-stack - - uv sync - uv pip install -e . - ``` ### Documentation diff --git a/distributions/bedrock/build.yaml b/distributions/bedrock/build.yaml deleted file mode 120000 index 72402ef8d..000000000 --- a/distributions/bedrock/build.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/bedrock/build.yaml \ No newline at end of file diff --git a/distributions/bedrock/compose.yaml b/distributions/bedrock/compose.yaml deleted file mode 100644 index 055b92c67..000000000 --- a/distributions/bedrock/compose.yaml +++ /dev/null @@ -1,15 +0,0 @@ -services: - llamastack: - image: distribution-bedrock - volumes: - - ~/.llama:/root/.llama - - ./run.yaml:/root/llamastack-run-bedrock.yaml - ports: - - "8321:8321" - entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-bedrock.yaml" - deploy: - restart_policy: - condition: on-failure - delay: 3s - max_attempts: 5 - window: 60s diff --git a/distributions/bedrock/run.yaml b/distributions/bedrock/run.yaml deleted file mode 120000 index f38abfc4e..000000000 --- a/distributions/bedrock/run.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/bedrock/run.yaml \ No newline at end of file diff --git a/distributions/cerebras/build.yaml b/distributions/cerebras/build.yaml deleted file mode 120000 index bccbbcf60..000000000 --- a/distributions/cerebras/build.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/cerebras/build.yaml \ No newline at end of file diff --git a/distributions/cerebras/compose.yaml b/distributions/cerebras/compose.yaml deleted file mode 100644 index 8dc09a865..000000000 --- a/distributions/cerebras/compose.yaml +++ /dev/null @@ -1,16 +0,0 @@ -services: - llamastack: - image: llamastack/distribution-cerebras - network_mode: "host" - volumes: - - ~/.llama:/root/.llama - - ./run.yaml:/root/llamastack-run-cerebras.yaml - ports: - - "8321:8321" - entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-cerebras.yaml" - deploy: - restart_policy: - condition: on-failure - delay: 3s - max_attempts: 5 - window: 60s diff --git a/distributions/cerebras/run.yaml b/distributions/cerebras/run.yaml deleted file mode 120000 index 9f9d20b4b..000000000 --- a/distributions/cerebras/run.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/cerebras/run.yaml \ No newline at end of file diff --git a/distributions/dell-tgi/compose.yaml b/distributions/dell-tgi/compose.yaml deleted file mode 100644 index d26636cbd..000000000 --- a/distributions/dell-tgi/compose.yaml +++ /dev/null @@ -1,50 +0,0 @@ -services: - text-generation-inference: - image: registry.dell.huggingface.co/enterprise-dell-inference-meta-llama-meta-llama-3.1-8b-instruct - network_mode: "host" - volumes: - - $HOME/.cache/huggingface:/data - ports: - - "5009:5009" - devices: - - nvidia.com/gpu=all - environment: - - CUDA_VISIBLE_DEVICES=0,1,2,3,4 - - NUM_SHARD=4 - - MAX_BATCH_PREFILL_TOKENS=32768 - - MAX_INPUT_TOKENS=8000 - - MAX_TOTAL_TOKENS=8192 - command: [] - deploy: - resources: - reservations: - devices: - - driver: nvidia - # that's the closest analogue to --gpus; provide - # an integer amount of devices or 'all' - count: all - # Devices are reserved using a list of capabilities, making - # capabilities the only required field. A device MUST - # satisfy all the requested capabilities for a successful - # reservation. - capabilities: [gpu] - runtime: nvidia - llamastack: - depends_on: - text-generation-inference: - condition: service_healthy - image: llamastack/distribution-tgi - network_mode: "host" - volumes: - - ~/.llama:/root/.llama - # Link to TGI run.yaml file - - ./run.yaml:/root/my-run.yaml - ports: - - "8321:8321" - # Hack: wait for TGI server to start before starting docker - entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml" - restart_policy: - condition: on-failure - delay: 3s - max_attempts: 5 - window: 60s diff --git a/distributions/dell-tgi/run.yaml b/distributions/dell-tgi/run.yaml deleted file mode 100644 index cd6ddcfdf..000000000 --- a/distributions/dell-tgi/run.yaml +++ /dev/null @@ -1,44 +0,0 @@ -version: '2' -image_name: local -container_image: null -conda_env: local -apis: -- shields -- agents -- models -- memory -- memory_banks -- inference -- safety -providers: - inference: - - provider_id: tgi0 - provider_type: remote::tgi - config: - url: http://127.0.0.1:80 - safety: - - provider_id: meta0 - provider_type: inline::llama-guard - config: - model: Llama-Guard-3-1B - excluded_categories: [] - - provider_id: meta1 - provider_type: inline::prompt-guard - config: - model: Prompt-Guard-86M - memory: - - provider_id: meta0 - provider_type: inline::faiss - config: {} - agents: - - provider_id: meta0 - provider_type: inline::meta-reference - config: - persistence_store: - namespace: null - type: sqlite - db_path: ~/.llama/runtime/kvstore.db - telemetry: - - provider_id: meta0 - provider_type: inline::meta-reference - config: {} diff --git a/distributions/fireworks/build.yaml b/distributions/fireworks/build.yaml deleted file mode 120000 index 32a5bd869..000000000 --- a/distributions/fireworks/build.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/fireworks/build.yaml \ No newline at end of file diff --git a/distributions/fireworks/compose.yaml b/distributions/fireworks/compose.yaml deleted file mode 100644 index 84b8491e4..000000000 --- a/distributions/fireworks/compose.yaml +++ /dev/null @@ -1,14 +0,0 @@ -services: - llamastack: - image: llamastack/distribution-fireworks - ports: - - "8321:8321" - environment: - - FIREWORKS_API_KEY=${FIREWORKS_API_KEY} - entrypoint: bash -c "python -m llama_stack.distribution.server.server --template fireworks" - deploy: - restart_policy: - condition: on-failure - delay: 3s - max_attempts: 5 - window: 60s diff --git a/distributions/fireworks/run.yaml b/distributions/fireworks/run.yaml deleted file mode 120000 index 532e0e2a8..000000000 --- a/distributions/fireworks/run.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/fireworks/run.yaml \ No newline at end of file diff --git a/distributions/meta-reference-gpu/build.yaml b/distributions/meta-reference-gpu/build.yaml deleted file mode 120000 index 4418195eb..000000000 --- a/distributions/meta-reference-gpu/build.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/meta-reference-gpu/build.yaml \ No newline at end of file diff --git a/distributions/meta-reference-gpu/compose.yaml b/distributions/meta-reference-gpu/compose.yaml deleted file mode 100644 index d977e92ea..000000000 --- a/distributions/meta-reference-gpu/compose.yaml +++ /dev/null @@ -1,34 +0,0 @@ -services: - llamastack: - image: llamastack/distribution-meta-reference-gpu - network_mode: "host" - volumes: - - ~/.llama:/root/.llama - - ./run.yaml:/root/my-run.yaml - ports: - - "8321:8321" - devices: - - nvidia.com/gpu=all - environment: - - CUDA_VISIBLE_DEVICES=0 - command: [] - deploy: - resources: - reservations: - devices: - - driver: nvidia - # that's the closest analogue to --gpus; provide - # an integer amount of devices or 'all' - count: 1 - # Devices are reserved using a list of capabilities, making - # capabilities the only required field. A device MUST - # satisfy all the requested capabilities for a successful - # reservation. - capabilities: [gpu] - restart_policy: - condition: on-failure - delay: 3s - max_attempts: 5 - window: 60s - runtime: nvidia - entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml" diff --git a/distributions/meta-reference-gpu/run-with-safety.yaml b/distributions/meta-reference-gpu/run-with-safety.yaml deleted file mode 120000 index 4c5483425..000000000 --- a/distributions/meta-reference-gpu/run-with-safety.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/meta-reference-gpu/run-with-safety.yaml \ No newline at end of file diff --git a/distributions/meta-reference-gpu/run.yaml b/distributions/meta-reference-gpu/run.yaml deleted file mode 120000 index d680186ab..000000000 --- a/distributions/meta-reference-gpu/run.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/meta-reference-gpu/run.yaml \ No newline at end of file diff --git a/distributions/meta-reference-quantized-gpu/build.yaml b/distributions/meta-reference-quantized-gpu/build.yaml deleted file mode 120000 index f3dbe996f..000000000 --- a/distributions/meta-reference-quantized-gpu/build.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/meta-reference-quantized-gpu/build.yaml \ No newline at end of file diff --git a/distributions/meta-reference-quantized-gpu/compose.yaml b/distributions/meta-reference-quantized-gpu/compose.yaml deleted file mode 100644 index 98e943dce..000000000 --- a/distributions/meta-reference-quantized-gpu/compose.yaml +++ /dev/null @@ -1,35 +0,0 @@ -services: - llamastack: - image: llamastack/distribution-meta-reference-quantized-gpu - network_mode: "host" - volumes: - - ~/.llama:/root/.llama - - ./run.yaml:/root/my-run.yaml - ports: - - "8321:8321" - devices: - - nvidia.com/gpu=all - environment: - - CUDA_VISIBLE_DEVICES=0 - command: [] - deploy: - resources: - reservations: - devices: - - driver: nvidia - # that's the closest analogue to --gpus; provide - # an integer amount of devices or 'all' - count: 1 - # Devices are reserved using a list of capabilities, making - # capabilities the only required field. A device MUST - # satisfy all the requested capabilities for a successful - # reservation. - capabilities: [gpu] - runtime: nvidia - entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml" - deploy: - restart_policy: - condition: on-failure - delay: 3s - max_attempts: 5 - window: 60s diff --git a/distributions/meta-reference-quantized-gpu/run.yaml b/distributions/meta-reference-quantized-gpu/run.yaml deleted file mode 100644 index eb631adaa..000000000 --- a/distributions/meta-reference-quantized-gpu/run.yaml +++ /dev/null @@ -1,58 +0,0 @@ -version: '2' -image_name: local -container_image: null -conda_env: local -apis: -- shields -- agents -- models -- memory -- memory_banks -- inference -- safety -providers: - inference: - - provider_id: meta0 - provider_type: inline::meta-reference-quantized - config: - model: Llama3.2-3B-Instruct:int4-qlora-eo8 - quantization: - type: int4 - torch_seed: null - max_seq_len: 2048 - max_batch_size: 1 - - provider_id: meta1 - provider_type: inline::meta-reference-quantized - config: - # not a quantized model ! - model: Llama-Guard-3-1B - quantization: null - torch_seed: null - max_seq_len: 2048 - max_batch_size: 1 - safety: - - provider_id: meta0 - provider_type: inline::llama-guard - config: - model: Llama-Guard-3-1B - excluded_categories: [] - - provider_id: meta1 - provider_type: inline::prompt-guard - config: - model: Prompt-Guard-86M - memory: - - provider_id: meta0 - provider_type: inline::meta-reference - config: {} - agents: - - provider_id: meta0 - provider_type: inline::meta-reference - config: - persistence_store: - namespace: null - type: sqlite - db_path: ~/.llama/runtime/kvstore.db - telemetry: - - provider_id: meta0 - provider_type: inline::meta-reference - config: {} diff --git a/distributions/ollama/build.yaml b/distributions/ollama/build.yaml deleted file mode 120000 index 8772548e0..000000000 --- a/distributions/ollama/build.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/ollama/build.yaml \ No newline at end of file diff --git a/distributions/ollama/compose.yaml b/distributions/ollama/compose.yaml deleted file mode 100644 index 176f19d6b..000000000 --- a/distributions/ollama/compose.yaml +++ /dev/null @@ -1,71 +0,0 @@ -services: - ollama: - image: ollama/ollama:latest - network_mode: ${NETWORK_MODE:-bridge} - volumes: - - ~/.ollama:/root/.ollama - ports: - - "11434:11434" - environment: - OLLAMA_DEBUG: 1 - command: [] - deploy: - resources: - limits: - memory: 8G # Set maximum memory - reservations: - memory: 8G # Set minimum memory reservation - # healthcheck: - # # ugh, no CURL in ollama image - # test: ["CMD", "curl", "-f", "http://ollama:11434"] - # interval: 10s - # timeout: 5s - # retries: 5 - - ollama-init: - image: ollama/ollama:latest - depends_on: - - ollama - # condition: service_healthy - network_mode: ${NETWORK_MODE:-bridge} - environment: - - OLLAMA_HOST=ollama - - INFERENCE_MODEL=${INFERENCE_MODEL} - - SAFETY_MODEL=${SAFETY_MODEL:-} - volumes: - - ~/.ollama:/root/.ollama - - ./pull-models.sh:/pull-models.sh - entrypoint: ["/pull-models.sh"] - - llamastack: - depends_on: - ollama: - condition: service_started - ollama-init: - condition: service_started - image: ${LLAMA_STACK_IMAGE:-llamastack/distribution-ollama} - network_mode: ${NETWORK_MODE:-bridge} - volumes: - - ~/.llama:/root/.llama - # Link to ollama run.yaml file - - ~/local/llama-stack/:/app/llama-stack-source - - ./run${SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml - ports: - - "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}" - environment: - - INFERENCE_MODEL=${INFERENCE_MODEL} - - SAFETY_MODEL=${SAFETY_MODEL:-} - - OLLAMA_URL=http://ollama:11434 - entrypoint: > - python -m llama_stack.distribution.server.server /root/my-run.yaml \ - --port ${LLAMA_STACK_PORT:-5001} - deploy: - restart_policy: - condition: on-failure - delay: 10s - max_attempts: 3 - window: 60s -volumes: - ollama: - ollama-init: - llamastack: diff --git a/distributions/ollama/pull-models.sh b/distributions/ollama/pull-models.sh deleted file mode 100755 index fb5bf8a4a..000000000 --- a/distributions/ollama/pull-models.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/sh - -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -echo "Preloading (${INFERENCE_MODEL}, ${SAFETY_MODEL})..." -for model in ${INFERENCE_MODEL} ${SAFETY_MODEL}; do - echo "Preloading $model..." - if ! ollama run "$model"; then - echo "Failed to pull and run $model" - exit 1 - fi -done - -echo "All models pulled successfully" diff --git a/distributions/ollama/run-with-safety.yaml b/distributions/ollama/run-with-safety.yaml deleted file mode 120000 index 5695b49e7..000000000 --- a/distributions/ollama/run-with-safety.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/ollama/run-with-safety.yaml \ No newline at end of file diff --git a/distributions/ollama/run.yaml b/distributions/ollama/run.yaml deleted file mode 120000 index b008b1bf4..000000000 --- a/distributions/ollama/run.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/ollama/run.yaml \ No newline at end of file diff --git a/distributions/remote-nvidia/build.yaml b/distributions/remote-nvidia/build.yaml deleted file mode 120000 index 8903d2e57..000000000 --- a/distributions/remote-nvidia/build.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/nvidia/build.yaml \ No newline at end of file diff --git a/distributions/remote-nvidia/compose.yaml b/distributions/remote-nvidia/compose.yaml deleted file mode 100644 index ab8b4ce25..000000000 --- a/distributions/remote-nvidia/compose.yaml +++ /dev/null @@ -1,19 +0,0 @@ -services: - llamastack: - image: distribution-nvidia:dev - network_mode: "host" - volumes: - - ~/.llama:/root/.llama - - ./run.yaml:/root/llamastack-run-nvidia.yaml - ports: - - "8321:8321" - environment: - - INFERENCE_MODEL=${INFERENCE_MODEL:-Llama3.1-8B-Instruct} - - NVIDIA_API_KEY=${NVIDIA_API_KEY:-} - entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml-config /root/llamastack-run-nvidia.yaml" - deploy: - restart_policy: - condition: on-failure - delay: 3s - max_attempts: 5 - window: 60s diff --git a/distributions/remote-nvidia/run.yaml b/distributions/remote-nvidia/run.yaml deleted file mode 120000 index 85da3e26b..000000000 --- a/distributions/remote-nvidia/run.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/nvidia/run.yaml \ No newline at end of file diff --git a/distributions/remote-vllm/build.yaml b/distributions/remote-vllm/build.yaml deleted file mode 120000 index 52e5d0f2d..000000000 --- a/distributions/remote-vllm/build.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/remote-vllm/build.yaml \ No newline at end of file diff --git a/distributions/remote-vllm/compose.yaml b/distributions/remote-vllm/compose.yaml deleted file mode 100644 index c387e1049..000000000 --- a/distributions/remote-vllm/compose.yaml +++ /dev/null @@ -1,100 +0,0 @@ -services: - vllm-inference: - image: vllm/vllm-openai:latest - volumes: - - $HOME/.cache/huggingface:/root/.cache/huggingface - network_mode: ${NETWORK_MODE:-bridged} - ports: - - "${VLLM_INFERENCE_PORT:-5100}:${VLLM_INFERENCE_PORT:-5100}" - devices: - - nvidia.com/gpu=all - environment: - - CUDA_VISIBLE_DEVICES=${VLLM_INFERENCE_GPU:-0} - - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN - command: > - --gpu-memory-utilization 0.75 - --model ${VLLM_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct} - --enforce-eager - --max-model-len 8192 - --max-num-seqs 16 - --port ${VLLM_INFERENCE_PORT:-5100} - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:${VLLM_INFERENCE_PORT:-5100}/v1/health"] - interval: 30s - timeout: 10s - retries: 5 - deploy: - resources: - reservations: - devices: - - driver: nvidia - capabilities: [gpu] - runtime: nvidia - - # A little trick: - # if VLLM_SAFETY_MODEL is set, we will create a service for the safety model - # otherwise, the entry will end in a hyphen which gets ignored by docker compose - vllm-${VLLM_SAFETY_MODEL:+safety}: - image: vllm/vllm-openai:latest - volumes: - - $HOME/.cache/huggingface:/root/.cache/huggingface - network_mode: ${NETWORK_MODE:-bridged} - ports: - - "${VLLM_SAFETY_PORT:-5101}:${VLLM_SAFETY_PORT:-5101}" - devices: - - nvidia.com/gpu=all - environment: - - CUDA_VISIBLE_DEVICES=${VLLM_SAFETY_GPU:-1} - - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN - command: > - --gpu-memory-utilization 0.75 - --model ${VLLM_SAFETY_MODEL} - --enforce-eager - --max-model-len 8192 - --max-num-seqs 16 - --port ${VLLM_SAFETY_PORT:-5101} - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:${VLLM_SAFETY_PORT:-5101}/v1/health"] - interval: 30s - timeout: 10s - retries: 5 - deploy: - resources: - reservations: - devices: - - driver: nvidia - capabilities: [gpu] - runtime: nvidia - llamastack: - depends_on: - - vllm-inference: - condition: service_healthy - - vllm-${VLLM_SAFETY_MODEL:+safety}: - condition: service_healthy - # image: llamastack/distribution-remote-vllm - image: llamastack/distribution-remote-vllm:test-0.0.52rc3 - volumes: - - ~/.llama:/root/.llama - - ./run${VLLM_SAFETY_MODEL:+-with-safety}.yaml:/root/llamastack-run-remote-vllm.yaml - network_mode: ${NETWORK_MODE:-bridged} - environment: - - VLLM_URL=http://vllm-inference:${VLLM_INFERENCE_PORT:-5100}/v1 - - VLLM_SAFETY_URL=http://vllm-safety:${VLLM_SAFETY_PORT:-5101}/v1 - - INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct} - - MAX_TOKENS=${MAX_TOKENS:-4096} - - SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm} - - SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B} - ports: - - "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}" - # Hack: wait for vLLM server to start before starting docker - entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 5001" - deploy: - restart_policy: - condition: on-failure - delay: 3s - max_attempts: 5 - window: 60s -volumes: - vllm-inference: - vllm-safety: - llamastack: diff --git a/distributions/remote-vllm/run-with-safety.yaml b/distributions/remote-vllm/run-with-safety.yaml deleted file mode 120000 index b2c3c36da..000000000 --- a/distributions/remote-vllm/run-with-safety.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/remote-vllm/run-with-safety.yaml \ No newline at end of file diff --git a/distributions/remote-vllm/run.yaml b/distributions/remote-vllm/run.yaml deleted file mode 120000 index ac70c0e6a..000000000 --- a/distributions/remote-vllm/run.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/remote-vllm/run.yaml \ No newline at end of file diff --git a/distributions/runpod/build.yaml b/distributions/runpod/build.yaml deleted file mode 100644 index 9348573ef..000000000 --- a/distributions/runpod/build.yaml +++ /dev/null @@ -1,9 +0,0 @@ -name: runpod -distribution_spec: - description: Use Runpod for running LLM inference - providers: - inference: remote::runpod - memory: meta-reference - safety: meta-reference - agents: meta-reference - telemetry: meta-reference diff --git a/distributions/sambanova/build.yaml b/distributions/sambanova/build.yaml deleted file mode 100644 index dbf013d2d..000000000 --- a/distributions/sambanova/build.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/sambanova/build.yaml diff --git a/distributions/sambanova/compose.yaml b/distributions/sambanova/compose.yaml deleted file mode 100644 index 58b9fb1ef..000000000 --- a/distributions/sambanova/compose.yaml +++ /dev/null @@ -1,16 +0,0 @@ -services: - llamastack: - image: llamastack/distribution-sambanova - network_mode: "host" - volumes: - - ~/.llama:/root/.llama - - ./run.yaml:/root/llamastack-run-sambanova.yaml - ports: - - "5000:5000" - entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-sambanova.yaml" - deploy: - restart_policy: - condition: on-failure - delay: 3s - max_attempts: 5 - window: 60s diff --git a/distributions/sambanova/run.yaml b/distributions/sambanova/run.yaml deleted file mode 100644 index 385282c67..000000000 --- a/distributions/sambanova/run.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/sambanova/run.yaml diff --git a/distributions/tgi/build.yaml b/distributions/tgi/build.yaml deleted file mode 120000 index 73e59ad84..000000000 --- a/distributions/tgi/build.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/tgi/build.yaml \ No newline at end of file diff --git a/distributions/tgi/compose.yaml b/distributions/tgi/compose.yaml deleted file mode 100644 index 753b7880b..000000000 --- a/distributions/tgi/compose.yaml +++ /dev/null @@ -1,103 +0,0 @@ -services: - tgi-inference: - image: ghcr.io/huggingface/text-generation-inference:latest - volumes: - - $HOME/.cache/huggingface:/data - network_mode: ${NETWORK_MODE:-bridged} - ports: - - "${TGI_INFERENCE_PORT:-8080}:${TGI_INFERENCE_PORT:-8080}" - devices: - - nvidia.com/gpu=all - environment: - - CUDA_VISIBLE_DEVICES=${TGI_INFERENCE_GPU:-0} - - HF_TOKEN=$HF_TOKEN - - HF_HOME=/data - - HF_DATASETS_CACHE=/data - - HF_MODULES_CACHE=/data - - HF_HUB_CACHE=/data - command: > - --dtype bfloat16 - --usage-stats off - --sharded false - --model-id ${TGI_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct} - --port ${TGI_INFERENCE_PORT:-8080} - --cuda-memory-fraction 0.75 - healthcheck: - test: ["CMD", "curl", "-f", "http://tgi-inference:${TGI_INFERENCE_PORT:-8080}/health"] - interval: 5s - timeout: 5s - retries: 30 - deploy: - resources: - reservations: - devices: - - driver: nvidia - capabilities: [gpu] - runtime: nvidia - - tgi-${TGI_SAFETY_MODEL:+safety}: - image: ghcr.io/huggingface/text-generation-inference:latest - volumes: - - $HOME/.cache/huggingface:/data - network_mode: ${NETWORK_MODE:-bridged} - ports: - - "${TGI_SAFETY_PORT:-8081}:${TGI_SAFETY_PORT:-8081}" - devices: - - nvidia.com/gpu=all - environment: - - CUDA_VISIBLE_DEVICES=${TGI_SAFETY_GPU:-1} - - HF_TOKEN=$HF_TOKEN - - HF_HOME=/data - - HF_DATASETS_CACHE=/data - - HF_MODULES_CACHE=/data - - HF_HUB_CACHE=/data - command: > - --dtype bfloat16 - --usage-stats off - --sharded false - --model-id ${TGI_SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B} - --port ${TGI_SAFETY_PORT:-8081} - --cuda-memory-fraction 0.75 - healthcheck: - test: ["CMD", "curl", "-f", "http://tgi-safety:${TGI_SAFETY_PORT:-8081}/health"] - interval: 5s - timeout: 5s - retries: 30 - deploy: - resources: - reservations: - devices: - - driver: nvidia - capabilities: [gpu] - runtime: nvidia - - llamastack: - depends_on: - tgi-inference: - condition: service_healthy - tgi-${TGI_SAFETY_MODEL:+safety}: - condition: service_healthy - image: llamastack/distribution-tgi:test-0.0.52rc3 - network_mode: ${NETWORK_MODE:-bridged} - volumes: - - ~/.llama:/root/.llama - - ./run${TGI_SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml - ports: - - "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}" - # Hack: wait for TGI server to start before starting docker - entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml" - restart_policy: - condition: on-failure - delay: 3s - max_attempts: 5 - window: 60s - environment: - - TGI_URL=http://tgi-inference:${TGI_INFERENCE_PORT:-8080} - - SAFETY_TGI_URL=http://tgi-safety:${TGI_SAFETY_PORT:-8081} - - INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct} - - SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B} - -volumes: - tgi-inference: - tgi-safety: - llamastack: diff --git a/distributions/tgi/run-with-safety.yaml b/distributions/tgi/run-with-safety.yaml deleted file mode 120000 index 62d26708e..000000000 --- a/distributions/tgi/run-with-safety.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/tgi/run-with-safety.yaml \ No newline at end of file diff --git a/distributions/tgi/run.yaml b/distributions/tgi/run.yaml deleted file mode 120000 index f3cc3a502..000000000 --- a/distributions/tgi/run.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/tgi/run.yaml \ No newline at end of file diff --git a/distributions/together/build.yaml b/distributions/together/build.yaml deleted file mode 120000 index 3877a9c96..000000000 --- a/distributions/together/build.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/together/build.yaml \ No newline at end of file diff --git a/distributions/together/compose.yaml b/distributions/together/compose.yaml deleted file mode 100644 index f66ee69f9..000000000 --- a/distributions/together/compose.yaml +++ /dev/null @@ -1,14 +0,0 @@ -services: - llamastack: - image: llamastack/distribution-together - ports: - - "8321:8321" - environment: - - TOGETHER_API_KEY=${TOGETHER_API_KEY} - entrypoint: bash -c "python -m llama_stack.distribution.server.server --template together" - deploy: - restart_policy: - condition: on-failure - delay: 3s - max_attempts: 5 - window: 60s diff --git a/distributions/together/run.yaml b/distributions/together/run.yaml deleted file mode 120000 index 102d9866e..000000000 --- a/distributions/together/run.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/together/run.yaml \ No newline at end of file diff --git a/distributions/vllm-gpu/build.yaml b/distributions/vllm-gpu/build.yaml deleted file mode 120000 index a95d34c1f..000000000 --- a/distributions/vllm-gpu/build.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/inline-vllm/build.yaml \ No newline at end of file diff --git a/distributions/vllm-gpu/compose.yaml b/distributions/vllm-gpu/compose.yaml deleted file mode 100644 index 98267cdc3..000000000 --- a/distributions/vllm-gpu/compose.yaml +++ /dev/null @@ -1,35 +0,0 @@ -services: - llamastack: - image: llamastack/distribution-inline-vllm - network_mode: "host" - volumes: - - ~/.llama:/root/.llama - - ./run.yaml:/root/my-run.yaml - ports: - - "8321:8321" - devices: - - nvidia.com/gpu=all - environment: - - CUDA_VISIBLE_DEVICES=0 - command: [] - deploy: - resources: - reservations: - devices: - - driver: nvidia - # that's the closest analogue to --gpus; provide - # an integer amount of devices or 'all' - count: 1 - # Devices are reserved using a list of capabilities, making - # capabilities the only required field. A device MUST - # satisfy all the requested capabilities for a successful - # reservation. - capabilities: [gpu] - runtime: nvidia - entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml" - deploy: - restart_policy: - condition: on-failure - delay: 3s - max_attempts: 5 - window: 60s diff --git a/distributions/vllm-gpu/run.yaml b/distributions/vllm-gpu/run.yaml deleted file mode 100644 index a75a4c451..000000000 --- a/distributions/vllm-gpu/run.yaml +++ /dev/null @@ -1,66 +0,0 @@ -version: '2' -image_name: local -container_image: null -conda_env: local -apis: -- shields -- agents -- models -- memory -- memory_banks -- inference -- safety -providers: - inference: - - provider_id: vllm-inference - provider_type: inline::vllm - config: - model: Llama3.2-3B-Instruct - tensor_parallel_size: 1 - gpu_memory_utilization: 0.4 - enforce_eager: true - max_tokens: 4096 - - provider_id: vllm-inference-safety - provider_type: inline::vllm - config: - model: Llama-Guard-3-1B - tensor_parallel_size: 1 - gpu_memory_utilization: 0.2 - enforce_eager: true - max_tokens: 4096 - safety: - - provider_id: meta0 - provider_type: inline::llama-guard - config: - model: Llama-Guard-3-1B - excluded_categories: [] - # Uncomment to use prompt guard - # - provider_id: meta1 - # provider_type: inline::prompt-guard - # config: - # model: Prompt-Guard-86M - memory: - - provider_id: meta0 - provider_type: inline::meta-reference - config: {} - # Uncomment to use pgvector - # - provider_id: pgvector - # provider_type: remote::pgvector - # config: - # host: 127.0.0.1 - # port: 5432 - # db: postgres - # user: postgres - # password: mysecretpassword - agents: - - provider_id: meta0 - provider_type: inline::meta-reference - config: - persistence_store: - namespace: null - type: sqlite - db_path: ~/.llama/runtime/agents_store.db - telemetry: - - provider_id: meta0 - provider_type: inline::meta-reference - config: {} diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 1a8169090..1d1b14b4a 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -6,8 +6,8 @@ OpenAPI specification - - + +