diff --git a/.github/workflows/update-readthedocs.yml b/.github/workflows/update-readthedocs.yml index 70369bcc4..23bafa1e5 100644 --- a/.github/workflows/update-readthedocs.yml +++ b/.github/workflows/update-readthedocs.yml @@ -11,17 +11,42 @@ on: branches: - main paths: - - 'docs/source/**' - - 'docs/resources/**' + - 'docs/**' + - '.github/workflows/update-readthedocs.yml' + pull_request: + branches: + - main + paths: + - 'docs/**' - '.github/workflows/update-readthedocs.yml' jobs: update-readthedocs: - runs-on: ubuntu-latest + runs-on: ubuntu-latest env: TOKEN: ${{ secrets.READTHEDOCS_TOKEN }} steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install the latest version of uv + uses: astral-sh/setup-uv@v5 + + - name: Sync with uv + run: uv sync --extra docs + + - name: Build HTML + run: | + cd docs + uv run make html + - name: Trigger ReadTheDocs build + if: github.event_name != 'pull_request' run: | if [ -z "$TOKEN" ]; then echo "READTHEDOCS_TOKEN is not set" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 56e35aa6e..70af72a62 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -30,6 +30,7 @@ repos: rev: v0.9.4 hooks: - id: ruff + args: [ --fix ] exclude: ^llama_stack/strong_typing/.*$ - id: ruff-format @@ -45,23 +46,26 @@ repos: hooks: - id: uv-export args: [ - "--frozen", - "--no-hashes", - "--no-emit-project", + "--frozen", + "--no-hashes", + "--no-emit-project", "--output-file=requirements.txt" ] files: ^pyproject\.toml$ - id: uv-sync -# - repo: https://github.com/pre-commit/mirrors-mypy -# rev: v1.14.0 -# hooks: -# - id: mypy -# additional_dependencies: -# - types-requests -# - types-setuptools -# - pydantic -# args: [--ignore-missing-imports] +- repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.15.0 + hooks: + - id: mypy + additional_dependencies: + - uv==0.6.2 + - mypy + - pytest + - rich + - types-requests + - pydantic + pass_filenames: false # - repo: https://github.com/jsh9/pydoclint # rev: d88180a8632bb1602a4d81344085cf320f288c5a diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 83a717273..1e4a88f13 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -134,9 +134,11 @@ If you are making changes to the documentation at [https://llama-stack.readthedo $ cd llama-stack/docs $ uv sync --extra docs +# This rebuilds the documentation pages. +$ uv run make html + # This will start a local server (usually at http://127.0.0.1:8000) that automatically rebuilds and refreshes when you make changes to the documentation. -$ make html -$ uv run sphinx-autobuild source build/html +$ uv run sphinx-autobuild source build/html --write-all ``` ### Update API Documentation @@ -145,7 +147,7 @@ If you modify or add new API endpoints, update the API documentation accordingly ```bash $ uv sync --extra dev -$ ./docs/openapi_generator/run_openapi_generator.sh +$ uv run ./docs/openapi_generator/run_openapi_generator.sh ``` The generated API documentation will be available in `docs/_static/`. Make sure to review the changes before committing. diff --git a/MANIFEST.in b/MANIFEST.in index 9d9048983..ec45d8f08 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,3 +3,4 @@ include distributions/dependencies.json include llama_stack/distribution/*.sh include llama_stack/cli/scripts/*.sh include llama_stack/templates/*/*.yaml +include llama_stack/providers/tests/test_cases/*.json diff --git a/README.md b/README.md index baec8c1bd..3946deea6 100644 --- a/README.md +++ b/README.md @@ -78,18 +78,14 @@ You have two ways to install this repository: ``` * **Install from source**: - If you prefer to install from the source code, make sure you have [conda installed](https://docs.conda.io/projects/conda/en/stable). + If you prefer to install from the source code, we recommend using [uv](https://github.com/astral-sh/uv). Then, run the following commands: ```bash - mkdir -p ~/local - cd ~/local git clone git@github.com:meta-llama/llama-stack.git - - conda create -n stack python=3.10 - conda activate stack - cd llama-stack - pip install -e . + + uv sync + uv pip install -e . ``` ### Documentation diff --git a/distributions/dependencies.json b/distributions/dependencies.json index 345a29f33..9e468f08d 100644 --- a/distributions/dependencies.json +++ b/distributions/dependencies.json @@ -30,9 +30,7 @@ "sentencepiece", "tqdm", "transformers", - "uvicorn", - "sentence-transformers --no-deps", - "torch torchvision --index-url https://download.pytorch.org/whl/cpu" + "uvicorn" ], "cerebras": [ "aiosqlite", @@ -170,9 +168,7 @@ "sentencepiece", "tqdm", "transformers", - "uvicorn", - "sentence-transformers --no-deps", - "torch torchvision --index-url https://download.pytorch.org/whl/cpu" + "uvicorn" ], "hf-serverless": [ "aiohttp", @@ -247,9 +243,7 @@ "tqdm", "transformers", "uvicorn", - "zmq", - "sentence-transformers --no-deps", - "torch torchvision --index-url https://download.pytorch.org/whl/cpu" + "zmq" ], "meta-reference-quantized-gpu": [ "accelerate", @@ -290,9 +284,7 @@ "tqdm", "transformers", "uvicorn", - "zmq", - "sentence-transformers --no-deps", - "torch torchvision --index-url https://download.pytorch.org/whl/cpu" + "zmq" ], "nvidia": [ "aiosqlite", @@ -323,9 +315,7 @@ "sentencepiece", "tqdm", "transformers", - "uvicorn", - "sentence-transformers --no-deps", - "torch torchvision --index-url https://download.pytorch.org/whl/cpu" + "uvicorn" ], "ollama": [ "aiohttp", @@ -335,7 +325,6 @@ "chardet", "chromadb-client", "datasets", - "faiss-cpu", "fastapi", "fire", "httpx", @@ -356,11 +345,10 @@ "scikit-learn", "scipy", "sentencepiece", + "sqlite-vec", "tqdm", "transformers", - "uvicorn", - "sentence-transformers --no-deps", - "torch torchvision --index-url https://download.pytorch.org/whl/cpu" + "uvicorn" ], "remote-vllm": [ "aiosqlite", @@ -423,9 +411,7 @@ "sentencepiece", "tqdm", "transformers", - "uvicorn", - "sentence-transformers --no-deps", - "torch torchvision --index-url https://download.pytorch.org/whl/cpu" + "uvicorn" ], "tgi": [ "aiohttp", diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 40c167685..2a9f4b6f7 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -2315,6 +2315,70 @@ } } }, + "/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume": { + "post": { + "responses": { + "200": { + "description": "A Turn object if stream is False, otherwise an AsyncIterator of AgentTurnResponseStreamChunk objects.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Turn" + } + }, + "text/event-stream": { + "schema": { + "$ref": "#/components/schemas/AgentTurnResponseStreamChunk" + } + } + } + } + }, + "tags": [ + "Agents" + ], + "description": "Resume an agent turn with executed tool call responses.\nWhen a Turn has the status `awaiting_input` due to pending input from client side tool calls, this endpoint can be used to submit the outputs from the tool calls once they are ready.", + "parameters": [ + { + "name": "agent_id", + "in": "path", + "description": "The ID of the agent to resume.", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "session_id", + "in": "path", + "description": "The ID of the session to resume.", + "required": true, + "schema": { + "type": "string" + } + }, + { + "name": "turn_id", + "in": "path", + "description": "The ID of the turn to resume.", + "required": true, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ResumeAgentTurnRequest" + } + } + }, + "required": true + } + } + }, "/v1/eval/benchmarks/{benchmark_id}/jobs": { "post": { "responses": { @@ -4226,6 +4290,9 @@ }, "tool_config": { "$ref": "#/components/schemas/ToolConfig" + }, + "allow_turn_resume": { + "type": "boolean" } }, "additionalProperties": false, @@ -4454,6 +4521,31 @@ }, "content": { "$ref": "#/components/schemas/InterleavedContent" + }, + "metadata": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } } }, "additionalProperties": false, @@ -4612,6 +4704,9 @@ }, { "$ref": "#/components/schemas/AgentTurnResponseTurnCompletePayload" + }, + { + "$ref": "#/components/schemas/AgentTurnResponseTurnAwaitingInputPayload" } ], "discriminator": { @@ -4621,7 +4716,8 @@ "step_progress": "#/components/schemas/AgentTurnResponseStepProgressPayload", "step_complete": "#/components/schemas/AgentTurnResponseStepCompletePayload", "turn_start": "#/components/schemas/AgentTurnResponseTurnStartPayload", - "turn_complete": "#/components/schemas/AgentTurnResponseTurnCompletePayload" + "turn_complete": "#/components/schemas/AgentTurnResponseTurnCompletePayload", + "turn_awaiting_input": "#/components/schemas/AgentTurnResponseTurnAwaitingInputPayload" } } }, @@ -4784,6 +4880,25 @@ "title": "AgentTurnResponseStreamChunk", "description": "streamed agent turn completion response." }, + "AgentTurnResponseTurnAwaitingInputPayload": { + "type": "object", + "properties": { + "event_type": { + "type": "string", + "const": "turn_awaiting_input", + "default": "turn_awaiting_input" + }, + "turn": { + "$ref": "#/components/schemas/Turn" + } + }, + "additionalProperties": false, + "required": [ + "event_type", + "turn" + ], + "title": "AgentTurnResponseTurnAwaitingInputPayload" + }, "AgentTurnResponseTurnCompletePayload": { "type": "object", "properties": { @@ -4929,11 +5044,42 @@ "description": "The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint." }, "contents": { - "type": "array", - "items": { - "$ref": "#/components/schemas/InterleavedContent" - }, - "description": "List of contents to generate embeddings for. Note that content can be multimodal. The behavior depends on the model and provider. Some models may only support text." + "oneOf": [ + { + "type": "array", + "items": { + "type": "string" + } + }, + { + "type": "array", + "items": { + "$ref": "#/components/schemas/InterleavedContentItem" + } + } + ], + "description": "List of contents to generate embeddings for. Each content can be a string or an InterleavedContentItem (and hence can be multimodal). The behavior depends on the model and provider. Some models may only support text." + }, + "text_truncation": { + "type": "string", + "enum": [ + "none", + "start", + "end" + ], + "description": "(Optional) Config for how to truncate text for embedding when text is longer than the model's max sequence length." + }, + "output_dimension": { + "type": "integer", + "description": "(Optional) Output dimensionality for the embeddings. Only supported by Matryoshka models." + }, + "task_type": { + "type": "string", + "enum": [ + "query", + "document" + ], + "description": "(Optional) How is the embedding being used? This is only supported by asymmetric embedding models." } }, "additionalProperties": false, @@ -6625,6 +6771,31 @@ }, "error_code": { "type": "integer" + }, + "metadata": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } } }, "additionalProperties": false, @@ -7474,9 +7645,37 @@ "properties": { "content": { "$ref": "#/components/schemas/InterleavedContent" + }, + "metadata": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } } }, "additionalProperties": false, + "required": [ + "metadata" + ], "title": "RAGQueryResult" }, "QueryChunksRequest": { @@ -8015,6 +8214,27 @@ ], "title": "RegisterVectorDbRequest" }, + "ResumeAgentTurnRequest": { + "type": "object", + "properties": { + "tool_responses": { + "type": "array", + "items": { + "$ref": "#/components/schemas/ToolResponseMessage" + }, + "description": "The tool call responses to resume the turn with." + }, + "stream": { + "type": "boolean", + "description": "Whether to stream the response." + } + }, + "additionalProperties": false, + "required": [ + "tool_responses" + ], + "title": "ResumeAgentTurnRequest" + }, "RunEvalRequest": { "type": "object", "properties": { diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index c5043665b..a2329e47a 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -1401,6 +1401,53 @@ paths: schema: $ref: '#/components/schemas/QueryTracesRequest' required: true + /v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume: + post: + responses: + '200': + description: >- + A Turn object if stream is False, otherwise an AsyncIterator of AgentTurnResponseStreamChunk + objects. + content: + application/json: + schema: + $ref: '#/components/schemas/Turn' + text/event-stream: + schema: + $ref: '#/components/schemas/AgentTurnResponseStreamChunk' + tags: + - Agents + description: >- + Resume an agent turn with executed tool call responses. + + When a Turn has the status `awaiting_input` due to pending input from client + side tool calls, this endpoint can be used to submit the outputs from the + tool calls once they are ready. + parameters: + - name: agent_id + in: path + description: The ID of the agent to resume. + required: true + schema: + type: string + - name: session_id + in: path + description: The ID of the session to resume. + required: true + schema: + type: string + - name: turn_id + in: path + description: The ID of the turn to resume. + required: true + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/ResumeAgentTurnRequest' + required: true /v1/eval/benchmarks/{benchmark_id}/jobs: post: responses: @@ -2740,6 +2787,8 @@ components: $ref: '#/components/schemas/AgentTool' tool_config: $ref: '#/components/schemas/ToolConfig' + allow_turn_resume: + type: boolean additionalProperties: false required: - messages @@ -2896,6 +2945,16 @@ components: - type: string content: $ref: '#/components/schemas/InterleavedContent' + metadata: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object additionalProperties: false required: - call_id @@ -2992,6 +3051,7 @@ components: - $ref: '#/components/schemas/AgentTurnResponseStepCompletePayload' - $ref: '#/components/schemas/AgentTurnResponseTurnStartPayload' - $ref: '#/components/schemas/AgentTurnResponseTurnCompletePayload' + - $ref: '#/components/schemas/AgentTurnResponseTurnAwaitingInputPayload' discriminator: propertyName: event_type mapping: @@ -3000,6 +3060,7 @@ components: step_complete: '#/components/schemas/AgentTurnResponseStepCompletePayload' turn_start: '#/components/schemas/AgentTurnResponseTurnStartPayload' turn_complete: '#/components/schemas/AgentTurnResponseTurnCompletePayload' + turn_awaiting_input: '#/components/schemas/AgentTurnResponseTurnAwaitingInputPayload' AgentTurnResponseStepCompletePayload: type: object properties: @@ -3106,6 +3167,21 @@ components: - event title: AgentTurnResponseStreamChunk description: streamed agent turn completion response. + "AgentTurnResponseTurnAwaitingInputPayload": + type: object + properties: + event_type: + type: string + const: turn_awaiting_input + default: turn_awaiting_input + turn: + $ref: '#/components/schemas/Turn' + additionalProperties: false + required: + - event_type + - turn + title: >- + AgentTurnResponseTurnAwaitingInputPayload AgentTurnResponseTurnCompletePayload: type: object properties: @@ -3224,13 +3300,39 @@ components: The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint. contents: - type: array - items: - $ref: '#/components/schemas/InterleavedContent' + oneOf: + - type: array + items: + type: string + - type: array + items: + $ref: '#/components/schemas/InterleavedContentItem' description: >- - List of contents to generate embeddings for. Note that content can be - multimodal. The behavior depends on the model and provider. Some models - may only support text. + List of contents to generate embeddings for. Each content can be a string + or an InterleavedContentItem (and hence can be multimodal). The behavior + depends on the model and provider. Some models may only support text. + text_truncation: + type: string + enum: + - none + - start + - end + description: >- + (Optional) Config for how to truncate text for embedding when text is + longer than the model's max sequence length. + output_dimension: + type: integer + description: >- + (Optional) Output dimensionality for the embeddings. Only supported by + Matryoshka models. + task_type: + type: string + enum: + - query + - document + description: >- + (Optional) How is the embedding being used? This is only supported by + asymmetric embedding models. additionalProperties: false required: - model_id @@ -4289,6 +4391,16 @@ components: type: string error_code: type: integer + metadata: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object additionalProperties: false required: - content @@ -4862,7 +4974,19 @@ components: properties: content: $ref: '#/components/schemas/InterleavedContent' + metadata: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object additionalProperties: false + required: + - metadata title: RAGQueryResult QueryChunksRequest: type: object @@ -5179,6 +5303,22 @@ components: - vector_db_id - embedding_model title: RegisterVectorDbRequest + ResumeAgentTurnRequest: + type: object + properties: + tool_responses: + type: array + items: + $ref: '#/components/schemas/ToolResponseMessage' + description: >- + The tool call responses to resume the turn with. + stream: + type: boolean + description: Whether to stream the response. + additionalProperties: false + required: + - tool_responses + title: ResumeAgentTurnRequest RunEvalRequest: type: object properties: diff --git a/docs/getting_started.ipynb b/docs/getting_started.ipynb index 51ae945f4..7f9afd647 100644 --- a/docs/getting_started.ipynb +++ b/docs/getting_started.ipynb @@ -86,8 +86,6 @@ "# NBVAL_SKIP\n", "\n", "!apt-get install -y bubblewrap\n", - "import os\n", - "os.environ[\"UV_SYSTEM_PYTHON\"] = \"1\"\n", "!pip install uv\n", "!uv pip install llama-stack" ] @@ -3632,7 +3630,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "master", + "display_name": "toolchain", "language": "python", "name": "python3" }, diff --git a/docs/source/concepts/index.md b/docs/source/concepts/index.md index df46e0134..c839266b6 100644 --- a/docs/source/concepts/index.md +++ b/docs/source/concepts/index.md @@ -25,7 +25,7 @@ We are working on adding a few more APIs to complete the application lifecycle. ## API Providers The goal of Llama Stack is to build an ecosystem where users can easily swap out different implementations for the same API. Examples for these include: -- LLM inference providers (e.g., Fireworks, Together, AWS Bedrock, Groq, Cerebras, SambaNova, etc.), +- LLM inference providers (e.g., Fireworks, Together, AWS Bedrock, Groq, Cerebras, SambaNova, vLLM, etc.), - Vector databases (e.g., ChromaDB, Weaviate, Qdrant, FAISS, PGVector, etc.), - Safety providers (e.g., Meta's Llama Guard, AWS Bedrock Guardrails, etc.) @@ -33,7 +33,7 @@ Providers come in two flavors: - **Remote**: the provider runs as a separate service external to the Llama Stack codebase. Llama Stack contains a small amount of adapter code. - **Inline**: the provider is fully specified and implemented within the Llama Stack codebase. It may be a simple wrapper around an existing library, or a full fledged implementation within Llama Stack. -Most importantly, Llama Stack always strives to provide at least one fully "local" provider for each API so you can iterate on a fully featured environment locally. +Most importantly, Llama Stack always strives to provide at least one fully inline provider for each API so you can iterate on a fully featured environment locally. ## Resources Some of these APIs are associated with a set of **Resources**. Here is the mapping of APIs to resources: diff --git a/docs/source/conf.py b/docs/source/conf.py index a876333db..fd105a6cf 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -15,7 +15,7 @@ from docutils import nodes project = "llama-stack" -copyright = "2024, Meta" +copyright = "2025, Meta" author = "Meta" # -- General configuration --------------------------------------------------- diff --git a/docs/source/distributions/remote_hosted_distro/nvidia.md b/docs/source/distributions/remote_hosted_distro/nvidia.md index e2a0e1253..2d6e85260 100644 --- a/docs/source/distributions/remote_hosted_distro/nvidia.md +++ b/docs/source/distributions/remote_hosted_distro/nvidia.md @@ -38,6 +38,7 @@ The following models are available by default: - `meta-llama/Llama-3.2-3B-Instruct (meta/llama-3.2-3b-instruct)` - `meta-llama/Llama-3.2-11B-Vision-Instruct (meta/llama-3.2-11b-vision-instruct)` - `meta-llama/Llama-3.2-90B-Vision-Instruct (meta/llama-3.2-90b-vision-instruct)` +- `baai/bge-m3 (baai/bge-m3)` ### Prerequisite: API Keys diff --git a/docs/source/distributions/selection.md b/docs/source/distributions/selection.md index da1b0df9c..269b14bce 100644 --- a/docs/source/distributions/selection.md +++ b/docs/source/distributions/selection.md @@ -17,7 +17,7 @@ Which templates / distributions to choose depends on the hardware you have for r - {dockerhub}`distribution-nvidia` ([Guide](self_hosted_distro/nvidia)) - **Are you running on a "regular" desktop or laptop ?** We suggest using the ollama template for quick prototyping and get started without having to worry about needing GPUs. - - {dockerhub}`distribution-ollama` ([link](self_hosted_distro/ollama)) + - {dockerhub}`distribution-ollama` ([Guide](self_hosted_distro/ollama)) - **Do you have an API key for a remote inference provider like Fireworks, Together, etc.?** If so, we suggest: - {dockerhub}`distribution-together` ([Guide](self_hosted_distro/together)) @@ -28,7 +28,7 @@ Which templates / distributions to choose depends on the hardware you have for r - [Android](ondevice_distro/android_sdk) -- **If none of the above fit your needs, you can also build your own [custom distribution](building_distro).** +- **If none of the above fit your needs, you can also build your own [custom distribution](building_distro.md).** ### Distribution Details diff --git a/docs/source/distributions/self_hosted_distro/cerebras.md b/docs/source/distributions/self_hosted_distro/cerebras.md index a0c9eb263..6e2af14fd 100644 --- a/docs/source/distributions/self_hosted_distro/cerebras.md +++ b/docs/source/distributions/self_hosted_distro/cerebras.md @@ -8,7 +8,7 @@ The `llamastack/distribution-cerebras` distribution consists of the following pr | agents | `inline::meta-reference` | | datasetio | `remote::huggingface`, `inline::localfs` | | eval | `inline::meta-reference` | -| inference | `remote::cerebras` | +| inference | `remote::cerebras`, `inline::sentence-transformers` | | safety | `inline::llama-guard` | | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | diff --git a/docs/source/distributions/self_hosted_distro/dell.md b/docs/source/distributions/self_hosted_distro/dell.md index aef3ecf58..f49b332a9 100644 --- a/docs/source/distributions/self_hosted_distro/dell.md +++ b/docs/source/distributions/self_hosted_distro/dell.md @@ -19,7 +19,7 @@ The `llamastack/distribution-dell` distribution consists of the following provid | agents | `inline::meta-reference` | | datasetio | `remote::huggingface`, `inline::localfs` | | eval | `inline::meta-reference` | -| inference | `remote::tgi` | +| inference | `remote::tgi`, `inline::sentence-transformers` | | safety | `inline::llama-guard` | | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | diff --git a/docs/source/distributions/self_hosted_distro/fireworks.md b/docs/source/distributions/self_hosted_distro/fireworks.md index 7951e148e..f69e6d963 100644 --- a/docs/source/distributions/self_hosted_distro/fireworks.md +++ b/docs/source/distributions/self_hosted_distro/fireworks.md @@ -18,7 +18,7 @@ The `llamastack/distribution-fireworks` distribution consists of the following p | agents | `inline::meta-reference` | | datasetio | `remote::huggingface`, `inline::localfs` | | eval | `inline::meta-reference` | -| inference | `remote::fireworks` | +| inference | `remote::fireworks`, `inline::sentence-transformers` | | safety | `inline::llama-guard` | | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | diff --git a/docs/source/distributions/self_hosted_distro/ollama.md b/docs/source/distributions/self_hosted_distro/ollama.md index 2fa796e81..a487109c8 100644 --- a/docs/source/distributions/self_hosted_distro/ollama.md +++ b/docs/source/distributions/self_hosted_distro/ollama.md @@ -23,7 +23,7 @@ The `llamastack/distribution-ollama` distribution consists of the following prov | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime` | -| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | +| vector_io | `inline::sqlite-vec`, `remote::chromadb`, `remote::pgvector` | You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration. diff --git a/docs/source/distributions/self_hosted_distro/remote-vllm.md b/docs/source/distributions/self_hosted_distro/remote-vllm.md index 6c3bbd1d0..01f38807b 100644 --- a/docs/source/distributions/self_hosted_distro/remote-vllm.md +++ b/docs/source/distributions/self_hosted_distro/remote-vllm.md @@ -17,7 +17,7 @@ The `llamastack/distribution-remote-vllm` distribution consists of the following | agents | `inline::meta-reference` | | datasetio | `remote::huggingface`, `inline::localfs` | | eval | `inline::meta-reference` | -| inference | `remote::vllm` | +| inference | `remote::vllm`, `inline::sentence-transformers` | | safety | `inline::llama-guard` | | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | diff --git a/docs/source/distributions/self_hosted_distro/tgi.md b/docs/source/distributions/self_hosted_distro/tgi.md index f4eecf2cd..80baf9c81 100644 --- a/docs/source/distributions/self_hosted_distro/tgi.md +++ b/docs/source/distributions/self_hosted_distro/tgi.md @@ -19,7 +19,7 @@ The `llamastack/distribution-tgi` distribution consists of the following provide | agents | `inline::meta-reference` | | datasetio | `remote::huggingface`, `inline::localfs` | | eval | `inline::meta-reference` | -| inference | `remote::tgi` | +| inference | `remote::tgi`, `inline::sentence-transformers` | | safety | `inline::llama-guard` | | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | diff --git a/docs/source/distributions/self_hosted_distro/together.md b/docs/source/distributions/self_hosted_distro/together.md index 936ae58f5..7af0dcf4d 100644 --- a/docs/source/distributions/self_hosted_distro/together.md +++ b/docs/source/distributions/self_hosted_distro/together.md @@ -18,7 +18,7 @@ The `llamastack/distribution-together` distribution consists of the following pr | agents | `inline::meta-reference` | | datasetio | `remote::huggingface`, `inline::localfs` | | eval | `inline::meta-reference` | -| inference | `remote::together` | +| inference | `remote::together`, `inline::sentence-transformers` | | safety | `inline::llama-guard` | | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | diff --git a/docs/source/index.md b/docs/source/index.md index cb2355bfd..b6fd314b7 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -67,6 +67,7 @@ A number of "adapters" are available for some popular Inference and Vector Store | **Provider** | **Environments** | | :----: | :----: | | FAISS | Single Node | +| SQLite-Vec| Single Node | | Chroma | Hosted and Single Node | | Postgres (PGVector) | Hosted and Single Node | | Weaviate | Hosted | @@ -88,6 +89,7 @@ self introduction/index getting_started/index concepts/index +providers/index distributions/index distributions/selection building_applications/index diff --git a/docs/source/providers/index.md b/docs/source/providers/index.md new file mode 100644 index 000000000..e039e90b0 --- /dev/null +++ b/docs/source/providers/index.md @@ -0,0 +1,59 @@ +# Providers Overview + +The goal of Llama Stack is to build an ecosystem where users can easily swap out different implementations for the same API. Examples for these include: +- LLM inference providers (e.g., Fireworks, Together, AWS Bedrock, Groq, Cerebras, SambaNova, vLLM, etc.), +- Vector databases (e.g., ChromaDB, Weaviate, Qdrant, FAISS, PGVector, etc.), +- Safety providers (e.g., Meta's Llama Guard, AWS Bedrock Guardrails, etc.) + +Providers come in two flavors: +- **Remote**: the provider runs as a separate service external to the Llama Stack codebase. Llama Stack contains a small amount of adapter code. +- **Inline**: the provider is fully specified and implemented within the Llama Stack codebase. It may be a simple wrapper around an existing library, or a full fledged implementation within Llama Stack. + +Importantly, Llama Stack always strives to provide at least one fully inline provider for each API so you can iterate on a fully featured environment locally. + +## Agents +Run multi-step agentic workflows with LLMs with tool usage, memory (RAG), etc. + +## DatasetIO +Interfaces with datasets and data loaders. + +## Eval +Generates outputs (via Inference or Agents) and perform scoring. + +## Inference +Runs inference with an LLM. + +## Post Training +Fine-tunes a model. + +## Safety +Applies safety policies to the output at a Systems (not only model) level. + +## Scoring +Evaluates the outputs of the system. + +## Telemetry +Collects telemetry data from the system. + +## Tool Runtime +Is associated with the ToolGroup resouces. + +## Vector IO + +Vector IO refers to operations on vector databases, such as adding documents, searching, and deleting documents. +Vector IO plays a crucial role in [Retreival Augmented Generation (RAG)](../..//building_applications/rag), where the vector +io and database are used to store and retrieve documents for retrieval. + +#### Vector IO Providers +The following providers (i.e., databases) are available for Vector IO: + +```{toctree} +:maxdepth: 1 + +vector_io/faiss +vector_io/sqlite-vec +vector_io/chromadb +vector_io/pgvector +vector_io/qdrant +vector_io/weaviate +``` diff --git a/docs/source/providers/vector_io/chromadb.md b/docs/source/providers/vector_io/chromadb.md new file mode 100644 index 000000000..4a7caf2e1 --- /dev/null +++ b/docs/source/providers/vector_io/chromadb.md @@ -0,0 +1,36 @@ +--- +orphan: true +--- +# Chroma + +[Chroma](https://www.trychroma.com/) is an inline and remote vector +database provider for Llama Stack. It allows you to store and query vectors directly within a Chroma database. +That means you're not limited to storing vectors in memory or in a separate service. + +## Features +Chroma supports: +- Store embeddings and their metadata +- Vector search +- Full-text search +- Document storage +- Metadata filtering +- Multi-modal retrieval + +## Usage + +To use Chrome in your Llama Stack project, follow these steps: + +1. Install the necessary dependencies. +2. Configure your Llama Stack project to use chroma. +3. Start storing and querying vectors. + +## Installation + +You can install chroma using pip: + +```bash +pip install chromadb +``` + +## Documentation +See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introduction) for more details about Chroma in general. diff --git a/docs/source/providers/vector_io/faiss.md b/docs/source/providers/vector_io/faiss.md new file mode 100644 index 000000000..f894190eb --- /dev/null +++ b/docs/source/providers/vector_io/faiss.md @@ -0,0 +1,33 @@ +--- +orphan: true +--- +# Faiss + +[Faiss](https://github.com/facebookresearch/faiss) is an inline vector database provider for Llama Stack. It +allows you to store and query vectors directly in memory. +That means you'll get fast and efficient vector retrieval. + +## Features + +- Lightweight and easy to use +- Fully integrated with Llama Stack +- GPU support + +## Usage + +To use Faiss in your Llama Stack project, follow these steps: + +1. Install the necessary dependencies. +2. Configure your Llama Stack project to use Faiss. +3. Start storing and querying vectors. + +## Installation + +You can install Faiss using pip: + +```bash +pip install faiss-cpu +``` +## Documentation +See [Faiss' documentation](https://faiss.ai/) or the [Faiss Wiki](https://github.com/facebookresearch/faiss/wiki) for +more details about Faiss in general. diff --git a/docs/source/providers/vector_io/pgvector.md b/docs/source/providers/vector_io/pgvector.md new file mode 100644 index 000000000..919eb88d8 --- /dev/null +++ b/docs/source/providers/vector_io/pgvector.md @@ -0,0 +1,31 @@ +--- +orphan: true +--- +# Postgres PGVector + +[PGVector](https://github.com/pgvector/pgvector) is a remote vector database provider for Llama Stack. It +allows you to store and query vectors directly in memory. +That means you'll get fast and efficient vector retrieval. + +## Features + +- Easy to use +- Fully integrated with Llama Stack + +## Usage + +To use PGVector in your Llama Stack project, follow these steps: + +1. Install the necessary dependencies. +2. Configure your Llama Stack project to use Faiss. +3. Start storing and querying vectors. + +## Installation + +You can install PGVector using docker: + +```bash +docker pull pgvector/pgvector:pg17 +``` +## Documentation +See [PGVector's documentation](https://github.com/pgvector/pgvector) for more details about PGVector in general. diff --git a/docs/source/providers/vector_io/qdrant.md b/docs/source/providers/vector_io/qdrant.md new file mode 100644 index 000000000..c374ade98 --- /dev/null +++ b/docs/source/providers/vector_io/qdrant.md @@ -0,0 +1,31 @@ +--- +orphan: true +--- +# Qdrant + +[Qdrant](https://qdrant.tech/documentation/) is a remote vector database provider for Llama Stack. It +allows you to store and query vectors directly in memory. +That means you'll get fast and efficient vector retrieval. + +## Features + +- Easy to use +- Fully integrated with Llama Stack + +## Usage + +To use Qdrant in your Llama Stack project, follow these steps: + +1. Install the necessary dependencies. +2. Configure your Llama Stack project to use Faiss. +3. Start storing and querying vectors. + +## Installation + +You can install Qdrant using docker: + +```bash +docker pull qdrant/qdrant +``` +## Documentation +See the [Qdrant documentation](https://qdrant.tech/documentation/) for more details about Qdrant in general. diff --git a/docs/source/providers/vector_io/sqlite-vec.md b/docs/source/providers/vector_io/sqlite-vec.md new file mode 100644 index 000000000..f5ce4c003 --- /dev/null +++ b/docs/source/providers/vector_io/sqlite-vec.md @@ -0,0 +1,33 @@ +--- +orphan: true +--- +# SQLite-Vec + +[SQLite-Vec](https://github.com/asg017/sqlite-vec) is an inline vector database provider for Llama Stack. It +allows you to store and query vectors directly within an SQLite database. +That means you're not limited to storing vectors in memory or in a separate service. + +## Features + +- Lightweight and easy to use +- Fully integrated with Llama Stack + +## Usage + +To use SQLite-Vec in your Llama Stack project, follow these steps: + +1. Install the necessary dependencies. +2. Configure your Llama Stack project to use SQLite-Vec. +3. Start storing and querying vectors. + +## Installation + +You can install SQLite-Vec using pip: + +```bash +pip install sqlite-vec +``` + +## Documentation + +See [sqlite-vec's GitHub repo](https://github.com/asg017/sqlite-vec/tree/main) for more details about sqlite-vec in general. diff --git a/docs/source/providers/vector_io/weaviate.md b/docs/source/providers/vector_io/weaviate.md new file mode 100644 index 000000000..47321781c --- /dev/null +++ b/docs/source/providers/vector_io/weaviate.md @@ -0,0 +1,33 @@ +--- +orphan: true +--- +# Weaviate + +[Weaviate](https://weaviate.io/) is a vector database provider for Llama Stack. +It allows you to store and query vectors directly within a Weaviate database. +That means you're not limited to storing vectors in memory or in a separate service. + +## Features +Weaviate supports: +- Store embeddings and their metadata +- Vector search +- Full-text search +- Hybrid search +- Document storage +- Metadata filtering +- Multi-modal retrieval + +## Usage + +To use Weaviate in your Llama Stack project, follow these steps: + +1. Install the necessary dependencies. +2. Configure your Llama Stack project to use chroma. +3. Start storing and querying vectors. + +## Installation + +To install Weaviate see the [Weaviate quickstart documentation](https://weaviate.io/developers/weaviate/quickstart). + +## Documentation +See [Weaviate's documentation](https://weaviate.io/developers/weaviate) for more details about Weaviate in general. diff --git a/docs/source/references/llama_cli_reference/index.md b/docs/source/references/llama_cli_reference/index.md index 76abce544..a43666963 100644 --- a/docs/source/references/llama_cli_reference/index.md +++ b/docs/source/references/llama_cli_reference/index.md @@ -171,7 +171,7 @@ The `llama model` command helps you explore the model’s interface. llama model --help ``` ``` -usage: llama model [-h] {download,list,prompt-format,describe} ... +usage: llama model [-h] {download,list,prompt-format,describe,verify-download,remove} ... Work with llama models @@ -179,15 +179,15 @@ options: -h, --help show this help message and exit model_subcommands: - {download,list,prompt-format,describe} + {download,list,prompt-format,describe,verify-download,remove} ``` +### Describe + You can use the describe command to know more about a model: ``` llama model describe -m Llama3.2-3B-Instruct ``` -### Describe - ``` +-----------------------------+----------------------------------+ | Model | Llama3.2-3B-Instruct | @@ -234,3 +234,10 @@ llama model prompt-format -m Llama3.2-3B-Instruct You will be shown a Markdown formatted description of the model interface and how prompts / messages are formatted for various scenarios. **NOTE**: Outputs in terminal are color printed to show special tokens. + +### Remove model +You can run `llama model remove` to remove unecessary model: + +``` +llama model remove -m Llama-Guard-3-8B-int8 +``` diff --git a/llama_stack/apis/agents/agents.py b/llama_stack/apis/agents/agents.py index 367648ded..c904fdbef 100644 --- a/llama_stack/apis/agents/agents.py +++ b/llama_stack/apis/agents/agents.py @@ -194,6 +194,7 @@ class AgentTurnResponseEventType(Enum): turn_start = "turn_start" turn_complete = "turn_complete" + turn_awaiting_input = "turn_awaiting_input" @json_schema_type @@ -235,6 +236,14 @@ class AgentTurnResponseTurnCompletePayload(BaseModel): turn: Turn +@json_schema_type +class AgentTurnResponseTurnAwaitingInputPayload(BaseModel): + event_type: Literal[AgentTurnResponseEventType.turn_awaiting_input.value] = ( + AgentTurnResponseEventType.turn_awaiting_input.value + ) + turn: Turn + + AgentTurnResponseEventPayload = register_schema( Annotated[ Union[ @@ -243,6 +252,7 @@ AgentTurnResponseEventPayload = register_schema( AgentTurnResponseStepCompletePayload, AgentTurnResponseTurnStartPayload, AgentTurnResponseTurnCompletePayload, + AgentTurnResponseTurnAwaitingInputPayload, ], Field(discriminator="event_type"), ], @@ -286,6 +296,18 @@ class AgentTurnCreateRequest(AgentConfigOverridablePerTurn): stream: Optional[bool] = False tool_config: Optional[ToolConfig] = None + # TODO (xiyan): temporary flag, will remove for 0.1.5 + allow_turn_resume: Optional[bool] = False + + +@json_schema_type +class AgentTurnResumeRequest(BaseModel): + agent_id: str + session_id: str + turn_id: str + tool_responses: List[ToolResponseMessage] + stream: Optional[bool] = False + @json_schema_type class AgentTurnResponseStreamChunk(BaseModel): @@ -333,8 +355,34 @@ class Agents(Protocol): documents: Optional[List[Document]] = None, toolgroups: Optional[List[AgentToolGroup]] = None, tool_config: Optional[ToolConfig] = None, + allow_turn_resume: Optional[bool] = False, ) -> Union[Turn, AsyncIterator[AgentTurnResponseStreamChunk]]: ... + @webmethod( + route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume", + method="POST", + ) + async def resume_agent_turn( + self, + agent_id: str, + session_id: str, + turn_id: str, + tool_responses: List[ToolResponseMessage], + stream: Optional[bool] = False, + ) -> Union[Turn, AsyncIterator[AgentTurnResponseStreamChunk]]: + """Resume an agent turn with executed tool call responses. + + When a Turn has the status `awaiting_input` due to pending input from client side tool calls, this endpoint can be used to submit the outputs from the tool calls once they are ready. + + :param agent_id: The ID of the agent to resume. + :param session_id: The ID of the session to resume. + :param turn_id: The ID of the turn to resume. + :param tool_responses: The tool call responses to resume the turn with. + :param stream: Whether to stream the response. + :returns: A Turn object if stream is False, otherwise an AsyncIterator of AgentTurnResponseStreamChunk objects. + """ + ... + @webmethod( route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}", method="GET", diff --git a/llama_stack/apis/common/type_system.py b/llama_stack/apis/common/type_system.py index 139ae8875..d7746df8d 100644 --- a/llama_stack/apis/common/type_system.py +++ b/llama_stack/apis/common/type_system.py @@ -91,15 +91,18 @@ ParamType = register_schema( name="ParamType", ) +""" # TODO: recursive definition of ParamType in these containers # will cause infinite recursion in OpenAPI generation script # since we are going with ChatCompletionInputType and CompletionInputType # we don't need to worry about ArrayType/ObjectType/UnionType for now -# ArrayType.model_rebuild() -# ObjectType.model_rebuild() -# UnionType.model_rebuild() +ArrayType.model_rebuild() +ObjectType.model_rebuild() +UnionType.model_rebuild() -# class CustomType(BaseModel): -# type: Literal["custom"] = "custom" -# validator_class: str +class CustomType(BaseModel): +pylint: disable=syntax-error + type: Literal["custom"] = "custom" + validator_class: str +""" diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py index a3fb69477..e517d9c3c 100644 --- a/llama_stack/apis/inference/inference.py +++ b/llama_stack/apis/inference/inference.py @@ -20,7 +20,7 @@ from typing import ( from pydantic import BaseModel, Field, field_validator from typing_extensions import Annotated -from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent +from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent, InterleavedContentItem from llama_stack.apis.models import Model from llama_stack.apis.telemetry.telemetry import MetricResponseMixin from llama_stack.models.llama.datatypes import ( @@ -165,6 +165,7 @@ class ToolResponse(BaseModel): call_id: str tool_name: Union[BuiltinTool, str] content: InterleavedContent + metadata: Optional[Dict[str, Any]] = None @field_validator("tool_name", mode="before") @classmethod @@ -402,6 +403,30 @@ class ModelStore(Protocol): def get_model(self, identifier: str) -> Model: ... +class TextTruncation(Enum): + """Config for how to truncate text for embedding when text is longer than the model's max sequence length. Start and End semantics depend on whether the language is left-to-right or right-to-left. + + :cvar none: No truncation (default). If the text is longer than the model's max sequence length, you will get an error. + :cvar start: Truncate from the start + :cvar end: Truncate from the end + """ + + none = "none" + start = "start" + end = "end" + + +class EmbeddingTaskType(Enum): + """How is the embedding being used? This is only supported by asymmetric embedding models. + + :cvar query: Used for a query for semantic search. + :cvar document: Used at indexing time when ingesting documents. + """ + + query = "query" + document = "document" + + @runtime_checkable @trace_protocol class Inference(Protocol): @@ -481,12 +506,18 @@ class Inference(Protocol): async def embeddings( self, model_id: str, - contents: List[InterleavedContent], + contents: List[str] | List[InterleavedContentItem], + text_truncation: Optional[TextTruncation] = TextTruncation.none, + output_dimension: Optional[int] = None, + task_type: Optional[EmbeddingTaskType] = None, ) -> EmbeddingsResponse: """Generate embeddings for content pieces using the specified model. :param model_id: The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint. - :param contents: List of contents to generate embeddings for. Note that content can be multimodal. The behavior depends on the model and provider. Some models may only support text. + :param contents: List of contents to generate embeddings for. Each content can be a string or an InterleavedContentItem (and hence can be multimodal). The behavior depends on the model and provider. Some models may only support text. + :param output_dimension: (Optional) Output dimensionality for the embeddings. Only supported by Matryoshka models. + :param text_truncation: (Optional) Config for how to truncate text for embedding when text is longer than the model's max sequence length. + :param task_type: (Optional) How is the embedding being used? This is only supported by asymmetric embedding models. :returns: An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id} """ ... diff --git a/llama_stack/apis/tools/rag_tool.py b/llama_stack/apis/tools/rag_tool.py index cff8eeefe..2b9ef10d8 100644 --- a/llama_stack/apis/tools/rag_tool.py +++ b/llama_stack/apis/tools/rag_tool.py @@ -26,6 +26,7 @@ class RAGDocument(BaseModel): @json_schema_type class RAGQueryResult(BaseModel): content: Optional[InterleavedContent] = None + metadata: Dict[str, Any] = Field(default_factory=dict) @json_schema_type diff --git a/llama_stack/apis/tools/tools.py b/llama_stack/apis/tools/tools.py index b83be127f..a4d84edbe 100644 --- a/llama_stack/apis/tools/tools.py +++ b/llama_stack/apis/tools/tools.py @@ -72,6 +72,7 @@ class ToolInvocationResult(BaseModel): content: InterleavedContent error_message: Optional[str] = None error_code: Optional[int] = None + metadata: Optional[Dict[str, Any]] = None class ToolStore(Protocol): diff --git a/llama_stack/cli/model/list.py b/llama_stack/cli/model/list.py index 2f62cb9ce..b9499f06d 100644 --- a/llama_stack/cli/model/list.py +++ b/llama_stack/cli/model/list.py @@ -19,6 +19,13 @@ def _get_model_size(model_dir): return sum(f.stat().st_size for f in Path(model_dir).rglob("*") if f.is_file()) +def _convert_to_model_descriptor(model): + for m in all_registered_models(): + if model == m.descriptor().replace(":", "-"): + return str(m.descriptor()) + return str(model) + + def _run_model_list_downloaded_cmd() -> None: headers = ["Model", "Size", "Modified Time"] @@ -30,7 +37,7 @@ def _run_model_list_downloaded_cmd() -> None: modified_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(os.path.getmtime(abs_path))) rows.append( [ - model, + _convert_to_model_descriptor(model), model_size, modified_time, ] @@ -68,6 +75,13 @@ class ModelList(Subcommand): action="store_true", help="List the downloaded models", ) + self.parser.add_argument( + "-s", + "--search", + type=str, + required=False, + help="Search for the input string as a substring in the model descriptor(ID)", + ) def _run_model_list_cmd(self, args: argparse.Namespace) -> None: from .safety_models import prompt_guard_model_sku @@ -87,15 +101,19 @@ class ModelList(Subcommand): continue descriptor = model.descriptor() - rows.append( - [ - descriptor, - model.huggingface_repo, - f"{model.max_seq_length // 1024}K", - ] + if not args.search or args.search.lower() in descriptor.lower(): + rows.append( + [ + descriptor, + model.huggingface_repo, + f"{model.max_seq_length // 1024}K", + ] + ) + if len(rows) == 0: + print(f"Did not find any model matching `{args.search}`.") + else: + print_table( + rows, + headers, + separate_rows=True, ) - print_table( - rows, - headers, - separate_rows=True, - ) diff --git a/llama_stack/cli/model/model.py b/llama_stack/cli/model/model.py index 3f8f55773..2f4065b83 100644 --- a/llama_stack/cli/model/model.py +++ b/llama_stack/cli/model/model.py @@ -10,6 +10,7 @@ from llama_stack.cli.model.describe import ModelDescribe from llama_stack.cli.model.download import ModelDownload from llama_stack.cli.model.list import ModelList from llama_stack.cli.model.prompt_format import ModelPromptFormat +from llama_stack.cli.model.remove import ModelRemove from llama_stack.cli.model.verify_download import ModelVerifyDownload from llama_stack.cli.subcommand import Subcommand @@ -35,3 +36,4 @@ class ModelParser(Subcommand): ModelPromptFormat.create(subparsers) ModelDescribe.create(subparsers) ModelVerifyDownload.create(subparsers) + ModelRemove.create(subparsers) diff --git a/llama_stack/cli/model/remove.py b/llama_stack/cli/model/remove.py new file mode 100644 index 000000000..ee8d6299d --- /dev/null +++ b/llama_stack/cli/model/remove.py @@ -0,0 +1,67 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import argparse +import os +import shutil + +from llama_stack.cli.subcommand import Subcommand +from llama_stack.distribution.utils.config_dirs import DEFAULT_CHECKPOINT_DIR +from llama_stack.models.llama.sku_list import resolve_model + + +class ModelRemove(Subcommand): + """Remove the downloaded llama model""" + + def __init__(self, subparsers: argparse._SubParsersAction): + super().__init__() + self.parser = subparsers.add_parser( + "remove", + prog="llama model remove", + description="Remove the downloaded llama model", + formatter_class=argparse.RawTextHelpFormatter, + ) + self._add_arguments() + self.parser.set_defaults(func=self._run_model_remove_cmd) + + def _add_arguments(self): + self.parser.add_argument( + "-m", + "--model", + required=True, + help="Specify the llama downloaded model name, see `llama model list --downloaded`", + ) + self.parser.add_argument( + "-f", + "--force", + action="store_true", + help="Used to forcefully remove the llama model from the storage without further confirmation", + ) + + def _run_model_remove_cmd(self, args: argparse.Namespace) -> None: + from .safety_models import prompt_guard_model_sku + + prompt_guard = prompt_guard_model_sku() + if args.model == prompt_guard.model_id: + model = prompt_guard + else: + model = resolve_model(args.model) + + model_path = os.path.join(DEFAULT_CHECKPOINT_DIR, args.model.replace(":", "-")) + + if model is None or not os.path.isdir(model_path): + print(f"'{args.model}' is not a valid llama model or does not exist.") + return + + if args.force: + shutil.rmtree(model_path) + print(f"{args.model} removed.") + else: + if input(f"Are you sure you want to remove {args.model}? (y/n): ").strip().lower() == "y": + shutil.rmtree(model_path) + print(f"{args.model} removed.") + else: + print("Removal aborted.") diff --git a/llama_stack/cli/stack/_build.py b/llama_stack/cli/stack/_build.py index 76f03aa5c..96382d428 100644 --- a/llama_stack/cli/stack/_build.py +++ b/llama_stack/cli/stack/_build.py @@ -9,6 +9,7 @@ import importlib.resources import json import os import shutil +import sys import textwrap from functools import lru_cache from pathlib import Path @@ -23,10 +24,10 @@ from termcolor import cprint from llama_stack.cli.table import print_table from llama_stack.distribution.build import ( SERVER_DEPENDENCIES, - ImageType, build_image, get_provider_dependencies, ) +from llama_stack.distribution.configure import parse_and_maybe_upgrade_config from llama_stack.distribution.datatypes import ( BuildConfig, DistributionSpec, @@ -37,6 +38,8 @@ from llama_stack.distribution.distribution import get_provider_registry from llama_stack.distribution.resolver import InvalidProviderError from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR from llama_stack.distribution.utils.dynamic import instantiate_class_type +from llama_stack.distribution.utils.exec import formulate_run_args, in_notebook, run_with_pty +from llama_stack.distribution.utils.image_types import ImageType from llama_stack.providers.datatypes import Api TEMPLATES_PATH = Path(__file__).parent.parent.parent / "templates" @@ -59,8 +62,16 @@ def run_stack_build_command(args: argparse.Namespace) -> None: if args.list_templates: return _run_template_list_cmd() - current_conda_env = os.environ.get("CONDA_DEFAULT_ENV") - image_name = args.image_name or current_conda_env + if args.image_type == "venv": + current_venv = os.environ.get("VIRTUAL_ENV") + image_name = args.image_name or current_venv + if not image_name and in_notebook(): + image_name = "__system__" + elif args.image_type == "conda": + current_conda_env = os.environ.get("CONDA_DEFAULT_ENV") + image_name = args.image_name or current_conda_env + else: + image_name = args.image_name if args.template: available_templates = available_templates_specs() @@ -69,7 +80,7 @@ def run_stack_build_command(args: argparse.Namespace) -> None: f"Could not find template {args.template}. Please run `llama stack build --list-templates` to check out the available templates", color="red", ) - return + sys.exit(1) build_config = available_templates[args.template] if args.image_type: build_config.image_type = args.image_type @@ -78,7 +89,7 @@ def run_stack_build_command(args: argparse.Namespace) -> None: f"Please specify a image-type (container | conda | venv) for {args.template}", color="red", ) - return + sys.exit(1) elif not args.config and not args.template: name = prompt( "> Enter a name for your Llama Stack (e.g. my-local-stack): ", @@ -159,14 +170,14 @@ def run_stack_build_command(args: argparse.Namespace) -> None: f"Could not parse config file {args.config}: {e}", color="red", ) - return + sys.exit(1) if build_config.image_type == ImageType.container.value and not args.image_name: cprint( "Please specify --image-name when building a container from a config file", color="red", ) - return + sys.exit(1) if args.print_deps_only: print(f"# Dependencies for {args.template or args.config or image_name}") @@ -177,19 +188,41 @@ def run_stack_build_command(args: argparse.Namespace) -> None: print(f"uv pip install {special_dep}") return - _run_stack_build_command_from_build_config( - build_config, - image_name=image_name, - config_path=args.config, - template_name=args.template, - ) + try: + run_config = _run_stack_build_command_from_build_config( + build_config, + image_name=image_name, + config_path=args.config, + template_name=args.template, + ) + + except (Exception, RuntimeError) as exc: + cprint( + f"Error building stack: {exc}", + color="red", + ) + sys.exit(1) + if run_config is None: + cprint( + "Run config path is empty", + color="red", + ) + sys.exit(1) + + if args.run: + run_config = Path(run_config) + config_dict = yaml.safe_load(run_config.read_text()) + config = parse_and_maybe_upgrade_config(config_dict) + run_args = formulate_run_args(args.image_type, args.image_name, config, args.template) + run_args.extend([run_config, str(os.getenv("LLAMA_STACK_PORT", 8321))]) + run_with_pty(run_args) def _generate_run_config( build_config: BuildConfig, build_dir: Path, image_name: str, -) -> None: +) -> str: """ Generate a run.yaml template file for user to edit from a build.yaml file """ @@ -239,6 +272,7 @@ def _generate_run_config( f"You can now run your stack with `llama stack run {run_config_file}`", color="green", ) + return run_config_file def _run_stack_build_command_from_build_config( @@ -246,7 +280,7 @@ def _run_stack_build_command_from_build_config( image_name: Optional[str] = None, template_name: Optional[str] = None, config_path: Optional[str] = None, -) -> None: +) -> str: if build_config.image_type == ImageType.container.value: if template_name: image_name = f"distribution-{template_name}" @@ -256,6 +290,9 @@ def _run_stack_build_command_from_build_config( elif build_config.image_type == ImageType.conda.value: if not image_name: raise ValueError("Please specify an image name when building a conda image") + elif build_config.image_type == ImageType.venv.value: + if not image_name: + raise ValueError("Please specify an image name when building a venv image") if template_name: build_dir = DISTRIBS_BASE_DIR / template_name @@ -276,7 +313,7 @@ def _run_stack_build_command_from_build_config( template_or_config=template_name or config_path, ) if return_code != 0: - return + raise RuntimeError(f"Failed to build image {image_name}") if template_name: # copy run.yaml from template to build_dir instead of generating it again @@ -286,8 +323,9 @@ def _run_stack_build_command_from_build_config( shutil.copy(path, run_config_file) cprint("Build Successful!", color="green") + return template_path else: - _generate_run_config(build_config, build_dir, image_name) + return _generate_run_config(build_config, build_dir, image_name) def _run_template_list_cmd() -> None: diff --git a/llama_stack/cli/stack/build.py b/llama_stack/cli/stack/build.py index 7b17a960a..ceee725e6 100644 --- a/llama_stack/cli/stack/build.py +++ b/llama_stack/cli/stack/build.py @@ -68,6 +68,13 @@ the build. If not specified, currently active Conda environment will be used if help="Print the dependencies for the stack only, without building the stack", ) + self.parser.add_argument( + "--run", + action="store_true", + default=False, + help="Run the stack after building using the same image type, name, and other applicable arguments", + ) + def _run_stack_build_command(self, args: argparse.Namespace) -> None: # always keep implementation completely silo-ed away from CLI so CLI # can be fast to load and reduces dependencies diff --git a/llama_stack/cli/stack/configure.py b/llama_stack/cli/stack/configure.py deleted file mode 100644 index 2bb3f7313..000000000 --- a/llama_stack/cli/stack/configure.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -import argparse - -from llama_stack.cli.subcommand import Subcommand - - -class StackConfigure(Subcommand): - """Llama cli for configuring llama toolchain configs""" - - def __init__(self, subparsers: argparse._SubParsersAction): - super().__init__() - self.parser = subparsers.add_parser( - "configure", - prog="llama stack configure", - description="Configure a llama stack distribution", - formatter_class=argparse.RawTextHelpFormatter, - ) - self._add_arguments() - self.parser.set_defaults(func=self._run_stack_configure_cmd) - - def _add_arguments(self): - self.parser.add_argument( - "config", - type=str, - help="Path to the build config file (e.g. ~/.llama/builds//-build.yaml). For container, this could also be the name of the container image. ", - ) - - self.parser.add_argument( - "--output-dir", - type=str, - help="Path to the output directory to store generated run.yaml config file. If not specified, will use ~/.llama/build//-run.yaml", - ) - - def _run_stack_configure_cmd(self, args: argparse.Namespace) -> None: - self.parser.error( - """ - DEPRECATED! llama stack configure has been deprecated. - Please use llama stack run instead. - Please see example run.yaml in /distributions folder. - """ - ) diff --git a/llama_stack/cli/stack/run.py b/llama_stack/cli/stack/run.py index 73536491b..627ee829a 100644 --- a/llama_stack/cli/stack/run.py +++ b/llama_stack/cli/stack/run.py @@ -74,10 +74,6 @@ class StackRun(Subcommand): ) def _run_stack_run_cmd(self, args: argparse.Namespace) -> None: - import importlib.resources - import json - import subprocess - import yaml from termcolor import cprint @@ -87,7 +83,7 @@ class StackRun(Subcommand): BUILDS_BASE_DIR, DISTRIBS_BASE_DIR, ) - from llama_stack.distribution.utils.exec import run_with_pty + from llama_stack.distribution.utils.exec import formulate_run_args, run_with_pty if not args.config: self.parser.error("Must specify a config file to run") @@ -125,64 +121,7 @@ class StackRun(Subcommand): config_dict = yaml.safe_load(config_file.read_text()) config = parse_and_maybe_upgrade_config(config_dict) - if args.image_type == ImageType.container.value or config.container_image: - script = importlib.resources.files("llama_stack") / "distribution/start_container.sh" - image_name = f"distribution-{template_name}" if template_name else config.container_image - run_args = [script, image_name] - elif args.image_type == ImageType.conda.value: - current_conda_env = os.environ.get("CONDA_DEFAULT_ENV") - image_name = args.image_name or current_conda_env - if not image_name: - cprint( - "No current conda environment detected, please specify a conda environment name with --image-name", - color="red", - ) - return - - def get_conda_prefix(env_name): - # Conda "base" environment does not end with "base" in the - # prefix, so should be handled separately. - if env_name == "base": - return os.environ.get("CONDA_PREFIX") - # Get conda environments info - conda_env_info = json.loads(subprocess.check_output(["conda", "info", "--envs", "--json"]).decode()) - envs = conda_env_info["envs"] - for envpath in envs: - if envpath.endswith(env_name): - return envpath - return None - - print(f"Using conda environment: {image_name}") - conda_prefix = get_conda_prefix(image_name) - if not conda_prefix: - cprint( - f"Conda environment {image_name} does not exist.", - color="red", - ) - return - - build_file = Path(conda_prefix) / "llamastack-build.yaml" - if not build_file.exists(): - cprint( - f"Build file {build_file} does not exist.\n\nPlease run `llama stack build` or specify the correct conda environment name with --image-name", - color="red", - ) - return - - script = importlib.resources.files("llama_stack") / "distribution/start_conda_env.sh" - run_args = [ - script, - image_name, - ] - else: - # else must be venv since that is the only valid option left. - current_venv = os.environ.get("VIRTUAL_ENV") - venv = args.image_name or current_venv - script = importlib.resources.files("llama_stack") / "distribution/start_venv.sh" - run_args = [ - script, - venv, - ] + run_args = formulate_run_args(args.image_type, args.image_name, config, template_name) run_args.extend([str(config_file), str(args.port)]) if args.disable_ipv6: @@ -206,5 +145,4 @@ class StackRun(Subcommand): if args.tls_keyfile and args.tls_certfile: run_args.extend(["--tls-keyfile", args.tls_keyfile, "--tls-certfile", args.tls_certfile]) - run_with_pty(run_args) diff --git a/llama_stack/cli/stack/stack.py b/llama_stack/cli/stack/stack.py index 10e49f8c9..431f7b98e 100644 --- a/llama_stack/cli/stack/stack.py +++ b/llama_stack/cli/stack/stack.py @@ -10,7 +10,6 @@ from importlib.metadata import version from llama_stack.cli.subcommand import Subcommand from .build import StackBuild -from .configure import StackConfigure from .list_apis import StackListApis from .list_providers import StackListProviders from .run import StackRun @@ -37,7 +36,6 @@ class StackParser(Subcommand): # Add sub-commands StackBuild.create(subparsers) - StackConfigure.create(subparsers) StackListApis.create(subparsers) StackListProviders.create(subparsers) StackRun.create(subparsers) diff --git a/llama_stack/distribution/build.py b/llama_stack/distribution/build.py index 511817de8..2b43b8128 100644 --- a/llama_stack/distribution/build.py +++ b/llama_stack/distribution/build.py @@ -7,7 +7,6 @@ import importlib.resources import logging import sys -from enum import Enum from pathlib import Path from typing import Dict, List @@ -18,6 +17,7 @@ from llama_stack.distribution.datatypes import BuildConfig, Provider from llama_stack.distribution.distribution import get_provider_registry from llama_stack.distribution.utils.config_dirs import BUILDS_BASE_DIR from llama_stack.distribution.utils.exec import run_command, run_with_pty +from llama_stack.distribution.utils.image_types import ImageType from llama_stack.providers.datatypes import Api log = logging.getLogger(__name__) @@ -33,12 +33,6 @@ SERVER_DEPENDENCIES = [ ] -class ImageType(Enum): - container = "container" - conda = "conda" - venv = "venv" - - class ApiInput(BaseModel): api: Api provider: str diff --git a/llama_stack/distribution/build_container.sh b/llama_stack/distribution/build_container.sh index 4101cec44..08941a538 100755 --- a/llama_stack/distribution/build_container.sh +++ b/llama_stack/distribution/build_container.sh @@ -8,6 +8,8 @@ LLAMA_MODELS_DIR=${LLAMA_MODELS_DIR:-} LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-} +LLAMA_STACK_CLIENT_DIR=${LLAMA_STACK_CLIENT_DIR:-} + TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-} PYPI_VERSION=${PYPI_VERSION:-} BUILD_PLATFORM=${BUILD_PLATFORM:-} @@ -32,7 +34,7 @@ container_base="$3" build_file_path="$4" host_build_dir="$5" pip_dependencies="$6" -special_pip_deps="$7" +special_pip_deps="${7:-}" # Define color codes @@ -106,26 +108,39 @@ fi stack_mount="/app/llama-stack-source" models_mount="/app/llama-models-source" +client_mount="/app/llama-stack-client-source" -if [ -n "$LLAMA_STACK_DIR" ]; then - if [ ! -d "$LLAMA_STACK_DIR" ]; then - echo "${RED}Warning: LLAMA_STACK_DIR is set but directory does not exist: $LLAMA_STACK_DIR${NC}" >&2 +install_local_package() { + local dir="$1" + local mount_point="$2" + local name="$3" + + if [ ! -d "$dir" ]; then + echo "${RED}Warning: $name is set but directory does not exist: $dir${NC}" >&2 exit 1 fi - # Install in editable format. We will mount the source code into the container - # so that changes will be reflected in the container without having to do a - # rebuild. This is just for development convenience. - if [ "$USE_COPY_NOT_MOUNT" = "true" ]; then add_to_container << EOF -COPY $LLAMA_STACK_DIR $stack_mount +COPY $dir $mount_point EOF fi - add_to_container << EOF -RUN uv pip install --no-cache -e $stack_mount +RUN uv pip install --no-cache -e $mount_point EOF +} + + +if [ -n "$LLAMA_MODELS_DIR" ]; then + install_local_package "$LLAMA_MODELS_DIR" "$models_mount" "LLAMA_MODELS_DIR" +fi + +if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then + install_local_package "$LLAMA_STACK_CLIENT_DIR" "$client_mount" "LLAMA_STACK_CLIENT_DIR" +fi + +if [ -n "$LLAMA_STACK_DIR" ]; then + install_local_package "$LLAMA_STACK_DIR" "$stack_mount" "LLAMA_STACK_DIR" else if [ -n "$TEST_PYPI_VERSION" ]; then # these packages are damaged in test-pypi, so install them first @@ -134,6 +149,7 @@ RUN uv pip install fastapi libcst EOF add_to_container << EOF RUN uv pip install --no-cache --extra-index-url https://test.pypi.org/simple/ \ + --index-strategy unsafe-best-match \ llama-models==$TEST_PYPI_VERSION llama-stack-client==$TEST_PYPI_VERSION llama-stack==$TEST_PYPI_VERSION EOF @@ -149,23 +165,6 @@ EOF fi fi -if [ -n "$LLAMA_MODELS_DIR" ]; then - if [ ! -d "$LLAMA_MODELS_DIR" ]; then - echo "${RED}Warning: LLAMA_MODELS_DIR is set but directory does not exist: $LLAMA_MODELS_DIR${NC}" >&2 - exit 1 - fi - - if [ "$USE_COPY_NOT_MOUNT" = "true" ]; then - add_to_container << EOF -COPY $LLAMA_MODELS_DIR $models_mount -EOF - fi - add_to_container << EOF -RUN uv pip uninstall llama-models -RUN uv pip install --no-cache $models_mount -EOF -fi - # if template_or_config ends with .yaml, it is not a template and we should not use the --template flag if [[ "$template_or_config" != *.yaml ]]; then add_to_container << EOF @@ -177,6 +176,15 @@ ENTRYPOINT ["python", "-m", "llama_stack.distribution.server.server"] EOF fi +# Add other require item commands genearic to all containers +add_to_container << EOF + +# Allows running as non-root user +RUN mkdir -p /.llama /.cache + +RUN chmod -R g+rw /app /.llama /.cache +EOF + printf "Containerfile created successfully in $TEMP_DIR/Containerfile\n\n" cat $TEMP_DIR/Containerfile printf "\n" @@ -189,6 +197,9 @@ if [ "$USE_COPY_NOT_MOUNT" != "true" ]; then if [ -n "$LLAMA_MODELS_DIR" ]; then mounts="$mounts -v $(readlink -f $LLAMA_MODELS_DIR):$models_mount" fi + if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then + mounts="$mounts -v $(readlink -f $LLAMA_STACK_CLIENT_DIR):$client_mount" + fi fi if command -v selinuxenabled &>/dev/null && selinuxenabled; then diff --git a/llama_stack/distribution/build_venv.sh b/llama_stack/distribution/build_venv.sh index b47cfcb83..52c5c7051 100755 --- a/llama_stack/distribution/build_venv.sh +++ b/llama_stack/distribution/build_venv.sh @@ -16,6 +16,7 @@ TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-} # Reference: https://github.com/astral-sh/uv/pull/1694 UV_HTTP_TIMEOUT=${UV_HTTP_TIMEOUT:-500} UV_SYSTEM_PYTHON=${UV_SYSTEM_PYTHON:-} +VIRTUAL_ENV=${VIRTUAL_ENV:-} if [ -n "$LLAMA_STACK_DIR" ]; then echo "Using llama-stack-dir=$LLAMA_STACK_DIR" @@ -24,8 +25,8 @@ if [ -n "$LLAMA_MODELS_DIR" ]; then echo "Using llama-models-dir=$LLAMA_MODELS_DIR" fi -if [ "$#" -lt 3 ]; then - echo "Usage: $0 []" >&2 +if [ "$#" -lt 2 ]; then + echo "Usage: $0 []" >&2 echo "Example: $0 mybuild ./my-stack-build.yaml 'numpy pandas scipy'" >&2 exit 1 fi @@ -34,8 +35,7 @@ special_pip_deps="$3" set -euo pipefail -build_name="$1" -env_name="llamastack-$build_name" +env_name="$1" pip_dependencies="$2" # Define color codes @@ -74,9 +74,13 @@ run() { local env_name="$1" local pip_dependencies="$2" local special_pip_deps="$3" - - if [ -n "$UV_SYSTEM_PYTHON" ]; then + + if [ -n "$UV_SYSTEM_PYTHON" ] || [ "$env_name" == "__system__" ]; then echo "Installing dependencies in system Python environment" + # if env == __system__, ensure we set UV_SYSTEM_PYTHON + export UV_SYSTEM_PYTHON=1 + elif [ "$VIRTUAL_ENV" == "$env_name" ]; then + echo "Virtual environment $env_name is already active" else echo "Using virtual environment $env_name" uv venv "$env_name" @@ -90,6 +94,7 @@ run() { # shellcheck disable=SC2086 # we are building a command line so word splitting is expected uv pip install --extra-index-url https://test.pypi.org/simple/ \ + --index-strategy unsafe-best-match \ llama-models=="$TEST_PYPI_VERSION" llama-stack=="$TEST_PYPI_VERSION" \ $pip_dependencies if [ -n "$special_pip_deps" ]; then diff --git a/llama_stack/distribution/library_client.py b/llama_stack/distribution/library_client.py index 639e5ee73..59189f8bb 100644 --- a/llama_stack/distribution/library_client.py +++ b/llama_stack/distribution/library_client.py @@ -41,6 +41,7 @@ from llama_stack.distribution.stack import ( redact_sensitive_fields, replace_env_vars, ) +from llama_stack.distribution.utils.exec import in_notebook from llama_stack.providers.utils.telemetry.tracing import ( end_trace, setup_logger, @@ -52,19 +53,6 @@ logger = logging.getLogger(__name__) T = TypeVar("T") -def in_notebook(): - try: - from IPython import get_ipython - - if "IPKernelApp" not in get_ipython().config: # pragma: no cover - return False - except ImportError: - return False - except AttributeError: - return False - return True - - def convert_pydantic_to_json_value(value: Any) -> Any: if isinstance(value, Enum): return value.value @@ -230,12 +218,11 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): if Api.telemetry in self.impls: setup_logger(self.impls[Api.telemetry]) - console = Console() - console.print(f"Using config [blue]{self.config_path_or_template_name}[/blue]:") - - # Redact sensitive information before printing - safe_config = redact_sensitive_fields(self.config.model_dump()) - console.print(yaml.dump(safe_config, indent=2)) + if not os.environ.get("PYTEST_CURRENT_TEST"): + console = Console() + console.print(f"Using config [blue]{self.config_path_or_template_name}[/blue]:") + safe_config = redact_sensitive_fields(self.config.model_dump()) + console.print(yaml.dump(safe_config, indent=2)) endpoints = get_all_api_endpoints() endpoint_impls = {} diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py index 016ca4984..df4ed03d3 100644 --- a/llama_stack/distribution/routers/routers.py +++ b/llama_stack/distribution/routers/routers.py @@ -6,7 +6,11 @@ from typing import Any, AsyncGenerator, Dict, List, Optional -from llama_stack.apis.common.content_types import URL, InterleavedContent +from llama_stack.apis.common.content_types import ( + URL, + InterleavedContent, + InterleavedContentItem, +) from llama_stack.apis.datasetio import DatasetIO, PaginatedRowsResult from llama_stack.apis.eval import ( BenchmarkConfig, @@ -17,11 +21,13 @@ from llama_stack.apis.eval import ( ) from llama_stack.apis.inference import ( EmbeddingsResponse, + EmbeddingTaskType, Inference, LogProbConfig, Message, ResponseFormat, SamplingParams, + TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -214,7 +220,10 @@ class InferenceRouter(Inference): async def embeddings( self, model_id: str, - contents: List[InterleavedContent], + contents: List[str] | List[InterleavedContentItem], + text_truncation: Optional[TextTruncation] = TextTruncation.none, + output_dimension: Optional[int] = None, + task_type: Optional[EmbeddingTaskType] = None, ) -> EmbeddingsResponse: model = await self.routing_table.get_model(model_id) if model is None: @@ -224,6 +233,9 @@ class InferenceRouter(Inference): return await self.routing_table.get_provider_impl(model_id).embeddings( model_id=model_id, contents=contents, + text_truncation=text_truncation, + output_dimension=output_dimension, + task_type=task_type, ) diff --git a/llama_stack/distribution/start_venv.sh b/llama_stack/distribution/start_venv.sh index 1cfa7248f..195274129 100755 --- a/llama_stack/distribution/start_venv.sh +++ b/llama_stack/distribution/start_venv.sh @@ -55,6 +55,7 @@ while [[ $# -gt 0 ]]; do esac done +echo "Using virtual environment: $venv_path" # Activate virtual environment if [ ! -d "$venv_path" ]; then echo -e "${RED}Error: Virtual environment not found at $venv_path${NC}" >&2 diff --git a/llama_stack/distribution/utils/exec.py b/llama_stack/distribution/utils/exec.py index 4a3a95826..00afdadbe 100644 --- a/llama_stack/distribution/utils/exec.py +++ b/llama_stack/distribution/utils/exec.py @@ -12,8 +12,78 @@ import signal import subprocess import sys +from termcolor import cprint + log = logging.getLogger(__name__) +import importlib +import json +from pathlib import Path + +from llama_stack.distribution.utils.image_types import ImageType + + +def formulate_run_args(image_type, image_name, config, template_name) -> list: + if image_type == ImageType.container.value or config.container_image: + script = importlib.resources.files("llama_stack") / "distribution/start_container.sh" + image_name = f"distribution-{template_name}" if template_name else config.container_image + run_args = [script, image_name] + elif image_type == ImageType.conda.value: + current_conda_env = os.environ.get("CONDA_DEFAULT_ENV") + image_name = image_name or current_conda_env + if not image_name: + cprint( + "No current conda environment detected, please specify a conda environment name with --image-name", + color="red", + ) + return + + def get_conda_prefix(env_name): + # Conda "base" environment does not end with "base" in the + # prefix, so should be handled separately. + if env_name == "base": + return os.environ.get("CONDA_PREFIX") + # Get conda environments info + conda_env_info = json.loads(subprocess.check_output(["conda", "info", "--envs", "--json"]).decode()) + envs = conda_env_info["envs"] + for envpath in envs: + if envpath.endswith(env_name): + return envpath + return None + + print(f"Using conda environment: {image_name}") + conda_prefix = get_conda_prefix(image_name) + if not conda_prefix: + cprint( + f"Conda environment {image_name} does not exist.", + color="red", + ) + return + + build_file = Path(conda_prefix) / "llamastack-build.yaml" + if not build_file.exists(): + cprint( + f"Build file {build_file} does not exist.\n\nPlease run `llama stack build` or specify the correct conda environment name with --image-name", + color="red", + ) + return + + script = importlib.resources.files("llama_stack") / "distribution/start_conda_env.sh" + run_args = [ + script, + image_name, + ] + else: + # else must be venv since that is the only valid option left. + current_venv = os.environ.get("VIRTUAL_ENV") + venv = image_name or current_venv + script = importlib.resources.files("llama_stack") / "distribution/start_venv.sh" + run_args = [ + script, + venv, + ] + return run_args + def run_with_pty(command): if sys.platform.startswith("win"): @@ -22,6 +92,19 @@ def run_with_pty(command): return _run_with_pty_unix(command) +def in_notebook(): + try: + from IPython import get_ipython + + if "IPKernelApp" not in get_ipython().config: # pragma: no cover + return False + except ImportError: + return False + except AttributeError: + return False + return True + + # run a command in a pseudo-terminal, with interrupt handling, # useful when you want to run interactive things def _run_with_pty_unix(command): diff --git a/llama_stack/distribution/utils/image_types.py b/llama_stack/distribution/utils/image_types.py new file mode 100644 index 000000000..1a43b092f --- /dev/null +++ b/llama_stack/distribution/utils/image_types.py @@ -0,0 +1,13 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from enum import Enum + + +class ImageType(Enum): + container = "container" + conda = "conda" + venv = "venv" diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py index 1c21df57f..560215b25 100644 --- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py +++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py @@ -30,8 +30,10 @@ from llama_stack.apis.agents import ( AgentTurnResponseStepProgressPayload, AgentTurnResponseStepStartPayload, AgentTurnResponseStreamChunk, + AgentTurnResponseTurnAwaitingInputPayload, AgentTurnResponseTurnCompletePayload, AgentTurnResponseTurnStartPayload, + AgentTurnResumeRequest, Attachment, Document, InferenceStep, @@ -60,9 +62,13 @@ from llama_stack.apis.inference import ( UserMessage, ) from llama_stack.apis.safety import Safety -from llama_stack.apis.tools import RAGDocument, RAGQueryConfig, ToolGroups, ToolRuntime +from llama_stack.apis.tools import RAGDocument, RAGQueryConfig, ToolGroups, ToolInvocationResult, ToolRuntime from llama_stack.apis.vector_io import VectorIO -from llama_stack.models.llama.datatypes import BuiltinTool, ToolCall, ToolParamDefinition +from llama_stack.models.llama.datatypes import ( + BuiltinTool, + ToolCall, + ToolParamDefinition, +) from llama_stack.providers.utils.kvstore import KVStore from llama_stack.providers.utils.memory.vector_store import concat_interleaved_content from llama_stack.providers.utils.telemetry import tracing @@ -151,6 +157,15 @@ class ChatAgent(ShieldRunnerMixin): async def create_session(self, name: str) -> str: return await self.storage.create_session(name) + async def get_messages_from_turns(self, turns: List[Turn]) -> List[Message]: + messages = [] + if self.agent_config.instructions != "": + messages.append(SystemMessage(content=self.agent_config.instructions)) + + for turn in turns: + messages.extend(self.turn_to_messages(turn)) + return messages + async def create_and_execute_turn(self, request: AgentTurnCreateRequest) -> AsyncGenerator: with tracing.span("create_and_execute_turn") as span: span.set_attribute("session_id", request.session_id) @@ -163,14 +178,7 @@ class ChatAgent(ShieldRunnerMixin): raise ValueError(f"Session {request.session_id} not found") turns = await self.storage.get_session_turns(request.session_id) - - messages = [] - if self.agent_config.instructions != "": - messages.append(SystemMessage(content=self.agent_config.instructions)) - - for i, turn in enumerate(turns): - messages.extend(self.turn_to_messages(turn)) - + messages = await self.get_messages_from_turns(turns) messages.extend(request.messages) turn_id = str(uuid.uuid4()) @@ -222,13 +230,136 @@ class ChatAgent(ShieldRunnerMixin): ) await self.storage.add_turn_to_session(request.session_id, turn) - chunk = AgentTurnResponseStreamChunk( + if output_message.tool_calls and request.allow_turn_resume: + chunk = AgentTurnResponseStreamChunk( + event=AgentTurnResponseEvent( + payload=AgentTurnResponseTurnAwaitingInputPayload( + turn=turn, + ) + ) + ) + else: + chunk = AgentTurnResponseStreamChunk( + event=AgentTurnResponseEvent( + payload=AgentTurnResponseTurnCompletePayload( + turn=turn, + ) + ) + ) + + yield chunk + + async def resume_turn(self, request: AgentTurnResumeRequest) -> AsyncGenerator: + with tracing.span("resume_turn") as span: + span.set_attribute("agent_id", self.agent_id) + span.set_attribute("session_id", request.session_id) + span.set_attribute("turn_id", request.turn_id) + span.set_attribute("request", request.model_dump_json()) + assert request.stream is True, "Non-streaming not supported" + + session_info = await self.storage.get_session_info(request.session_id) + if session_info is None: + raise ValueError(f"Session {request.session_id} not found") + + turns = await self.storage.get_session_turns(request.session_id) + messages = await self.get_messages_from_turns(turns) + messages.extend(request.tool_responses) + + last_turn_messages = [ + x for x in messages if isinstance(x, UserMessage) or isinstance(x, ToolResponseMessage) + ] + + # get the steps from the turn id + steps = [] + if len(turns) > 0: + steps = turns[-1].steps + + # mark tool execution step as complete + # if there's no tool execution in progress step (due to storage, or tool call parsing on client), + # we'll create a new tool execution step with current time + in_progress_tool_call_step = await self.storage.get_in_progress_tool_call_step( + request.session_id, request.turn_id + ) + now = datetime.now() + tool_execution_step = ToolExecutionStep( + step_id=(in_progress_tool_call_step.step_id if in_progress_tool_call_step else str(uuid.uuid4())), + turn_id=request.turn_id, + tool_calls=(in_progress_tool_call_step.tool_calls if in_progress_tool_call_step else []), + tool_responses=[ + ToolResponse( + call_id=x.call_id, + tool_name=x.tool_name, + content=x.content, + ) + for x in request.tool_responses + ], + completed_at=now, + started_at=(in_progress_tool_call_step.started_at if in_progress_tool_call_step else now), + ) + steps.append(tool_execution_step) + yield AgentTurnResponseStreamChunk( event=AgentTurnResponseEvent( - payload=AgentTurnResponseTurnCompletePayload( - turn=turn, + payload=AgentTurnResponseStepCompletePayload( + step_type=StepType.tool_execution.value, + step_id=tool_execution_step.step_id, + step_details=tool_execution_step, ) ) ) + + output_message = None + async for chunk in self.run( + session_id=request.session_id, + turn_id=request.turn_id, + input_messages=messages, + sampling_params=self.agent_config.sampling_params, + stream=request.stream, + ): + if isinstance(chunk, CompletionMessage): + output_message = chunk + continue + + assert isinstance(chunk, AgentTurnResponseStreamChunk), f"Unexpected type {type(chunk)}" + event = chunk.event + if event.payload.event_type == AgentTurnResponseEventType.step_complete.value: + steps.append(event.payload.step_details) + + yield chunk + + assert output_message is not None + + last_turn_start_time = datetime.now() + if len(turns) > 0: + last_turn_start_time = turns[-1].started_at + + turn = Turn( + turn_id=request.turn_id, + session_id=request.session_id, + input_messages=last_turn_messages, + output_message=output_message, + started_at=last_turn_start_time, + completed_at=datetime.now(), + steps=steps, + ) + await self.storage.add_turn_to_session(request.session_id, turn) + + if output_message.tool_calls: + chunk = AgentTurnResponseStreamChunk( + event=AgentTurnResponseEvent( + payload=AgentTurnResponseTurnAwaitingInputPayload( + turn=turn, + ) + ) + ) + else: + chunk = AgentTurnResponseStreamChunk( + event=AgentTurnResponseEvent( + payload=AgentTurnResponseTurnCompletePayload( + turn=turn, + ) + ) + ) + yield chunk async def run( @@ -456,6 +587,7 @@ class ChatAgent(ShieldRunnerMixin): call_id="", tool_name=MEMORY_QUERY_TOOL, content=retrieved_context or [], + metadata=result.metadata, ) ], ), @@ -611,11 +743,7 @@ class ChatAgent(ShieldRunnerMixin): input_messages = input_messages + [message] else: log.info(f"{str(message)}") - tool_call = message.tool_calls[0] - if tool_call.tool_name in client_tools: - yield message - return - + # 1. Start the tool execution step and progress step_id = str(uuid.uuid4()) yield AgentTurnResponseStreamChunk( event=AgentTurnResponseEvent( @@ -625,6 +753,7 @@ class ChatAgent(ShieldRunnerMixin): ) ) ) + tool_call = message.tool_calls[0] yield AgentTurnResponseStreamChunk( event=AgentTurnResponseEvent( payload=AgentTurnResponseStepProgressPayload( @@ -639,6 +768,23 @@ class ChatAgent(ShieldRunnerMixin): ) ) + # If tool is a client tool, yield CompletionMessage and return + if tool_call.tool_name in client_tools: + await self.storage.set_in_progress_tool_call_step( + session_id, + turn_id, + ToolExecutionStep( + step_id=step_id, + turn_id=turn_id, + tool_calls=[tool_call], + tool_responses=[], + started_at=datetime.now(), + ), + ) + yield message + return + + # If tool is a builtin server tool, execute it tool_name = tool_call.tool_name if isinstance(tool_name, BuiltinTool): tool_name = tool_name.value @@ -650,13 +796,21 @@ class ChatAgent(ShieldRunnerMixin): }, ) as span: tool_execution_start_time = datetime.now() - result_messages = await execute_tool_call_maybe( + tool_call = message.tool_calls[0] + tool_result = await execute_tool_call_maybe( self.tool_runtime_api, session_id, - [message], + tool_call, toolgroup_args, tool_to_group, ) + result_messages = [ + ToolResponseMessage( + call_id=tool_call.call_id, + tool_name=tool_call.tool_name, + content=tool_result.content, + ) + ] assert len(result_messages) == 1, "Currently not supporting multiple messages" result_message = result_messages[0] span.set_attribute("output", result_message.model_dump_json()) @@ -675,6 +829,7 @@ class ChatAgent(ShieldRunnerMixin): call_id=result_message.call_id, tool_name=result_message.tool_name, content=result_message.content, + metadata=tool_result.metadata, ) ], started_at=tool_execution_start_time, @@ -913,19 +1068,10 @@ async def attachment_message(tempdir: str, urls: List[URL]) -> ToolResponseMessa async def execute_tool_call_maybe( tool_runtime_api: ToolRuntime, session_id: str, - messages: List[CompletionMessage], + tool_call: ToolCall, toolgroup_args: Dict[str, Dict[str, Any]], tool_to_group: Dict[str, str], -) -> List[ToolResponseMessage]: - # While Tools.run interface takes a list of messages, - # All tools currently only run on a single message - # When this changes, we can drop this assert - # Whether to call tools on each message and aggregate - # or aggregate and call tool once, reamins to be seen. - assert len(messages) == 1, "Expected single message" - message = messages[0] - - tool_call = message.tool_calls[0] +) -> ToolInvocationResult: name = tool_call.tool_name group_name = tool_to_group.get(name, None) if group_name is None: @@ -946,14 +1092,7 @@ async def execute_tool_call_maybe( **tool_call_args, ), ) - - return [ - ToolResponseMessage( - call_id=tool_call.call_id, - tool_name=tool_call.tool_name, - content=result.content, - ) - ] + return result def _interpret_content_as_attachment( diff --git a/llama_stack/providers/inline/agents/meta_reference/agents.py b/llama_stack/providers/inline/agents/meta_reference/agents.py index e3c18d112..72c1a0f34 100644 --- a/llama_stack/providers/inline/agents/meta_reference/agents.py +++ b/llama_stack/providers/inline/agents/meta_reference/agents.py @@ -11,8 +11,6 @@ import tempfile import uuid from typing import AsyncGenerator, List, Optional, Union -from termcolor import colored - from llama_stack.apis.agents import ( AgentConfig, AgentCreateResponse, @@ -21,6 +19,7 @@ from llama_stack.apis.agents import ( AgentStepResponse, AgentToolGroup, AgentTurnCreateRequest, + AgentTurnResumeRequest, Document, Session, Turn, @@ -68,12 +67,7 @@ class MetaReferenceAgentsImpl(Agents): # check if "bwrap" is available if not shutil.which("bwrap"): - print( - colored( - "Warning: `bwrap` is not available. Code interpreter tool will not work correctly.", - "yellow", - ) - ) + logger.warning("Warning: `bwrap` is not available. Code interpreter tool will not work correctly.") async def create_agent( self, @@ -146,6 +140,7 @@ class MetaReferenceAgentsImpl(Agents): documents: Optional[List[Document]] = None, stream: Optional[bool] = False, tool_config: Optional[ToolConfig] = None, + allow_turn_resume: Optional[bool] = False, ) -> AsyncGenerator: request = AgentTurnCreateRequest( agent_id=agent_id, @@ -155,6 +150,7 @@ class MetaReferenceAgentsImpl(Agents): toolgroups=toolgroups, documents=documents, tool_config=tool_config, + allow_turn_resume=allow_turn_resume, ) if stream: return self._create_agent_turn_streaming(request) @@ -169,6 +165,34 @@ class MetaReferenceAgentsImpl(Agents): async for event in agent.create_and_execute_turn(request): yield event + async def resume_agent_turn( + self, + agent_id: str, + session_id: str, + turn_id: str, + tool_responses: List[ToolResponseMessage], + stream: Optional[bool] = False, + ) -> AsyncGenerator: + request = AgentTurnResumeRequest( + agent_id=agent_id, + session_id=session_id, + turn_id=turn_id, + tool_responses=tool_responses, + stream=stream, + ) + if stream: + return self._continue_agent_turn_streaming(request) + else: + raise NotImplementedError("Non-streaming agent turns not yet implemented") + + async def _continue_agent_turn_streaming( + self, + request: AgentTurnResumeRequest, + ) -> AsyncGenerator: + agent = await self.get_agent(request.agent_id) + async for event in agent.resume_turn(request): + yield event + async def get_agents_turn(self, agent_id: str, session_id: str, turn_id: str) -> Turn: turn = await self.persistence_store.get(f"session:{agent_id}:{session_id}:{turn_id}") turn = json.loads(turn) diff --git a/llama_stack/providers/inline/agents/meta_reference/persistence.py b/llama_stack/providers/inline/agents/meta_reference/persistence.py index 4b8ad6d4a..3c3866873 100644 --- a/llama_stack/providers/inline/agents/meta_reference/persistence.py +++ b/llama_stack/providers/inline/agents/meta_reference/persistence.py @@ -12,7 +12,7 @@ from typing import List, Optional from pydantic import BaseModel -from llama_stack.apis.agents import Turn +from llama_stack.apis.agents import ToolExecutionStep, Turn from llama_stack.providers.utils.kvstore import KVStore log = logging.getLogger(__name__) @@ -84,3 +84,15 @@ class AgentPersistence: continue turns.sort(key=lambda x: (x.completed_at or datetime.min)) return turns + + async def set_in_progress_tool_call_step(self, session_id: str, turn_id: str, step: ToolExecutionStep): + await self.kvstore.set( + key=f"in_progress_tool_call_step:{self.agent_id}:{session_id}:{turn_id}", + value=step.model_dump_json(), + ) + + async def get_in_progress_tool_call_step(self, session_id: str, turn_id: str) -> Optional[ToolExecutionStep]: + value = await self.kvstore.get( + key=f"in_progress_tool_call_step:{self.agent_id}:{session_id}:{turn_id}", + ) + return ToolExecutionStep(**json.loads(value)) if value else None diff --git a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py index 6a83836e6..bfb09af53 100644 --- a/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py +++ b/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py @@ -44,7 +44,6 @@ class SentenceTransformersInferenceImpl( pass async def register_model(self, model: Model) -> None: - _ = self._load_sentence_transformer_model(model.provider_resource_id) return model async def unregister_model(self, model_id: str) -> None: diff --git a/llama_stack/providers/inline/inference/vllm/vllm.py b/llama_stack/providers/inline/inference/vllm/vllm.py index 5b0df91e7..d03ea933a 100644 --- a/llama_stack/providers/inline/inference/vllm/vllm.py +++ b/llama_stack/providers/inline/inference/vllm/vllm.py @@ -22,11 +22,14 @@ from llama_stack.apis.inference import ( CompletionResponse, CompletionResponseStreamChunk, EmbeddingsResponse, + EmbeddingTaskType, Inference, + InterleavedContentItem, LogProbConfig, Message, ResponseFormat, SamplingParams, + TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -230,5 +233,12 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate): async for chunk in process_chat_completion_stream_response(stream, request): yield chunk - async def embeddings(self, model_id: str, contents: List[InterleavedContent]) -> EmbeddingsResponse: + async def embeddings( + self, + model_id: str, + contents: List[str] | List[InterleavedContentItem], + text_truncation: Optional[TextTruncation] = TextTruncation.none, + output_dimension: Optional[int] = None, + task_type: Optional[EmbeddingTaskType] = None, + ) -> EmbeddingsResponse: raise NotImplementedError() diff --git a/llama_stack/providers/inline/tool_runtime/rag/memory.py b/llama_stack/providers/inline/tool_runtime/rag/memory.py index a6cd57923..306bd78a6 100644 --- a/llama_stack/providers/inline/tool_runtime/rag/memory.py +++ b/llama_stack/providers/inline/tool_runtime/rag/memory.py @@ -119,10 +119,10 @@ class MemoryToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, RAGToolRuntime): # sort by score chunks, scores = zip(*sorted(zip(chunks, scores, strict=False), key=lambda x: x[1], reverse=True), strict=False) - + chunks = chunks[: query_config.max_chunks] tokens = 0 picked = [] - for c in chunks[: query_config.max_chunks]: + for c in chunks: metadata = c.metadata tokens += metadata["token_count"] if tokens > query_config.max_tokens_in_context: @@ -146,6 +146,9 @@ class MemoryToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, RAGToolRuntime): text="\n=== END-RETRIEVED-CONTEXT ===\n", ), ], + metadata={ + "document_ids": [c.metadata["document_id"] for c in chunks[: len(picked)]], + }, ) async def list_runtime_tools( diff --git a/llama_stack/providers/inline/vector_io/sqlite_vec/config.py b/llama_stack/providers/inline/vector_io/sqlite_vec/config.py index 5a830ff27..e5e3581c6 100644 --- a/llama_stack/providers/inline/vector_io/sqlite_vec/config.py +++ b/llama_stack/providers/inline/vector_io/sqlite_vec/config.py @@ -4,26 +4,16 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -# config.py from typing import Any, Dict from pydantic import BaseModel -from llama_stack.providers.utils.kvstore.config import ( - KVStoreConfig, - SqliteKVStoreConfig, -) - class SQLiteVectorIOConfig(BaseModel): db_path: str - kvstore: KVStoreConfig @classmethod def sample_run_config(cls, __distro_dir__: str) -> Dict[str, Any]: return { - "kvstore": SqliteKVStoreConfig.sample_run_config( - __distro_dir__=__distro_dir__, - db_name="sqlite_vec.db", - ) + "db_path": "${env.SQLITE_STORE_DIR:~/.llama/" + __distro_dir__ + "}/" + "sqlite_vec.db", } diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py index 346a2bd73..b0402f6a5 100644 --- a/llama_stack/providers/registry/inference.py +++ b/llama_stack/providers/registry/inference.py @@ -61,7 +61,10 @@ def available_providers() -> List[ProviderSpec]: InlineProviderSpec( api=Api.inference, provider_type="inline::sentence-transformers", - pip_packages=["sentence-transformers"], + pip_packages=[ + "torch torchvision --index-url https://download.pytorch.org/whl/cpu", + "sentence-transformers --no-deps", + ], module="llama_stack.providers.inline.inference.sentence_transformers", config_class="llama_stack.providers.inline.inference.sentence_transformers.config.SentenceTransformersInferenceConfig", ), diff --git a/llama_stack/providers/registry/tool_runtime.py b/llama_stack/providers/registry/tool_runtime.py index 33d880f30..95ea2dcf9 100644 --- a/llama_stack/providers/registry/tool_runtime.py +++ b/llama_stack/providers/registry/tool_runtime.py @@ -20,7 +20,18 @@ def available_providers() -> List[ProviderSpec]: InlineProviderSpec( api=Api.tool_runtime, provider_type="inline::rag-runtime", - pip_packages=[], + pip_packages=[ + "blobfile", + "chardet", + "pypdf", + "tqdm", + "numpy", + "scikit-learn", + "scipy", + "nltk", + "sentencepiece", + "transformers", + ], module="llama_stack.providers.inline.tool_runtime.rag", config_class="llama_stack.providers.inline.tool_runtime.rag.config.RagToolRuntimeConfig", api_dependencies=[Api.vector_io, Api.inference], diff --git a/llama_stack/providers/registry/vector_io.py b/llama_stack/providers/registry/vector_io.py index 88a65397a..ff4f9caf5 100644 --- a/llama_stack/providers/registry/vector_io.py +++ b/llama_stack/providers/registry/vector_io.py @@ -14,33 +14,13 @@ from llama_stack.providers.datatypes import ( remote_provider_spec, ) -EMBEDDING_DEPS = [ - "blobfile", - "chardet", - "pypdf", - "tqdm", - "numpy", - "scikit-learn", - "scipy", - "nltk", - "sentencepiece", - "transformers", - # this happens to work because special dependencies are always installed last - # so if there was a regular torch installed first, this would be ignored - # we need a better way to do this to identify potential conflicts, etc. - # for now, this lets us significantly reduce the size of the container which - # does not have any "local" inference code (and hence does not need GPU-enabled torch) - "torch torchvision --index-url https://download.pytorch.org/whl/cpu", - "sentence-transformers --no-deps", -] - def available_providers() -> List[ProviderSpec]: return [ InlineProviderSpec( api=Api.vector_io, provider_type="inline::meta-reference", - pip_packages=EMBEDDING_DEPS + ["faiss-cpu"], + pip_packages=["faiss-cpu"], module="llama_stack.providers.inline.vector_io.faiss", config_class="llama_stack.providers.inline.vector_io.faiss.FaissVectorIOConfig", deprecation_warning="Please use the `inline::faiss` provider instead.", @@ -49,24 +29,33 @@ def available_providers() -> List[ProviderSpec]: InlineProviderSpec( api=Api.vector_io, provider_type="inline::faiss", - pip_packages=EMBEDDING_DEPS + ["faiss-cpu"], + pip_packages=["faiss-cpu"], module="llama_stack.providers.inline.vector_io.faiss", config_class="llama_stack.providers.inline.vector_io.faiss.FaissVectorIOConfig", api_dependencies=[Api.inference], ), InlineProviderSpec( api=Api.vector_io, - provider_type="inline::sqlite_vec", - pip_packages=EMBEDDING_DEPS + ["sqlite-vec"], + provider_type="inline::sqlite-vec", + pip_packages=["sqlite-vec"], module="llama_stack.providers.inline.vector_io.sqlite_vec", config_class="llama_stack.providers.inline.vector_io.sqlite_vec.SQLiteVectorIOConfig", api_dependencies=[Api.inference], ), + InlineProviderSpec( + api=Api.vector_io, + provider_type="inline::sqlite_vec", + pip_packages=["sqlite-vec"], + module="llama_stack.providers.inline.vector_io.sqlite_vec", + config_class="llama_stack.providers.inline.vector_io.sqlite_vec.SQLiteVectorIOConfig", + deprecation_warning="Please use the `inline::sqlite-vec` provider (notice the hyphen instead of underscore) instead.", + api_dependencies=[Api.inference], + ), remote_provider_spec( Api.vector_io, AdapterSpec( adapter_type="chromadb", - pip_packages=EMBEDDING_DEPS + ["chromadb-client"], + pip_packages=["chromadb-client"], module="llama_stack.providers.remote.vector_io.chroma", config_class="llama_stack.providers.remote.vector_io.chroma.ChromaVectorIOConfig", ), @@ -75,7 +64,7 @@ def available_providers() -> List[ProviderSpec]: InlineProviderSpec( api=Api.vector_io, provider_type="inline::chromadb", - pip_packages=EMBEDDING_DEPS + ["chromadb"], + pip_packages=["chromadb"], module="llama_stack.providers.inline.vector_io.chroma", config_class="llama_stack.providers.inline.vector_io.chroma.ChromaVectorIOConfig", api_dependencies=[Api.inference], @@ -84,7 +73,7 @@ def available_providers() -> List[ProviderSpec]: Api.vector_io, AdapterSpec( adapter_type="pgvector", - pip_packages=EMBEDDING_DEPS + ["psycopg2-binary"], + pip_packages=["psycopg2-binary"], module="llama_stack.providers.remote.vector_io.pgvector", config_class="llama_stack.providers.remote.vector_io.pgvector.PGVectorVectorIOConfig", ), @@ -94,7 +83,7 @@ def available_providers() -> List[ProviderSpec]: Api.vector_io, AdapterSpec( adapter_type="weaviate", - pip_packages=EMBEDDING_DEPS + ["weaviate-client"], + pip_packages=["weaviate-client"], module="llama_stack.providers.remote.vector_io.weaviate", config_class="llama_stack.providers.remote.vector_io.weaviate.WeaviateVectorIOConfig", provider_data_validator="llama_stack.providers.remote.vector_io.weaviate.WeaviateRequestProviderData", @@ -115,7 +104,7 @@ def available_providers() -> List[ProviderSpec]: Api.vector_io, AdapterSpec( adapter_type="qdrant", - pip_packages=EMBEDDING_DEPS + ["qdrant-client"], + pip_packages=["qdrant-client"], module="llama_stack.providers.remote.vector_io.qdrant", config_class="llama_stack.providers.remote.vector_io.qdrant.QdrantVectorIOConfig", ), diff --git a/llama_stack/providers/remote/inference/bedrock/bedrock.py b/llama_stack/providers/remote/inference/bedrock/bedrock.py index 9c5a291db..b82a4c752 100644 --- a/llama_stack/providers/remote/inference/bedrock/bedrock.py +++ b/llama_stack/providers/remote/inference/bedrock/bedrock.py @@ -9,17 +9,22 @@ from typing import AsyncGenerator, AsyncIterator, Dict, List, Optional, Union from botocore.client import BaseClient -from llama_stack.apis.common.content_types import InterleavedContent +from llama_stack.apis.common.content_types import ( + InterleavedContent, + InterleavedContentItem, +) from llama_stack.apis.inference import ( ChatCompletionRequest, ChatCompletionResponse, ChatCompletionResponseStreamChunk, EmbeddingsResponse, + EmbeddingTaskType, Inference, LogProbConfig, Message, ResponseFormat, SamplingParams, + TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -162,7 +167,10 @@ class BedrockInferenceAdapter(ModelRegistryHelper, Inference): async def embeddings( self, model_id: str, - contents: List[InterleavedContent], + contents: List[str] | List[InterleavedContentItem], + text_truncation: Optional[TextTruncation] = TextTruncation.none, + output_dimension: Optional[int] = None, + task_type: Optional[EmbeddingTaskType] = None, ) -> EmbeddingsResponse: model = await self.model_store.get_model(model_id) embeddings = [] diff --git a/llama_stack/providers/remote/inference/cerebras/cerebras.py b/llama_stack/providers/remote/inference/cerebras/cerebras.py index 0a27d81d7..4deeea630 100644 --- a/llama_stack/providers/remote/inference/cerebras/cerebras.py +++ b/llama_stack/providers/remote/inference/cerebras/cerebras.py @@ -8,17 +8,22 @@ from typing import AsyncGenerator, List, Optional, Union from cerebras.cloud.sdk import AsyncCerebras -from llama_stack.apis.common.content_types import InterleavedContent +from llama_stack.apis.common.content_types import ( + InterleavedContent, + InterleavedContentItem, +) from llama_stack.apis.inference import ( ChatCompletionRequest, CompletionRequest, CompletionResponse, EmbeddingsResponse, + EmbeddingTaskType, Inference, LogProbConfig, Message, ResponseFormat, SamplingParams, + TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -172,6 +177,9 @@ class CerebrasInferenceAdapter(ModelRegistryHelper, Inference): async def embeddings( self, model_id: str, - contents: List[InterleavedContent], + contents: List[str] | List[InterleavedContentItem], + text_truncation: Optional[TextTruncation] = TextTruncation.none, + output_dimension: Optional[int] = None, + task_type: Optional[EmbeddingTaskType] = None, ) -> EmbeddingsResponse: raise NotImplementedError() diff --git a/llama_stack/providers/remote/inference/databricks/databricks.py b/llama_stack/providers/remote/inference/databricks/databricks.py index de13638f5..75751c8b1 100644 --- a/llama_stack/providers/remote/inference/databricks/databricks.py +++ b/llama_stack/providers/remote/inference/databricks/databricks.py @@ -8,16 +8,21 @@ from typing import AsyncGenerator, List, Optional from openai import OpenAI -from llama_stack.apis.common.content_types import InterleavedContent +from llama_stack.apis.common.content_types import ( + InterleavedContent, + InterleavedContentItem, +) from llama_stack.apis.inference import ( ChatCompletionRequest, ChatCompletionResponse, EmbeddingsResponse, + EmbeddingTaskType, Inference, LogProbConfig, Message, ResponseFormat, SamplingParams, + TextTruncation, ToolChoice, ToolDefinition, ToolPromptFormat, @@ -130,7 +135,10 @@ class DatabricksInferenceAdapter(ModelRegistryHelper, Inference): async def embeddings( self, - model: str, - contents: List[InterleavedContent], + model_id: str, + contents: List[str] | List[InterleavedContentItem], + text_truncation: Optional[TextTruncation] = TextTruncation.none, + output_dimension: Optional[int] = None, + task_type: Optional[EmbeddingTaskType] = None, ) -> EmbeddingsResponse: raise NotImplementedError() diff --git a/llama_stack/providers/remote/inference/fireworks/fireworks.py b/llama_stack/providers/remote/inference/fireworks/fireworks.py index 3f455da3c..90fe70cbf 100644 --- a/llama_stack/providers/remote/inference/fireworks/fireworks.py +++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py @@ -8,19 +8,24 @@ from typing import AsyncGenerator, List, Optional, Union from fireworks.client import Fireworks -from llama_stack.apis.common.content_types import InterleavedContent +from llama_stack.apis.common.content_types import ( + InterleavedContent, + InterleavedContentItem, +) from llama_stack.apis.inference import ( ChatCompletionRequest, ChatCompletionResponse, CompletionRequest, CompletionResponse, EmbeddingsResponse, + EmbeddingTaskType, Inference, LogProbConfig, Message, ResponseFormat, ResponseFormatType, SamplingParams, + TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -204,15 +209,14 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv input_dict = {} media_present = request_has_media(request) + llama_model = self.get_llama_model(request.model) if isinstance(request, ChatCompletionRequest): - if media_present: + if media_present or not llama_model: input_dict["messages"] = [ await convert_message_to_openai_dict(m, download=True) for m in request.messages ] else: - input_dict["prompt"] = await chat_completion_request_to_prompt( - request, self.get_llama_model(request.model) - ) + input_dict["prompt"] = await chat_completion_request_to_prompt(request, llama_model) else: assert not media_present, "Fireworks does not support media for Completion requests" input_dict["prompt"] = await completion_request_to_prompt(request) @@ -232,7 +236,10 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv async def embeddings( self, model_id: str, - contents: List[InterleavedContent], + contents: List[str] | List[InterleavedContentItem], + text_truncation: Optional[TextTruncation] = TextTruncation.none, + output_dimension: Optional[int] = None, + task_type: Optional[EmbeddingTaskType] = None, ) -> EmbeddingsResponse: model = await self.model_store.get_model(model_id) diff --git a/llama_stack/providers/remote/inference/groq/groq.py b/llama_stack/providers/remote/inference/groq/groq.py index c75e92dfe..45c15a467 100644 --- a/llama_stack/providers/remote/inference/groq/groq.py +++ b/llama_stack/providers/remote/inference/groq/groq.py @@ -17,16 +17,23 @@ from llama_stack.apis.inference import ( CompletionResponse, CompletionResponseStreamChunk, EmbeddingsResponse, + EmbeddingTaskType, Inference, InterleavedContent, + InterleavedContentItem, LogProbConfig, Message, ResponseFormat, + TextTruncation, ToolChoice, ToolConfig, ) from llama_stack.distribution.request_headers import NeedsRequestProviderData -from llama_stack.models.llama.datatypes import SamplingParams, ToolDefinition, ToolPromptFormat +from llama_stack.models.llama.datatypes import ( + SamplingParams, + ToolDefinition, + ToolPromptFormat, +) from llama_stack.models.llama.sku_list import CoreModelId from llama_stack.providers.remote.inference.groq.config import GroqConfig from llama_stack.providers.utils.inference.model_registry import ( @@ -140,7 +147,10 @@ class GroqInferenceAdapter(Inference, ModelRegistryHelper, NeedsRequestProviderD async def embeddings( self, model_id: str, - contents: List[InterleavedContent], + contents: List[str] | List[InterleavedContentItem], + text_truncation: Optional[TextTruncation] = TextTruncation.none, + output_dimension: Optional[int] = None, + task_type: Optional[EmbeddingTaskType] = None, ) -> EmbeddingsResponse: raise NotImplementedError() diff --git a/llama_stack/providers/remote/inference/nvidia/models.py b/llama_stack/providers/remote/inference/nvidia/models.py index c432861ee..4305f4c6f 100644 --- a/llama_stack/providers/remote/inference/nvidia/models.py +++ b/llama_stack/providers/remote/inference/nvidia/models.py @@ -4,8 +4,10 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from llama_stack.apis.models import ModelType from llama_stack.models.llama.datatypes import CoreModelId from llama_stack.providers.utils.inference.model_registry import ( + ProviderModelEntry, build_hf_repo_model_entry, ) @@ -46,6 +48,14 @@ _MODEL_ENTRIES = [ "meta/llama-3.2-90b-vision-instruct", CoreModelId.llama3_2_90b_vision_instruct.value, ), + ProviderModelEntry( + provider_model_id="baai/bge-m3", + model_type=ModelType.embedding, + metadata={ + "embedding_dimension": 1024, + "context_length": 8192, + }, + ), # TODO(mf): how do we handle Nemotron models? # "Llama3.1-Nemotron-51B-Instruct" -> "meta/llama-3.1-nemotron-51b-instruct", ] diff --git a/llama_stack/providers/remote/inference/nvidia/nvidia.py b/llama_stack/providers/remote/inference/nvidia/nvidia.py index 824389577..ecd53e91c 100644 --- a/llama_stack/providers/remote/inference/nvidia/nvidia.py +++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py @@ -10,6 +10,11 @@ from typing import AsyncIterator, List, Optional, Union from openai import APIConnectionError, AsyncOpenAI +from llama_stack.apis.common.content_types import ( + InterleavedContent, + InterleavedContentItem, + TextContentItem, +) from llama_stack.apis.inference import ( ChatCompletionRequest, ChatCompletionResponse, @@ -18,15 +23,20 @@ from llama_stack.apis.inference import ( CompletionResponse, CompletionResponseStreamChunk, EmbeddingsResponse, + EmbeddingTaskType, Inference, - InterleavedContent, LogProbConfig, Message, ResponseFormat, + TextTruncation, ToolChoice, ToolConfig, ) -from llama_stack.models.llama.datatypes import SamplingParams, ToolDefinition, ToolPromptFormat +from llama_stack.models.llama.datatypes import ( + SamplingParams, + ToolDefinition, + ToolPromptFormat, +) from llama_stack.providers.utils.inference.model_registry import ( ModelRegistryHelper, ) @@ -117,9 +127,41 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper): async def embeddings( self, model_id: str, - contents: List[InterleavedContent], + contents: List[str] | List[InterleavedContentItem], + text_truncation: Optional[TextTruncation] = TextTruncation.none, + output_dimension: Optional[int] = None, + task_type: Optional[EmbeddingTaskType] = None, ) -> EmbeddingsResponse: - raise NotImplementedError() + if any(content_has_media(content) for content in contents): + raise NotImplementedError("Media is not supported") + + # + # Llama Stack: contents = List[str] | List[InterleavedContentItem] + # -> + # OpenAI: input = str | List[str] + # + # we can ignore str and always pass List[str] to OpenAI + # + flat_contents = [ + item.text if isinstance(item, TextContentItem) else item + for content in contents + for item in (content if isinstance(content, list) else [content]) + ] + input = [content.text if isinstance(content, TextContentItem) else content for content in flat_contents] + model = self.get_provider_model_id(model_id) + + response = await self._client.embeddings.create( + model=model, + input=input, + # extra_body={"input_type": "passage"|"query"}, # TODO(mf): how to tell caller's intent? + ) + + # + # OpenAI: CreateEmbeddingResponse(data=[Embedding(embedding=List[float], ...)], ...) + # -> + # Llama Stack: EmbeddingsResponse(embeddings=List[List[float]]) + # + return EmbeddingsResponse(embeddings=[embedding.embedding for embedding in response.data]) async def chat_completion( self, diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py index 1dbcbc294..6fcfd2e99 100644 --- a/llama_stack/providers/remote/inference/ollama/ollama.py +++ b/llama_stack/providers/remote/inference/ollama/ollama.py @@ -13,6 +13,7 @@ from ollama import AsyncClient from llama_stack.apis.common.content_types import ( ImageContentItem, InterleavedContent, + InterleavedContentItem, TextContentItem, ) from llama_stack.apis.inference import ( @@ -20,11 +21,13 @@ from llama_stack.apis.inference import ( ChatCompletionResponse, CompletionRequest, EmbeddingsResponse, + EmbeddingTaskType, Inference, LogProbConfig, Message, ResponseFormat, SamplingParams, + TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -175,8 +178,9 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate): input_dict = {} media_present = request_has_media(request) + llama_model = self.register_helper.get_llama_model(request.model) if isinstance(request, ChatCompletionRequest): - if media_present: + if media_present or not llama_model: contents = [await convert_message_to_openai_dict_for_ollama(m) for m in request.messages] # flatten the list of lists input_dict["messages"] = [item for sublist in contents for item in sublist] @@ -184,7 +188,7 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate): input_dict["raw"] = True input_dict["prompt"] = await chat_completion_request_to_prompt( request, - self.register_helper.get_llama_model(request.model), + llama_model, ) else: assert not media_present, "Ollama does not support media for Completion requests" @@ -258,7 +262,10 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate): async def embeddings( self, model_id: str, - contents: List[InterleavedContent], + contents: List[str] | List[InterleavedContentItem], + text_truncation: Optional[TextTruncation] = TextTruncation.none, + output_dimension: Optional[int] = None, + task_type: Optional[EmbeddingTaskType] = None, ) -> EmbeddingsResponse: model = await self.model_store.get_model(model_id) @@ -274,7 +281,10 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate): return EmbeddingsResponse(embeddings=embeddings) async def register_model(self, model: Model) -> Model: + model = await self.register_helper.register_model(model) if model.model_type == ModelType.embedding: + log.info(f"Pulling embedding model `{model.provider_resource_id}` if necessary...") + await self.client.pull(model.provider_resource_id) response = await self.client.list() else: response = await self.client.ps() @@ -284,7 +294,7 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate): f"Model '{model.provider_resource_id}' is not available in Ollama. Available models: {', '.join(available_models)}" ) - return await self.register_helper.register_model(model) + return model async def convert_message_to_openai_dict_for_ollama(message: Message) -> List[dict]: diff --git a/llama_stack/providers/remote/inference/passthrough/passthrough.py b/llama_stack/providers/remote/inference/passthrough/passthrough.py index a34c34f69..11da6bb9e 100644 --- a/llama_stack/providers/remote/inference/passthrough/passthrough.py +++ b/llama_stack/providers/remote/inference/passthrough/passthrough.py @@ -11,11 +11,13 @@ from llama_stack_client import LlamaStackClient from llama_stack.apis.common.content_types import InterleavedContent from llama_stack.apis.inference import ( EmbeddingsResponse, + EmbeddingTaskType, Inference, LogProbConfig, Message, ResponseFormat, SamplingParams, + TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -138,6 +140,9 @@ class PassthroughInferenceAdapter(Inference): self, model_id: str, contents: List[InterleavedContent], + text_truncation: Optional[TextTruncation] = TextTruncation.none, + output_dimension: Optional[int] = None, + task_type: Optional[EmbeddingTaskType] = None, ) -> EmbeddingsResponse: client = self._get_client() model = await self.model_store.get_model(model_id) @@ -145,4 +150,7 @@ class PassthroughInferenceAdapter(Inference): return client.inference.embeddings( model_id=model.provider_resource_id, contents=contents, + text_truncation=text_truncation, + output_dimension=output_dimension, + task_type=task_type, ) diff --git a/llama_stack/providers/remote/inference/runpod/runpod.py b/llama_stack/providers/remote/inference/runpod/runpod.py index 09122a8e6..bd620aa64 100644 --- a/llama_stack/providers/remote/inference/runpod/runpod.py +++ b/llama_stack/providers/remote/inference/runpod/runpod.py @@ -69,9 +69,10 @@ class RunpodInferenceAdapter(ModelRegistryHelper, Inference): response_format: Optional[ResponseFormat] = None, tools: Optional[List[ToolDefinition]] = None, tool_choice: Optional[ToolChoice] = ToolChoice.auto, - tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json, + tool_prompt_format: Optional[ToolPromptFormat] = None, stream: Optional[bool] = False, logprobs: Optional[LogProbConfig] = None, + tool_config: Optional[ToolConfig] = None, ) -> AsyncGenerator: request = ChatCompletionRequest( model=model, @@ -119,6 +120,9 @@ class RunpodInferenceAdapter(ModelRegistryHelper, Inference): async def embeddings( self, model: str, - contents: List[InterleavedContent], + contents: List[str] | List[InterleavedContentItem], + text_truncation: Optional[TextTruncation] = TextTruncation.none, + output_dimension: Optional[int] = None, + task_type: Optional[EmbeddingTaskType] = None, ) -> EmbeddingsResponse: raise NotImplementedError() diff --git a/llama_stack/providers/remote/inference/sambanova/sambanova.py b/llama_stack/providers/remote/inference/sambanova/sambanova.py index c05284d7d..57a296258 100644 --- a/llama_stack/providers/remote/inference/sambanova/sambanova.py +++ b/llama_stack/providers/remote/inference/sambanova/sambanova.py @@ -5,16 +5,38 @@ # the root directory of this source tree. import json -from typing import AsyncGenerator +from typing import AsyncGenerator, List, Optional from openai import OpenAI from llama_stack.apis.common.content_types import ( ImageContentItem, InterleavedContent, + InterleavedContentItem, TextContentItem, ) -from llama_stack.apis.inference import * # noqa: F403 +from llama_stack.apis.inference import ( + ChatCompletionRequest, + ChatCompletionResponse, + CompletionMessage, + EmbeddingsResponse, + EmbeddingTaskType, + Inference, + LogProbConfig, + Message, + ResponseFormat, + SamplingParams, + StopReason, + SystemMessage, + TextTruncation, + ToolCall, + ToolChoice, + ToolConfig, + ToolDefinition, + ToolPromptFormat, + ToolResponseMessage, + UserMessage, +) from llama_stack.models.llama.datatypes import ( GreedySamplingStrategy, TopKSamplingStrategy, @@ -119,7 +141,10 @@ class SambaNovaInferenceAdapter(ModelRegistryHelper, Inference): async def embeddings( self, model_id: str, - contents: List[InterleavedContent], + contents: List[str] | List[InterleavedContentItem], + text_truncation: Optional[TextTruncation] = TextTruncation.none, + output_dimension: Optional[int] = None, + task_type: Optional[EmbeddingTaskType] = None, ) -> EmbeddingsResponse: raise NotImplementedError() diff --git a/llama_stack/providers/remote/inference/tgi/tgi.py b/llama_stack/providers/remote/inference/tgi/tgi.py index 1a50e3b61..d09ca241f 100644 --- a/llama_stack/providers/remote/inference/tgi/tgi.py +++ b/llama_stack/providers/remote/inference/tgi/tgi.py @@ -10,18 +10,23 @@ from typing import AsyncGenerator, List, Optional from huggingface_hub import AsyncInferenceClient, HfApi -from llama_stack.apis.common.content_types import InterleavedContent +from llama_stack.apis.common.content_types import ( + InterleavedContent, + InterleavedContentItem, +) from llama_stack.apis.inference import ( ChatCompletionRequest, ChatCompletionResponse, CompletionRequest, EmbeddingsResponse, + EmbeddingTaskType, Inference, LogProbConfig, Message, ResponseFormat, ResponseFormatType, SamplingParams, + TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -268,7 +273,10 @@ class _HfAdapter(Inference, ModelsProtocolPrivate): async def embeddings( self, model_id: str, - contents: List[InterleavedContent], + contents: List[str] | List[InterleavedContentItem], + text_truncation: Optional[TextTruncation] = TextTruncation.none, + output_dimension: Optional[int] = None, + task_type: Optional[EmbeddingTaskType] = None, ) -> EmbeddingsResponse: raise NotImplementedError() diff --git a/llama_stack/providers/remote/inference/together/together.py b/llama_stack/providers/remote/inference/together/together.py index 8afd3e85b..040f04e77 100644 --- a/llama_stack/providers/remote/inference/together/together.py +++ b/llama_stack/providers/remote/inference/together/together.py @@ -8,18 +8,23 @@ from typing import AsyncGenerator, List, Optional, Union from together import Together -from llama_stack.apis.common.content_types import InterleavedContent +from llama_stack.apis.common.content_types import ( + InterleavedContent, + InterleavedContentItem, +) from llama_stack.apis.inference import ( ChatCompletionRequest, ChatCompletionResponse, CompletionRequest, EmbeddingsResponse, + EmbeddingTaskType, Inference, LogProbConfig, Message, ResponseFormat, ResponseFormatType, SamplingParams, + TextTruncation, ToolChoice, ToolConfig, ToolDefinition, @@ -198,13 +203,12 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi async def _get_params(self, request: Union[ChatCompletionRequest, CompletionRequest]) -> dict: input_dict = {} media_present = request_has_media(request) + llama_model = self.get_llama_model(request.model) if isinstance(request, ChatCompletionRequest): - if media_present: + if media_present or not llama_model: input_dict["messages"] = [await convert_message_to_openai_dict(m) for m in request.messages] else: - input_dict["prompt"] = await chat_completion_request_to_prompt( - request, self.get_llama_model(request.model) - ) + input_dict["prompt"] = await chat_completion_request_to_prompt(request, llama_model) else: assert not media_present, "Together does not support media for Completion requests" input_dict["prompt"] = await completion_request_to_prompt(request) @@ -219,7 +223,10 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi async def embeddings( self, model_id: str, - contents: List[InterleavedContent], + contents: List[str] | List[InterleavedContentItem], + text_truncation: Optional[TextTruncation] = TextTruncation.none, + output_dimension: Optional[int] = None, + task_type: Optional[EmbeddingTaskType] = None, ) -> EmbeddingsResponse: model = await self.model_store.get_model(model_id) assert all(not content_has_media(content) for content in contents), ( diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py index b37b0d305..b9422d85d 100644 --- a/llama_stack/providers/remote/inference/vllm/vllm.py +++ b/llama_stack/providers/remote/inference/vllm/vllm.py @@ -10,7 +10,13 @@ from typing import AsyncGenerator, List, Optional, Union from llama_models.datatypes import StopReason, ToolCall from openai import OpenAI -from llama_stack.apis.common.content_types import InterleavedContent, TextDelta, ToolCallDelta, ToolCallParseStatus +from llama_stack.apis.common.content_types import ( + InterleavedContent, + InterleavedContentItem, + TextDelta, + ToolCallDelta, + ToolCallParseStatus, +) from llama_stack.apis.inference import ( ChatCompletionRequest, ChatCompletionResponse, @@ -22,18 +28,21 @@ from llama_stack.apis.inference import ( CompletionResponse, CompletionResponseStreamChunk, EmbeddingsResponse, + EmbeddingTaskType, Inference, LogProbConfig, Message, ResponseFormat, ResponseFormatType, SamplingParams, + TextTruncation, ToolChoice, ToolConfig, ToolDefinition, ToolPromptFormat, ) from llama_stack.apis.models import Model, ModelType +from llama_stack.models.llama.datatypes import BuiltinTool from llama_stack.models.llama.sku_list import all_registered_models from llama_stack.providers.datatypes import ModelsProtocolPrivate from llama_stack.providers.utils.inference.model_registry import ( @@ -112,10 +121,16 @@ def _convert_to_vllm_tools_in_request(tools: List[ToolDefinition]) -> List[dict] if tool_param.required: compat_required.append(tool_key) + # The tool.tool_name can be a str or a BuiltinTool enum. If + # it's the latter, convert to a string. + tool_name = tool.tool_name + if isinstance(tool_name, BuiltinTool): + tool_name = tool_name.value + compat_tool = { "type": "function", "function": { - "name": tool.tool_name, + "name": tool_name, "description": tool.description, "parameters": { "type": "object", @@ -369,7 +384,10 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): async def embeddings( self, model_id: str, - contents: List[InterleavedContent], + contents: List[str] | List[InterleavedContentItem], + text_truncation: Optional[TextTruncation] = TextTruncation.none, + output_dimension: Optional[int] = None, + task_type: Optional[EmbeddingTaskType] = None, ) -> EmbeddingsResponse: model = await self.model_store.get_model(model_id) diff --git a/llama_stack/providers/tests/inference/test_prompt_adapter.py b/llama_stack/providers/tests/inference/test_prompt_adapter.py index 323c6cb6a..2a6dbb561 100644 --- a/llama_stack/providers/tests/inference/test_prompt_adapter.py +++ b/llama_stack/providers/tests/inference/test_prompt_adapter.py @@ -8,7 +8,10 @@ import unittest from llama_stack.apis.inference import ( ChatCompletionRequest, + CompletionMessage, + StopReason, SystemMessage, + ToolCall, ToolConfig, UserMessage, ) @@ -20,6 +23,7 @@ from llama_stack.models.llama.datatypes import ( ) from llama_stack.providers.utils.inference.prompt_adapter import ( chat_completion_request_to_messages, + chat_completion_request_to_prompt, ) MODEL = "Llama3.1-8B-Instruct" @@ -119,6 +123,46 @@ class PrepareMessagesTests(unittest.IsolatedAsyncioTestCase): self.assertTrue("Return function calls in JSON format" in messages[1].content) self.assertEqual(messages[-1].content, content) + async def test_completion_message_encoding(self): + request = ChatCompletionRequest( + model=MODEL3_2, + messages=[ + UserMessage(content="hello"), + CompletionMessage( + content="", + stop_reason=StopReason.end_of_turn, + tool_calls=[ + ToolCall( + tool_name="custom1", + arguments={"param1": "value1"}, + call_id="123", + ) + ], + ), + ], + tools=[ + ToolDefinition( + tool_name="custom1", + description="custom1 tool", + parameters={ + "param1": ToolParamDefinition( + param_type="str", + description="param1 description", + required=True, + ), + }, + ), + ], + tool_config=ToolConfig(tool_prompt_format=ToolPromptFormat.python_list), + ) + prompt = await chat_completion_request_to_prompt(request, request.model) + self.assertIn('[custom1(param1="value1")]', prompt) + + request.model = MODEL + request.tool_config.tool_prompt_format = ToolPromptFormat.json + prompt = await chat_completion_request_to_prompt(request, request.model) + self.assertIn('{"type": "function", "name": "custom1", "parameters": {"param1": "value1"}}', prompt) + async def test_user_provided_system_message(self): content = "Hello !" system_prompt = "You are a pirate" diff --git a/llama_stack/providers/tests/vector_io/fixtures.py b/llama_stack/providers/tests/vector_io/fixtures.py index 1797d47a5..c29717a27 100644 --- a/llama_stack/providers/tests/vector_io/fixtures.py +++ b/llama_stack/providers/tests/vector_io/fixtures.py @@ -61,7 +61,7 @@ def vector_io_sqlite_vec() -> ProviderFixture: providers=[ Provider( provider_id="sqlite_vec", - provider_type="inline::sqlite_vec", + provider_type="inline::sqlite-vec", config=SQLiteVectorIOConfig( kvstore=SqliteKVStoreConfig(db_path=temp_file.name).model_dump(), ).model_dump(), diff --git a/llama_stack/providers/utils/inference/embedding_mixin.py b/llama_stack/providers/utils/inference/embedding_mixin.py index a84c2eecb..f43475554 100644 --- a/llama_stack/providers/utils/inference/embedding_mixin.py +++ b/llama_stack/providers/utils/inference/embedding_mixin.py @@ -5,13 +5,16 @@ # the root directory of this source tree. import logging -from typing import List +from typing import List, Optional from llama_stack.apis.inference import ( EmbeddingsResponse, - InterleavedContent, + EmbeddingTaskType, + InterleavedContentItem, ModelStore, + TextTruncation, ) +from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str EMBEDDING_MODELS = {} @@ -25,11 +28,16 @@ class SentenceTransformerEmbeddingMixin: async def embeddings( self, model_id: str, - contents: List[InterleavedContent], + contents: List[str] | List[InterleavedContentItem], + text_truncation: Optional[TextTruncation] = TextTruncation.none, + output_dimension: Optional[int] = None, + task_type: Optional[EmbeddingTaskType] = None, ) -> EmbeddingsResponse: model = await self.model_store.get_model(model_id) embedding_model = self._load_sentence_transformer_model(model.provider_resource_id) - embeddings = embedding_model.encode(contents) + embeddings = embedding_model.encode( + [interleaved_content_as_str(content) for content in contents], show_progress_bar=False + ) return EmbeddingsResponse(embeddings=embeddings) def _load_sentence_transformer_model(self, model: str) -> "SentenceTransformer": diff --git a/llama_stack/providers/utils/inference/model_registry.py b/llama_stack/providers/utils/inference/model_registry.py index 0882019e3..d9e24662a 100644 --- a/llama_stack/providers/utils/inference/model_registry.py +++ b/llama_stack/providers/utils/inference/model_registry.py @@ -79,28 +79,28 @@ class ModelRegistryHelper(ModelsProtocolPrivate): provider_resource_id = model.provider_resource_id else: provider_resource_id = self.get_provider_model_id(model.provider_resource_id) + if provider_resource_id: model.provider_resource_id = provider_resource_id else: - if model.metadata.get("llama_model") is None: - raise ValueError( - f"Model '{model.provider_resource_id}' is not available and no llama_model was specified in metadata. " - "Please specify a llama_model in metadata or use a supported model identifier" - ) + llama_model = model.metadata.get("llama_model") + if llama_model is None: + return model + existing_llama_model = self.get_llama_model(model.provider_resource_id) if existing_llama_model: - if existing_llama_model != model.metadata["llama_model"]: + if existing_llama_model != llama_model: raise ValueError( f"Provider model id '{model.provider_resource_id}' is already registered to a different llama model: '{existing_llama_model}'" ) else: - if model.metadata["llama_model"] not in ALL_HUGGINGFACE_REPOS_TO_MODEL_DESCRIPTOR: + if llama_model not in ALL_HUGGINGFACE_REPOS_TO_MODEL_DESCRIPTOR: raise ValueError( - f"Invalid llama_model '{model.metadata['llama_model']}' specified in metadata. " + f"Invalid llama_model '{llama_model}' specified in metadata. " f"Must be one of: {', '.join(ALL_HUGGINGFACE_REPOS_TO_MODEL_DESCRIPTOR.keys())}" ) self.provider_id_to_llama_model_map[model.provider_resource_id] = ( - ALL_HUGGINGFACE_REPOS_TO_MODEL_DESCRIPTOR[model.metadata["llama_model"]] + ALL_HUGGINGFACE_REPOS_TO_MODEL_DESCRIPTOR[llama_model] ) return model diff --git a/llama_stack/providers/utils/inference/prompt_adapter.py b/llama_stack/providers/utils/inference/prompt_adapter.py index 10fe442e8..ca6fe04fd 100644 --- a/llama_stack/providers/utils/inference/prompt_adapter.py +++ b/llama_stack/providers/utils/inference/prompt_adapter.py @@ -252,7 +252,9 @@ async def chat_completion_request_to_prompt(request: ChatCompletionRequest, llam request = await convert_request_to_raw(request) formatter = ChatFormat(tokenizer=Tokenizer.get_instance()) - model_input = formatter.encode_dialog_prompt(request.messages) + model_input = formatter.encode_dialog_prompt( + request.messages, tool_prompt_format=request.tool_config.tool_prompt_format + ) return formatter.tokenizer.decode(model_input.tokens) @@ -264,7 +266,9 @@ async def chat_completion_request_to_model_input_info( request = await convert_request_to_raw(request) formatter = ChatFormat(tokenizer=Tokenizer.get_instance()) - model_input = formatter.encode_dialog_prompt(request.messages) + model_input = formatter.encode_dialog_prompt( + request.messages, tool_prompt_format=request.tool_config.tool_prompt_format + ) return ( formatter.tokenizer.decode(model_input.tokens), len(model_input.tokens), diff --git a/llama_stack/schema_utils.py b/llama_stack/schema_utils.py index 581404844..ad92338e6 100644 --- a/llama_stack/schema_utils.py +++ b/llama_stack/schema_utils.py @@ -5,12 +5,10 @@ # the root directory of this source tree. from dataclasses import dataclass -from typing import Any, Callable, List, Optional, TypeVar +from typing import Any, Callable, List, Optional, Protocol, TypeVar from .strong_typing.schema import json_schema_type, register_schema # noqa: F401 -T = TypeVar("T") - @dataclass class WebMethod: @@ -22,6 +20,13 @@ class WebMethod: raw_bytes_request_body: Optional[bool] = False +class HasWebMethod(Protocol): + __webmethod__: WebMethod + + +T = TypeVar("T", bound=HasWebMethod) # Bound T to classes that match this protocol + + def webmethod( route: Optional[str] = None, method: Optional[str] = None, diff --git a/llama_stack/scripts/distro_codegen.py b/llama_stack/scripts/distro_codegen.py index 1c44b4625..76c7283eb 100644 --- a/llama_stack/scripts/distro_codegen.py +++ b/llama_stack/scripts/distro_codegen.py @@ -11,7 +11,7 @@ import subprocess import sys from functools import partial from pathlib import Path -from typing import Iterator +from typing import Iterable from rich.progress import Progress, SpinnerColumn, TextColumn @@ -39,7 +39,7 @@ class ChangedPathTracker: return self._changed_paths -def find_template_dirs(templates_dir: Path) -> Iterator[Path]: +def find_template_dirs(templates_dir: Path) -> Iterable[Path]: """Find immediate subdirectories in the templates folder.""" if not templates_dir.exists(): raise FileNotFoundError(f"Templates directory not found: {templates_dir}") @@ -90,7 +90,7 @@ def check_for_changes(change_tracker: ChangedPathTracker) -> bool: return has_changes -def collect_template_dependencies(template_dir: Path) -> tuple[str, list[str]]: +def collect_template_dependencies(template_dir: Path) -> tuple[str | None, list[str]]: try: module_name = f"llama_stack.templates.{template_dir.name}" module = importlib.import_module(module_name) diff --git a/llama_stack/scripts/run_client_sdk_tests.py b/llama_stack/scripts/run_client_sdk_tests.py index 1e2ef1ac8..6aaeb3273 100644 --- a/llama_stack/scripts/run_client_sdk_tests.py +++ b/llama_stack/scripts/run_client_sdk_tests.py @@ -52,7 +52,7 @@ def main(parser: argparse.ArgumentParser): pytest_args, "-s", "-v", - REPO_ROOT / CLIENT_SDK_TESTS_RELATIVE_PATH, + str(REPO_ROOT / CLIENT_SDK_TESTS_RELATIVE_PATH), ] ) diff --git a/llama_stack/templates/cerebras/build.yaml b/llama_stack/templates/cerebras/build.yaml index 9d5ab1a52..ef6c43212 100644 --- a/llama_stack/templates/cerebras/build.yaml +++ b/llama_stack/templates/cerebras/build.yaml @@ -4,6 +4,7 @@ distribution_spec: providers: inference: - remote::cerebras + - inline::sentence-transformers safety: - inline::llama-guard vector_io: diff --git a/llama_stack/templates/cerebras/cerebras.py b/llama_stack/templates/cerebras/cerebras.py index c467579ac..544a50c03 100644 --- a/llama_stack/templates/cerebras/cerebras.py +++ b/llama_stack/templates/cerebras/cerebras.py @@ -20,7 +20,7 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin def get_distribution_template() -> DistributionTemplate: providers = { - "inference": ["remote::cerebras"], + "inference": ["remote::cerebras", "inline::sentence-transformers"], "safety": ["inline::llama-guard"], "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], "agents": ["inline::meta-reference"], diff --git a/llama_stack/templates/dell/build.yaml b/llama_stack/templates/dell/build.yaml index e2edb9386..05b98d56f 100644 --- a/llama_stack/templates/dell/build.yaml +++ b/llama_stack/templates/dell/build.yaml @@ -5,6 +5,7 @@ distribution_spec: providers: inference: - remote::tgi + - inline::sentence-transformers vector_io: - inline::faiss - remote::chromadb diff --git a/llama_stack/templates/dell/dell.py b/llama_stack/templates/dell/dell.py index 116fbd285..8348beafd 100644 --- a/llama_stack/templates/dell/dell.py +++ b/llama_stack/templates/dell/dell.py @@ -20,7 +20,7 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin def get_distribution_template() -> DistributionTemplate: providers = { - "inference": ["remote::tgi"], + "inference": ["remote::tgi", "inline::sentence-transformers"], "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], diff --git a/llama_stack/templates/fireworks/build.yaml b/llama_stack/templates/fireworks/build.yaml index cdd60ec2a..a9c472c53 100644 --- a/llama_stack/templates/fireworks/build.yaml +++ b/llama_stack/templates/fireworks/build.yaml @@ -4,6 +4,7 @@ distribution_spec: providers: inference: - remote::fireworks + - inline::sentence-transformers vector_io: - inline::faiss - remote::chromadb diff --git a/llama_stack/templates/fireworks/fireworks.py b/llama_stack/templates/fireworks/fireworks.py index 06b851551..4457296b0 100644 --- a/llama_stack/templates/fireworks/fireworks.py +++ b/llama_stack/templates/fireworks/fireworks.py @@ -25,7 +25,7 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin def get_distribution_template() -> DistributionTemplate: providers = { - "inference": ["remote::fireworks"], + "inference": ["remote::fireworks", "inline::sentence-transformers"], "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], diff --git a/llama_stack/templates/hf-serverless/build.yaml b/llama_stack/templates/hf-serverless/build.yaml index f9303cfab..c0cc1e2c2 100644 --- a/llama_stack/templates/hf-serverless/build.yaml +++ b/llama_stack/templates/hf-serverless/build.yaml @@ -4,6 +4,7 @@ distribution_spec: providers: inference: - remote::hf::serverless + - inline::sentence-transformers vector_io: - inline::faiss - remote::chromadb diff --git a/llama_stack/templates/hf-serverless/hf_serverless.py b/llama_stack/templates/hf-serverless/hf_serverless.py index 46efb6f0b..af04e39d4 100644 --- a/llama_stack/templates/hf-serverless/hf_serverless.py +++ b/llama_stack/templates/hf-serverless/hf_serverless.py @@ -21,7 +21,7 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin def get_distribution_template() -> DistributionTemplate: providers = { - "inference": ["remote::hf::serverless"], + "inference": ["remote::hf::serverless", "inline::sentence-transformers"], "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], diff --git a/llama_stack/templates/nvidia/nvidia.py b/llama_stack/templates/nvidia/nvidia.py index 1181706e1..98fd2747c 100644 --- a/llama_stack/templates/nvidia/nvidia.py +++ b/llama_stack/templates/nvidia/nvidia.py @@ -55,9 +55,11 @@ def get_distribution_template() -> DistributionTemplate: core_model_to_hf_repo = {m.descriptor(): m.huggingface_repo for m in all_registered_models()} default_models = [ ModelInput( - model_id=core_model_to_hf_repo[m.llama_model], + model_id=core_model_to_hf_repo[m.llama_model] if m.llama_model else m.provider_model_id, provider_model_id=m.provider_model_id, provider_id="nvidia", + model_type=m.model_type, + metadata=m.metadata, ) for m in _MODEL_ENTRIES ] diff --git a/llama_stack/templates/nvidia/run.yaml b/llama_stack/templates/nvidia/run.yaml index 14fb28354..4c38ec24e 100644 --- a/llama_stack/templates/nvidia/run.yaml +++ b/llama_stack/templates/nvidia/run.yaml @@ -135,6 +135,13 @@ models: provider_id: nvidia provider_model_id: meta/llama-3.2-90b-vision-instruct model_type: llm +- metadata: + embedding_dimension: 1024 + context_length: 8192 + model_id: baai/bge-m3 + provider_id: nvidia + provider_model_id: baai/bge-m3 + model_type: embedding shields: [] vector_dbs: [] datasets: [] diff --git a/llama_stack/templates/ollama/build.yaml b/llama_stack/templates/ollama/build.yaml index 0fee6808c..52a50b38a 100644 --- a/llama_stack/templates/ollama/build.yaml +++ b/llama_stack/templates/ollama/build.yaml @@ -5,7 +5,7 @@ distribution_spec: inference: - remote::ollama vector_io: - - inline::faiss + - inline::sqlite-vec - remote::chromadb - remote::pgvector safety: diff --git a/llama_stack/templates/ollama/ollama.py b/llama_stack/templates/ollama/ollama.py index 31119e040..4f644c270 100644 --- a/llama_stack/templates/ollama/ollama.py +++ b/llama_stack/templates/ollama/ollama.py @@ -13,10 +13,6 @@ from llama_stack.distribution.datatypes import ( ShieldInput, ToolGroupInput, ) -from llama_stack.providers.inline.inference.sentence_transformers import ( - SentenceTransformersInferenceConfig, -) -from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig from llama_stack.providers.inline.vector_io.sqlite_vec.config import SQLiteVectorIOConfig from llama_stack.providers.remote.inference.ollama import OllamaImplConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -25,7 +21,7 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin def get_distribution_template() -> DistributionTemplate: providers = { "inference": ["remote::ollama"], - "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], + "vector_io": ["inline::sqlite-vec", "remote::chromadb", "remote::pgvector"], "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], @@ -45,19 +41,9 @@ def get_distribution_template() -> DistributionTemplate: provider_type="remote::ollama", config=OllamaImplConfig.sample_run_config(), ) - embedding_provider = Provider( - provider_id="sentence-transformers", - provider_type="inline::sentence-transformers", - config=SentenceTransformersInferenceConfig.sample_run_config(), - ) - vector_io_provider_faiss = Provider( - provider_id="faiss", - provider_type="inline::faiss", - config=FaissVectorIOConfig.sample_run_config(f"distributions/{name}"), - ) vector_io_provider_sqlite = Provider( - provider_id="sqlite_vec", - provider_type="inline::sqlite_vec", + provider_id="sqlite-vec", + provider_type="inline::sqlite-vec", config=SQLiteVectorIOConfig.sample_run_config(f"distributions/{name}"), ) @@ -104,19 +90,16 @@ def get_distribution_template() -> DistributionTemplate: run_configs={ "run.yaml": RunConfigSettings( provider_overrides={ - "inference": [inference_provider, embedding_provider], - "vector_io": [vector_io_provider_faiss, vector_io_provider_sqlite], + "inference": [inference_provider], + "vector_io": [vector_io_provider_sqlite], }, - default_models=[inference_model, embedding_model], + default_models=[inference_model], default_tool_groups=default_tool_groups, ), "run-with-safety.yaml": RunConfigSettings( provider_overrides={ - "inference": [ - inference_provider, - embedding_provider, - ], - "vector_io": [vector_io_provider_faiss, vector_io_provider_faiss], + "inference": [inference_provider], + "vector_io": [vector_io_provider_sqlite], "safety": [ Provider( provider_id="llama-guard", diff --git a/llama_stack/templates/ollama/run-with-safety.yaml b/llama_stack/templates/ollama/run-with-safety.yaml index 4ce64cf59..9f9a32d79 100644 --- a/llama_stack/templates/ollama/run-with-safety.yaml +++ b/llama_stack/templates/ollama/run-with-safety.yaml @@ -16,24 +16,11 @@ providers: provider_type: remote::ollama config: url: ${env.OLLAMA_URL:http://localhost:11434} - - provider_id: sentence-transformers - provider_type: inline::sentence-transformers - config: {} vector_io: - - provider_id: faiss - provider_type: inline::faiss + - provider_id: sqlite-vec + provider_type: inline::sqlite-vec config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/faiss_store.db - - provider_id: faiss - provider_type: inline::faiss - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/faiss_store.db + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/sqlite_vec.db safety: - provider_id: llama-guard provider_type: inline::llama-guard diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml index b4982f8e2..d78f0838f 100644 --- a/llama_stack/templates/ollama/run.yaml +++ b/llama_stack/templates/ollama/run.yaml @@ -16,24 +16,11 @@ providers: provider_type: remote::ollama config: url: ${env.OLLAMA_URL:http://localhost:11434} - - provider_id: sentence-transformers - provider_type: inline::sentence-transformers - config: {} vector_io: - - provider_id: faiss - provider_type: inline::faiss + - provider_id: sqlite-vec + provider_type: inline::sqlite-vec config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/faiss_store.db - - provider_id: sqlite_vec - provider_type: inline::sqlite_vec - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/sqlite_vec.db + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/sqlite_vec.db safety: - provider_id: llama-guard provider_type: inline::llama-guard @@ -100,11 +87,14 @@ models: model_id: ${env.INFERENCE_MODEL} provider_id: ollama model_type: llm +<<<<<<< HEAD - metadata: embedding_dimension: 384 model_id: all-MiniLM-L6-v2 provider_id: sentence-transformers model_type: embedding +======= +>>>>>>> upstream/main shields: [] vector_dbs: [] datasets: [] diff --git a/llama_stack/templates/remote-vllm/build.yaml b/llama_stack/templates/remote-vllm/build.yaml index 74d9f32d9..ccb328c1c 100644 --- a/llama_stack/templates/remote-vllm/build.yaml +++ b/llama_stack/templates/remote-vllm/build.yaml @@ -4,6 +4,7 @@ distribution_spec: providers: inference: - remote::vllm + - inline::sentence-transformers vector_io: - inline::faiss - remote::chromadb diff --git a/llama_stack/templates/remote-vllm/vllm.py b/llama_stack/templates/remote-vllm/vllm.py index 40a2d541d..10d291456 100644 --- a/llama_stack/templates/remote-vllm/vllm.py +++ b/llama_stack/templates/remote-vllm/vllm.py @@ -23,7 +23,7 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin def get_distribution_template() -> DistributionTemplate: providers = { - "inference": ["remote::vllm"], + "inference": ["remote::vllm", "inline::sentence-transformers"], "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], diff --git a/llama_stack/templates/tgi/build.yaml b/llama_stack/templates/tgi/build.yaml index 8bc628158..9fe79647c 100644 --- a/llama_stack/templates/tgi/build.yaml +++ b/llama_stack/templates/tgi/build.yaml @@ -4,6 +4,7 @@ distribution_spec: providers: inference: - remote::tgi + - inline::sentence-transformers vector_io: - inline::faiss - remote::chromadb diff --git a/llama_stack/templates/tgi/tgi.py b/llama_stack/templates/tgi/tgi.py index 71718a93d..9b80414f9 100644 --- a/llama_stack/templates/tgi/tgi.py +++ b/llama_stack/templates/tgi/tgi.py @@ -23,7 +23,7 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin def get_distribution_template() -> DistributionTemplate: providers = { - "inference": ["remote::tgi"], + "inference": ["remote::tgi", "inline::sentence-transformers"], "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], diff --git a/llama_stack/templates/together/build.yaml b/llama_stack/templates/together/build.yaml index 90ee5bcee..a8a6de28d 100644 --- a/llama_stack/templates/together/build.yaml +++ b/llama_stack/templates/together/build.yaml @@ -4,6 +4,7 @@ distribution_spec: providers: inference: - remote::together + - inline::sentence-transformers vector_io: - inline::faiss - remote::chromadb diff --git a/llama_stack/templates/together/together.py b/llama_stack/templates/together/together.py index d275b7238..8d0e2353c 100644 --- a/llama_stack/templates/together/together.py +++ b/llama_stack/templates/together/together.py @@ -25,7 +25,7 @@ from llama_stack.templates.template import DistributionTemplate, RunConfigSettin def get_distribution_template() -> DistributionTemplate: providers = { - "inference": ["remote::together"], + "inference": ["remote::together", "inline::sentence-transformers"], "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], diff --git a/llama_stack/templates/vllm-gpu/build.yaml b/llama_stack/templates/vllm-gpu/build.yaml index d24046613..8eb44dc1b 100644 --- a/llama_stack/templates/vllm-gpu/build.yaml +++ b/llama_stack/templates/vllm-gpu/build.yaml @@ -4,6 +4,7 @@ distribution_spec: providers: inference: - inline::vllm + - inline::sentence-transformers vector_io: - inline::faiss - remote::chromadb diff --git a/llama_stack/templates/vllm-gpu/vllm.py b/llama_stack/templates/vllm-gpu/vllm.py index 31900687b..8cdec589e 100644 --- a/llama_stack/templates/vllm-gpu/vllm.py +++ b/llama_stack/templates/vllm-gpu/vllm.py @@ -20,7 +20,7 @@ from llama_stack.templates.template import ( def get_distribution_template() -> DistributionTemplate: providers = { - "inference": ["inline::vllm"], + "inference": ["inline::vllm", "inline::sentence-transformers"], "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], diff --git a/pyproject.toml b/pyproject.toml index c8ed5737b..2bad04163 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -158,3 +158,26 @@ ignore = [ "B007", "B008", ] + +[tool.mypy] +mypy_path = ["llama_stack"] +packages = ["llama_stack"] +disable_error_code = [] +warn_return_any = true +# # honor excludes by not following there through imports +follow_imports = "silent" +exclude = [ + # As we fix more and more of these, we should remove them from the list + "llama_stack/providers", + "llama_stack/distribution", + "llama_stack/apis", + "llama_stack/cli", + "llama_stack/models", + "llama_stack/strong_typing", + "llama_stack/templates", +] + +[[tool.mypy.overrides]] +# packages that lack typing annotations, do not have stubs, or are unavailable. +module = ["llama_models.*", "yaml", "fire"] +ignore_missing_imports = true diff --git a/requirements.txt b/requirements.txt index 02e1a8655..014db083a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,7 +16,7 @@ fsspec==2025.2.0 h11==0.14.0 httpcore==1.0.7 httpx==0.28.1 -huggingface-hub==0.28.1 +huggingface-hub==0.29.0 idna==3.10 jinja2==3.1.5 jsonschema==4.23.0 diff --git a/tests/client-sdk/agents/test_agents.py b/tests/client-sdk/agents/test_agents.py index e5380d357..e5606b50b 100644 --- a/tests/client-sdk/agents/test_agents.py +++ b/tests/client-sdk/agents/test_agents.py @@ -19,8 +19,12 @@ from llama_stack_client.types.shared.completion_message import CompletionMessage from llama_stack_client.types.shared_params.agent_config import AgentConfig, ToolConfig from llama_stack_client.types.tool_def_param import Parameter -from llama_stack.apis.agents.agents import AgentConfig as Server__AgentConfig -from llama_stack.apis.agents.agents import ToolChoice +from llama_stack.apis.agents.agents import ( + AgentConfig as Server__AgentConfig, +) +from llama_stack.apis.agents.agents import ( + ToolChoice, +) class TestClientTool(ClientTool): @@ -86,7 +90,6 @@ class TestClientTool(ClientTool): def agent_config(llama_stack_client, text_model_id): available_shields = [shield.identifier for shield in llama_stack_client.shields.list()] available_shields = available_shields[:1] - print(f"Using shield: {available_shields}") agent_config = AgentConfig( model=text_model_id, instructions="You are a helpful assistant", @@ -329,8 +332,11 @@ def test_tool_choice(llama_stack_client, agent_config): ] client_tool = TestClientTool() for tool_choice, expected_tool in data: - agent_config["tool_config"] = {"tool_choice": tool_choice} - agent_config["client_tools"] = [client_tool.get_tool_definition()] + agent_config = { + **agent_config, + "tool_config": {"tool_choice": tool_choice}, + "client_tools": [client_tool.get_tool_definition()], + } agent = Agent(llama_stack_client, agent_config, client_tools=(client_tool,)) session_id = agent.create_session(f"test-session-{uuid4()}") @@ -378,7 +384,6 @@ def xtest_override_system_message_behavior(llama_stack_client, agent_config): logs = [str(log) for log in EventLogger().log(response) if log is not None] logs_str = "".join(logs) - print(logs_str) # can't tell a joke: "I don't have a function" assert "function" in logs_str @@ -417,7 +422,6 @@ def xtest_override_system_message_behavior(llama_stack_client, agent_config): logs = [str(log) for log in EventLogger().log(response) if log is not None] logs_str = "".join(logs) - print(logs_str) assert "bicycle" in logs_str response = agent.create_turn( @@ -432,7 +436,6 @@ def xtest_override_system_message_behavior(llama_stack_client, agent_config): logs = [str(log) for log in EventLogger().log(response) if log is not None] logs_str = "".join(logs) - print(logs_str) assert "-100" in logs_str assert "get_boiling_point" in logs_str @@ -453,6 +456,7 @@ def test_rag_agent(llama_stack_client, agent_config): vector_db_id=vector_db_id, embedding_model="all-MiniLM-L6-v2", embedding_dimension=384, + provider_id="faiss", ) llama_stack_client.tool_runtime.rag_tool.insert( documents=documents, @@ -484,15 +488,17 @@ def test_rag_agent(llama_stack_client, agent_config): ), ] for prompt, expected_kw in user_prompts: - print(f"User> {prompt}") response = rag_agent.create_turn( messages=[{"role": "user", "content": prompt}], session_id=session_id, + stream=False, ) - logs = [str(log) for log in EventLogger().log(response) if log is not None] - logs_str = "".join(logs) - assert "Tool:query_from_memory" in logs_str - assert expected_kw in logs_str.lower() + # rag is called + tool_execution_step = next(step for step in response.steps if step.step_type == "tool_execution") + assert tool_execution_step.tool_calls[0].tool_name == "query_from_memory" + # document ids are present in metadata + assert "num-0" in tool_execution_step.tool_responses[0].metadata["document_ids"] + assert expected_kw in response.output_message.content.lower() def test_rag_and_code_agent(llama_stack_client, agent_config): @@ -548,7 +554,6 @@ def test_rag_and_code_agent(llama_stack_client, agent_config): ] for prompt, docs, tool_name in user_prompts: - print(f"User> {prompt}") session_id = agent.create_session(f"test-session-{uuid4()}") response = agent.create_turn( messages=[{"role": "user", "content": prompt}], diff --git a/tests/client-sdk/conftest.py b/tests/client-sdk/conftest.py index b397f7ab3..13dee0ba3 100644 --- a/tests/client-sdk/conftest.py +++ b/tests/client-sdk/conftest.py @@ -42,22 +42,30 @@ def pytest_addoption(parser): ) parser.addoption( "--inference-model", - action="store", default=TEXT_MODEL, help="Specify the inference model to use for testing", ) parser.addoption( "--vision-inference-model", - action="store", default=VISION_MODEL, help="Specify the vision inference model to use for testing", ) parser.addoption( "--safety-shield", - action="store", default="meta-llama/Llama-Guard-3-1B", help="Specify the safety shield model to use for testing", ) + parser.addoption( + "--embedding-model", + default=None, + help="Specify the embedding model to use for testing", + ) + parser.addoption( + "--embedding-dimension", + type=int, + default=384, + help="Output dimensionality of the embedding model to use for testing", + ) @pytest.fixture(scope="session") @@ -72,7 +80,7 @@ def provider_data(): @pytest.fixture(scope="session") -def llama_stack_client(provider_data): +def llama_stack_client(provider_data, text_model_id): if os.environ.get("LLAMA_STACK_CONFIG"): client = LlamaStackAsLibraryClient( get_env_or_fail("LLAMA_STACK_CONFIG"), @@ -89,19 +97,91 @@ def llama_stack_client(provider_data): ) else: raise ValueError("LLAMA_STACK_CONFIG or LLAMA_STACK_BASE_URL must be set") + return client +@pytest.fixture(scope="session") +def inference_provider_type(llama_stack_client): + providers = llama_stack_client.providers.list() + inference_providers = [p for p in providers if p.api == "inference"] + assert len(inference_providers) > 0, "No inference providers found" + return inference_providers[0].provider_type + + +@pytest.fixture(scope="session") +def client_with_models(llama_stack_client, text_model_id, vision_model_id, embedding_model_id, embedding_dimension): + client = llama_stack_client + + providers = [p for p in client.providers.list() if p.api == "inference"] + assert len(providers) > 0, "No inference providers found" + inference_providers = [p.provider_id for p in providers if p.provider_type != "inline::sentence-transformers"] + if text_model_id: + client.models.register(model_id=text_model_id, provider_id=inference_providers[0]) + if vision_model_id: + client.models.register(model_id=vision_model_id, provider_id=inference_providers[0]) + + if embedding_model_id and embedding_dimension: + # try to find a provider that supports embeddings, if sentence-transformers is not available + selected_provider = None + for p in providers: + if p.provider_type == "inline::sentence-transformers": + selected_provider = p + break + + selected_provider = selected_provider or providers[0] + client.models.register( + model_id=embedding_model_id, + provider_id=selected_provider.provider_id, + model_type="embedding", + metadata={"embedding_dimension": embedding_dimension}, + ) + return client + + +MODEL_SHORT_IDS = { + "meta-llama/Llama-3.1-8B-Instruct": "8B", + "meta-llama/Llama-3.2-11B-Vision-Instruct": "11B", + "all-MiniLM-L6-v2": "MiniLM", +} + + +def get_short_id(value): + return MODEL_SHORT_IDS.get(value, value) + + def pytest_generate_tests(metafunc): + params = [] + values = [] + id_parts = [] + if "text_model_id" in metafunc.fixturenames: - metafunc.parametrize( - "text_model_id", - [metafunc.config.getoption("--inference-model")], - scope="session", - ) + params.append("text_model_id") + val = metafunc.config.getoption("--inference-model") + values.append(val) + id_parts.append(f"txt={get_short_id(val)}") + if "vision_model_id" in metafunc.fixturenames: - metafunc.parametrize( - "vision_model_id", - [metafunc.config.getoption("--vision-inference-model")], - scope="session", - ) + params.append("vision_model_id") + val = metafunc.config.getoption("--vision-inference-model") + values.append(val) + id_parts.append(f"vis={get_short_id(val)}") + + if "embedding_model_id" in metafunc.fixturenames: + params.append("embedding_model_id") + val = metafunc.config.getoption("--embedding-model") + values.append(val) + if val is not None: + id_parts.append(f"emb={get_short_id(val)}") + + if "embedding_dimension" in metafunc.fixturenames: + params.append("embedding_dimension") + val = metafunc.config.getoption("--embedding-dimension") + values.append(val) + if val != 384: + id_parts.append(f"dim={val}") + + if params: + # Create a single test ID string + test_id = ":".join(id_parts) + metafunc.parametrize(params, [values], scope="session", ids=[test_id]) diff --git a/tests/client-sdk/inference/test_embedding.py b/tests/client-sdk/inference/test_embedding.py new file mode 100644 index 000000000..3304406a9 --- /dev/null +++ b/tests/client-sdk/inference/test_embedding.py @@ -0,0 +1,98 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + + +# +# Test plan: +# +# Types of input: +# - array of a string +# - array of a image (ImageContentItem, either URL or base64 string) +# - array of a text (TextContentItem) +# Types of output: +# - list of list of floats +# +# Todo: +# - negative tests +# - empty +# - empty list +# - empty string +# - empty text +# - empty image +# - long +# - long string +# - long text +# - large image +# - appropriate combinations +# - batch size +# - many inputs +# - invalid +# - invalid URL +# - invalid base64 +# +# Notes: +# - use llama_stack_client fixture +# - use pytest.mark.parametrize when possible +# - no accuracy tests: only check the type of output, not the content +# + +import pytest +from llama_stack_client.types import EmbeddingsResponse +from llama_stack_client.types.shared.interleaved_content import ( + ImageContentItem, + ImageContentItemImage, + ImageContentItemImageURL, + TextContentItem, +) + +DUMMY_STRING = "hello" +DUMMY_STRING2 = "world" +DUMMY_TEXT = TextContentItem(text=DUMMY_STRING, type="text") +DUMMY_TEXT2 = TextContentItem(text=DUMMY_STRING2, type="text") +# TODO(mf): add a real image URL and base64 string +DUMMY_IMAGE_URL = ImageContentItem( + image=ImageContentItemImage(url=ImageContentItemImageURL(uri="https://example.com/image.jpg")), type="image" +) +DUMMY_IMAGE_BASE64 = ImageContentItem(image=ImageContentItemImage(data="base64string"), type="image") + + +@pytest.mark.parametrize( + "contents", + [ + [DUMMY_STRING, DUMMY_STRING2], + [DUMMY_TEXT, DUMMY_TEXT2], + ], + ids=[ + "list[string]", + "list[text]", + ], +) +def test_embedding_text(llama_stack_client, embedding_model_id, contents): + response = llama_stack_client.inference.embeddings(model_id=embedding_model_id, contents=contents) + assert isinstance(response, EmbeddingsResponse) + assert len(response.embeddings) == sum(len(content) if isinstance(content, list) else 1 for content in contents) + assert isinstance(response.embeddings[0], list) + assert isinstance(response.embeddings[0][0], float) + + +@pytest.mark.parametrize( + "contents", + [ + [DUMMY_IMAGE_URL, DUMMY_IMAGE_BASE64], + [DUMMY_IMAGE_URL, DUMMY_STRING, DUMMY_IMAGE_BASE64, DUMMY_TEXT], + ], + ids=[ + "list[url,base64]", + "list[url,string,base64,text]", + ], +) +@pytest.mark.skip(reason="Media is not supported") +def test_embedding_image(llama_stack_client, embedding_model_id, contents): + response = llama_stack_client.inference.embeddings(model_id=embedding_model_id, contents=contents) + assert isinstance(response, EmbeddingsResponse) + assert len(response.embeddings) == sum(len(content) if isinstance(content, list) else 1 for content in contents) + assert isinstance(response.embeddings[0], list) + assert isinstance(response.embeddings[0][0], float) diff --git a/tests/client-sdk/inference/test_text_inference.py b/tests/client-sdk/inference/test_text_inference.py index 1fe53ab86..75d932380 100644 --- a/tests/client-sdk/inference/test_text_inference.py +++ b/tests/client-sdk/inference/test_text_inference.py @@ -28,14 +28,6 @@ def provider_tool_format(inference_provider_type): ) -@pytest.fixture(scope="session") -def inference_provider_type(llama_stack_client): - providers = llama_stack_client.providers.list() - inference_providers = [p for p in providers if p.api == "inference"] - assert len(inference_providers) > 0, "No inference providers found" - return inference_providers[0].provider_type - - @pytest.fixture def get_weather_tool_definition(): return { @@ -50,8 +42,8 @@ def get_weather_tool_definition(): } -def test_text_completion_non_streaming(llama_stack_client, text_model_id): - response = llama_stack_client.inference.completion( +def test_text_completion_non_streaming(client_with_models, text_model_id): + response = client_with_models.inference.completion( content="Complete the sentence using one word: Roses are red, violets are ", stream=False, model_id=text_model_id, @@ -63,8 +55,8 @@ def test_text_completion_non_streaming(llama_stack_client, text_model_id): # assert "blue" in response.content.lower().strip() -def test_text_completion_streaming(llama_stack_client, text_model_id): - response = llama_stack_client.inference.completion( +def test_text_completion_streaming(client_with_models, text_model_id): + response = client_with_models.inference.completion( content="Complete the sentence using one word: Roses are red, violets are ", stream=True, model_id=text_model_id, @@ -78,11 +70,11 @@ def test_text_completion_streaming(llama_stack_client, text_model_id): assert len(content_str) > 10 -def test_completion_log_probs_non_streaming(llama_stack_client, text_model_id, inference_provider_type): +def test_completion_log_probs_non_streaming(client_with_models, text_model_id, inference_provider_type): if inference_provider_type not in PROVIDER_LOGPROBS_TOP_K: pytest.xfail(f"{inference_provider_type} doesn't support log probs yet") - response = llama_stack_client.inference.completion( + response = client_with_models.inference.completion( content="Complete the sentence: Micheael Jordan is born in ", stream=False, model_id=text_model_id, @@ -98,11 +90,11 @@ def test_completion_log_probs_non_streaming(llama_stack_client, text_model_id, i assert all(len(logprob.logprobs_by_token) == 1 for logprob in response.logprobs) -def test_completion_log_probs_streaming(llama_stack_client, text_model_id, inference_provider_type): +def test_completion_log_probs_streaming(client_with_models, text_model_id, inference_provider_type): if inference_provider_type not in PROVIDER_LOGPROBS_TOP_K: pytest.xfail(f"{inference_provider_type} doesn't support log probs yet") - response = llama_stack_client.inference.completion( + response = client_with_models.inference.completion( content="Complete the sentence: Micheael Jordan is born in ", stream=True, model_id=text_model_id, @@ -123,7 +115,7 @@ def test_completion_log_probs_streaming(llama_stack_client, text_model_id, infer @pytest.mark.parametrize("test_case", ["completion-01"]) -def test_text_completion_structured_output(llama_stack_client, text_model_id, inference_provider_type, test_case): +def test_text_completion_structured_output(client_with_models, text_model_id, test_case): class AnswerFormat(BaseModel): name: str year_born: str @@ -132,7 +124,7 @@ def test_text_completion_structured_output(llama_stack_client, text_model_id, in tc = TestCase(test_case) user_input = tc["user_input"] - response = llama_stack_client.inference.completion( + response = client_with_models.inference.completion( model_id=text_model_id, content=user_input, stream=False, @@ -161,8 +153,8 @@ def test_text_completion_structured_output(llama_stack_client, text_model_id, in ), ], ) -def test_text_chat_completion_non_streaming(llama_stack_client, text_model_id, question, expected): - response = llama_stack_client.inference.chat_completion( +def test_text_chat_completion_non_streaming(client_with_models, text_model_id, question, expected): + response = client_with_models.inference.chat_completion( model_id=text_model_id, messages=[ { @@ -184,8 +176,8 @@ def test_text_chat_completion_non_streaming(llama_stack_client, text_model_id, q ("What is the name of the US captial?", "Washington"), ], ) -def test_text_chat_completion_streaming(llama_stack_client, text_model_id, question, expected): - response = llama_stack_client.inference.chat_completion( +def test_text_chat_completion_streaming(client_with_models, text_model_id, question, expected): + response = client_with_models.inference.chat_completion( model_id=text_model_id, messages=[{"role": "user", "content": question}], stream=True, @@ -196,9 +188,9 @@ def test_text_chat_completion_streaming(llama_stack_client, text_model_id, quest def test_text_chat_completion_with_tool_calling_and_non_streaming( - llama_stack_client, text_model_id, get_weather_tool_definition, provider_tool_format + client_with_models, text_model_id, get_weather_tool_definition, provider_tool_format ): - response = llama_stack_client.inference.chat_completion( + response = client_with_models.inference.chat_completion( model_id=text_model_id, messages=[ {"role": "system", "content": "You are a helpful assistant."}, @@ -233,9 +225,9 @@ def extract_tool_invocation_content(response): def test_text_chat_completion_with_tool_calling_and_streaming( - llama_stack_client, text_model_id, get_weather_tool_definition, provider_tool_format + client_with_models, text_model_id, get_weather_tool_definition, provider_tool_format ): - response = llama_stack_client.inference.chat_completion( + response = client_with_models.inference.chat_completion( model_id=text_model_id, messages=[ {"role": "system", "content": "You are a helpful assistant."}, @@ -250,18 +242,23 @@ def test_text_chat_completion_with_tool_calling_and_streaming( assert tool_invocation_content == "[get_weather, {'location': 'San Francisco, CA'}]" -@pytest.mark.parametrize("test_case", ["chat_completion-01"]) def test_text_chat_completion_with_tool_choice_required( - llama_stack_client, text_model_id, get_weather_tool_definition, provider_tool_format, inference_provider_type + client_with_models, + text_model_id, + get_weather_tool_definition, + provider_tool_format, ): - response = llama_stack_client.inference.chat_completion( + response = client_with_models.inference.chat_completion( model_id=text_model_id, messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "What's the weather like in San Francisco?"}, ], tools=[get_weather_tool_definition], - tool_config={"tool_choice": "required", "tool_prompt_format": provider_tool_format}, + tool_config={ + "tool_choice": "required", + "tool_prompt_format": provider_tool_format, + }, stream=True, ) tool_invocation_content = extract_tool_invocation_content(response) @@ -269,9 +266,9 @@ def test_text_chat_completion_with_tool_choice_required( def test_text_chat_completion_with_tool_choice_none( - llama_stack_client, text_model_id, get_weather_tool_definition, provider_tool_format + client_with_models, text_model_id, get_weather_tool_definition, provider_tool_format ): - response = llama_stack_client.inference.chat_completion( + response = client_with_models.inference.chat_completion( model_id=text_model_id, messages=[ {"role": "system", "content": "You are a helpful assistant."}, @@ -285,7 +282,8 @@ def test_text_chat_completion_with_tool_choice_none( assert tool_invocation_content == "" -def test_text_chat_completion_structured_output(llama_stack_client, text_model_id, inference_provider_type, test_case): +@pytest.mark.parametrize("test_case", ["chat_completion-01"]) +def test_text_chat_completion_structured_output(client_with_models, text_model_id, test_case): class AnswerFormat(BaseModel): first_name: str last_name: str @@ -294,7 +292,7 @@ def test_text_chat_completion_structured_output(llama_stack_client, text_model_i tc = TestCase(test_case) - response = llama_stack_client.inference.chat_completion( + response = client_with_models.inference.chat_completion( model_id=text_model_id, messages=tc["messages"], response_format={ @@ -318,7 +316,7 @@ def test_text_chat_completion_structured_output(llama_stack_client, text_model_i False, ], ) -def test_text_chat_completion_tool_calling_tools_not_in_request(llama_stack_client, text_model_id, streaming): +def test_text_chat_completion_tool_calling_tools_not_in_request(client_with_models, text_model_id, streaming): # TODO: more dynamic lookup on tool_prompt_format for model family tool_prompt_format = "json" if "3.1" in text_model_id else "python_list" request = { @@ -374,7 +372,7 @@ def test_text_chat_completion_tool_calling_tools_not_in_request(llama_stack_clie "stream": streaming, } - response = llama_stack_client.inference.chat_completion(**request) + response = client_with_models.inference.chat_completion(**request) if streaming: for chunk in response: diff --git a/tests/client-sdk/inference/test_vision_inference.py b/tests/client-sdk/inference/test_vision_inference.py index b23089747..8fa0d8023 100644 --- a/tests/client-sdk/inference/test_vision_inference.py +++ b/tests/client-sdk/inference/test_vision_inference.py @@ -10,14 +10,6 @@ import pathlib import pytest -@pytest.fixture(scope="session") -def inference_provider_type(llama_stack_client): - providers = llama_stack_client.providers.list() - inference_providers = [p for p in providers if p.api == "inference"] - assert len(inference_providers) > 0, "No inference providers found" - return inference_providers[0].provider_type - - @pytest.fixture def image_path(): return pathlib.Path(__file__).parent / "dog.png" @@ -35,7 +27,7 @@ def base64_image_url(base64_image_data, image_path): return f"data:image/{image_path.suffix[1:]};base64,{base64_image_data}" -def test_image_chat_completion_non_streaming(llama_stack_client, vision_model_id): +def test_image_chat_completion_non_streaming(client_with_models, vision_model_id): message = { "role": "user", "content": [ @@ -53,7 +45,7 @@ def test_image_chat_completion_non_streaming(llama_stack_client, vision_model_id }, ], } - response = llama_stack_client.inference.chat_completion( + response = client_with_models.inference.chat_completion( model_id=vision_model_id, messages=[message], stream=False, @@ -63,7 +55,7 @@ def test_image_chat_completion_non_streaming(llama_stack_client, vision_model_id assert any(expected in message_content for expected in {"dog", "puppy", "pup"}) -def test_image_chat_completion_streaming(llama_stack_client, vision_model_id): +def test_image_chat_completion_streaming(client_with_models, vision_model_id): message = { "role": "user", "content": [ @@ -81,7 +73,7 @@ def test_image_chat_completion_streaming(llama_stack_client, vision_model_id): }, ], } - response = llama_stack_client.inference.chat_completion( + response = client_with_models.inference.chat_completion( model_id=vision_model_id, messages=[message], stream=True, @@ -94,7 +86,7 @@ def test_image_chat_completion_streaming(llama_stack_client, vision_model_id): @pytest.mark.parametrize("type_", ["url", "data"]) -def test_image_chat_completion_base64(llama_stack_client, vision_model_id, base64_image_data, base64_image_url, type_): +def test_image_chat_completion_base64(client_with_models, vision_model_id, base64_image_data, base64_image_url, type_): image_spec = { "url": { "type": "image", @@ -122,7 +114,7 @@ def test_image_chat_completion_base64(llama_stack_client, vision_model_id, base6 }, ], } - response = llama_stack_client.inference.chat_completion( + response = client_with_models.inference.chat_completion( model_id=vision_model_id, messages=[message], stream=False, diff --git a/tests/client-sdk/vector_io/conftest.py b/tests/client-sdk/vector_io/conftest.py deleted file mode 100644 index 64cac27d2..000000000 --- a/tests/client-sdk/vector_io/conftest.py +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - - -def pytest_addoption(parser): - parser.addoption( - "--embedding-model", - action="store", - default="all-MiniLM-L6-v2", - help="Specify the embedding model to use for testing", - ) - - -def pytest_generate_tests(metafunc): - if "embedding_model" in metafunc.fixturenames: - metafunc.parametrize( - "embedding_model", - [metafunc.config.getoption("--embedding-model")], - ) diff --git a/tests/client-sdk/vector_io/test_vector_io.py b/tests/client-sdk/vector_io/test_vector_io.py index c7e4040b6..e093548b5 100644 --- a/tests/client-sdk/vector_io/test_vector_io.py +++ b/tests/client-sdk/vector_io/test_vector_io.py @@ -36,12 +36,12 @@ def single_entry_vector_db_registry(llama_stack_client, empty_vector_db_registry @pytest.mark.parametrize("provider_id", INLINE_VECTOR_DB_PROVIDERS) -def test_vector_db_retrieve(llama_stack_client, embedding_model, empty_vector_db_registry, provider_id): +def test_vector_db_retrieve(llama_stack_client, embedding_model_id, empty_vector_db_registry, provider_id): # Register a memory bank first vector_db_id = f"test_vector_db_{random.randint(1000, 9999)}" llama_stack_client.vector_dbs.register( vector_db_id=vector_db_id, - embedding_model=embedding_model, + embedding_model=embedding_model_id, embedding_dimension=384, provider_id=provider_id, ) @@ -50,7 +50,7 @@ def test_vector_db_retrieve(llama_stack_client, embedding_model, empty_vector_db response = llama_stack_client.vector_dbs.retrieve(vector_db_id=vector_db_id) assert response is not None assert response.identifier == vector_db_id - assert response.embedding_model == embedding_model + assert response.embedding_model == embedding_model_id assert response.provider_id == provider_id assert response.provider_resource_id == vector_db_id @@ -61,11 +61,11 @@ def test_vector_db_list(llama_stack_client, empty_vector_db_registry): @pytest.mark.parametrize("provider_id", INLINE_VECTOR_DB_PROVIDERS) -def test_vector_db_register(llama_stack_client, embedding_model, empty_vector_db_registry, provider_id): +def test_vector_db_register(llama_stack_client, embedding_model_id, empty_vector_db_registry, provider_id): vector_db_id = f"test_vector_db_{random.randint(1000, 9999)}" llama_stack_client.vector_dbs.register( vector_db_id=vector_db_id, - embedding_model=embedding_model, + embedding_model=embedding_model_id, embedding_dimension=384, provider_id=provider_id, ) diff --git a/uv.lock b/uv.lock index ce633c174..3cf05f17d 100644 --- a/uv.lock +++ b/uv.lock @@ -584,7 +584,7 @@ wheels = [ [[package]] name = "huggingface-hub" -version = "0.28.1" +version = "0.29.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "filelock" }, @@ -595,9 +595,9 @@ dependencies = [ { name = "tqdm" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/e7/ce/a734204aaae6c35a22f9956ebcd8d8708ae5b842e15d6f42bd6f49e634a4/huggingface_hub-0.28.1.tar.gz", hash = "sha256:893471090c98e3b6efbdfdacafe4052b20b84d59866fb6f54c33d9af18c303ae", size = 387074 } +sdist = { url = "https://files.pythonhosted.org/packages/e2/ac/9f7010c8b050d80b64bfddcc09ef4a4450ae4369940d1b01fa13f5d083de/huggingface_hub-0.29.0.tar.gz", hash = "sha256:64034c852be270cac16c5743fe1f659b14515a9de6342d6f42cbb2ede191fc80", size = 389753 } wheels = [ - { url = "https://files.pythonhosted.org/packages/ea/da/6c2bea5327b640920267d3bf2c9fc114cfbd0a5de234d81cda80cc9e33c8/huggingface_hub-0.28.1-py3-none-any.whl", hash = "sha256:aa6b9a3ffdae939b72c464dbb0d7f99f56e649b55c3d52406f49e0a5a620c0a7", size = 464068 }, + { url = "https://files.pythonhosted.org/packages/2a/4d/8092df2cb0cafa9fcaf691db851b2fccfe9cad4048e081436bbbdf56e4e1/huggingface_hub-0.29.0-py3-none-any.whl", hash = "sha256:c02daa0b6bafbdacb1320fdfd1dc7151d0940825c88c4ef89837fdb1f6ea0afe", size = 468012 }, ] [[package]] @@ -994,7 +994,7 @@ wheels = [ [[package]] name = "lm-format-enforcer" -version = "0.10.9" +version = "0.10.10" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "interegular" }, @@ -1002,9 +1002,9 @@ dependencies = [ { name = "pydantic" }, { name = "pyyaml" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/73/5d/401ffb7a8895e0f3206345e96c52b428c81e4a2af049d426023cb9cb0cdb/lm_format_enforcer-0.10.9.tar.gz", hash = "sha256:3e0bfeaf9fac9f69c8947da554db9a19a76d0be6e85075055f2c70d0aca420da", size = 39713 } +sdist = { url = "https://files.pythonhosted.org/packages/9d/3f/1ec9e91208a2b8af28ef2caf096e70446d7b3c7218c891fffa899608bf08/lm_format_enforcer-0.10.10.tar.gz", hash = "sha256:b1ff9530ccf73097e35bded94737677c9768a235d74b26af8cd25414efdf85f5", size = 39393 } wheels = [ - { url = "https://files.pythonhosted.org/packages/c1/01/e78fdf09de2b4e7750a402eaa4f6783c7215ededd4bc6fe4a3f6d69c49da/lm_format_enforcer-0.10.9-py3-none-any.whl", hash = "sha256:6f3602d3470f54b3ba10d356ea34cc136afbd13394a360949dd8d943a2f2471e", size = 43940 }, + { url = "https://files.pythonhosted.org/packages/32/55/9b91312b7b59903ffa2d1c4310cbeecfea0f8e8e12b154d7ad1d093d0b03/lm_format_enforcer-0.10.10-py3-none-any.whl", hash = "sha256:c5e4330c717780b046c77f46699f8a668cb2b806da540c0127da942538d13695", size = 44231 }, ] [[package]] @@ -1362,7 +1362,7 @@ wheels = [ [[package]] name = "openai" -version = "1.63.0" +version = "1.63.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, @@ -1374,9 +1374,9 @@ dependencies = [ { name = "tqdm" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/4f/32/2049e973a646801df425aecdf88c6504ca878bdb3951fe12076fc30f2977/openai-1.63.0.tar.gz", hash = "sha256:597d7a1b35b113e5a09fcb953bdb1eef44f404a39985f3d7573b3ab09221fd66", size = 356710 } +sdist = { url = "https://files.pythonhosted.org/packages/e6/1c/11b520deb71f9ea54ced3c52cd6a5f7131215deba63ad07f23982e328141/openai-1.63.2.tar.gz", hash = "sha256:aeabeec984a7d2957b4928ceaa339e2ead19c61cfcf35ae62b7c363368d26360", size = 356902 } wheels = [ - { url = "https://files.pythonhosted.org/packages/67/a0/e1fe4e87218639fc0a0927da5266c2978eaa0e2eb5437479ee64a11535bb/openai-1.63.0-py3-none-any.whl", hash = "sha256:a664dfc78f0a05ca46c3e21f344f840cf6bf7174f13cfa9de214ed28bfca1dda", size = 472282 }, + { url = "https://files.pythonhosted.org/packages/15/64/db3462b358072387b8e93e6e6a38d3c741a17b4a84171ef01d6c85c63f25/openai-1.63.2-py3-none-any.whl", hash = "sha256:1f38b27b5a40814c2b7d8759ec78110df58c4a614c25f182809ca52b080ff4d4", size = 472282 }, ] [[package]] @@ -2577,14 +2577,14 @@ wheels = [ [[package]] name = "sphinxcontrib-video" -version = "0.4.0" +version = "0.4.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "sphinx" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/c7/58/b41664ea7522e886fb33c85a562fe05fc44e1e53bc59da7466d4d7b65787/sphinxcontrib_video-0.4.0.tar.gz", hash = "sha256:1052553faf5f0e255e5e292fae3f5f2fdd295f8a80745d649bfcdbcb12581a69", size = 11324 } +sdist = { url = "https://files.pythonhosted.org/packages/16/48/063e167b6e692bc84bbad74df30bcb27e460a7c620af7824729db8dba606/sphinxcontrib_video-0.4.1.tar.gz", hash = "sha256:75a033e71b7de124cc5902430b7ba818a1c6c377be6401d07e9f2329a95d5ca4", size = 11362 } wheels = [ - { url = "https://files.pythonhosted.org/packages/b3/d5/fa5544847af0e9d335dfa6ece10860abf61b8305365fbb2afe4e9f396b04/sphinxcontrib_video-0.4.0-py3-none-any.whl", hash = "sha256:b94212a6a3489f399ab8287db01536cdd018b5410bbf78d0685db96777ce44e8", size = 10045 }, + { url = "https://files.pythonhosted.org/packages/5d/8b/a0271fe65357860ccc52168181891e9fc9d354bfdc9be273e6a77b84f905/sphinxcontrib_video-0.4.1-py3-none-any.whl", hash = "sha256:d63ec68983dac36960557973281a616b5d9e68838369763313fc80533b1ad774", size = 10066 }, ] [[package]] @@ -2950,61 +2950,61 @@ wheels = [ [[package]] name = "websockets" -version = "14.2" +version = "15.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/94/54/8359678c726243d19fae38ca14a334e740782336c9f19700858c4eb64a1e/websockets-14.2.tar.gz", hash = "sha256:5059ed9c54945efb321f097084b4c7e52c246f2c869815876a69d1efc4ad6eb5", size = 164394 } +sdist = { url = "https://files.pythonhosted.org/packages/2e/7a/8bc4d15af7ff30f7ba34f9a172063bfcee9f5001d7cef04bee800a658f33/websockets-15.0.tar.gz", hash = "sha256:ca36151289a15b39d8d683fd8b7abbe26fc50be311066c5f8dcf3cb8cee107ab", size = 175574 } wheels = [ - { url = "https://files.pythonhosted.org/packages/28/fa/76607eb7dcec27b2d18d63f60a32e60e2b8629780f343bb83a4dbb9f4350/websockets-14.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e8179f95323b9ab1c11723e5d91a89403903f7b001828161b480a7810b334885", size = 163089 }, - { url = "https://files.pythonhosted.org/packages/9e/00/ad2246b5030575b79e7af0721810fdaecaf94c4b2625842ef7a756fa06dd/websockets-14.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0d8c3e2cdb38f31d8bd7d9d28908005f6fa9def3324edb9bf336d7e4266fd397", size = 160741 }, - { url = "https://files.pythonhosted.org/packages/72/f7/60f10924d333a28a1ff3fcdec85acf226281331bdabe9ad74947e1b7fc0a/websockets-14.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:714a9b682deb4339d39ffa674f7b674230227d981a37d5d174a4a83e3978a610", size = 160996 }, - { url = "https://files.pythonhosted.org/packages/63/7c/c655789cf78648c01ac6ecbe2d6c18f91b75bdc263ffee4d08ce628d12f0/websockets-14.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2e53c72052f2596fb792a7acd9704cbc549bf70fcde8a99e899311455974ca3", size = 169974 }, - { url = "https://files.pythonhosted.org/packages/fb/5b/013ed8b4611857ac92ac631079c08d9715b388bd1d88ec62e245f87a39df/websockets-14.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e3fbd68850c837e57373d95c8fe352203a512b6e49eaae4c2f4088ef8cf21980", size = 168985 }, - { url = "https://files.pythonhosted.org/packages/cd/33/aa3e32fd0df213a5a442310754fe3f89dd87a0b8e5b4e11e0991dd3bcc50/websockets-14.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b27ece32f63150c268593d5fdb82819584831a83a3f5809b7521df0685cd5d8", size = 169297 }, - { url = "https://files.pythonhosted.org/packages/93/17/dae0174883d6399f57853ac44abf5f228eaba86d98d160f390ffabc19b6e/websockets-14.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4daa0faea5424d8713142b33825fff03c736f781690d90652d2c8b053345b0e7", size = 169677 }, - { url = "https://files.pythonhosted.org/packages/42/e2/0375af7ac00169b98647c804651c515054b34977b6c1354f1458e4116c1e/websockets-14.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:bc63cee8596a6ec84d9753fd0fcfa0452ee12f317afe4beae6b157f0070c6c7f", size = 169089 }, - { url = "https://files.pythonhosted.org/packages/73/8d/80f71d2a351a44b602859af65261d3dde3a0ce4e76cf9383738a949e0cc3/websockets-14.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7a570862c325af2111343cc9b0257b7119b904823c675b22d4ac547163088d0d", size = 169026 }, - { url = "https://files.pythonhosted.org/packages/48/97/173b1fa6052223e52bb4054a141433ad74931d94c575e04b654200b98ca4/websockets-14.2-cp310-cp310-win32.whl", hash = "sha256:75862126b3d2d505e895893e3deac0a9339ce750bd27b4ba515f008b5acf832d", size = 163967 }, - { url = "https://files.pythonhosted.org/packages/c0/5b/2fcf60f38252a4562b28b66077e0d2b48f91fef645d5f78874cd1dec807b/websockets-14.2-cp310-cp310-win_amd64.whl", hash = "sha256:cc45afb9c9b2dc0852d5c8b5321759cf825f82a31bfaf506b65bf4668c96f8b2", size = 164413 }, - { url = "https://files.pythonhosted.org/packages/15/b6/504695fb9a33df0ca56d157f5985660b5fc5b4bf8c78f121578d2d653392/websockets-14.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:3bdc8c692c866ce5fefcaf07d2b55c91d6922ac397e031ef9b774e5b9ea42166", size = 163088 }, - { url = "https://files.pythonhosted.org/packages/81/26/ebfb8f6abe963c795122439c6433c4ae1e061aaedfc7eff32d09394afbae/websockets-14.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c93215fac5dadc63e51bcc6dceca72e72267c11def401d6668622b47675b097f", size = 160745 }, - { url = "https://files.pythonhosted.org/packages/a1/c6/1435ad6f6dcbff80bb95e8986704c3174da8866ddb751184046f5c139ef6/websockets-14.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1c9b6535c0e2cf8a6bf938064fb754aaceb1e6a4a51a80d884cd5db569886910", size = 160995 }, - { url = "https://files.pythonhosted.org/packages/96/63/900c27cfe8be1a1f2433fc77cd46771cf26ba57e6bdc7cf9e63644a61863/websockets-14.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a52a6d7cf6938e04e9dceb949d35fbdf58ac14deea26e685ab6368e73744e4c", size = 170543 }, - { url = "https://files.pythonhosted.org/packages/00/8b/bec2bdba92af0762d42d4410593c1d7d28e9bfd952c97a3729df603dc6ea/websockets-14.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9f05702e93203a6ff5226e21d9b40c037761b2cfb637187c9802c10f58e40473", size = 169546 }, - { url = "https://files.pythonhosted.org/packages/6b/a9/37531cb5b994f12a57dec3da2200ef7aadffef82d888a4c29a0d781568e4/websockets-14.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22441c81a6748a53bfcb98951d58d1af0661ab47a536af08920d129b4d1c3473", size = 169911 }, - { url = "https://files.pythonhosted.org/packages/60/d5/a6eadba2ed9f7e65d677fec539ab14a9b83de2b484ab5fe15d3d6d208c28/websockets-14.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:efd9b868d78b194790e6236d9cbc46d68aba4b75b22497eb4ab64fa640c3af56", size = 170183 }, - { url = "https://files.pythonhosted.org/packages/76/57/a338ccb00d1df881c1d1ee1f2a20c9c1b5b29b51e9e0191ee515d254fea6/websockets-14.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:1a5a20d5843886d34ff8c57424cc65a1deda4375729cbca4cb6b3353f3ce4142", size = 169623 }, - { url = "https://files.pythonhosted.org/packages/64/22/e5f7c33db0cb2c1d03b79fd60d189a1da044e2661f5fd01d629451e1db89/websockets-14.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:34277a29f5303d54ec6468fb525d99c99938607bc96b8d72d675dee2b9f5bf1d", size = 169583 }, - { url = "https://files.pythonhosted.org/packages/aa/2e/2b4662237060063a22e5fc40d46300a07142afe30302b634b4eebd717c07/websockets-14.2-cp311-cp311-win32.whl", hash = "sha256:02687db35dbc7d25fd541a602b5f8e451a238ffa033030b172ff86a93cb5dc2a", size = 163969 }, - { url = "https://files.pythonhosted.org/packages/94/a5/0cda64e1851e73fc1ecdae6f42487babb06e55cb2f0dc8904b81d8ef6857/websockets-14.2-cp311-cp311-win_amd64.whl", hash = "sha256:862e9967b46c07d4dcd2532e9e8e3c2825e004ffbf91a5ef9dde519ee2effb0b", size = 164408 }, - { url = "https://files.pythonhosted.org/packages/c1/81/04f7a397653dc8bec94ddc071f34833e8b99b13ef1a3804c149d59f92c18/websockets-14.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:1f20522e624d7ffbdbe259c6b6a65d73c895045f76a93719aa10cd93b3de100c", size = 163096 }, - { url = "https://files.pythonhosted.org/packages/ec/c5/de30e88557e4d70988ed4d2eabd73fd3e1e52456b9f3a4e9564d86353b6d/websockets-14.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:647b573f7d3ada919fd60e64d533409a79dcf1ea21daeb4542d1d996519ca967", size = 160758 }, - { url = "https://files.pythonhosted.org/packages/e5/8c/d130d668781f2c77d106c007b6c6c1d9db68239107c41ba109f09e6c218a/websockets-14.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6af99a38e49f66be5a64b1e890208ad026cda49355661549c507152113049990", size = 160995 }, - { url = "https://files.pythonhosted.org/packages/a6/bc/f6678a0ff17246df4f06765e22fc9d98d1b11a258cc50c5968b33d6742a1/websockets-14.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:091ab63dfc8cea748cc22c1db2814eadb77ccbf82829bac6b2fbe3401d548eda", size = 170815 }, - { url = "https://files.pythonhosted.org/packages/d8/b2/8070cb970c2e4122a6ef38bc5b203415fd46460e025652e1ee3f2f43a9a3/websockets-14.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b374e8953ad477d17e4851cdc66d83fdc2db88d9e73abf755c94510ebddceb95", size = 169759 }, - { url = "https://files.pythonhosted.org/packages/81/da/72f7caabd94652e6eb7e92ed2d3da818626e70b4f2b15a854ef60bf501ec/websockets-14.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a39d7eceeea35db85b85e1169011bb4321c32e673920ae9c1b6e0978590012a3", size = 170178 }, - { url = "https://files.pythonhosted.org/packages/31/e0/812725b6deca8afd3a08a2e81b3c4c120c17f68c9b84522a520b816cda58/websockets-14.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0a6f3efd47ffd0d12080594f434faf1cd2549b31e54870b8470b28cc1d3817d9", size = 170453 }, - { url = "https://files.pythonhosted.org/packages/66/d3/8275dbc231e5ba9bb0c4f93144394b4194402a7a0c8ffaca5307a58ab5e3/websockets-14.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:065ce275e7c4ffb42cb738dd6b20726ac26ac9ad0a2a48e33ca632351a737267", size = 169830 }, - { url = "https://files.pythonhosted.org/packages/a3/ae/e7d1a56755ae15ad5a94e80dd490ad09e345365199600b2629b18ee37bc7/websockets-14.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e9d0e53530ba7b8b5e389c02282f9d2aa47581514bd6049d3a7cffe1385cf5fe", size = 169824 }, - { url = "https://files.pythonhosted.org/packages/b6/32/88ccdd63cb261e77b882e706108d072e4f1c839ed723bf91a3e1f216bf60/websockets-14.2-cp312-cp312-win32.whl", hash = "sha256:20e6dd0984d7ca3037afcb4494e48c74ffb51e8013cac71cf607fffe11df7205", size = 163981 }, - { url = "https://files.pythonhosted.org/packages/b3/7d/32cdb77990b3bdc34a306e0a0f73a1275221e9a66d869f6ff833c95b56ef/websockets-14.2-cp312-cp312-win_amd64.whl", hash = "sha256:44bba1a956c2c9d268bdcdf234d5e5ff4c9b6dc3e300545cbe99af59dda9dcce", size = 164421 }, - { url = "https://files.pythonhosted.org/packages/82/94/4f9b55099a4603ac53c2912e1f043d6c49d23e94dd82a9ce1eb554a90215/websockets-14.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:6f1372e511c7409a542291bce92d6c83320e02c9cf392223272287ce55bc224e", size = 163102 }, - { url = "https://files.pythonhosted.org/packages/8e/b7/7484905215627909d9a79ae07070057afe477433fdacb59bf608ce86365a/websockets-14.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4da98b72009836179bb596a92297b1a61bb5a830c0e483a7d0766d45070a08ad", size = 160766 }, - { url = "https://files.pythonhosted.org/packages/a3/a4/edb62efc84adb61883c7d2c6ad65181cb087c64252138e12d655989eec05/websockets-14.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f8a86a269759026d2bde227652b87be79f8a734e582debf64c9d302faa1e9f03", size = 160998 }, - { url = "https://files.pythonhosted.org/packages/f5/79/036d320dc894b96af14eac2529967a6fc8b74f03b83c487e7a0e9043d842/websockets-14.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:86cf1aaeca909bf6815ea714d5c5736c8d6dd3a13770e885aafe062ecbd04f1f", size = 170780 }, - { url = "https://files.pythonhosted.org/packages/63/75/5737d21ee4dd7e4b9d487ee044af24a935e36a9ff1e1419d684feedcba71/websockets-14.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a9b0f6c3ba3b1240f602ebb3971d45b02cc12bd1845466dd783496b3b05783a5", size = 169717 }, - { url = "https://files.pythonhosted.org/packages/2c/3c/bf9b2c396ed86a0b4a92ff4cdaee09753d3ee389be738e92b9bbd0330b64/websockets-14.2-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:669c3e101c246aa85bc8534e495952e2ca208bd87994650b90a23d745902db9a", size = 170155 }, - { url = "https://files.pythonhosted.org/packages/75/2d/83a5aca7247a655b1da5eb0ee73413abd5c3a57fc8b92915805e6033359d/websockets-14.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:eabdb28b972f3729348e632ab08f2a7b616c7e53d5414c12108c29972e655b20", size = 170495 }, - { url = "https://files.pythonhosted.org/packages/79/dd/699238a92761e2f943885e091486378813ac8f43e3c84990bc394c2be93e/websockets-14.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2066dc4cbcc19f32c12a5a0e8cc1b7ac734e5b64ac0a325ff8353451c4b15ef2", size = 169880 }, - { url = "https://files.pythonhosted.org/packages/c8/c9/67a8f08923cf55ce61aadda72089e3ed4353a95a3a4bc8bf42082810e580/websockets-14.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ab95d357cd471df61873dadf66dd05dd4709cae001dd6342edafc8dc6382f307", size = 169856 }, - { url = "https://files.pythonhosted.org/packages/17/b1/1ffdb2680c64e9c3921d99db460546194c40d4acbef999a18c37aa4d58a3/websockets-14.2-cp313-cp313-win32.whl", hash = "sha256:a9e72fb63e5f3feacdcf5b4ff53199ec8c18d66e325c34ee4c551ca748623bbc", size = 163974 }, - { url = "https://files.pythonhosted.org/packages/14/13/8b7fc4cb551b9cfd9890f0fd66e53c18a06240319915533b033a56a3d520/websockets-14.2-cp313-cp313-win_amd64.whl", hash = "sha256:b439ea828c4ba99bb3176dc8d9b933392a2413c0f6b149fdcba48393f573377f", size = 164420 }, - { url = "https://files.pythonhosted.org/packages/10/3d/91d3d2bb1325cd83e8e2c02d0262c7d4426dc8fa0831ef1aa4d6bf2041af/websockets-14.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:d7d9cafbccba46e768be8a8ad4635fa3eae1ffac4c6e7cb4eb276ba41297ed29", size = 160773 }, - { url = "https://files.pythonhosted.org/packages/33/7c/cdedadfef7381939577858b1b5718a4ab073adbb584e429dd9d9dc9bfe16/websockets-14.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:c76193c1c044bd1e9b3316dcc34b174bbf9664598791e6fb606d8d29000e070c", size = 161007 }, - { url = "https://files.pythonhosted.org/packages/ca/35/7a20a3c450b27c04e50fbbfc3dfb161ed8e827b2a26ae31c4b59b018b8c6/websockets-14.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd475a974d5352390baf865309fe37dec6831aafc3014ffac1eea99e84e83fc2", size = 162264 }, - { url = "https://files.pythonhosted.org/packages/e8/9c/e3f9600564b0c813f2448375cf28b47dc42c514344faed3a05d71fb527f9/websockets-14.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2c6c0097a41968b2e2b54ed3424739aab0b762ca92af2379f152c1aef0187e1c", size = 161873 }, - { url = "https://files.pythonhosted.org/packages/3f/37/260f189b16b2b8290d6ae80c9f96d8b34692cf1bb3475df54c38d3deb57d/websockets-14.2-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d7ff794c8b36bc402f2e07c0b2ceb4a2424147ed4785ff03e2a7af03711d60a", size = 161818 }, - { url = "https://files.pythonhosted.org/packages/ff/1e/e47dedac8bf7140e59aa6a679e850c4df9610ae844d71b6015263ddea37b/websockets-14.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:dec254fcabc7bd488dab64846f588fc5b6fe0d78f641180030f8ea27b76d72c3", size = 164465 }, - { url = "https://files.pythonhosted.org/packages/7b/c8/d529f8a32ce40d98309f4470780631e971a5a842b60aec864833b3615786/websockets-14.2-py3-none-any.whl", hash = "sha256:7a6ceec4ea84469f15cf15807a747e9efe57e369c384fa86e022b3bea679b79b", size = 157416 }, + { url = "https://files.pythonhosted.org/packages/3d/f1/b20cc4c1ff84911c791f36fa511a78203836bb4d603f56290de08c067437/websockets-15.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5e6ee18a53dd5743e6155b8ff7e8e477c25b29b440f87f65be8165275c87fef0", size = 174701 }, + { url = "https://files.pythonhosted.org/packages/f9/e8/4de59ee85ec86052ca574f4e5327ef948e4f77757d3c9c1503f5a0e9c039/websockets-15.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ee06405ea2e67366a661ed313e14cf2a86e84142a3462852eb96348f7219cee3", size = 172358 }, + { url = "https://files.pythonhosted.org/packages/2f/ea/b0f95815cdc83d61b1a895858671c6af38a76c23f3ea5d91e2ba11bbedc7/websockets-15.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8711682a629bbcaf492f5e0af72d378e976ea1d127a2d47584fa1c2c080b436b", size = 172610 }, + { url = "https://files.pythonhosted.org/packages/09/ed/c5d8f1f296f475c00611a40eff6a952248785efb125f91a0b29575f36ba6/websockets-15.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94c4a9b01eede952442c088d415861b0cf2053cbd696b863f6d5022d4e4e2453", size = 181579 }, + { url = "https://files.pythonhosted.org/packages/b7/fc/2444b5ae792d92179f20cec53475bcc25d1d7f00a2be9947de9837ef230a/websockets-15.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:45535fead66e873f411c1d3cf0d3e175e66f4dd83c4f59d707d5b3e4c56541c4", size = 180588 }, + { url = "https://files.pythonhosted.org/packages/ff/b5/0945a31562d351cff26d76a2ae9a4ba4536e698aa059a4262afd793b2a1d/websockets-15.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e389efe46ccb25a1f93d08c7a74e8123a2517f7b7458f043bd7529d1a63ffeb", size = 180902 }, + { url = "https://files.pythonhosted.org/packages/b6/7c/e9d844b87754bc83b294cc1c695cbc6c5d42e329b85d2bf2d7bb9554d09c/websockets-15.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:67a04754d121ea5ca39ddedc3f77071651fb5b0bc6b973c71c515415b44ed9c5", size = 181282 }, + { url = "https://files.pythonhosted.org/packages/9e/6c/6a5d3272f494fa2fb4806b896ecb312bd6c72bab632df4ace19946c079dc/websockets-15.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:bd66b4865c8b853b8cca7379afb692fc7f52cf898786537dfb5e5e2d64f0a47f", size = 180694 }, + { url = "https://files.pythonhosted.org/packages/b2/32/1fb4b62c2ec2c9844d4ddaa4021d993552c7c493a0acdcec95551679d501/websockets-15.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a4cc73a6ae0a6751b76e69cece9d0311f054da9b22df6a12f2c53111735657c8", size = 180631 }, + { url = "https://files.pythonhosted.org/packages/e4/9b/5ef1ddb8857ce894217bdd9572ad98c1cef20d8f9f0f43823b782b7ded6b/websockets-15.0-cp310-cp310-win32.whl", hash = "sha256:89da58e4005e153b03fe8b8794330e3f6a9774ee9e1c3bd5bc52eb098c3b0c4f", size = 175664 }, + { url = "https://files.pythonhosted.org/packages/29/63/c320572ccf813ed2bc3058a0e0291ee95eb258dc5e6b3446ca45dc1af0fd/websockets-15.0-cp310-cp310-win_amd64.whl", hash = "sha256:4ff380aabd7a74a42a760ee76c68826a8f417ceb6ea415bd574a035a111fd133", size = 176109 }, + { url = "https://files.pythonhosted.org/packages/ee/16/81a7403c8c0a33383de647e89c07824ea6a654e3877d6ff402cbae298cb8/websockets-15.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:dd24c4d256558429aeeb8d6c24ebad4e982ac52c50bc3670ae8646c181263965", size = 174702 }, + { url = "https://files.pythonhosted.org/packages/ef/40/4629202386a3bf1195db9fe41baeb1d6dfd8d72e651d9592d81dae7fdc7c/websockets-15.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f83eca8cbfd168e424dfa3b3b5c955d6c281e8fc09feb9d870886ff8d03683c7", size = 172359 }, + { url = "https://files.pythonhosted.org/packages/7b/33/dfb650e822bc7912d8c542c452497867af91dec81e7b5bf96aca5b419d58/websockets-15.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4095a1f2093002c2208becf6f9a178b336b7572512ee0a1179731acb7788e8ad", size = 172604 }, + { url = "https://files.pythonhosted.org/packages/2e/52/666743114513fcffd43ee5df261a1eb5d41f8e9861b7a190b730732c19ba/websockets-15.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb915101dfbf318486364ce85662bb7b020840f68138014972c08331458d41f3", size = 182145 }, + { url = "https://files.pythonhosted.org/packages/9c/63/5273f146b13aa4a057a95ab0855d9990f3a1ced63693f4365135d1abfacc/websockets-15.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:45d464622314973d78f364689d5dbb9144e559f93dca11b11af3f2480b5034e1", size = 181152 }, + { url = "https://files.pythonhosted.org/packages/0f/ae/075697f3f97de7c26b73ae96d952e13fa36393e0db3f028540b28954e0a9/websockets-15.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ace960769d60037ca9625b4c578a6f28a14301bd2a1ff13bb00e824ac9f73e55", size = 181523 }, + { url = "https://files.pythonhosted.org/packages/25/87/06d091bbcbe01903bed3dad3bb4a1a3c516f61e611ec31fffb28abe4974b/websockets-15.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c7cd4b1015d2f60dfe539ee6c95bc968d5d5fad92ab01bb5501a77393da4f596", size = 181791 }, + { url = "https://files.pythonhosted.org/packages/77/08/5063b6cc1b2aa1fba2ee3b578b777db22fde7145f121d07fd878811e983b/websockets-15.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:4f7290295794b5dec470867c7baa4a14182b9732603fd0caf2a5bf1dc3ccabf3", size = 181231 }, + { url = "https://files.pythonhosted.org/packages/86/ff/af23084df0a7405bb2add12add8c17d6192a8de9480f1b90d12352ba2b7d/websockets-15.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3abd670ca7ce230d5a624fd3d55e055215d8d9b723adee0a348352f5d8d12ff4", size = 181191 }, + { url = "https://files.pythonhosted.org/packages/21/ce/b2bdfcf49201dee0b899edc6a814755763ec03d74f2714923d38453a9e8d/websockets-15.0-cp311-cp311-win32.whl", hash = "sha256:110a847085246ab8d4d119632145224d6b49e406c64f1bbeed45c6f05097b680", size = 175666 }, + { url = "https://files.pythonhosted.org/packages/8d/7b/444edcd5365538c226b631897975a65bbf5ccf27c77102e17d8f12a306ea/websockets-15.0-cp311-cp311-win_amd64.whl", hash = "sha256:8d7bbbe2cd6ed80aceef2a14e9f1c1b61683194c216472ed5ff33b700e784e37", size = 176105 }, + { url = "https://files.pythonhosted.org/packages/22/1e/92c4547d7b2a93f848aedaf37e9054111bc00dc11bff4385ca3f80dbb412/websockets-15.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:cccc18077acd34c8072578394ec79563664b1c205f7a86a62e94fafc7b59001f", size = 174709 }, + { url = "https://files.pythonhosted.org/packages/9f/37/eae4830a28061ba552516d84478686b637cd9e57d6a90b45ad69e89cb0af/websockets-15.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d4c22992e24f12de340ca5f824121a5b3e1a37ad4360b4e1aaf15e9d1c42582d", size = 172372 }, + { url = "https://files.pythonhosted.org/packages/46/2f/b409f8b8aa9328d5a47f7a301a43319d540d70cf036d1e6443675978a988/websockets-15.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1206432cc6c644f6fc03374b264c5ff805d980311563202ed7fef91a38906276", size = 172607 }, + { url = "https://files.pythonhosted.org/packages/d6/81/d7e2e4542d4b4df849b0110df1b1f94f2647b71ab4b65d672090931ad2bb/websockets-15.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d3cc75ef3e17490042c47e0523aee1bcc4eacd2482796107fd59dd1100a44bc", size = 182422 }, + { url = "https://files.pythonhosted.org/packages/b6/91/3b303160938d123eea97f58be363f7dbec76e8c59d587e07b5bc257dd584/websockets-15.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b89504227a5311610e4be16071465885a0a3d6b0e82e305ef46d9b064ce5fb72", size = 181362 }, + { url = "https://files.pythonhosted.org/packages/f2/8b/df6807f1ca339c567aba9a7ab03bfdb9a833f625e8d2b4fc7529e4c701de/websockets-15.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56e3efe356416bc67a8e093607315951d76910f03d2b3ad49c4ade9207bf710d", size = 181787 }, + { url = "https://files.pythonhosted.org/packages/21/37/e6d3d5ebb0ebcaf98ae84904205c9dcaf3e0fe93e65000b9f08631ed7309/websockets-15.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0f2205cdb444a42a7919690238fb5979a05439b9dbb73dd47c863d39640d85ab", size = 182058 }, + { url = "https://files.pythonhosted.org/packages/c9/df/6aca296f2be4c638ad20908bb3d7c94ce7afc8d9b4b2b0780d1fc59b359c/websockets-15.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:aea01f40995fa0945c020228ab919b8dfc93fc8a9f2d3d705ab5b793f32d9e99", size = 181434 }, + { url = "https://files.pythonhosted.org/packages/88/f1/75717a982bab39bbe63c83f9df0e7753e5c98bab907eb4fb5d97fe5c8c11/websockets-15.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a9f8e33747b1332db11cf7fcf4a9512bef9748cb5eb4d3f7fbc8c30d75dc6ffc", size = 181431 }, + { url = "https://files.pythonhosted.org/packages/e7/15/cee9e63ed9ac5bfc1a3ae8fc6c02c41745023c21eed622eef142d8fdd749/websockets-15.0-cp312-cp312-win32.whl", hash = "sha256:32e02a2d83f4954aa8c17e03fe8ec6962432c39aca4be7e8ee346b05a3476904", size = 175678 }, + { url = "https://files.pythonhosted.org/packages/4e/00/993974c60f40faabb725d4dbae8b072ef73b4c4454bd261d3b1d34ace41f/websockets-15.0-cp312-cp312-win_amd64.whl", hash = "sha256:ffc02b159b65c05f2ed9ec176b715b66918a674bd4daed48a9a7a590dd4be1aa", size = 176119 }, + { url = "https://files.pythonhosted.org/packages/12/23/be28dc1023707ac51768f848d28a946443041a348ee3a54abdf9f6283372/websockets-15.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:d2244d8ab24374bed366f9ff206e2619345f9cd7fe79aad5225f53faac28b6b1", size = 174714 }, + { url = "https://files.pythonhosted.org/packages/8f/ff/02b5e9fbb078e7666bf3d25c18c69b499747a12f3e7f2776063ef3fb7061/websockets-15.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3a302241fbe825a3e4fe07666a2ab513edfdc6d43ce24b79691b45115273b5e7", size = 172374 }, + { url = "https://files.pythonhosted.org/packages/8e/61/901c8d4698e0477eff4c3c664d53f898b601fa83af4ce81946650ec2a4cb/websockets-15.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:10552fed076757a70ba2c18edcbc601c7637b30cdfe8c24b65171e824c7d6081", size = 172605 }, + { url = "https://files.pythonhosted.org/packages/d2/4b/dc47601a80dff317aecf8da7b4ab278d11d3494b2c373b493e4887561f90/websockets-15.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c53f97032b87a406044a1c33d1e9290cc38b117a8062e8a8b285175d7e2f99c9", size = 182380 }, + { url = "https://files.pythonhosted.org/packages/83/f7/b155d2b38f05ed47a0b8de1c9ea245fcd7fc625d89f35a37eccba34b42de/websockets-15.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1caf951110ca757b8ad9c4974f5cac7b8413004d2f29707e4d03a65d54cedf2b", size = 181325 }, + { url = "https://files.pythonhosted.org/packages/d3/ff/040a20c01c294695cac0e361caf86f33347acc38f164f6d2be1d3e007d9f/websockets-15.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8bf1ab71f9f23b0a1d52ec1682a3907e0c208c12fef9c3e99d2b80166b17905f", size = 181763 }, + { url = "https://files.pythonhosted.org/packages/cb/6a/af23e93678fda8341ac8775e85123425e45c608389d3514863c702896ea5/websockets-15.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bfcd3acc1a81f106abac6afd42327d2cf1e77ec905ae11dc1d9142a006a496b6", size = 182097 }, + { url = "https://files.pythonhosted.org/packages/7e/3e/1069e159c30129dc03c01513b5830237e576f47cedb888777dd885cae583/websockets-15.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:c8c5c8e1bac05ef3c23722e591ef4f688f528235e2480f157a9cfe0a19081375", size = 181485 }, + { url = "https://files.pythonhosted.org/packages/9a/a7/c91c47103f1cd941b576bbc452601e9e01f67d5c9be3e0a9abe726491ab5/websockets-15.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:86bfb52a9cfbcc09aba2b71388b0a20ea5c52b6517c0b2e316222435a8cdab72", size = 181466 }, + { url = "https://files.pythonhosted.org/packages/16/32/a4ca6e3d56c24aac46b0cf5c03b841379f6409d07fc2044b244f90f54105/websockets-15.0-cp313-cp313-win32.whl", hash = "sha256:26ba70fed190708551c19a360f9d7eca8e8c0f615d19a574292b7229e0ae324c", size = 175673 }, + { url = "https://files.pythonhosted.org/packages/c0/31/25a417a23e985b61ffa5544f9facfe4a118cb64d664c886f1244a8baeca5/websockets-15.0-cp313-cp313-win_amd64.whl", hash = "sha256:ae721bcc8e69846af00b7a77a220614d9b2ec57d25017a6bbde3a99473e41ce8", size = 176115 }, + { url = "https://files.pythonhosted.org/packages/42/52/359467c7ca12721a04520da9ba9fc29da2cd176c30992f6f81fa881bb3e5/websockets-15.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:b499caef4bca9cbd0bd23cd3386f5113ee7378094a3cb613a2fa543260fe9506", size = 172384 }, + { url = "https://files.pythonhosted.org/packages/7c/ff/36fd8a45fac404d8f109e03ca06328f49847d71c0c048414c76bb2db91c4/websockets-15.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:17f2854c6bd9ee008c4b270f7010fe2da6c16eac5724a175e75010aacd905b31", size = 172616 }, + { url = "https://files.pythonhosted.org/packages/b1/a8/65496a87984815e2837835d5ac3c9f81ea82031036877e8f80953c59dbd9/websockets-15.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:89f72524033abbfde880ad338fd3c2c16e31ae232323ebdfbc745cbb1b3dcc03", size = 173871 }, + { url = "https://files.pythonhosted.org/packages/23/89/9441e1e0818d46fe22d78b3e5c8fe2316516211330e138231c90dce5559e/websockets-15.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1657a9eecb29d7838e3b415458cc494e6d1b194f7ac73a34aa55c6fb6c72d1f3", size = 173477 }, + { url = "https://files.pythonhosted.org/packages/2f/1b/80460b3ac9795ef7bbaa074c603d64e009dbb2ceb11008416efab0dcc811/websockets-15.0-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e413352a921f5ad5d66f9e2869b977e88d5103fc528b6deb8423028a2befd842", size = 173425 }, + { url = "https://files.pythonhosted.org/packages/56/d1/8da7e733ed266f342e8c544c3b8338449de9b860d85d9a0bfd4fe1857d6e/websockets-15.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:8561c48b0090993e3b2a54db480cab1d23eb2c5735067213bb90f402806339f5", size = 176160 }, + { url = "https://files.pythonhosted.org/packages/e8/b2/31eec524b53f01cd8343f10a8e429730c52c1849941d1f530f8253b6d934/websockets-15.0-py3-none-any.whl", hash = "sha256:51ffd53c53c4442415b613497a34ba0aa7b99ac07f1e4a62db5dcd640ae6c3c3", size = 169023 }, ] [[package]]