diff --git a/.github/actions/setup-runner/action.yml b/.github/actions/setup-runner/action.yml index 0be999fe2..1ca02bbff 100644 --- a/.github/actions/setup-runner/action.yml +++ b/.github/actions/setup-runner/action.yml @@ -28,7 +28,7 @@ runs: # Install llama-stack-client-python based on the client-version input if [ "${{ inputs.client-version }}" = "latest" ]; then echo "Installing latest llama-stack-client-python from main branch" - uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main + uv pip install git+https://github.com/llamastack/llama-stack-client-python.git@main elif [ "${{ inputs.client-version }}" = "published" ]; then echo "Installing published llama-stack-client-python from PyPI" uv pip install llama-stack-client diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index a38d4971a..9ef49fba3 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -52,7 +52,8 @@ jobs: run: | # Get test directories dynamically, excluding non-test directories # NOTE: we are excluding post_training since the tests take too long - TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d -printf "%f\n" | + TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d | + sed 's|tests/integration/||' | grep -Ev "^(__pycache__|fixtures|test_cases|recordings|non_ci|post_training)$" | sort | jq -R -s -c 'split("\n")[:-1]') echo "test-types=$TEST_TYPES" >> $GITHUB_OUTPUT diff --git a/.github/workflows/integration-vector-io-tests.yml b/.github/workflows/integration-vector-io-tests.yml index aa239572b..f4d28e407 100644 --- a/.github/workflows/integration-vector-io-tests.yml +++ b/.github/workflows/integration-vector-io-tests.yml @@ -164,9 +164,9 @@ jobs: ENABLE_WEAVIATE: ${{ matrix.vector-io-provider == 'remote::weaviate' && 'true' || '' }} WEAVIATE_CLUSTER_URL: ${{ matrix.vector-io-provider == 'remote::weaviate' && 'localhost:8080' || '' }} run: | - uv run pytest -sv --stack-config="inference=inline::sentence-transformers,vector_io=${{ matrix.vector-io-provider }}" \ + uv run pytest -sv --stack-config="files=inline::localfs,inference=inline::sentence-transformers,vector_io=${{ matrix.vector-io-provider }}" \ tests/integration/vector_io \ - --embedding-model sentence-transformers/all-MiniLM-L6-v2 + --embedding-model inline::sentence-transformers/all-MiniLM-L6-v2 - name: Check Storage and Memory Available After Tests if: ${{ always() }} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 066fcecf0..c81e9e7b1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,13 +1,82 @@ -# Contributing to Llama-Stack +# Contributing to Llama Stack We want to make contributing to this project as easy and transparent as possible. +## Set up your development environment + +We use [uv](https://github.com/astral-sh/uv) to manage python dependencies and virtual environments. +You can install `uv` by following this [guide](https://docs.astral.sh/uv/getting-started/installation/). + +You can install the dependencies by running: + +```bash +cd llama-stack +uv sync --group dev +uv pip install -e . +source .venv/bin/activate +``` + +```{note} +You can use a specific version of Python with `uv` by adding the `--python ` flag (e.g. `--python 3.12`). +Otherwise, `uv` will automatically select a Python version according to the `requires-python` section of the `pyproject.toml`. +For more info, see the [uv docs around Python versions](https://docs.astral.sh/uv/concepts/python-versions/). +``` + +Note that you can create a dotenv file `.env` that includes necessary environment variables: +``` +LLAMA_STACK_BASE_URL=http://localhost:8321 +LLAMA_STACK_CLIENT_LOG=debug +LLAMA_STACK_PORT=8321 +LLAMA_STACK_CONFIG= +TAVILY_SEARCH_API_KEY= +BRAVE_SEARCH_API_KEY= +``` + +And then use this dotenv file when running client SDK tests via the following: +```bash +uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py --text-model=meta-llama/Llama-3.1-8B-Instruct +``` + +### Pre-commit Hooks + +We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running: + +```bash +uv run pre-commit install +``` + +After that, pre-commit hooks will run automatically before each commit. + +Alternatively, if you don't want to install the pre-commit hooks, you can run the checks manually by running: + +```bash +uv run pre-commit run --all-files +``` + +```{caution} +Before pushing your changes, make sure that the pre-commit hooks have passed successfully. +``` + ## Discussions -> Issues -> Pull Requests We actively welcome your pull requests. However, please read the following. This is heavily inspired by [Ghostty](https://github.com/ghostty-org/ghostty/blob/main/CONTRIBUTING.md). If in doubt, please open a [discussion](https://github.com/meta-llama/llama-stack/discussions); we can always convert that to an issue later. +### Issues +We use GitHub issues to track public bugs. Please ensure your description is +clear and has sufficient instructions to be able to reproduce the issue. + +Meta has a [bounty program](http://facebook.com/whitehat/info) for the safe +disclosure of security bugs. In those cases, please go through the process +outlined on that page and do not file a public issue. + +### Contributor License Agreement ("CLA") +In order to accept your pull request, we need you to submit a CLA. You only need +to do this once to work on any of Meta's open source projects. + +Complete your CLA here: + **I'd like to contribute!** If you are new to the project, start by looking at the issues tagged with "good first issue". If you're interested @@ -51,93 +120,15 @@ Please avoid picking up too many issues at once. This helps you stay focused and Please keep pull requests (PRs) small and focused. If you have a large set of changes, consider splitting them into logically grouped, smaller PRs to facilitate review and testing. -> [!TIP] -> As a general guideline: -> - Experienced contributors should try to keep no more than 5 open PRs at a time. -> - New contributors are encouraged to have only one open PR at a time until they’re familiar with the codebase and process. - -## Contributor License Agreement ("CLA") -In order to accept your pull request, we need you to submit a CLA. You only need -to do this once to work on any of Meta's open source projects. - -Complete your CLA here: - -## Issues -We use GitHub issues to track public bugs. Please ensure your description is -clear and has sufficient instructions to be able to reproduce the issue. - -Meta has a [bounty program](http://facebook.com/whitehat/info) for the safe -disclosure of security bugs. In those cases, please go through the process -outlined on that page and do not file a public issue. - - -## Set up your development environment - -We use [uv](https://github.com/astral-sh/uv) to manage python dependencies and virtual environments. -You can install `uv` by following this [guide](https://docs.astral.sh/uv/getting-started/installation/). - -You can install the dependencies by running: - -```bash -cd llama-stack -uv sync --group dev -uv pip install -e . -source .venv/bin/activate +```{tip} +As a general guideline: +- Experienced contributors should try to keep no more than 5 open PRs at a time. +- New contributors are encouraged to have only one open PR at a time until they’re familiar with the codebase and process. ``` -> [!NOTE] -> You can use a specific version of Python with `uv` by adding the `--python ` flag (e.g. `--python 3.12`) -> Otherwise, `uv` will automatically select a Python version according to the `requires-python` section of the `pyproject.toml`. -> For more info, see the [uv docs around Python versions](https://docs.astral.sh/uv/concepts/python-versions/). +## Repository guidelines -Note that you can create a dotenv file `.env` that includes necessary environment variables: -``` -LLAMA_STACK_BASE_URL=http://localhost:8321 -LLAMA_STACK_CLIENT_LOG=debug -LLAMA_STACK_PORT=8321 -LLAMA_STACK_CONFIG= -TAVILY_SEARCH_API_KEY= -BRAVE_SEARCH_API_KEY= -``` - -And then use this dotenv file when running client SDK tests via the following: -```bash -uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py --text-model=meta-llama/Llama-3.1-8B-Instruct -``` - -## Pre-commit Hooks - -We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running: - -```bash -uv run pre-commit install -``` - -After that, pre-commit hooks will run automatically before each commit. - -Alternatively, if you don't want to install the pre-commit hooks, you can run the checks manually by running: - -```bash -uv run pre-commit run --all-files -``` - -> [!CAUTION] -> Before pushing your changes, make sure that the pre-commit hooks have passed successfully. - -## Running tests - -You can find the Llama Stack testing documentation [here](https://github.com/meta-llama/llama-stack/blob/main/tests/README.md). - -## Adding a new dependency to the project - -To add a new dependency to the project, you can use the `uv` command. For example, to add `foo` to the project, you can run: - -```bash -uv add foo -uv sync -``` - -## Coding Style +### Coding Style * Comments should provide meaningful insights into the code. Avoid filler comments that simply describe the next step, as they create unnecessary clutter, same goes for docstrings. @@ -159,6 +150,10 @@ uv sync * When possible, use keyword arguments only when calling functions. * Llama Stack utilizes [custom Exception classes](llama_stack/apis/common/errors.py) for certain Resources that should be used where applicable. +### License +By contributing to Llama, you agree that your contributions will be licensed +under the LICENSE file in the root directory of this source tree. + ## Common Tasks Some tips about common tasks you work on while contributing to Llama Stack: @@ -210,8 +205,4 @@ If you modify or add new API endpoints, update the API documentation accordingly uv run ./docs/openapi_generator/run_openapi_generator.sh ``` -The generated API documentation will be available in `docs/_static/`. Make sure to review the changes before committing. - -## License -By contributing to Llama, you agree that your contributions will be licensed -under the LICENSE file in the root directory of this source tree. +The generated API documentation will be available in `docs/_static/`. Make sure to review the changes before committing. \ No newline at end of file diff --git a/README.md b/README.md index 03aa3dd50..8db4580a2 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,8 @@ # Llama Stack +meta-llama%2Fllama-stack | Trendshift + +----- [![PyPI version](https://img.shields.io/pypi/v/llama_stack.svg)](https://pypi.org/project/llama_stack/) [![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-stack)](https://pypi.org/project/llama-stack/) [![License](https://img.shields.io/pypi/l/llama_stack.svg)](https://github.com/meta-llama/llama-stack/blob/main/LICENSE) @@ -9,6 +12,7 @@ [**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack) + ### ✨🎉 Llama 4 Support 🎉✨ We released [Version 0.2.0](https://github.com/meta-llama/llama-stack/releases/tag/v0.2.0) with support for the Llama 4 herd of models released by Meta. @@ -179,3 +183,17 @@ Please checkout our [Documentation](https://llama-stack.readthedocs.io/en/latest Check out our client SDKs for connecting to a Llama Stack server in your preferred language, you can choose from [python](https://github.com/meta-llama/llama-stack-client-python), [typescript](https://github.com/meta-llama/llama-stack-client-typescript), [swift](https://github.com/meta-llama/llama-stack-client-swift), and [kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) programming languages to quickly build your applications. You can find more example scripts with client SDKs to talk with the Llama Stack server in our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repo. + + +## 🌟 GitHub Star History +## Star History + +[![Star History Chart](https://api.star-history.com/svg?repos=meta-llama/llama-stack&type=Date)](https://www.star-history.com/#meta-llama/llama-stack&Date) + +## ✨ Contributors + +Thanks to all of our amazing contributors! + + + + \ No newline at end of file diff --git a/docs/_static/js/keyboard_shortcuts.js b/docs/_static/js/keyboard_shortcuts.js new file mode 100644 index 000000000..81d0b7c65 --- /dev/null +++ b/docs/_static/js/keyboard_shortcuts.js @@ -0,0 +1,14 @@ +document.addEventListener('keydown', function(event) { + // command+K or ctrl+K + if ((event.metaKey || event.ctrlKey) && event.key === 'k') { + event.preventDefault(); + document.querySelector('.search-input, .search-field, input[name="q"]').focus(); + } + + // forward slash + if (event.key === '/' && + !event.target.matches('input, textarea, select')) { + event.preventDefault(); + document.querySelector('.search-input, .search-field, input[name="q"]').focus(); + } +}); diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 9896b36cd..e160d4f98 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -8293,28 +8293,60 @@ "type": "array", "items": { "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" + "properties": { + "attributes": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } + "description": "(Optional) Key-value attributes associated with the file" + }, + "file_id": { + "type": "string", + "description": "Unique identifier of the file containing the result" + }, + "filename": { + "type": "string", + "description": "Name of the file containing the result" + }, + "score": { + "type": "number", + "description": "Relevance score for this search result (between 0 and 1)" + }, + "text": { + "type": "string", + "description": "Text content of the search result" + } + }, + "additionalProperties": false, + "required": [ + "attributes", + "file_id", + "filename", + "score", + "text" + ], + "title": "OpenAIResponseOutputMessageFileSearchToolCallResults", + "description": "Search results returned by the file search operation." }, "description": "(Optional) Search results returned by the file search operation" } @@ -8515,6 +8547,13 @@ "$ref": "#/components/schemas/OpenAIResponseInputTool" } }, + "include": { + "type": "array", + "items": { + "type": "string" + }, + "description": "(Optional) Additional fields to include in the response." + }, "max_infer_iters": { "type": "integer" } diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 15d491a65..6a377a846 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -6021,14 +6021,44 @@ components: type: array items: type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object + properties: + attributes: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + description: >- + (Optional) Key-value attributes associated with the file + file_id: + type: string + description: >- + Unique identifier of the file containing the result + filename: + type: string + description: Name of the file containing the result + score: + type: number + description: >- + Relevance score for this search result (between 0 and 1) + text: + type: string + description: Text content of the search result + additionalProperties: false + required: + - attributes + - file_id + - filename + - score + - text + title: >- + OpenAIResponseOutputMessageFileSearchToolCallResults + description: >- + Search results returned by the file search operation. description: >- (Optional) Search results returned by the file search operation additionalProperties: false @@ -6188,6 +6218,12 @@ components: type: array items: $ref: '#/components/schemas/OpenAIResponseInputTool' + include: + type: array + items: + type: string + description: >- + (Optional) Additional fields to include in the response. max_infer_iters: type: integer additionalProperties: false diff --git a/docs/source/apis/external.md b/docs/source/apis/external.md index cc13deb9b..5831990b0 100644 --- a/docs/source/apis/external.md +++ b/docs/source/apis/external.md @@ -111,7 +111,7 @@ name = "llama-stack-api-weather" version = "0.1.0" description = "Weather API for Llama Stack" readme = "README.md" -requires-python = ">=3.10" +requires-python = ">=3.12" dependencies = ["llama-stack", "pydantic"] [build-system] @@ -231,7 +231,7 @@ name = "llama-stack-provider-kaze" version = "0.1.0" description = "Kaze weather provider for Llama Stack" readme = "README.md" -requires-python = ">=3.10" +requires-python = ">=3.12" dependencies = ["llama-stack", "pydantic", "aiohttp"] [build-system] diff --git a/docs/source/conf.py b/docs/source/conf.py index 20f1abf00..3f84d1310 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -131,6 +131,7 @@ html_static_path = ["../_static"] def setup(app): app.add_css_file("css/my_theme.css") app.add_js_file("js/detect_theme.js") + app.add_js_file("js/keyboard_shortcuts.js") def dockerhub_role(name, rawtext, text, lineno, inliner, options={}, content=[]): url = f"https://hub.docker.com/r/llamastack/{text}" diff --git a/docs/source/contributing/index.md b/docs/source/contributing/index.md index 1e067ea6c..7a3a1c2e2 100644 --- a/docs/source/contributing/index.md +++ b/docs/source/contributing/index.md @@ -2,14 +2,28 @@ ```{include} ../../../CONTRIBUTING.md ``` -See the [Adding a New API Provider](new_api_provider.md) which describes how to add new API providers to the Stack. +## Adding a New Provider +See the [Adding a New API Provider Page](new_api_provider.md) which describes how to add new API providers to the Stack. +See the [Vector Database Page](new_vector_database.md) which describes how to add a new vector databases with Llama Stack. +See the [External Provider Page](../providers/external/index.md) which describes how to add external providers to the Stack. ```{toctree} :maxdepth: 1 :hidden: new_api_provider -testing +new_vector_database ``` + +## Testing + +See the [Test Page](testing.md) which describes how to test your changes. +```{toctree} +:maxdepth: 1 +:hidden: +:caption: Testing + +testing +``` \ No newline at end of file diff --git a/docs/source/contributing/new_vector_database.md b/docs/source/contributing/new_vector_database.md new file mode 100644 index 000000000..83c0f55bc --- /dev/null +++ b/docs/source/contributing/new_vector_database.md @@ -0,0 +1,75 @@ +# Adding a New Vector Database + +This guide will walk you through the process of adding a new vector database to Llama Stack. + +> **_NOTE:_** Here's an example Pull Request of the [Milvus Vector Database Provider](https://github.com/meta-llama/llama-stack/pull/1467). + +Vector Database providers are used to store and retrieve vector embeddings. Vector databases are not limited to vector +search but can support keyword and hybrid search. Additionally, vector database can also support operations like +filtering, sorting, and aggregating vectors. + +## Steps to Add a New Vector Database Provider +1. **Choose the Database Type**: Determine if your vector database is a remote service, inline, or both. + - Remote databases make requests to external services, while inline databases execute locally. Some providers support both. +2. **Implement the Provider**: Create a new provider class that inherits from `VectorDatabaseProvider` and implements the required methods. + - Implement methods for vector storage, retrieval, search, and any additional features your database supports. + - You will need to implement the following methods for `YourVectorIndex`: + - `YourVectorIndex.create()` + - `YourVectorIndex.initialize()` + - `YourVectorIndex.add_chunks()` + - `YourVectorIndex.delete_chunk()` + - `YourVectorIndex.query_vector()` + - `YourVectorIndex.query_keyword()` + - `YourVectorIndex.query_hybrid()` + - You will need to implement the following methods for `YourVectorIOAdapter`: + - `YourVectorIOAdapter.initialize()` + - `YourVectorIOAdapter.shutdown()` + - `YourVectorIOAdapter.list_vector_dbs()` + - `YourVectorIOAdapter.register_vector_db()` + - `YourVectorIOAdapter.unregister_vector_db()` + - `YourVectorIOAdapter.insert_chunks()` + - `YourVectorIOAdapter.query_chunks()` + - `YourVectorIOAdapter.delete_chunks()` +3. **Add to Registry**: Register your provider in the appropriate registry file. + - Update {repopath}`llama_stack/providers/registry/vector_io.py` to include your new provider. +```python +from llama_stack.providers.registry.specs import InlineProviderSpec +from llama_stack.providers.registry.api import Api + +InlineProviderSpec( + api=Api.vector_io, + provider_type="inline::milvus", + pip_packages=["pymilvus>=2.4.10"], + module="llama_stack.providers.inline.vector_io.milvus", + config_class="llama_stack.providers.inline.vector_io.milvus.MilvusVectorIOConfig", + api_dependencies=[Api.inference], + optional_api_dependencies=[Api.files], + description="", +), +``` +4. **Add Tests**: Create unit tests and integration tests for your provider in the `tests/` directory. + - Unit Tests + - By following the structure of the class methods, you will be able to easily run unit and integration tests for your database. + 1. You have to configure the tests for your provide in `/tests/unit/providers/vector_io/conftest.py`. + 2. Update the `vector_provider` fixture to include your provider if they are an inline provider. + 3. Create a `your_vectorprovider_index` fixture that initializes your vector index. + 4. Create a `your_vectorprovider_adapter` fixture that initializes your vector adapter. + 5. Add your provider to the `vector_io_providers` fixture dictionary. + - Please follow the naming convention of `your_vectorprovider_index` and `your_vectorprovider_adapter` as the tests require this to execute properly. + - Integration Tests + - Integration tests are located in {repopath}`tests/integration`. These tests use the python client-SDK APIs (from the `llama_stack_client` package) to test functionality. + - The two set of integration tests are: + - `tests/integration/vector_io/test_vector_io.py`: This file tests registration, insertion, and retrieval. + - `tests/integration/vector_io/test_openai_vector_stores.py`: These tests are for OpenAI-compatible vector stores and test the OpenAI API compatibility. + - You will need to update `skip_if_provider_doesnt_support_openai_vector_stores` to include your provider as well as `skip_if_provider_doesnt_support_openai_vector_stores_search` to test the appropriate search functionality. + - Running the tests in the GitHub CI + - You will need to update the `.github/workflows/integration-vector-io-tests.yml` file to include your provider. + - If your provider is a remote provider, you will also have to add a container to spin up and run it in the action. + - Updating the pyproject.yml + - If you are adding tests for the `inline` provider you will have to update the `unit` group. + - `uv add new_pip_package --group unit` + - If you are adding tests for the `remote` provider you will have to update the `test` group, which is used in the GitHub CI for integration tests. + - `uv add new_pip_package --group test` +5. **Update Documentation**: Please update the documentation for end users + - Generate the provider documentation by running {repopath}`./scripts/provider_codegen.py`. + - Update the autogenerated content in the registry/vector_io.py file with information about your provider. Please see other providers for examples. \ No newline at end of file diff --git a/docs/source/contributing/testing.md b/docs/source/contributing/testing.md index 47bf9dea7..454ded266 100644 --- a/docs/source/contributing/testing.md +++ b/docs/source/contributing/testing.md @@ -1,6 +1,8 @@ -# Testing Llama Stack +```{include} ../../../tests/README.md +``` -Tests are of three different kinds: -- Unit tests -- Provider focused integration tests -- Client SDK tests +```{include} ../../../tests/unit/README.md +``` + +```{include} ../../../tests/integration/README.md +``` diff --git a/docs/source/providers/external/external-providers-guide.md b/docs/source/providers/external/external-providers-guide.md index 2479d406f..e2d4ebea9 100644 --- a/docs/source/providers/external/external-providers-guide.md +++ b/docs/source/providers/external/external-providers-guide.md @@ -226,7 +226,7 @@ uv init name = "llama-stack-provider-ollama" version = "0.1.0" description = "Ollama provider for Llama Stack" -requires-python = ">=3.10" +requires-python = ">=3.12" dependencies = ["llama-stack", "pydantic", "ollama", "aiohttp"] ``` diff --git a/docs/source/providers/inference/index.md b/docs/source/providers/inference/index.md index cdde3a18a..b6d215474 100644 --- a/docs/source/providers/inference/index.md +++ b/docs/source/providers/inference/index.md @@ -35,6 +35,7 @@ remote_runpod remote_sambanova remote_tgi remote_together +remote_vertexai remote_vllm remote_watsonx ``` diff --git a/docs/source/providers/inference/remote_vertexai.md b/docs/source/providers/inference/remote_vertexai.md new file mode 100644 index 000000000..962bbd76f --- /dev/null +++ b/docs/source/providers/inference/remote_vertexai.md @@ -0,0 +1,40 @@ +# remote::vertexai + +## Description + +Google Vertex AI inference provider enables you to use Google's Gemini models through Google Cloud's Vertex AI platform, providing several advantages: + +• Enterprise-grade security: Uses Google Cloud's security controls and IAM +• Better integration: Seamless integration with other Google Cloud services +• Advanced features: Access to additional Vertex AI features like model tuning and monitoring +• Authentication: Uses Google Cloud Application Default Credentials (ADC) instead of API keys + +Configuration: +- Set VERTEX_AI_PROJECT environment variable (required) +- Set VERTEX_AI_LOCATION environment variable (optional, defaults to us-central1) +- Use Google Cloud Application Default Credentials or service account key + +Authentication Setup: +Option 1 (Recommended): gcloud auth application-default login +Option 2: Set GOOGLE_APPLICATION_CREDENTIALS to service account key path + +Available Models: +- vertex_ai/gemini-2.0-flash +- vertex_ai/gemini-2.5-flash +- vertex_ai/gemini-2.5-pro + +## Configuration + +| Field | Type | Required | Default | Description | +|-------|------|----------|---------|-------------| +| `project` | `` | No | | Google Cloud project ID for Vertex AI | +| `location` | `` | No | us-central1 | Google Cloud location for Vertex AI | + +## Sample Configuration + +```yaml +project: ${env.VERTEX_AI_PROJECT:=} +location: ${env.VERTEX_AI_LOCATION:=us-central1} + +``` + diff --git a/docs/source/providers/vector_io/inline_faiss.md b/docs/source/providers/vector_io/inline_faiss.md index bcff66f3f..cfa18a839 100644 --- a/docs/source/providers/vector_io/inline_faiss.md +++ b/docs/source/providers/vector_io/inline_faiss.md @@ -12,6 +12,18 @@ That means you'll get fast and efficient vector retrieval. - Lightweight and easy to use - Fully integrated with Llama Stack - GPU support +- **Vector search** - FAISS supports pure vector similarity search using embeddings + +## Search Modes + +**Supported:** +- **Vector Search** (`mode="vector"`): Performs vector similarity search using embeddings + +**Not Supported:** +- **Keyword Search** (`mode="keyword"`): Not supported by FAISS +- **Hybrid Search** (`mode="hybrid"`): Not supported by FAISS + +> **Note**: FAISS is designed as a pure vector similarity search library. See the [FAISS GitHub repository](https://github.com/facebookresearch/faiss) for more details about FAISS's core functionality. ## Usage diff --git a/docs/source/providers/vector_io/remote_milvus.md b/docs/source/providers/vector_io/remote_milvus.md index 3646f4acc..2af64b8bb 100644 --- a/docs/source/providers/vector_io/remote_milvus.md +++ b/docs/source/providers/vector_io/remote_milvus.md @@ -11,6 +11,7 @@ That means you're not limited to storing vectors in memory or in a separate serv - Easy to use - Fully integrated with Llama Stack +- Supports all search modes: vector, keyword, and hybrid search (both inline and remote configurations) ## Usage @@ -101,6 +102,92 @@ vector_io: - **`client_pem_path`**: Path to the **client certificate** file (required for mTLS). - **`client_key_path`**: Path to the **client private key** file (required for mTLS). +## Search Modes + +Milvus supports three different search modes for both inline and remote configurations: + +### Vector Search +Vector search uses semantic similarity to find the most relevant chunks based on embedding vectors. This is the default search mode and works well for finding conceptually similar content. + +```python +# Vector search example +search_response = client.vector_stores.search( + vector_store_id=vector_store.id, + query="What is machine learning?", + search_mode="vector", + max_num_results=5, +) +``` + +### Keyword Search +Keyword search uses traditional text-based matching to find chunks containing specific terms or phrases. This is useful when you need exact term matches. + +```python +# Keyword search example +search_response = client.vector_stores.search( + vector_store_id=vector_store.id, + query="Python programming language", + search_mode="keyword", + max_num_results=5, +) +``` + +### Hybrid Search +Hybrid search combines both vector and keyword search methods to provide more comprehensive results. It leverages the strengths of both semantic similarity and exact term matching. + +#### Basic Hybrid Search +```python +# Basic hybrid search example (uses RRF ranker with default impact_factor=60.0) +search_response = client.vector_stores.search( + vector_store_id=vector_store.id, + query="neural networks in Python", + search_mode="hybrid", + max_num_results=5, +) +``` + +**Note**: The default `impact_factor` value of 60.0 was empirically determined to be optimal in the original RRF research paper: ["Reciprocal Rank Fusion outperforms Condorcet and individual Rank Learning Methods"](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) (Cormack et al., 2009). + +#### Hybrid Search with RRF (Reciprocal Rank Fusion) Ranker +RRF combines rankings from vector and keyword search by using reciprocal ranks. The impact factor controls how much weight is given to higher-ranked results. + +```python +# Hybrid search with custom RRF parameters +search_response = client.vector_stores.search( + vector_store_id=vector_store.id, + query="neural networks in Python", + search_mode="hybrid", + max_num_results=5, + ranking_options={ + "ranker": { + "type": "rrf", + "impact_factor": 100.0, # Higher values give more weight to top-ranked results + } + }, +) +``` + +#### Hybrid Search with Weighted Ranker +Weighted ranker linearly combines normalized scores from vector and keyword search. The alpha parameter controls the balance between the two search methods. + +```python +# Hybrid search with weighted ranker +search_response = client.vector_stores.search( + vector_store_id=vector_store.id, + query="neural networks in Python", + search_mode="hybrid", + max_num_results=5, + ranking_options={ + "ranker": { + "type": "weighted", + "alpha": 0.7, # 70% vector search, 30% keyword search + } + }, +) +``` + +For detailed documentation on RRF and Weighted rankers, please refer to the [Milvus Reranking Guide](https://milvus.io/docs/reranking.md). + ## Documentation See the [Milvus documentation](https://milvus.io/docs/install-overview.md) for more details about Milvus in general. diff --git a/llama_stack/apis/agents/agents.py b/llama_stack/apis/agents/agents.py index e816da766..7dd3e9289 100644 --- a/llama_stack/apis/agents/agents.py +++ b/llama_stack/apis/agents/agents.py @@ -706,6 +706,7 @@ class Agents(Protocol): temperature: float | None = None, text: OpenAIResponseText | None = None, tools: list[OpenAIResponseInputTool] | None = None, + include: list[str] | None = None, max_infer_iters: int | None = 10, # this is an extension to the OpenAI API ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]: """Create a new OpenAI response. @@ -713,6 +714,7 @@ class Agents(Protocol): :param input: Input message(s) to create the response. :param model: The underlying LLM used for completions. :param previous_response_id: (Optional) if specified, the new response will be a continuation of the previous response. This can be used to easily fork-off new responses from existing responses. + :param include: (Optional) Additional fields to include in the response. :returns: An OpenAIResponseObject. """ ... diff --git a/llama_stack/apis/agents/openai_responses.py b/llama_stack/apis/agents/openai_responses.py index 10cadf38f..8574104dc 100644 --- a/llama_stack/apis/agents/openai_responses.py +++ b/llama_stack/apis/agents/openai_responses.py @@ -170,6 +170,23 @@ class OpenAIResponseOutputMessageWebSearchToolCall(BaseModel): type: Literal["web_search_call"] = "web_search_call" +class OpenAIResponseOutputMessageFileSearchToolCallResults(BaseModel): + """Search results returned by the file search operation. + + :param attributes: (Optional) Key-value attributes associated with the file + :param file_id: Unique identifier of the file containing the result + :param filename: Name of the file containing the result + :param score: Relevance score for this search result (between 0 and 1) + :param text: Text content of the search result + """ + + attributes: dict[str, Any] + file_id: str + filename: str + score: float + text: str + + @json_schema_type class OpenAIResponseOutputMessageFileSearchToolCall(BaseModel): """File search tool call output message for OpenAI responses. @@ -185,7 +202,7 @@ class OpenAIResponseOutputMessageFileSearchToolCall(BaseModel): queries: list[str] status: str type: Literal["file_search_call"] = "file_search_call" - results: list[dict[str, Any]] | None = None + results: list[OpenAIResponseOutputMessageFileSearchToolCallResults] | None = None @json_schema_type diff --git a/llama_stack/apis/common/errors.py b/llama_stack/apis/common/errors.py index c47c99f8d..7104d8db6 100644 --- a/llama_stack/apis/common/errors.py +++ b/llama_stack/apis/common/errors.py @@ -67,5 +67,14 @@ class SessionNotFoundError(ValueError): class ConflictError(ValueError): """raised when an operation cannot be performed due to a conflict with the current state""" - def __init__(self, message: str) -> None: + pass + + +class ModelTypeError(TypeError): + """raised when a model is present but not the correct type""" + + def __init__(self, model_name: str, model_type: str, expected_model_type: str) -> None: + message = ( + f"Model '{model_name}' is of type '{model_type}' rather than the expected type '{expected_model_type}'" + ) super().__init__(message) diff --git a/llama_stack/core/build.py b/llama_stack/core/build.py index b3e35ecef..4b20588fd 100644 --- a/llama_stack/core/build.py +++ b/llama_stack/core/build.py @@ -91,7 +91,7 @@ def get_provider_dependencies( def print_pip_install_help(config: BuildConfig): - normal_deps, special_deps = get_provider_dependencies(config) + normal_deps, special_deps, _ = get_provider_dependencies(config) cprint( f"Please install needed dependencies using the following commands:\n\nuv pip install {' '.join(normal_deps)}", diff --git a/llama_stack/core/routers/inference.py b/llama_stack/core/routers/inference.py index 79ab7c34f..6a3f07247 100644 --- a/llama_stack/core/routers/inference.py +++ b/llama_stack/core/routers/inference.py @@ -18,7 +18,7 @@ from llama_stack.apis.common.content_types import ( InterleavedContent, InterleavedContentItem, ) -from llama_stack.apis.common.errors import ModelNotFoundError +from llama_stack.apis.common.errors import ModelNotFoundError, ModelTypeError from llama_stack.apis.inference import ( BatchChatCompletionResponse, BatchCompletionResponse, @@ -65,7 +65,7 @@ from llama_stack.providers.datatypes import HealthResponse, HealthStatus, Routin from llama_stack.providers.utils.inference.inference_store import InferenceStore from llama_stack.providers.utils.telemetry.tracing import get_current_span -logger = get_logger(name=__name__, category="core") +logger = get_logger(name=__name__, category="inference") class InferenceRouter(Inference): @@ -177,6 +177,15 @@ class InferenceRouter(Inference): encoded = self.formatter.encode_content(messages) return len(encoded.tokens) if encoded and encoded.tokens else 0 + async def _get_model(self, model_id: str, expected_model_type: str) -> Model: + """takes a model id and gets model after ensuring that it is accessible and of the correct type""" + model = await self.routing_table.get_model(model_id) + if model is None: + raise ModelNotFoundError(model_id) + if model.model_type != expected_model_type: + raise ModelTypeError(model_id, model.model_type, expected_model_type) + return model + async def chat_completion( self, model_id: str, @@ -195,11 +204,7 @@ class InferenceRouter(Inference): ) if sampling_params is None: sampling_params = SamplingParams() - model = await self.routing_table.get_model(model_id) - if model is None: - raise ModelNotFoundError(model_id) - if model.model_type == ModelType.embedding: - raise ValueError(f"Model '{model_id}' is an embedding model and does not support chat completions") + model = await self._get_model(model_id, ModelType.llm) if tool_config: if tool_choice and tool_choice != tool_config.tool_choice: raise ValueError("tool_choice and tool_config.tool_choice must match") @@ -301,11 +306,7 @@ class InferenceRouter(Inference): logger.debug( f"InferenceRouter.completion: {model_id=}, {stream=}, {content=}, {sampling_params=}, {response_format=}", ) - model = await self.routing_table.get_model(model_id) - if model is None: - raise ModelNotFoundError(model_id) - if model.model_type == ModelType.embedding: - raise ValueError(f"Model '{model_id}' is an embedding model and does not support chat completions") + model = await self._get_model(model_id, ModelType.llm) provider = await self.routing_table.get_provider_impl(model_id) params = dict( model_id=model_id, @@ -355,11 +356,7 @@ class InferenceRouter(Inference): task_type: EmbeddingTaskType | None = None, ) -> EmbeddingsResponse: logger.debug(f"InferenceRouter.embeddings: {model_id}") - model = await self.routing_table.get_model(model_id) - if model is None: - raise ModelNotFoundError(model_id) - if model.model_type == ModelType.llm: - raise ValueError(f"Model '{model_id}' is an LLM model and does not support embeddings") + await self._get_model(model_id, ModelType.embedding) provider = await self.routing_table.get_provider_impl(model_id) return await provider.embeddings( model_id=model_id, @@ -395,12 +392,7 @@ class InferenceRouter(Inference): logger.debug( f"InferenceRouter.openai_completion: {model=}, {stream=}, {prompt=}", ) - model_obj = await self.routing_table.get_model(model) - if model_obj is None: - raise ModelNotFoundError(model) - if model_obj.model_type == ModelType.embedding: - raise ValueError(f"Model '{model}' is an embedding model and does not support completions") - + model_obj = await self._get_model(model, ModelType.llm) params = dict( model=model_obj.identifier, prompt=prompt, @@ -476,11 +468,7 @@ class InferenceRouter(Inference): logger.debug( f"InferenceRouter.openai_chat_completion: {model=}, {stream=}, {messages=}", ) - model_obj = await self.routing_table.get_model(model) - if model_obj is None: - raise ModelNotFoundError(model) - if model_obj.model_type == ModelType.embedding: - raise ValueError(f"Model '{model}' is an embedding model and does not support chat completions") + model_obj = await self._get_model(model, ModelType.llm) # Use the OpenAI client for a bit of extra input validation without # exposing the OpenAI client itself as part of our API surface @@ -567,12 +555,7 @@ class InferenceRouter(Inference): logger.debug( f"InferenceRouter.openai_embeddings: {model=}, input_type={type(input)}, {encoding_format=}, {dimensions=}", ) - model_obj = await self.routing_table.get_model(model) - if model_obj is None: - raise ModelNotFoundError(model) - if model_obj.model_type != ModelType.embedding: - raise ValueError(f"Model '{model}' is not an embedding model") - + model_obj = await self._get_model(model, ModelType.embedding) params = dict( model=model_obj.identifier, input=input, @@ -871,4 +854,5 @@ class InferenceRouter(Inference): model=model.identifier, object="chat.completion", ) + logger.debug(f"InferenceRouter.completion_response: {final_response}") await self.store.store_chat_completion(final_response, messages) diff --git a/llama_stack/core/routing_tables/models.py b/llama_stack/core/routing_tables/models.py index c76619271..34c431e00 100644 --- a/llama_stack/core/routing_tables/models.py +++ b/llama_stack/core/routing_tables/models.py @@ -63,6 +63,8 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models): async def get_provider_impl(self, model_id: str) -> Any: model = await lookup_model(self, model_id) + if model.provider_id not in self.impls_by_provider_id: + raise ValueError(f"Provider {model.provider_id} not found in the routing table") return self.impls_by_provider_id[model.provider_id] async def register_model( diff --git a/llama_stack/core/routing_tables/toolgroups.py b/llama_stack/core/routing_tables/toolgroups.py index e172af991..6910b3906 100644 --- a/llama_stack/core/routing_tables/toolgroups.py +++ b/llama_stack/core/routing_tables/toolgroups.py @@ -124,10 +124,7 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups): return toolgroup async def unregister_toolgroup(self, toolgroup_id: str) -> None: - tool_group = await self.get_tool_group(toolgroup_id) - if tool_group is None: - raise ToolGroupNotFoundError(toolgroup_id) - await self.unregister_object(tool_group) + await self.unregister_object(await self.get_tool_group(toolgroup_id)) async def shutdown(self) -> None: pass diff --git a/llama_stack/core/routing_tables/vector_dbs.py b/llama_stack/core/routing_tables/vector_dbs.py index c81a27a3b..e8dc46997 100644 --- a/llama_stack/core/routing_tables/vector_dbs.py +++ b/llama_stack/core/routing_tables/vector_dbs.py @@ -8,7 +8,7 @@ from typing import Any from pydantic import TypeAdapter -from llama_stack.apis.common.errors import ModelNotFoundError, VectorStoreNotFoundError +from llama_stack.apis.common.errors import ModelNotFoundError, ModelTypeError, VectorStoreNotFoundError from llama_stack.apis.models import ModelType from llama_stack.apis.resource import ResourceType from llama_stack.apis.vector_dbs import ListVectorDBsResponse, VectorDB, VectorDBs @@ -66,7 +66,7 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs): if model is None: raise ModelNotFoundError(embedding_model) if model.model_type != ModelType.embedding: - raise ValueError(f"Model {embedding_model} is not an embedding model") + raise ModelTypeError(embedding_model, model.model_type, ModelType.embedding) if "embedding_dimension" not in model.metadata: raise ValueError(f"Model {embedding_model} does not have an embedding dimension") vector_db_data = { diff --git a/llama_stack/distributions/ci-tests/build.yaml b/llama_stack/distributions/ci-tests/build.yaml index 2f9ae8682..e6e699b62 100644 --- a/llama_stack/distributions/ci-tests/build.yaml +++ b/llama_stack/distributions/ci-tests/build.yaml @@ -14,6 +14,7 @@ distribution_spec: - provider_type: remote::openai - provider_type: remote::anthropic - provider_type: remote::gemini + - provider_type: remote::vertexai - provider_type: remote::groq - provider_type: remote::sambanova - provider_type: inline::sentence-transformers diff --git a/llama_stack/distributions/ci-tests/run.yaml b/llama_stack/distributions/ci-tests/run.yaml index 188c66275..05e1b4576 100644 --- a/llama_stack/distributions/ci-tests/run.yaml +++ b/llama_stack/distributions/ci-tests/run.yaml @@ -65,6 +65,11 @@ providers: provider_type: remote::gemini config: api_key: ${env.GEMINI_API_KEY:=} + - provider_id: ${env.VERTEX_AI_PROJECT:+vertexai} + provider_type: remote::vertexai + config: + project: ${env.VERTEX_AI_PROJECT:=} + location: ${env.VERTEX_AI_LOCATION:=us-central1} - provider_id: groq provider_type: remote::groq config: diff --git a/llama_stack/distributions/starter/build.yaml b/llama_stack/distributions/starter/build.yaml index f95a03a9e..1a4f81d49 100644 --- a/llama_stack/distributions/starter/build.yaml +++ b/llama_stack/distributions/starter/build.yaml @@ -14,6 +14,7 @@ distribution_spec: - provider_type: remote::openai - provider_type: remote::anthropic - provider_type: remote::gemini + - provider_type: remote::vertexai - provider_type: remote::groq - provider_type: remote::sambanova - provider_type: inline::sentence-transformers diff --git a/llama_stack/distributions/starter/run.yaml b/llama_stack/distributions/starter/run.yaml index 8bd737686..46bd12956 100644 --- a/llama_stack/distributions/starter/run.yaml +++ b/llama_stack/distributions/starter/run.yaml @@ -65,6 +65,11 @@ providers: provider_type: remote::gemini config: api_key: ${env.GEMINI_API_KEY:=} + - provider_id: ${env.VERTEX_AI_PROJECT:+vertexai} + provider_type: remote::vertexai + config: + project: ${env.VERTEX_AI_PROJECT:=} + location: ${env.VERTEX_AI_LOCATION:=us-central1} - provider_id: groq provider_type: remote::groq config: diff --git a/llama_stack/distributions/starter/starter.py b/llama_stack/distributions/starter/starter.py index a970f2d1c..0270b68ad 100644 --- a/llama_stack/distributions/starter/starter.py +++ b/llama_stack/distributions/starter/starter.py @@ -56,6 +56,7 @@ ENABLED_INFERENCE_PROVIDERS = [ "fireworks", "together", "gemini", + "vertexai", "groq", "sambanova", "anthropic", @@ -71,6 +72,7 @@ INFERENCE_PROVIDER_IDS = { "tgi": "${env.TGI_URL:+tgi}", "cerebras": "${env.CEREBRAS_API_KEY:+cerebras}", "nvidia": "${env.NVIDIA_API_KEY:+nvidia}", + "vertexai": "${env.VERTEX_AI_PROJECT:+vertexai}", } @@ -246,6 +248,14 @@ def get_distribution_template() -> DistributionTemplate: "", "Gemini API Key", ), + "VERTEX_AI_PROJECT": ( + "", + "Google Cloud Project ID for Vertex AI", + ), + "VERTEX_AI_LOCATION": ( + "us-central1", + "Google Cloud Location for Vertex AI", + ), "SAMBANOVA_API_KEY": ( "", "SambaNova API Key", diff --git a/llama_stack/log.py b/llama_stack/log.py index ab53e08c0..7507aface 100644 --- a/llama_stack/log.py +++ b/llama_stack/log.py @@ -32,6 +32,7 @@ CATEGORIES = [ "tools", "client", "telemetry", + "openai_responses", ] # Initialize category levels with default level @@ -99,7 +100,8 @@ def parse_environment_config(env_config: str) -> dict[str, int]: Dict[str, int]: A dictionary mapping categories to their log levels. """ category_levels = {} - for pair in env_config.split(";"): + delimiter = "," + for pair in env_config.split(delimiter): if not pair.strip(): continue diff --git a/llama_stack/models/llama/llama3/chat_format.py b/llama_stack/models/llama/llama3/chat_format.py index 0a973cf0c..1f88a1699 100644 --- a/llama_stack/models/llama/llama3/chat_format.py +++ b/llama_stack/models/llama/llama3/chat_format.py @@ -236,6 +236,7 @@ class ChatFormat: arguments_json=json.dumps(tool_arguments), ) ) + content = "" return RawMessage( role="assistant", diff --git a/llama_stack/providers/inline/agents/meta_reference/agents.py b/llama_stack/providers/inline/agents/meta_reference/agents.py index 15695ec48..0f12a0865 100644 --- a/llama_stack/providers/inline/agents/meta_reference/agents.py +++ b/llama_stack/providers/inline/agents/meta_reference/agents.py @@ -327,10 +327,21 @@ class MetaReferenceAgentsImpl(Agents): temperature: float | None = None, text: OpenAIResponseText | None = None, tools: list[OpenAIResponseInputTool] | None = None, + include: list[str] | None = None, max_infer_iters: int | None = 10, ) -> OpenAIResponseObject: return await self.openai_responses_impl.create_openai_response( - input, model, instructions, previous_response_id, store, stream, temperature, text, tools, max_infer_iters + input, + model, + instructions, + previous_response_id, + store, + stream, + temperature, + text, + tools, + include, + max_infer_iters, ) async def list_openai_responses( diff --git a/llama_stack/providers/inline/agents/meta_reference/openai_responses.py b/llama_stack/providers/inline/agents/meta_reference/openai_responses.py index 7eb2b3897..347954908 100644 --- a/llama_stack/providers/inline/agents/meta_reference/openai_responses.py +++ b/llama_stack/providers/inline/agents/meta_reference/openai_responses.py @@ -38,6 +38,7 @@ from llama_stack.apis.agents.openai_responses import ( OpenAIResponseOutputMessageContent, OpenAIResponseOutputMessageContentOutputText, OpenAIResponseOutputMessageFileSearchToolCall, + OpenAIResponseOutputMessageFileSearchToolCallResults, OpenAIResponseOutputMessageFunctionToolCall, OpenAIResponseOutputMessageMCPListTools, OpenAIResponseOutputMessageWebSearchToolCall, @@ -333,6 +334,7 @@ class OpenAIResponsesImpl: temperature: float | None = None, text: OpenAIResponseText | None = None, tools: list[OpenAIResponseInputTool] | None = None, + include: list[str] | None = None, max_infer_iters: int | None = 10, ): stream = bool(stream) @@ -486,8 +488,12 @@ class OpenAIResponsesImpl: # Convert collected chunks to complete response if chat_response_tool_calls: tool_calls = [chat_response_tool_calls[i] for i in sorted(chat_response_tool_calls.keys())] + + # when there are tool calls, we need to clear the content + chat_response_content = [] else: tool_calls = None + assistant_message = OpenAIAssistantMessageParam( content="".join(chat_response_content), tool_calls=tool_calls, @@ -826,12 +832,13 @@ class OpenAIResponsesImpl: text = result.metadata["chunks"][i] if "chunks" in result.metadata else None score = result.metadata["scores"][i] if "scores" in result.metadata else None message.results.append( - { - "file_id": doc_id, - "filename": doc_id, - "text": text, - "score": score, - } + OpenAIResponseOutputMessageFileSearchToolCallResults( + file_id=doc_id, + filename=doc_id, + text=text, + score=score, + attributes={}, + ) ) if error_exc or (result.error_code and result.error_code > 0) or result.error_message: message.status = "failed" diff --git a/llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py b/llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py index 796771ee1..801500dee 100644 --- a/llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py +++ b/llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py @@ -15,6 +15,7 @@ from llama_stack.apis.safety import ( RunShieldResponse, Safety, SafetyViolation, + ShieldStore, ViolationLevel, ) from llama_stack.apis.shields import Shield @@ -32,6 +33,8 @@ PROMPT_GUARD_MODEL = "Prompt-Guard-86M" class PromptGuardSafetyImpl(Safety, ShieldsProtocolPrivate): + shield_store: ShieldStore + def __init__(self, config: PromptGuardConfig, _deps) -> None: self.config = config @@ -53,7 +56,7 @@ class PromptGuardSafetyImpl(Safety, ShieldsProtocolPrivate): self, shield_id: str, messages: list[Message], - params: dict[str, Any] = None, + params: dict[str, Any], ) -> RunShieldResponse: shield = await self.shield_store.get_shield(shield_id) if not shield: @@ -61,6 +64,9 @@ class PromptGuardSafetyImpl(Safety, ShieldsProtocolPrivate): return await self.shield.run(messages) + async def run_moderation(self, input: str | list[str], model: str): + raise NotImplementedError("run_moderation not implemented for PromptGuard") + class PromptGuardShield: def __init__( @@ -117,8 +123,10 @@ class PromptGuardShield: elif self.config.guard_type == PromptGuardType.jailbreak.value and score_malicious > self.threshold: violation = SafetyViolation( violation_level=ViolationLevel.ERROR, - violation_type=f"prompt_injection:malicious={score_malicious}", - violation_return_message="Sorry, I cannot do this.", + user_message="Sorry, I cannot do this.", + metadata={ + "violation_type": f"prompt_injection:malicious={score_malicious}", + }, ) return RunShieldResponse(violation=violation) diff --git a/llama_stack/providers/inline/vector_io/faiss/faiss.py b/llama_stack/providers/inline/vector_io/faiss/faiss.py index 7a5373726..af61da59b 100644 --- a/llama_stack/providers/inline/vector_io/faiss/faiss.py +++ b/llama_stack/providers/inline/vector_io/faiss/faiss.py @@ -33,6 +33,7 @@ from llama_stack.providers.utils.kvstore import kvstore_impl from llama_stack.providers.utils.kvstore.api import KVStore from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin from llama_stack.providers.utils.memory.vector_store import ( + ChunkForDeletion, EmbeddingIndex, VectorDBWithIndex, ) @@ -128,11 +129,12 @@ class FaissIndex(EmbeddingIndex): # Save updated index await self._save_index() - async def delete_chunk(self, chunk_id: str) -> None: - if chunk_id not in self.chunk_ids: + async def delete_chunks(self, chunks_for_deletion: list[ChunkForDeletion]) -> None: + chunk_ids = [c.chunk_id for c in chunks_for_deletion] + if not set(chunk_ids).issubset(self.chunk_ids): return - async with self.chunk_id_lock: + def remove_chunk(chunk_id: str): index = self.chunk_ids.index(chunk_id) self.index.remove_ids(np.array([index])) @@ -146,6 +148,10 @@ class FaissIndex(EmbeddingIndex): self.chunk_by_index = new_chunk_by_index self.chunk_ids.pop(index) + async with self.chunk_id_lock: + for chunk_id in chunk_ids: + remove_chunk(chunk_id) + await self._save_index() async def query_vector( @@ -174,7 +180,9 @@ class FaissIndex(EmbeddingIndex): k: int, score_threshold: float, ) -> QueryChunksResponse: - raise NotImplementedError("Keyword search is not supported in FAISS") + raise NotImplementedError( + "Keyword search is not supported - underlying DB FAISS does not support this search mode" + ) async def query_hybrid( self, @@ -185,7 +193,9 @@ class FaissIndex(EmbeddingIndex): reranker_type: str, reranker_params: dict[str, Any] | None = None, ) -> QueryChunksResponse: - raise NotImplementedError("Hybrid search is not supported in FAISS") + raise NotImplementedError( + "Hybrid search is not supported - underlying DB FAISS does not support this search mode" + ) class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate): @@ -293,8 +303,7 @@ class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPr return await index.query_chunks(query, params) - async def delete_chunks(self, store_id: str, chunk_ids: list[str]) -> None: - """Delete a chunk from a faiss index""" + async def delete_chunks(self, store_id: str, chunks_for_deletion: list[ChunkForDeletion]) -> None: + """Delete chunks from a faiss index""" faiss_index = self.cache[store_id].index - for chunk_id in chunk_ids: - await faiss_index.delete_chunk(chunk_id) + await faiss_index.delete_chunks(chunks_for_deletion) diff --git a/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py b/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py index 1fff7b484..cc1982f3b 100644 --- a/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +++ b/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py @@ -31,6 +31,7 @@ from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIV from llama_stack.providers.utils.memory.vector_store import ( RERANKER_TYPE_RRF, RERANKER_TYPE_WEIGHTED, + ChunkForDeletion, EmbeddingIndex, VectorDBWithIndex, ) @@ -426,34 +427,36 @@ class SQLiteVecIndex(EmbeddingIndex): return QueryChunksResponse(chunks=chunks, scores=scores) - async def delete_chunk(self, chunk_id: str) -> None: + async def delete_chunks(self, chunks_for_deletion: list[ChunkForDeletion]) -> None: """Remove a chunk from the SQLite vector store.""" + chunk_ids = [c.chunk_id for c in chunks_for_deletion] - def _delete_chunk(): + def _delete_chunks(): connection = _create_sqlite_connection(self.db_path) cur = connection.cursor() try: cur.execute("BEGIN TRANSACTION") # Delete from metadata table - cur.execute(f"DELETE FROM {self.metadata_table} WHERE id = ?", (chunk_id,)) + placeholders = ",".join("?" * len(chunk_ids)) + cur.execute(f"DELETE FROM {self.metadata_table} WHERE id IN ({placeholders})", chunk_ids) # Delete from vector table - cur.execute(f"DELETE FROM {self.vector_table} WHERE id = ?", (chunk_id,)) + cur.execute(f"DELETE FROM {self.vector_table} WHERE id IN ({placeholders})", chunk_ids) # Delete from FTS table - cur.execute(f"DELETE FROM {self.fts_table} WHERE id = ?", (chunk_id,)) + cur.execute(f"DELETE FROM {self.fts_table} WHERE id IN ({placeholders})", chunk_ids) connection.commit() except Exception as e: connection.rollback() - logger.error(f"Error deleting chunk {chunk_id}: {e}") + logger.error(f"Error deleting chunks: {e}") raise finally: cur.close() connection.close() - await asyncio.to_thread(_delete_chunk) + await asyncio.to_thread(_delete_chunks) class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate): @@ -551,12 +554,10 @@ class SQLiteVecVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoc raise VectorStoreNotFoundError(vector_db_id) return await index.query_chunks(query, params) - async def delete_chunks(self, store_id: str, chunk_ids: list[str]) -> None: - """Delete a chunk from a sqlite_vec index.""" + async def delete_chunks(self, store_id: str, chunks_for_deletion: list[ChunkForDeletion]) -> None: + """Delete chunks from a sqlite_vec index.""" index = await self._get_and_cache_vector_db_index(store_id) if not index: raise VectorStoreNotFoundError(store_id) - for chunk_id in chunk_ids: - # Use the index's delete_chunk method - await index.index.delete_chunk(chunk_id) + await index.index.delete_chunks(chunks_for_deletion) diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py index a8bc96a77..1801cdcad 100644 --- a/llama_stack/providers/registry/inference.py +++ b/llama_stack/providers/registry/inference.py @@ -213,6 +213,36 @@ def available_providers() -> list[ProviderSpec]: description="Google Gemini inference provider for accessing Gemini models and Google's AI services.", ), ), + remote_provider_spec( + api=Api.inference, + adapter=AdapterSpec( + adapter_type="vertexai", + pip_packages=["litellm", "google-cloud-aiplatform"], + module="llama_stack.providers.remote.inference.vertexai", + config_class="llama_stack.providers.remote.inference.vertexai.VertexAIConfig", + provider_data_validator="llama_stack.providers.remote.inference.vertexai.config.VertexAIProviderDataValidator", + description="""Google Vertex AI inference provider enables you to use Google's Gemini models through Google Cloud's Vertex AI platform, providing several advantages: + +• Enterprise-grade security: Uses Google Cloud's security controls and IAM +• Better integration: Seamless integration with other Google Cloud services +• Advanced features: Access to additional Vertex AI features like model tuning and monitoring +• Authentication: Uses Google Cloud Application Default Credentials (ADC) instead of API keys + +Configuration: +- Set VERTEX_AI_PROJECT environment variable (required) +- Set VERTEX_AI_LOCATION environment variable (optional, defaults to us-central1) +- Use Google Cloud Application Default Credentials or service account key + +Authentication Setup: +Option 1 (Recommended): gcloud auth application-default login +Option 2: Set GOOGLE_APPLICATION_CREDENTIALS to service account key path + +Available Models: +- vertex_ai/gemini-2.0-flash +- vertex_ai/gemini-2.5-flash +- vertex_ai/gemini-2.5-pro""", + ), + ), remote_provider_spec( api=Api.inference, adapter=AdapterSpec( diff --git a/llama_stack/providers/registry/vector_io.py b/llama_stack/providers/registry/vector_io.py index 846f7b88e..70148eb15 100644 --- a/llama_stack/providers/registry/vector_io.py +++ b/llama_stack/providers/registry/vector_io.py @@ -45,6 +45,18 @@ That means you'll get fast and efficient vector retrieval. - Lightweight and easy to use - Fully integrated with Llama Stack - GPU support +- **Vector search** - FAISS supports pure vector similarity search using embeddings + +## Search Modes + +**Supported:** +- **Vector Search** (`mode="vector"`): Performs vector similarity search using embeddings + +**Not Supported:** +- **Keyword Search** (`mode="keyword"`): Not supported by FAISS +- **Hybrid Search** (`mode="hybrid"`): Not supported by FAISS + +> **Note**: FAISS is designed as a pure vector similarity search library. See the [FAISS GitHub repository](https://github.com/facebookresearch/faiss) for more details about FAISS's core functionality. ## Usage @@ -330,6 +342,7 @@ See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introducti """, ), api_dependencies=[Api.inference], + optional_api_dependencies=[Api.files], ), InlineProviderSpec( api=Api.vector_io, @@ -338,6 +351,7 @@ See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introducti module="llama_stack.providers.inline.vector_io.chroma", config_class="llama_stack.providers.inline.vector_io.chroma.ChromaVectorIOConfig", api_dependencies=[Api.inference], + optional_api_dependencies=[Api.files], description=""" [Chroma](https://www.trychroma.com/) is an inline and remote vector database provider for Llama Stack. It allows you to store and query vectors directly within a Chroma database. @@ -452,6 +466,7 @@ See [Weaviate's documentation](https://weaviate.io/developers/weaviate) for more """, ), api_dependencies=[Api.inference], + optional_api_dependencies=[Api.files], ), InlineProviderSpec( api=Api.vector_io, @@ -535,6 +550,7 @@ That means you're not limited to storing vectors in memory or in a separate serv - Easy to use - Fully integrated with Llama Stack +- Supports all search modes: vector, keyword, and hybrid search (both inline and remote configurations) ## Usage @@ -625,6 +641,92 @@ vector_io: - **`client_pem_path`**: Path to the **client certificate** file (required for mTLS). - **`client_key_path`**: Path to the **client private key** file (required for mTLS). +## Search Modes + +Milvus supports three different search modes for both inline and remote configurations: + +### Vector Search +Vector search uses semantic similarity to find the most relevant chunks based on embedding vectors. This is the default search mode and works well for finding conceptually similar content. + +```python +# Vector search example +search_response = client.vector_stores.search( + vector_store_id=vector_store.id, + query="What is machine learning?", + search_mode="vector", + max_num_results=5, +) +``` + +### Keyword Search +Keyword search uses traditional text-based matching to find chunks containing specific terms or phrases. This is useful when you need exact term matches. + +```python +# Keyword search example +search_response = client.vector_stores.search( + vector_store_id=vector_store.id, + query="Python programming language", + search_mode="keyword", + max_num_results=5, +) +``` + +### Hybrid Search +Hybrid search combines both vector and keyword search methods to provide more comprehensive results. It leverages the strengths of both semantic similarity and exact term matching. + +#### Basic Hybrid Search +```python +# Basic hybrid search example (uses RRF ranker with default impact_factor=60.0) +search_response = client.vector_stores.search( + vector_store_id=vector_store.id, + query="neural networks in Python", + search_mode="hybrid", + max_num_results=5, +) +``` + +**Note**: The default `impact_factor` value of 60.0 was empirically determined to be optimal in the original RRF research paper: ["Reciprocal Rank Fusion outperforms Condorcet and individual Rank Learning Methods"](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) (Cormack et al., 2009). + +#### Hybrid Search with RRF (Reciprocal Rank Fusion) Ranker +RRF combines rankings from vector and keyword search by using reciprocal ranks. The impact factor controls how much weight is given to higher-ranked results. + +```python +# Hybrid search with custom RRF parameters +search_response = client.vector_stores.search( + vector_store_id=vector_store.id, + query="neural networks in Python", + search_mode="hybrid", + max_num_results=5, + ranking_options={ + "ranker": { + "type": "rrf", + "impact_factor": 100.0, # Higher values give more weight to top-ranked results + } + }, +) +``` + +#### Hybrid Search with Weighted Ranker +Weighted ranker linearly combines normalized scores from vector and keyword search. The alpha parameter controls the balance between the two search methods. + +```python +# Hybrid search with weighted ranker +search_response = client.vector_stores.search( + vector_store_id=vector_store.id, + query="neural networks in Python", + search_mode="hybrid", + max_num_results=5, + ranking_options={ + "ranker": { + "type": "weighted", + "alpha": 0.7, # 70% vector search, 30% keyword search + } + }, +) +``` + +For detailed documentation on RRF and Weighted rankers, please refer to the [Milvus Reranking Guide](https://milvus.io/docs/reranking.md). + ## Documentation See the [Milvus documentation](https://milvus.io/docs/install-overview.md) for more details about Milvus in general. @@ -632,6 +734,7 @@ For more details on TLS configuration, refer to the [TLS setup guide](https://mi """, ), api_dependencies=[Api.inference], + optional_api_dependencies=[Api.files], ), InlineProviderSpec( api=Api.vector_io, diff --git a/llama_stack/providers/remote/inference/fireworks/fireworks.py b/llama_stack/providers/remote/inference/fireworks/fireworks.py index ca4c7b578..bd86f7238 100644 --- a/llama_stack/providers/remote/inference/fireworks/fireworks.py +++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py @@ -235,6 +235,7 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv llama_model = self.get_llama_model(request.model) if isinstance(request, ChatCompletionRequest): + # TODO: tools are never added to the request, so we need to add them here if media_present or not llama_model: input_dict["messages"] = [ await convert_message_to_openai_dict(m, download=True) for m in request.messages @@ -378,6 +379,7 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv # Fireworks chat completions OpenAI-compatible API does not support # tool calls properly. llama_model = self.get_llama_model(model_obj.provider_resource_id) + if llama_model: return await OpenAIChatCompletionToLlamaStackMixin.openai_chat_completion( self, @@ -431,4 +433,5 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv user=user, ) + logger.debug(f"fireworks params: {params}") return await self._get_openai_client().chat.completions.create(model=model_obj.provider_resource_id, **params) diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py index 26b4dec76..a93421536 100644 --- a/llama_stack/providers/remote/inference/ollama/ollama.py +++ b/llama_stack/providers/remote/inference/ollama/ollama.py @@ -457,9 +457,6 @@ class OllamaInferenceAdapter( user: str | None = None, ) -> OpenAIEmbeddingsResponse: model_obj = await self._get_model(model) - if model_obj.model_type != ModelType.embedding: - raise ValueError(f"Model {model} is not an embedding model") - if model_obj.provider_resource_id is None: raise ValueError(f"Model {model} has no provider_resource_id set") diff --git a/llama_stack/providers/remote/inference/vertexai/__init__.py b/llama_stack/providers/remote/inference/vertexai/__init__.py new file mode 100644 index 000000000..d9e9419be --- /dev/null +++ b/llama_stack/providers/remote/inference/vertexai/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .config import VertexAIConfig + + +async def get_adapter_impl(config: VertexAIConfig, _deps): + from .vertexai import VertexAIInferenceAdapter + + impl = VertexAIInferenceAdapter(config) + await impl.initialize() + return impl diff --git a/llama_stack/providers/remote/inference/vertexai/config.py b/llama_stack/providers/remote/inference/vertexai/config.py new file mode 100644 index 000000000..659de653e --- /dev/null +++ b/llama_stack/providers/remote/inference/vertexai/config.py @@ -0,0 +1,45 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import Any + +from pydantic import BaseModel, Field + +from llama_stack.schema_utils import json_schema_type + + +class VertexAIProviderDataValidator(BaseModel): + vertex_project: str | None = Field( + default=None, + description="Google Cloud project ID for Vertex AI", + ) + vertex_location: str | None = Field( + default=None, + description="Google Cloud location for Vertex AI (e.g., us-central1)", + ) + + +@json_schema_type +class VertexAIConfig(BaseModel): + project: str = Field( + description="Google Cloud project ID for Vertex AI", + ) + location: str = Field( + default="us-central1", + description="Google Cloud location for Vertex AI", + ) + + @classmethod + def sample_run_config( + cls, + project: str = "${env.VERTEX_AI_PROJECT:=}", + location: str = "${env.VERTEX_AI_LOCATION:=us-central1}", + **kwargs, + ) -> dict[str, Any]: + return { + "project": project, + "location": location, + } diff --git a/llama_stack/providers/remote/inference/vertexai/models.py b/llama_stack/providers/remote/inference/vertexai/models.py new file mode 100644 index 000000000..e72db533d --- /dev/null +++ b/llama_stack/providers/remote/inference/vertexai/models.py @@ -0,0 +1,20 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.providers.utils.inference.model_registry import ( + ProviderModelEntry, +) + +# Vertex AI model IDs with vertex_ai/ prefix as required by litellm +LLM_MODEL_IDS = [ + "vertex_ai/gemini-2.0-flash", + "vertex_ai/gemini-2.5-flash", + "vertex_ai/gemini-2.5-pro", +] + +SAFETY_MODELS_ENTRIES = list[ProviderModelEntry]() + +MODEL_ENTRIES = [ProviderModelEntry(provider_model_id=m) for m in LLM_MODEL_IDS] + SAFETY_MODELS_ENTRIES diff --git a/llama_stack/providers/remote/inference/vertexai/vertexai.py b/llama_stack/providers/remote/inference/vertexai/vertexai.py new file mode 100644 index 000000000..8807fd0e6 --- /dev/null +++ b/llama_stack/providers/remote/inference/vertexai/vertexai.py @@ -0,0 +1,52 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import Any + +from llama_stack.apis.inference import ChatCompletionRequest +from llama_stack.providers.utils.inference.litellm_openai_mixin import ( + LiteLLMOpenAIMixin, +) + +from .config import VertexAIConfig +from .models import MODEL_ENTRIES + + +class VertexAIInferenceAdapter(LiteLLMOpenAIMixin): + def __init__(self, config: VertexAIConfig) -> None: + LiteLLMOpenAIMixin.__init__( + self, + MODEL_ENTRIES, + litellm_provider_name="vertex_ai", + api_key_from_config=None, # Vertex AI uses ADC, not API keys + provider_data_api_key_field="vertex_project", # Use project for validation + ) + self.config = config + + def get_api_key(self) -> str: + # Vertex AI doesn't use API keys, it uses Application Default Credentials + # Return empty string to let litellm handle authentication via ADC + return "" + + async def _get_params(self, request: ChatCompletionRequest) -> dict[str, Any]: + # Get base parameters from parent + params = await super()._get_params(request) + + # Add Vertex AI specific parameters + provider_data = self.get_request_provider_data() + if provider_data: + if getattr(provider_data, "vertex_project", None): + params["vertex_project"] = provider_data.vertex_project + if getattr(provider_data, "vertex_location", None): + params["vertex_location"] = provider_data.vertex_location + else: + params["vertex_project"] = self.config.project + params["vertex_location"] = self.config.location + + # Remove api_key since Vertex AI uses ADC + params.pop("api_key", None) + + return params diff --git a/llama_stack/providers/remote/vector_io/chroma/chroma.py b/llama_stack/providers/remote/vector_io/chroma/chroma.py index 26aeaedfb..8f252711b 100644 --- a/llama_stack/providers/remote/vector_io/chroma/chroma.py +++ b/llama_stack/providers/remote/vector_io/chroma/chroma.py @@ -26,6 +26,7 @@ from llama_stack.providers.utils.kvstore import kvstore_impl from llama_stack.providers.utils.kvstore.api import KVStore from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin from llama_stack.providers.utils.memory.vector_store import ( + ChunkForDeletion, EmbeddingIndex, VectorDBWithIndex, ) @@ -115,8 +116,10 @@ class ChromaIndex(EmbeddingIndex): ) -> QueryChunksResponse: raise NotImplementedError("Keyword search is not supported in Chroma") - async def delete_chunk(self, chunk_id: str) -> None: - raise NotImplementedError("delete_chunk is not supported in Chroma") + async def delete_chunks(self, chunks_for_deletion: list[ChunkForDeletion]) -> None: + """Delete a single chunk from the Chroma collection by its ID.""" + ids = [f"{chunk.document_id}:{chunk.chunk_id}" for chunk in chunks_for_deletion] + await maybe_await(self.collection.delete(ids=ids)) async def query_hybrid( self, @@ -144,6 +147,7 @@ class ChromaVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolP self.cache = {} self.kvstore: KVStore | None = None self.vector_db_store = None + self.files_api = files_api async def initialize(self) -> None: self.kvstore = await kvstore_impl(self.config.kvstore) @@ -227,5 +231,10 @@ class ChromaVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolP self.cache[vector_db_id] = index return index - async def delete_chunks(self, store_id: str, chunk_ids: list[str]) -> None: - raise NotImplementedError("OpenAI Vector Stores API is not supported in Chroma") + async def delete_chunks(self, store_id: str, chunks_for_deletion: list[ChunkForDeletion]) -> None: + """Delete chunks from a Chroma vector store.""" + index = await self._get_and_cache_vector_db_index(store_id) + if not index: + raise ValueError(f"Vector DB {store_id} not found") + + await index.index.delete_chunks(chunks_for_deletion) diff --git a/llama_stack/providers/remote/vector_io/milvus/milvus.py b/llama_stack/providers/remote/vector_io/milvus/milvus.py index b09edb65c..0eaae81b3 100644 --- a/llama_stack/providers/remote/vector_io/milvus/milvus.py +++ b/llama_stack/providers/remote/vector_io/milvus/milvus.py @@ -28,6 +28,7 @@ from llama_stack.providers.utils.kvstore.api import KVStore from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin from llama_stack.providers.utils.memory.vector_store import ( RERANKER_TYPE_WEIGHTED, + ChunkForDeletion, EmbeddingIndex, VectorDBWithIndex, ) @@ -287,14 +288,17 @@ class MilvusIndex(EmbeddingIndex): return QueryChunksResponse(chunks=filtered_chunks, scores=filtered_scores) - async def delete_chunk(self, chunk_id: str) -> None: + async def delete_chunks(self, chunks_for_deletion: list[ChunkForDeletion]) -> None: """Remove a chunk from the Milvus collection.""" + chunk_ids = [c.chunk_id for c in chunks_for_deletion] try: + # Use IN clause with square brackets and single quotes for VARCHAR field + chunk_ids_str = ", ".join(f"'{chunk_id}'" for chunk_id in chunk_ids) await asyncio.to_thread( - self.client.delete, collection_name=self.collection_name, filter=f'chunk_id == "{chunk_id}"' + self.client.delete, collection_name=self.collection_name, filter=f"chunk_id in [{chunk_ids_str}]" ) except Exception as e: - logger.error(f"Error deleting chunk {chunk_id} from Milvus collection {self.collection_name}: {e}") + logger.error(f"Error deleting chunks from Milvus collection {self.collection_name}: {e}") raise @@ -420,12 +424,10 @@ class MilvusVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolP return await index.query_chunks(query, params) - async def delete_chunks(self, store_id: str, chunk_ids: list[str]) -> None: + async def delete_chunks(self, store_id: str, chunks_for_deletion: list[ChunkForDeletion]) -> None: """Delete a chunk from a milvus vector store.""" index = await self._get_and_cache_vector_db_index(store_id) if not index: raise VectorStoreNotFoundError(store_id) - for chunk_id in chunk_ids: - # Use the index's delete_chunk method - await index.index.delete_chunk(chunk_id) + await index.index.delete_chunks(chunks_for_deletion) diff --git a/llama_stack/providers/remote/vector_io/pgvector/pgvector.py b/llama_stack/providers/remote/vector_io/pgvector/pgvector.py index b1645ac5a..d2a5d910b 100644 --- a/llama_stack/providers/remote/vector_io/pgvector/pgvector.py +++ b/llama_stack/providers/remote/vector_io/pgvector/pgvector.py @@ -27,6 +27,7 @@ from llama_stack.providers.utils.kvstore import kvstore_impl from llama_stack.providers.utils.kvstore.api import KVStore from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin from llama_stack.providers.utils.memory.vector_store import ( + ChunkForDeletion, EmbeddingIndex, VectorDBWithIndex, ) @@ -163,10 +164,11 @@ class PGVectorIndex(EmbeddingIndex): with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: cur.execute(f"DROP TABLE IF EXISTS {self.table_name}") - async def delete_chunk(self, chunk_id: str) -> None: + async def delete_chunks(self, chunks_for_deletion: list[ChunkForDeletion]) -> None: """Remove a chunk from the PostgreSQL table.""" + chunk_ids = [c.chunk_id for c in chunks_for_deletion] with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: - cur.execute(f"DELETE FROM {self.table_name} WHERE id = %s", (chunk_id,)) + cur.execute(f"DELETE FROM {self.table_name} WHERE id = ANY(%s)", (chunk_ids,)) class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate): @@ -275,12 +277,10 @@ class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoco self.cache[vector_db_id] = VectorDBWithIndex(vector_db, index, self.inference_api) return self.cache[vector_db_id] - async def delete_chunks(self, store_id: str, chunk_ids: list[str]) -> None: + async def delete_chunks(self, store_id: str, chunks_for_deletion: list[ChunkForDeletion]) -> None: """Delete a chunk from a PostgreSQL vector store.""" index = await self._get_and_cache_vector_db_index(store_id) if not index: raise VectorStoreNotFoundError(store_id) - for chunk_id in chunk_ids: - # Use the index's delete_chunk method - await index.index.delete_chunk(chunk_id) + await index.index.delete_chunks(chunks_for_deletion) diff --git a/llama_stack/providers/remote/vector_io/qdrant/qdrant.py b/llama_stack/providers/remote/vector_io/qdrant/qdrant.py index 144da0f4f..018015780 100644 --- a/llama_stack/providers/remote/vector_io/qdrant/qdrant.py +++ b/llama_stack/providers/remote/vector_io/qdrant/qdrant.py @@ -29,6 +29,7 @@ from llama_stack.providers.inline.vector_io.qdrant import QdrantVectorIOConfig a from llama_stack.providers.utils.kvstore import KVStore, kvstore_impl from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin from llama_stack.providers.utils.memory.vector_store import ( + ChunkForDeletion, EmbeddingIndex, VectorDBWithIndex, ) @@ -88,15 +89,16 @@ class QdrantIndex(EmbeddingIndex): await self.client.upsert(collection_name=self.collection_name, points=points) - async def delete_chunk(self, chunk_id: str) -> None: + async def delete_chunks(self, chunks_for_deletion: list[ChunkForDeletion]) -> None: """Remove a chunk from the Qdrant collection.""" + chunk_ids = [convert_id(c.chunk_id) for c in chunks_for_deletion] try: await self.client.delete( collection_name=self.collection_name, - points_selector=models.PointIdsList(points=[convert_id(chunk_id)]), + points_selector=models.PointIdsList(points=chunk_ids), ) except Exception as e: - log.error(f"Error deleting chunk {chunk_id} from Qdrant collection {self.collection_name}: {e}") + log.error(f"Error deleting chunks from Qdrant collection {self.collection_name}: {e}") raise async def query_vector(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse: @@ -264,12 +266,14 @@ class QdrantVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolP ) -> VectorStoreFileObject: # Qdrant doesn't allow multiple clients to access the same storage path simultaneously. async with self._qdrant_lock: - await super().openai_attach_file_to_vector_store(vector_store_id, file_id, attributes, chunking_strategy) + return await super().openai_attach_file_to_vector_store( + vector_store_id, file_id, attributes, chunking_strategy + ) - async def delete_chunks(self, store_id: str, chunk_ids: list[str]) -> None: + async def delete_chunks(self, store_id: str, chunks_for_deletion: list[ChunkForDeletion]) -> None: """Delete chunks from a Qdrant vector store.""" index = await self._get_and_cache_vector_db_index(store_id) if not index: raise ValueError(f"Vector DB {store_id} not found") - for chunk_id in chunk_ids: - await index.index.delete_chunk(chunk_id) + + await index.index.delete_chunks(chunks_for_deletion) diff --git a/llama_stack/providers/remote/vector_io/weaviate/weaviate.py b/llama_stack/providers/remote/vector_io/weaviate/weaviate.py index 11da8902c..966724848 100644 --- a/llama_stack/providers/remote/vector_io/weaviate/weaviate.py +++ b/llama_stack/providers/remote/vector_io/weaviate/weaviate.py @@ -26,6 +26,7 @@ from llama_stack.providers.utils.memory.openai_vector_store_mixin import ( OpenAIVectorStoreMixin, ) from llama_stack.providers.utils.memory.vector_store import ( + ChunkForDeletion, EmbeddingIndex, VectorDBWithIndex, ) @@ -67,6 +68,7 @@ class WeaviateIndex(EmbeddingIndex): data_objects.append( wvc.data.DataObject( properties={ + "chunk_id": chunk.chunk_id, "chunk_content": chunk.model_dump_json(), }, vector=embeddings[i].tolist(), @@ -79,10 +81,11 @@ class WeaviateIndex(EmbeddingIndex): # TODO: make this async friendly collection.data.insert_many(data_objects) - async def delete_chunk(self, chunk_id: str) -> None: + async def delete_chunks(self, chunks_for_deletion: list[ChunkForDeletion]) -> None: sanitized_collection_name = sanitize_collection_name(self.collection_name, weaviate_format=True) collection = self.client.collections.get(sanitized_collection_name) - collection.data.delete_many(where=Filter.by_property("id").contains_any([chunk_id])) + chunk_ids = [chunk.chunk_id for chunk in chunks_for_deletion] + collection.data.delete_many(where=Filter.by_property("chunk_id").contains_any(chunk_ids)) async def query_vector(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse: sanitized_collection_name = sanitize_collection_name(self.collection_name, weaviate_format=True) @@ -307,10 +310,10 @@ class WeaviateVectorIOAdapter( return await index.query_chunks(query, params) - async def delete_chunks(self, store_id: str, chunk_ids: list[str]) -> None: + async def delete_chunks(self, store_id: str, chunks_for_deletion: list[ChunkForDeletion]) -> None: sanitized_collection_name = sanitize_collection_name(store_id, weaviate_format=True) index = await self._get_and_cache_vector_db_index(sanitized_collection_name) if not index: raise ValueError(f"Vector DB {sanitized_collection_name} not found") - await index.delete(chunk_ids) + await index.index.delete_chunks(chunks_for_deletion) diff --git a/llama_stack/providers/utils/inference/openai_compat.py b/llama_stack/providers/utils/inference/openai_compat.py index e6e5ccc8a..9a77c8cc4 100644 --- a/llama_stack/providers/utils/inference/openai_compat.py +++ b/llama_stack/providers/utils/inference/openai_compat.py @@ -70,7 +70,7 @@ from openai.types.chat.chat_completion_chunk import ( from openai.types.chat.chat_completion_content_part_image_param import ( ImageURL as OpenAIImageURL, ) -from openai.types.chat.chat_completion_message_tool_call_param import ( +from openai.types.chat.chat_completion_message_tool_call import ( Function as OpenAIFunction, ) from pydantic import BaseModel diff --git a/llama_stack/providers/utils/memory/openai_vector_store_mixin.py b/llama_stack/providers/utils/memory/openai_vector_store_mixin.py index 7b6e69df1..120d0d4fc 100644 --- a/llama_stack/providers/utils/memory/openai_vector_store_mixin.py +++ b/llama_stack/providers/utils/memory/openai_vector_store_mixin.py @@ -6,7 +6,6 @@ import asyncio import json -import logging import mimetypes import time import uuid @@ -37,10 +36,15 @@ from llama_stack.apis.vector_io import ( VectorStoreSearchResponse, VectorStoreSearchResponsePage, ) +from llama_stack.log import get_logger from llama_stack.providers.utils.kvstore.api import KVStore -from llama_stack.providers.utils.memory.vector_store import content_from_data_and_mime_type, make_overlapped_chunks +from llama_stack.providers.utils.memory.vector_store import ( + ChunkForDeletion, + content_from_data_and_mime_type, + make_overlapped_chunks, +) -logger = logging.getLogger(__name__) +logger = get_logger(__name__, category="vector_io") # Constants for OpenAI vector stores CHUNK_MULTIPLIER = 5 @@ -154,8 +158,8 @@ class OpenAIVectorStoreMixin(ABC): self.openai_vector_stores = await self._load_openai_vector_stores() @abstractmethod - async def delete_chunks(self, store_id: str, chunk_ids: list[str]) -> None: - """Delete a chunk from a vector store.""" + async def delete_chunks(self, store_id: str, chunks_for_deletion: list[ChunkForDeletion]) -> None: + """Delete chunks from a vector store.""" pass @abstractmethod @@ -614,7 +618,7 @@ class OpenAIVectorStoreMixin(ABC): ) vector_store_file_object.status = "completed" except Exception as e: - logger.error(f"Error attaching file to vector store: {e}") + logger.exception("Error attaching file to vector store") vector_store_file_object.status = "failed" vector_store_file_object.last_error = VectorStoreFileLastError( code="server_error", @@ -767,7 +771,21 @@ class OpenAIVectorStoreMixin(ABC): dict_chunks = await self._load_openai_vector_store_file_contents(vector_store_id, file_id) chunks = [Chunk.model_validate(c) for c in dict_chunks] - await self.delete_chunks(vector_store_id, [str(c.chunk_id) for c in chunks if c.chunk_id]) + + # Create ChunkForDeletion objects with both chunk_id and document_id + chunks_for_deletion = [] + for c in chunks: + if c.chunk_id: + document_id = c.metadata.get("document_id") or ( + c.chunk_metadata.document_id if c.chunk_metadata else None + ) + if document_id: + chunks_for_deletion.append(ChunkForDeletion(chunk_id=str(c.chunk_id), document_id=document_id)) + else: + logger.warning(f"Chunk {c.chunk_id} has no document_id, skipping deletion") + + if chunks_for_deletion: + await self.delete_chunks(vector_store_id, chunks_for_deletion) store_info = self.openai_vector_stores[vector_store_id].copy() diff --git a/llama_stack/providers/utils/memory/vector_store.py b/llama_stack/providers/utils/memory/vector_store.py index bb9002f30..6ae5bb521 100644 --- a/llama_stack/providers/utils/memory/vector_store.py +++ b/llama_stack/providers/utils/memory/vector_store.py @@ -16,6 +16,7 @@ from urllib.parse import unquote import httpx import numpy as np from numpy.typing import NDArray +from pydantic import BaseModel from llama_stack.apis.common.content_types import ( URL, @@ -34,6 +35,18 @@ from llama_stack.providers.utils.vector_io.vector_utils import generate_chunk_id log = logging.getLogger(__name__) + +class ChunkForDeletion(BaseModel): + """Information needed to delete a chunk from a vector store. + + :param chunk_id: The ID of the chunk to delete + :param document_id: The ID of the document this chunk belongs to + """ + + chunk_id: str + document_id: str + + # Constants for reranker types RERANKER_TYPE_RRF = "rrf" RERANKER_TYPE_WEIGHTED = "weighted" @@ -232,7 +245,7 @@ class EmbeddingIndex(ABC): raise NotImplementedError() @abstractmethod - async def delete_chunk(self, chunk_id: str): + async def delete_chunks(self, chunks_for_deletion: list[ChunkForDeletion]): raise NotImplementedError() @abstractmethod diff --git a/llama_stack/ui/app/chat-playground/page.tsx b/llama_stack/ui/app/chat-playground/page.tsx index c31248b78..d8094af85 100644 --- a/llama_stack/ui/app/chat-playground/page.tsx +++ b/llama_stack/ui/app/chat-playground/page.tsx @@ -175,7 +175,7 @@ const handleSubmitWithContent = async (content: string) => { return (
-

Chat Playground

+

Chat Playground (Completions)