From ae3cbde8a4fe3e3777184c3014223bd1bdfdd965 Mon Sep 17 00:00:00 2001 From: Francisco Javier Arceo Date: Mon, 11 Aug 2025 09:41:16 -0400 Subject: [PATCH] docs: Add documentation on how to contribute a Vector DB provider and updated Test documentation and added a shortcut Signed-off-by: Francisco Javier Arceo --- README.md | 18 ++++ docs/_static/js/keyboard_shortcuts.js | 14 +++ docs/source/conf.py | 1 + docs/source/contributing/index.md | 18 +++- .../contributing/new_vector_database.md | 75 ++++++++++++++ docs/source/contributing/testing.md | 12 ++- docs/source/providers/inference/index.md | 1 + .../providers/inference/remote_vertexai.md | 40 ++++++++ .../providers/vector_io/inline_faiss.md | 12 +++ .../providers/vector_io/remote_milvus.md | 87 ++++++++++++++++ llama_stack/core/routing_tables/toolgroups.py | 5 +- llama_stack/distributions/ci-tests/build.yaml | 1 + llama_stack/distributions/ci-tests/run.yaml | 5 + llama_stack/distributions/starter/build.yaml | 1 + llama_stack/distributions/starter/run.yaml | 5 + llama_stack/distributions/starter/starter.py | 10 ++ .../providers/inline/vector_io/faiss/faiss.py | 8 +- llama_stack/providers/registry/inference.py | 30 ++++++ llama_stack/providers/registry/vector_io.py | 99 +++++++++++++++++++ .../remote/inference/vertexai/__init__.py | 15 +++ .../remote/inference/vertexai/config.py | 45 +++++++++ .../remote/inference/vertexai/models.py | 20 ++++ .../remote/inference/vertexai/vertexai.py | 52 ++++++++++ .../inference/test_openai_completion.py | 1 + .../inference/test_text_inference.py | 1 + 25 files changed, 563 insertions(+), 13 deletions(-) create mode 100644 docs/_static/js/keyboard_shortcuts.js create mode 100644 docs/source/contributing/new_vector_database.md create mode 100644 docs/source/providers/inference/remote_vertexai.md create mode 100644 llama_stack/providers/remote/inference/vertexai/__init__.py create mode 100644 llama_stack/providers/remote/inference/vertexai/config.py create mode 100644 llama_stack/providers/remote/inference/vertexai/models.py create mode 100644 llama_stack/providers/remote/inference/vertexai/vertexai.py diff --git a/README.md b/README.md index 03aa3dd50..8db4580a2 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,8 @@ # Llama Stack +meta-llama%2Fllama-stack | Trendshift + +----- [![PyPI version](https://img.shields.io/pypi/v/llama_stack.svg)](https://pypi.org/project/llama_stack/) [![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-stack)](https://pypi.org/project/llama-stack/) [![License](https://img.shields.io/pypi/l/llama_stack.svg)](https://github.com/meta-llama/llama-stack/blob/main/LICENSE) @@ -9,6 +12,7 @@ [**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack) + ### ✨🎉 Llama 4 Support 🎉✨ We released [Version 0.2.0](https://github.com/meta-llama/llama-stack/releases/tag/v0.2.0) with support for the Llama 4 herd of models released by Meta. @@ -179,3 +183,17 @@ Please checkout our [Documentation](https://llama-stack.readthedocs.io/en/latest Check out our client SDKs for connecting to a Llama Stack server in your preferred language, you can choose from [python](https://github.com/meta-llama/llama-stack-client-python), [typescript](https://github.com/meta-llama/llama-stack-client-typescript), [swift](https://github.com/meta-llama/llama-stack-client-swift), and [kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) programming languages to quickly build your applications. You can find more example scripts with client SDKs to talk with the Llama Stack server in our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repo. + + +## 🌟 GitHub Star History +## Star History + +[![Star History Chart](https://api.star-history.com/svg?repos=meta-llama/llama-stack&type=Date)](https://www.star-history.com/#meta-llama/llama-stack&Date) + +## ✨ Contributors + +Thanks to all of our amazing contributors! + + + + \ No newline at end of file diff --git a/docs/_static/js/keyboard_shortcuts.js b/docs/_static/js/keyboard_shortcuts.js new file mode 100644 index 000000000..81d0b7c65 --- /dev/null +++ b/docs/_static/js/keyboard_shortcuts.js @@ -0,0 +1,14 @@ +document.addEventListener('keydown', function(event) { + // command+K or ctrl+K + if ((event.metaKey || event.ctrlKey) && event.key === 'k') { + event.preventDefault(); + document.querySelector('.search-input, .search-field, input[name="q"]').focus(); + } + + // forward slash + if (event.key === '/' && + !event.target.matches('input, textarea, select')) { + event.preventDefault(); + document.querySelector('.search-input, .search-field, input[name="q"]').focus(); + } +}); diff --git a/docs/source/conf.py b/docs/source/conf.py index 20f1abf00..3f84d1310 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -131,6 +131,7 @@ html_static_path = ["../_static"] def setup(app): app.add_css_file("css/my_theme.css") app.add_js_file("js/detect_theme.js") + app.add_js_file("js/keyboard_shortcuts.js") def dockerhub_role(name, rawtext, text, lineno, inliner, options={}, content=[]): url = f"https://hub.docker.com/r/llamastack/{text}" diff --git a/docs/source/contributing/index.md b/docs/source/contributing/index.md index 1e067ea6c..79c3861ea 100644 --- a/docs/source/contributing/index.md +++ b/docs/source/contributing/index.md @@ -2,14 +2,28 @@ ```{include} ../../../CONTRIBUTING.md ``` -See the [Adding a New API Provider](new_api_provider.md) which describes how to add new API providers to the Stack. +## Testing +See the [Test Page](testing.md) which describes how to test your changes. +```{toctree} +:maxdepth: 1 +:hidden: +:caption: Testing +testing +``` +## Adding a New Provider + +See the [Adding a New API Provider Page](new_api_provider.md) which describes how to add new API providers to the Stack. + +See the [Vector Database Page](new_vector_database.md) which describes how to add a new vector databases with Llama Stack. + +See the [External Provider Page](../providers/external/index.md) which describes how to add external providers to the Stack. ```{toctree} :maxdepth: 1 :hidden: new_api_provider -testing +new_vector_database ``` diff --git a/docs/source/contributing/new_vector_database.md b/docs/source/contributing/new_vector_database.md new file mode 100644 index 000000000..83c0f55bc --- /dev/null +++ b/docs/source/contributing/new_vector_database.md @@ -0,0 +1,75 @@ +# Adding a New Vector Database + +This guide will walk you through the process of adding a new vector database to Llama Stack. + +> **_NOTE:_** Here's an example Pull Request of the [Milvus Vector Database Provider](https://github.com/meta-llama/llama-stack/pull/1467). + +Vector Database providers are used to store and retrieve vector embeddings. Vector databases are not limited to vector +search but can support keyword and hybrid search. Additionally, vector database can also support operations like +filtering, sorting, and aggregating vectors. + +## Steps to Add a New Vector Database Provider +1. **Choose the Database Type**: Determine if your vector database is a remote service, inline, or both. + - Remote databases make requests to external services, while inline databases execute locally. Some providers support both. +2. **Implement the Provider**: Create a new provider class that inherits from `VectorDatabaseProvider` and implements the required methods. + - Implement methods for vector storage, retrieval, search, and any additional features your database supports. + - You will need to implement the following methods for `YourVectorIndex`: + - `YourVectorIndex.create()` + - `YourVectorIndex.initialize()` + - `YourVectorIndex.add_chunks()` + - `YourVectorIndex.delete_chunk()` + - `YourVectorIndex.query_vector()` + - `YourVectorIndex.query_keyword()` + - `YourVectorIndex.query_hybrid()` + - You will need to implement the following methods for `YourVectorIOAdapter`: + - `YourVectorIOAdapter.initialize()` + - `YourVectorIOAdapter.shutdown()` + - `YourVectorIOAdapter.list_vector_dbs()` + - `YourVectorIOAdapter.register_vector_db()` + - `YourVectorIOAdapter.unregister_vector_db()` + - `YourVectorIOAdapter.insert_chunks()` + - `YourVectorIOAdapter.query_chunks()` + - `YourVectorIOAdapter.delete_chunks()` +3. **Add to Registry**: Register your provider in the appropriate registry file. + - Update {repopath}`llama_stack/providers/registry/vector_io.py` to include your new provider. +```python +from llama_stack.providers.registry.specs import InlineProviderSpec +from llama_stack.providers.registry.api import Api + +InlineProviderSpec( + api=Api.vector_io, + provider_type="inline::milvus", + pip_packages=["pymilvus>=2.4.10"], + module="llama_stack.providers.inline.vector_io.milvus", + config_class="llama_stack.providers.inline.vector_io.milvus.MilvusVectorIOConfig", + api_dependencies=[Api.inference], + optional_api_dependencies=[Api.files], + description="", +), +``` +4. **Add Tests**: Create unit tests and integration tests for your provider in the `tests/` directory. + - Unit Tests + - By following the structure of the class methods, you will be able to easily run unit and integration tests for your database. + 1. You have to configure the tests for your provide in `/tests/unit/providers/vector_io/conftest.py`. + 2. Update the `vector_provider` fixture to include your provider if they are an inline provider. + 3. Create a `your_vectorprovider_index` fixture that initializes your vector index. + 4. Create a `your_vectorprovider_adapter` fixture that initializes your vector adapter. + 5. Add your provider to the `vector_io_providers` fixture dictionary. + - Please follow the naming convention of `your_vectorprovider_index` and `your_vectorprovider_adapter` as the tests require this to execute properly. + - Integration Tests + - Integration tests are located in {repopath}`tests/integration`. These tests use the python client-SDK APIs (from the `llama_stack_client` package) to test functionality. + - The two set of integration tests are: + - `tests/integration/vector_io/test_vector_io.py`: This file tests registration, insertion, and retrieval. + - `tests/integration/vector_io/test_openai_vector_stores.py`: These tests are for OpenAI-compatible vector stores and test the OpenAI API compatibility. + - You will need to update `skip_if_provider_doesnt_support_openai_vector_stores` to include your provider as well as `skip_if_provider_doesnt_support_openai_vector_stores_search` to test the appropriate search functionality. + - Running the tests in the GitHub CI + - You will need to update the `.github/workflows/integration-vector-io-tests.yml` file to include your provider. + - If your provider is a remote provider, you will also have to add a container to spin up and run it in the action. + - Updating the pyproject.yml + - If you are adding tests for the `inline` provider you will have to update the `unit` group. + - `uv add new_pip_package --group unit` + - If you are adding tests for the `remote` provider you will have to update the `test` group, which is used in the GitHub CI for integration tests. + - `uv add new_pip_package --group test` +5. **Update Documentation**: Please update the documentation for end users + - Generate the provider documentation by running {repopath}`./scripts/provider_codegen.py`. + - Update the autogenerated content in the registry/vector_io.py file with information about your provider. Please see other providers for examples. \ No newline at end of file diff --git a/docs/source/contributing/testing.md b/docs/source/contributing/testing.md index 47bf9dea7..454ded266 100644 --- a/docs/source/contributing/testing.md +++ b/docs/source/contributing/testing.md @@ -1,6 +1,8 @@ -# Testing Llama Stack +```{include} ../../../tests/README.md +``` -Tests are of three different kinds: -- Unit tests -- Provider focused integration tests -- Client SDK tests +```{include} ../../../tests/unit/README.md +``` + +```{include} ../../../tests/integration/README.md +``` diff --git a/docs/source/providers/inference/index.md b/docs/source/providers/inference/index.md index 1c7bc86b9..38781e5eb 100644 --- a/docs/source/providers/inference/index.md +++ b/docs/source/providers/inference/index.md @@ -29,6 +29,7 @@ remote_runpod remote_sambanova remote_tgi remote_together +remote_vertexai remote_vllm remote_watsonx ``` diff --git a/docs/source/providers/inference/remote_vertexai.md b/docs/source/providers/inference/remote_vertexai.md new file mode 100644 index 000000000..962bbd76f --- /dev/null +++ b/docs/source/providers/inference/remote_vertexai.md @@ -0,0 +1,40 @@ +# remote::vertexai + +## Description + +Google Vertex AI inference provider enables you to use Google's Gemini models through Google Cloud's Vertex AI platform, providing several advantages: + +• Enterprise-grade security: Uses Google Cloud's security controls and IAM +• Better integration: Seamless integration with other Google Cloud services +• Advanced features: Access to additional Vertex AI features like model tuning and monitoring +• Authentication: Uses Google Cloud Application Default Credentials (ADC) instead of API keys + +Configuration: +- Set VERTEX_AI_PROJECT environment variable (required) +- Set VERTEX_AI_LOCATION environment variable (optional, defaults to us-central1) +- Use Google Cloud Application Default Credentials or service account key + +Authentication Setup: +Option 1 (Recommended): gcloud auth application-default login +Option 2: Set GOOGLE_APPLICATION_CREDENTIALS to service account key path + +Available Models: +- vertex_ai/gemini-2.0-flash +- vertex_ai/gemini-2.5-flash +- vertex_ai/gemini-2.5-pro + +## Configuration + +| Field | Type | Required | Default | Description | +|-------|------|----------|---------|-------------| +| `project` | `` | No | | Google Cloud project ID for Vertex AI | +| `location` | `` | No | us-central1 | Google Cloud location for Vertex AI | + +## Sample Configuration + +```yaml +project: ${env.VERTEX_AI_PROJECT:=} +location: ${env.VERTEX_AI_LOCATION:=us-central1} + +``` + diff --git a/docs/source/providers/vector_io/inline_faiss.md b/docs/source/providers/vector_io/inline_faiss.md index bcff66f3f..cfa18a839 100644 --- a/docs/source/providers/vector_io/inline_faiss.md +++ b/docs/source/providers/vector_io/inline_faiss.md @@ -12,6 +12,18 @@ That means you'll get fast and efficient vector retrieval. - Lightweight and easy to use - Fully integrated with Llama Stack - GPU support +- **Vector search** - FAISS supports pure vector similarity search using embeddings + +## Search Modes + +**Supported:** +- **Vector Search** (`mode="vector"`): Performs vector similarity search using embeddings + +**Not Supported:** +- **Keyword Search** (`mode="keyword"`): Not supported by FAISS +- **Hybrid Search** (`mode="hybrid"`): Not supported by FAISS + +> **Note**: FAISS is designed as a pure vector similarity search library. See the [FAISS GitHub repository](https://github.com/facebookresearch/faiss) for more details about FAISS's core functionality. ## Usage diff --git a/docs/source/providers/vector_io/remote_milvus.md b/docs/source/providers/vector_io/remote_milvus.md index 3646f4acc..2af64b8bb 100644 --- a/docs/source/providers/vector_io/remote_milvus.md +++ b/docs/source/providers/vector_io/remote_milvus.md @@ -11,6 +11,7 @@ That means you're not limited to storing vectors in memory or in a separate serv - Easy to use - Fully integrated with Llama Stack +- Supports all search modes: vector, keyword, and hybrid search (both inline and remote configurations) ## Usage @@ -101,6 +102,92 @@ vector_io: - **`client_pem_path`**: Path to the **client certificate** file (required for mTLS). - **`client_key_path`**: Path to the **client private key** file (required for mTLS). +## Search Modes + +Milvus supports three different search modes for both inline and remote configurations: + +### Vector Search +Vector search uses semantic similarity to find the most relevant chunks based on embedding vectors. This is the default search mode and works well for finding conceptually similar content. + +```python +# Vector search example +search_response = client.vector_stores.search( + vector_store_id=vector_store.id, + query="What is machine learning?", + search_mode="vector", + max_num_results=5, +) +``` + +### Keyword Search +Keyword search uses traditional text-based matching to find chunks containing specific terms or phrases. This is useful when you need exact term matches. + +```python +# Keyword search example +search_response = client.vector_stores.search( + vector_store_id=vector_store.id, + query="Python programming language", + search_mode="keyword", + max_num_results=5, +) +``` + +### Hybrid Search +Hybrid search combines both vector and keyword search methods to provide more comprehensive results. It leverages the strengths of both semantic similarity and exact term matching. + +#### Basic Hybrid Search +```python +# Basic hybrid search example (uses RRF ranker with default impact_factor=60.0) +search_response = client.vector_stores.search( + vector_store_id=vector_store.id, + query="neural networks in Python", + search_mode="hybrid", + max_num_results=5, +) +``` + +**Note**: The default `impact_factor` value of 60.0 was empirically determined to be optimal in the original RRF research paper: ["Reciprocal Rank Fusion outperforms Condorcet and individual Rank Learning Methods"](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) (Cormack et al., 2009). + +#### Hybrid Search with RRF (Reciprocal Rank Fusion) Ranker +RRF combines rankings from vector and keyword search by using reciprocal ranks. The impact factor controls how much weight is given to higher-ranked results. + +```python +# Hybrid search with custom RRF parameters +search_response = client.vector_stores.search( + vector_store_id=vector_store.id, + query="neural networks in Python", + search_mode="hybrid", + max_num_results=5, + ranking_options={ + "ranker": { + "type": "rrf", + "impact_factor": 100.0, # Higher values give more weight to top-ranked results + } + }, +) +``` + +#### Hybrid Search with Weighted Ranker +Weighted ranker linearly combines normalized scores from vector and keyword search. The alpha parameter controls the balance between the two search methods. + +```python +# Hybrid search with weighted ranker +search_response = client.vector_stores.search( + vector_store_id=vector_store.id, + query="neural networks in Python", + search_mode="hybrid", + max_num_results=5, + ranking_options={ + "ranker": { + "type": "weighted", + "alpha": 0.7, # 70% vector search, 30% keyword search + } + }, +) +``` + +For detailed documentation on RRF and Weighted rankers, please refer to the [Milvus Reranking Guide](https://milvus.io/docs/reranking.md). + ## Documentation See the [Milvus documentation](https://milvus.io/docs/install-overview.md) for more details about Milvus in general. diff --git a/llama_stack/core/routing_tables/toolgroups.py b/llama_stack/core/routing_tables/toolgroups.py index e172af991..6910b3906 100644 --- a/llama_stack/core/routing_tables/toolgroups.py +++ b/llama_stack/core/routing_tables/toolgroups.py @@ -124,10 +124,7 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups): return toolgroup async def unregister_toolgroup(self, toolgroup_id: str) -> None: - tool_group = await self.get_tool_group(toolgroup_id) - if tool_group is None: - raise ToolGroupNotFoundError(toolgroup_id) - await self.unregister_object(tool_group) + await self.unregister_object(await self.get_tool_group(toolgroup_id)) async def shutdown(self) -> None: pass diff --git a/llama_stack/distributions/ci-tests/build.yaml b/llama_stack/distributions/ci-tests/build.yaml index 2f9ae8682..e6e699b62 100644 --- a/llama_stack/distributions/ci-tests/build.yaml +++ b/llama_stack/distributions/ci-tests/build.yaml @@ -14,6 +14,7 @@ distribution_spec: - provider_type: remote::openai - provider_type: remote::anthropic - provider_type: remote::gemini + - provider_type: remote::vertexai - provider_type: remote::groq - provider_type: remote::sambanova - provider_type: inline::sentence-transformers diff --git a/llama_stack/distributions/ci-tests/run.yaml b/llama_stack/distributions/ci-tests/run.yaml index 188c66275..05e1b4576 100644 --- a/llama_stack/distributions/ci-tests/run.yaml +++ b/llama_stack/distributions/ci-tests/run.yaml @@ -65,6 +65,11 @@ providers: provider_type: remote::gemini config: api_key: ${env.GEMINI_API_KEY:=} + - provider_id: ${env.VERTEX_AI_PROJECT:+vertexai} + provider_type: remote::vertexai + config: + project: ${env.VERTEX_AI_PROJECT:=} + location: ${env.VERTEX_AI_LOCATION:=us-central1} - provider_id: groq provider_type: remote::groq config: diff --git a/llama_stack/distributions/starter/build.yaml b/llama_stack/distributions/starter/build.yaml index f95a03a9e..1a4f81d49 100644 --- a/llama_stack/distributions/starter/build.yaml +++ b/llama_stack/distributions/starter/build.yaml @@ -14,6 +14,7 @@ distribution_spec: - provider_type: remote::openai - provider_type: remote::anthropic - provider_type: remote::gemini + - provider_type: remote::vertexai - provider_type: remote::groq - provider_type: remote::sambanova - provider_type: inline::sentence-transformers diff --git a/llama_stack/distributions/starter/run.yaml b/llama_stack/distributions/starter/run.yaml index 8bd737686..46bd12956 100644 --- a/llama_stack/distributions/starter/run.yaml +++ b/llama_stack/distributions/starter/run.yaml @@ -65,6 +65,11 @@ providers: provider_type: remote::gemini config: api_key: ${env.GEMINI_API_KEY:=} + - provider_id: ${env.VERTEX_AI_PROJECT:+vertexai} + provider_type: remote::vertexai + config: + project: ${env.VERTEX_AI_PROJECT:=} + location: ${env.VERTEX_AI_LOCATION:=us-central1} - provider_id: groq provider_type: remote::groq config: diff --git a/llama_stack/distributions/starter/starter.py b/llama_stack/distributions/starter/starter.py index a970f2d1c..0270b68ad 100644 --- a/llama_stack/distributions/starter/starter.py +++ b/llama_stack/distributions/starter/starter.py @@ -56,6 +56,7 @@ ENABLED_INFERENCE_PROVIDERS = [ "fireworks", "together", "gemini", + "vertexai", "groq", "sambanova", "anthropic", @@ -71,6 +72,7 @@ INFERENCE_PROVIDER_IDS = { "tgi": "${env.TGI_URL:+tgi}", "cerebras": "${env.CEREBRAS_API_KEY:+cerebras}", "nvidia": "${env.NVIDIA_API_KEY:+nvidia}", + "vertexai": "${env.VERTEX_AI_PROJECT:+vertexai}", } @@ -246,6 +248,14 @@ def get_distribution_template() -> DistributionTemplate: "", "Gemini API Key", ), + "VERTEX_AI_PROJECT": ( + "", + "Google Cloud Project ID for Vertex AI", + ), + "VERTEX_AI_LOCATION": ( + "us-central1", + "Google Cloud Location for Vertex AI", + ), "SAMBANOVA_API_KEY": ( "", "SambaNova API Key", diff --git a/llama_stack/providers/inline/vector_io/faiss/faiss.py b/llama_stack/providers/inline/vector_io/faiss/faiss.py index 7a5373726..5a063592c 100644 --- a/llama_stack/providers/inline/vector_io/faiss/faiss.py +++ b/llama_stack/providers/inline/vector_io/faiss/faiss.py @@ -174,7 +174,9 @@ class FaissIndex(EmbeddingIndex): k: int, score_threshold: float, ) -> QueryChunksResponse: - raise NotImplementedError("Keyword search is not supported in FAISS") + raise NotImplementedError( + "Keyword search is not supported - underlying DB FAISS does not support this search mode" + ) async def query_hybrid( self, @@ -185,7 +187,9 @@ class FaissIndex(EmbeddingIndex): reranker_type: str, reranker_params: dict[str, Any] | None = None, ) -> QueryChunksResponse: - raise NotImplementedError("Hybrid search is not supported in FAISS") + raise NotImplementedError( + "Hybrid search is not supported - underlying DB FAISS does not support this search mode" + ) class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate): diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py index a8bc96a77..1801cdcad 100644 --- a/llama_stack/providers/registry/inference.py +++ b/llama_stack/providers/registry/inference.py @@ -213,6 +213,36 @@ def available_providers() -> list[ProviderSpec]: description="Google Gemini inference provider for accessing Gemini models and Google's AI services.", ), ), + remote_provider_spec( + api=Api.inference, + adapter=AdapterSpec( + adapter_type="vertexai", + pip_packages=["litellm", "google-cloud-aiplatform"], + module="llama_stack.providers.remote.inference.vertexai", + config_class="llama_stack.providers.remote.inference.vertexai.VertexAIConfig", + provider_data_validator="llama_stack.providers.remote.inference.vertexai.config.VertexAIProviderDataValidator", + description="""Google Vertex AI inference provider enables you to use Google's Gemini models through Google Cloud's Vertex AI platform, providing several advantages: + +• Enterprise-grade security: Uses Google Cloud's security controls and IAM +• Better integration: Seamless integration with other Google Cloud services +• Advanced features: Access to additional Vertex AI features like model tuning and monitoring +• Authentication: Uses Google Cloud Application Default Credentials (ADC) instead of API keys + +Configuration: +- Set VERTEX_AI_PROJECT environment variable (required) +- Set VERTEX_AI_LOCATION environment variable (optional, defaults to us-central1) +- Use Google Cloud Application Default Credentials or service account key + +Authentication Setup: +Option 1 (Recommended): gcloud auth application-default login +Option 2: Set GOOGLE_APPLICATION_CREDENTIALS to service account key path + +Available Models: +- vertex_ai/gemini-2.0-flash +- vertex_ai/gemini-2.5-flash +- vertex_ai/gemini-2.5-pro""", + ), + ), remote_provider_spec( api=Api.inference, adapter=AdapterSpec( diff --git a/llama_stack/providers/registry/vector_io.py b/llama_stack/providers/registry/vector_io.py index 846f7b88e..ed170b508 100644 --- a/llama_stack/providers/registry/vector_io.py +++ b/llama_stack/providers/registry/vector_io.py @@ -45,6 +45,18 @@ That means you'll get fast and efficient vector retrieval. - Lightweight and easy to use - Fully integrated with Llama Stack - GPU support +- **Vector search** - FAISS supports pure vector similarity search using embeddings + +## Search Modes + +**Supported:** +- **Vector Search** (`mode="vector"`): Performs vector similarity search using embeddings + +**Not Supported:** +- **Keyword Search** (`mode="keyword"`): Not supported by FAISS +- **Hybrid Search** (`mode="hybrid"`): Not supported by FAISS + +> **Note**: FAISS is designed as a pure vector similarity search library. See the [FAISS GitHub repository](https://github.com/facebookresearch/faiss) for more details about FAISS's core functionality. ## Usage @@ -535,6 +547,7 @@ That means you're not limited to storing vectors in memory or in a separate serv - Easy to use - Fully integrated with Llama Stack +- Supports all search modes: vector, keyword, and hybrid search (both inline and remote configurations) ## Usage @@ -625,6 +638,92 @@ vector_io: - **`client_pem_path`**: Path to the **client certificate** file (required for mTLS). - **`client_key_path`**: Path to the **client private key** file (required for mTLS). +## Search Modes + +Milvus supports three different search modes for both inline and remote configurations: + +### Vector Search +Vector search uses semantic similarity to find the most relevant chunks based on embedding vectors. This is the default search mode and works well for finding conceptually similar content. + +```python +# Vector search example +search_response = client.vector_stores.search( + vector_store_id=vector_store.id, + query="What is machine learning?", + search_mode="vector", + max_num_results=5, +) +``` + +### Keyword Search +Keyword search uses traditional text-based matching to find chunks containing specific terms or phrases. This is useful when you need exact term matches. + +```python +# Keyword search example +search_response = client.vector_stores.search( + vector_store_id=vector_store.id, + query="Python programming language", + search_mode="keyword", + max_num_results=5, +) +``` + +### Hybrid Search +Hybrid search combines both vector and keyword search methods to provide more comprehensive results. It leverages the strengths of both semantic similarity and exact term matching. + +#### Basic Hybrid Search +```python +# Basic hybrid search example (uses RRF ranker with default impact_factor=60.0) +search_response = client.vector_stores.search( + vector_store_id=vector_store.id, + query="neural networks in Python", + search_mode="hybrid", + max_num_results=5, +) +``` + +**Note**: The default `impact_factor` value of 60.0 was empirically determined to be optimal in the original RRF research paper: ["Reciprocal Rank Fusion outperforms Condorcet and individual Rank Learning Methods"](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) (Cormack et al., 2009). + +#### Hybrid Search with RRF (Reciprocal Rank Fusion) Ranker +RRF combines rankings from vector and keyword search by using reciprocal ranks. The impact factor controls how much weight is given to higher-ranked results. + +```python +# Hybrid search with custom RRF parameters +search_response = client.vector_stores.search( + vector_store_id=vector_store.id, + query="neural networks in Python", + search_mode="hybrid", + max_num_results=5, + ranking_options={ + "ranker": { + "type": "rrf", + "impact_factor": 100.0, # Higher values give more weight to top-ranked results + } + }, +) +``` + +#### Hybrid Search with Weighted Ranker +Weighted ranker linearly combines normalized scores from vector and keyword search. The alpha parameter controls the balance between the two search methods. + +```python +# Hybrid search with weighted ranker +search_response = client.vector_stores.search( + vector_store_id=vector_store.id, + query="neural networks in Python", + search_mode="hybrid", + max_num_results=5, + ranking_options={ + "ranker": { + "type": "weighted", + "alpha": 0.7, # 70% vector search, 30% keyword search + } + }, +) +``` + +For detailed documentation on RRF and Weighted rankers, please refer to the [Milvus Reranking Guide](https://milvus.io/docs/reranking.md). + ## Documentation See the [Milvus documentation](https://milvus.io/docs/install-overview.md) for more details about Milvus in general. diff --git a/llama_stack/providers/remote/inference/vertexai/__init__.py b/llama_stack/providers/remote/inference/vertexai/__init__.py new file mode 100644 index 000000000..d9e9419be --- /dev/null +++ b/llama_stack/providers/remote/inference/vertexai/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .config import VertexAIConfig + + +async def get_adapter_impl(config: VertexAIConfig, _deps): + from .vertexai import VertexAIInferenceAdapter + + impl = VertexAIInferenceAdapter(config) + await impl.initialize() + return impl diff --git a/llama_stack/providers/remote/inference/vertexai/config.py b/llama_stack/providers/remote/inference/vertexai/config.py new file mode 100644 index 000000000..659de653e --- /dev/null +++ b/llama_stack/providers/remote/inference/vertexai/config.py @@ -0,0 +1,45 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import Any + +from pydantic import BaseModel, Field + +from llama_stack.schema_utils import json_schema_type + + +class VertexAIProviderDataValidator(BaseModel): + vertex_project: str | None = Field( + default=None, + description="Google Cloud project ID for Vertex AI", + ) + vertex_location: str | None = Field( + default=None, + description="Google Cloud location for Vertex AI (e.g., us-central1)", + ) + + +@json_schema_type +class VertexAIConfig(BaseModel): + project: str = Field( + description="Google Cloud project ID for Vertex AI", + ) + location: str = Field( + default="us-central1", + description="Google Cloud location for Vertex AI", + ) + + @classmethod + def sample_run_config( + cls, + project: str = "${env.VERTEX_AI_PROJECT:=}", + location: str = "${env.VERTEX_AI_LOCATION:=us-central1}", + **kwargs, + ) -> dict[str, Any]: + return { + "project": project, + "location": location, + } diff --git a/llama_stack/providers/remote/inference/vertexai/models.py b/llama_stack/providers/remote/inference/vertexai/models.py new file mode 100644 index 000000000..e72db533d --- /dev/null +++ b/llama_stack/providers/remote/inference/vertexai/models.py @@ -0,0 +1,20 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.providers.utils.inference.model_registry import ( + ProviderModelEntry, +) + +# Vertex AI model IDs with vertex_ai/ prefix as required by litellm +LLM_MODEL_IDS = [ + "vertex_ai/gemini-2.0-flash", + "vertex_ai/gemini-2.5-flash", + "vertex_ai/gemini-2.5-pro", +] + +SAFETY_MODELS_ENTRIES = list[ProviderModelEntry]() + +MODEL_ENTRIES = [ProviderModelEntry(provider_model_id=m) for m in LLM_MODEL_IDS] + SAFETY_MODELS_ENTRIES diff --git a/llama_stack/providers/remote/inference/vertexai/vertexai.py b/llama_stack/providers/remote/inference/vertexai/vertexai.py new file mode 100644 index 000000000..8807fd0e6 --- /dev/null +++ b/llama_stack/providers/remote/inference/vertexai/vertexai.py @@ -0,0 +1,52 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import Any + +from llama_stack.apis.inference import ChatCompletionRequest +from llama_stack.providers.utils.inference.litellm_openai_mixin import ( + LiteLLMOpenAIMixin, +) + +from .config import VertexAIConfig +from .models import MODEL_ENTRIES + + +class VertexAIInferenceAdapter(LiteLLMOpenAIMixin): + def __init__(self, config: VertexAIConfig) -> None: + LiteLLMOpenAIMixin.__init__( + self, + MODEL_ENTRIES, + litellm_provider_name="vertex_ai", + api_key_from_config=None, # Vertex AI uses ADC, not API keys + provider_data_api_key_field="vertex_project", # Use project for validation + ) + self.config = config + + def get_api_key(self) -> str: + # Vertex AI doesn't use API keys, it uses Application Default Credentials + # Return empty string to let litellm handle authentication via ADC + return "" + + async def _get_params(self, request: ChatCompletionRequest) -> dict[str, Any]: + # Get base parameters from parent + params = await super()._get_params(request) + + # Add Vertex AI specific parameters + provider_data = self.get_request_provider_data() + if provider_data: + if getattr(provider_data, "vertex_project", None): + params["vertex_project"] = provider_data.vertex_project + if getattr(provider_data, "vertex_location", None): + params["vertex_location"] = provider_data.vertex_location + else: + params["vertex_project"] = self.config.project + params["vertex_location"] = self.config.location + + # Remove api_key since Vertex AI uses ADC + params.pop("api_key", None) + + return params diff --git a/tests/integration/inference/test_openai_completion.py b/tests/integration/inference/test_openai_completion.py index 0222bfb79..72137662d 100644 --- a/tests/integration/inference/test_openai_completion.py +++ b/tests/integration/inference/test_openai_completion.py @@ -34,6 +34,7 @@ def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id) "remote::runpod", "remote::sambanova", "remote::tgi", + "remote::vertexai", ): pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.") diff --git a/tests/integration/inference/test_text_inference.py b/tests/integration/inference/test_text_inference.py index 08e19726e..d7ffe5929 100644 --- a/tests/integration/inference/test_text_inference.py +++ b/tests/integration/inference/test_text_inference.py @@ -29,6 +29,7 @@ def skip_if_model_doesnt_support_completion(client_with_models, model_id): "remote::openai", "remote::anthropic", "remote::gemini", + "remote::vertexai", "remote::groq", "remote::sambanova", )