diff --git a/README.md b/README.md
index 03aa3dd50..8db4580a2 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,8 @@
# Llama Stack
+
+
+-----
[](https://pypi.org/project/llama_stack/)
[](https://pypi.org/project/llama-stack/)
[](https://github.com/meta-llama/llama-stack/blob/main/LICENSE)
@@ -9,6 +12,7 @@
[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)
+
### ✨🎉 Llama 4 Support 🎉✨
We released [Version 0.2.0](https://github.com/meta-llama/llama-stack/releases/tag/v0.2.0) with support for the Llama 4 herd of models released by Meta.
@@ -179,3 +183,17 @@ Please checkout our [Documentation](https://llama-stack.readthedocs.io/en/latest
Check out our client SDKs for connecting to a Llama Stack server in your preferred language, you can choose from [python](https://github.com/meta-llama/llama-stack-client-python), [typescript](https://github.com/meta-llama/llama-stack-client-typescript), [swift](https://github.com/meta-llama/llama-stack-client-swift), and [kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) programming languages to quickly build your applications.
You can find more example scripts with client SDKs to talk with the Llama Stack server in our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repo.
+
+
+## 🌟 GitHub Star History
+## Star History
+
+[](https://www.star-history.com/#meta-llama/llama-stack&Date)
+
+## ✨ Contributors
+
+Thanks to all of our amazing contributors!
+
+
+
+
\ No newline at end of file
diff --git a/docs/_static/js/keyboard_shortcuts.js b/docs/_static/js/keyboard_shortcuts.js
new file mode 100644
index 000000000..81d0b7c65
--- /dev/null
+++ b/docs/_static/js/keyboard_shortcuts.js
@@ -0,0 +1,14 @@
+document.addEventListener('keydown', function(event) {
+ // command+K or ctrl+K
+ if ((event.metaKey || event.ctrlKey) && event.key === 'k') {
+ event.preventDefault();
+ document.querySelector('.search-input, .search-field, input[name="q"]').focus();
+ }
+
+ // forward slash
+ if (event.key === '/' &&
+ !event.target.matches('input, textarea, select')) {
+ event.preventDefault();
+ document.querySelector('.search-input, .search-field, input[name="q"]').focus();
+ }
+});
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 20f1abf00..3f84d1310 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -131,6 +131,7 @@ html_static_path = ["../_static"]
def setup(app):
app.add_css_file("css/my_theme.css")
app.add_js_file("js/detect_theme.js")
+ app.add_js_file("js/keyboard_shortcuts.js")
def dockerhub_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
url = f"https://hub.docker.com/r/llamastack/{text}"
diff --git a/docs/source/contributing/index.md b/docs/source/contributing/index.md
index 1e067ea6c..79c3861ea 100644
--- a/docs/source/contributing/index.md
+++ b/docs/source/contributing/index.md
@@ -2,14 +2,28 @@
```{include} ../../../CONTRIBUTING.md
```
-See the [Adding a New API Provider](new_api_provider.md) which describes how to add new API providers to the Stack.
+## Testing
+See the [Test Page](testing.md) which describes how to test your changes.
+```{toctree}
+:maxdepth: 1
+:hidden:
+:caption: Testing
+testing
+```
+## Adding a New Provider
+
+See the [Adding a New API Provider Page](new_api_provider.md) which describes how to add new API providers to the Stack.
+
+See the [Vector Database Page](new_vector_database.md) which describes how to add a new vector databases with Llama Stack.
+
+See the [External Provider Page](../providers/external/index.md) which describes how to add external providers to the Stack.
```{toctree}
:maxdepth: 1
:hidden:
new_api_provider
-testing
+new_vector_database
```
diff --git a/docs/source/contributing/new_vector_database.md b/docs/source/contributing/new_vector_database.md
new file mode 100644
index 000000000..83c0f55bc
--- /dev/null
+++ b/docs/source/contributing/new_vector_database.md
@@ -0,0 +1,75 @@
+# Adding a New Vector Database
+
+This guide will walk you through the process of adding a new vector database to Llama Stack.
+
+> **_NOTE:_** Here's an example Pull Request of the [Milvus Vector Database Provider](https://github.com/meta-llama/llama-stack/pull/1467).
+
+Vector Database providers are used to store and retrieve vector embeddings. Vector databases are not limited to vector
+search but can support keyword and hybrid search. Additionally, vector database can also support operations like
+filtering, sorting, and aggregating vectors.
+
+## Steps to Add a New Vector Database Provider
+1. **Choose the Database Type**: Determine if your vector database is a remote service, inline, or both.
+ - Remote databases make requests to external services, while inline databases execute locally. Some providers support both.
+2. **Implement the Provider**: Create a new provider class that inherits from `VectorDatabaseProvider` and implements the required methods.
+ - Implement methods for vector storage, retrieval, search, and any additional features your database supports.
+ - You will need to implement the following methods for `YourVectorIndex`:
+ - `YourVectorIndex.create()`
+ - `YourVectorIndex.initialize()`
+ - `YourVectorIndex.add_chunks()`
+ - `YourVectorIndex.delete_chunk()`
+ - `YourVectorIndex.query_vector()`
+ - `YourVectorIndex.query_keyword()`
+ - `YourVectorIndex.query_hybrid()`
+ - You will need to implement the following methods for `YourVectorIOAdapter`:
+ - `YourVectorIOAdapter.initialize()`
+ - `YourVectorIOAdapter.shutdown()`
+ - `YourVectorIOAdapter.list_vector_dbs()`
+ - `YourVectorIOAdapter.register_vector_db()`
+ - `YourVectorIOAdapter.unregister_vector_db()`
+ - `YourVectorIOAdapter.insert_chunks()`
+ - `YourVectorIOAdapter.query_chunks()`
+ - `YourVectorIOAdapter.delete_chunks()`
+3. **Add to Registry**: Register your provider in the appropriate registry file.
+ - Update {repopath}`llama_stack/providers/registry/vector_io.py` to include your new provider.
+```python
+from llama_stack.providers.registry.specs import InlineProviderSpec
+from llama_stack.providers.registry.api import Api
+
+InlineProviderSpec(
+ api=Api.vector_io,
+ provider_type="inline::milvus",
+ pip_packages=["pymilvus>=2.4.10"],
+ module="llama_stack.providers.inline.vector_io.milvus",
+ config_class="llama_stack.providers.inline.vector_io.milvus.MilvusVectorIOConfig",
+ api_dependencies=[Api.inference],
+ optional_api_dependencies=[Api.files],
+ description="",
+),
+```
+4. **Add Tests**: Create unit tests and integration tests for your provider in the `tests/` directory.
+ - Unit Tests
+ - By following the structure of the class methods, you will be able to easily run unit and integration tests for your database.
+ 1. You have to configure the tests for your provide in `/tests/unit/providers/vector_io/conftest.py`.
+ 2. Update the `vector_provider` fixture to include your provider if they are an inline provider.
+ 3. Create a `your_vectorprovider_index` fixture that initializes your vector index.
+ 4. Create a `your_vectorprovider_adapter` fixture that initializes your vector adapter.
+ 5. Add your provider to the `vector_io_providers` fixture dictionary.
+ - Please follow the naming convention of `your_vectorprovider_index` and `your_vectorprovider_adapter` as the tests require this to execute properly.
+ - Integration Tests
+ - Integration tests are located in {repopath}`tests/integration`. These tests use the python client-SDK APIs (from the `llama_stack_client` package) to test functionality.
+ - The two set of integration tests are:
+ - `tests/integration/vector_io/test_vector_io.py`: This file tests registration, insertion, and retrieval.
+ - `tests/integration/vector_io/test_openai_vector_stores.py`: These tests are for OpenAI-compatible vector stores and test the OpenAI API compatibility.
+ - You will need to update `skip_if_provider_doesnt_support_openai_vector_stores` to include your provider as well as `skip_if_provider_doesnt_support_openai_vector_stores_search` to test the appropriate search functionality.
+ - Running the tests in the GitHub CI
+ - You will need to update the `.github/workflows/integration-vector-io-tests.yml` file to include your provider.
+ - If your provider is a remote provider, you will also have to add a container to spin up and run it in the action.
+ - Updating the pyproject.yml
+ - If you are adding tests for the `inline` provider you will have to update the `unit` group.
+ - `uv add new_pip_package --group unit`
+ - If you are adding tests for the `remote` provider you will have to update the `test` group, which is used in the GitHub CI for integration tests.
+ - `uv add new_pip_package --group test`
+5. **Update Documentation**: Please update the documentation for end users
+ - Generate the provider documentation by running {repopath}`./scripts/provider_codegen.py`.
+ - Update the autogenerated content in the registry/vector_io.py file with information about your provider. Please see other providers for examples.
\ No newline at end of file
diff --git a/docs/source/contributing/testing.md b/docs/source/contributing/testing.md
index 47bf9dea7..454ded266 100644
--- a/docs/source/contributing/testing.md
+++ b/docs/source/contributing/testing.md
@@ -1,6 +1,8 @@
-# Testing Llama Stack
+```{include} ../../../tests/README.md
+```
-Tests are of three different kinds:
-- Unit tests
-- Provider focused integration tests
-- Client SDK tests
+```{include} ../../../tests/unit/README.md
+```
+
+```{include} ../../../tests/integration/README.md
+```
diff --git a/docs/source/providers/inference/index.md b/docs/source/providers/inference/index.md
index 1c7bc86b9..38781e5eb 100644
--- a/docs/source/providers/inference/index.md
+++ b/docs/source/providers/inference/index.md
@@ -29,6 +29,7 @@ remote_runpod
remote_sambanova
remote_tgi
remote_together
+remote_vertexai
remote_vllm
remote_watsonx
```
diff --git a/docs/source/providers/inference/remote_vertexai.md b/docs/source/providers/inference/remote_vertexai.md
new file mode 100644
index 000000000..962bbd76f
--- /dev/null
+++ b/docs/source/providers/inference/remote_vertexai.md
@@ -0,0 +1,40 @@
+# remote::vertexai
+
+## Description
+
+Google Vertex AI inference provider enables you to use Google's Gemini models through Google Cloud's Vertex AI platform, providing several advantages:
+
+• Enterprise-grade security: Uses Google Cloud's security controls and IAM
+• Better integration: Seamless integration with other Google Cloud services
+• Advanced features: Access to additional Vertex AI features like model tuning and monitoring
+• Authentication: Uses Google Cloud Application Default Credentials (ADC) instead of API keys
+
+Configuration:
+- Set VERTEX_AI_PROJECT environment variable (required)
+- Set VERTEX_AI_LOCATION environment variable (optional, defaults to us-central1)
+- Use Google Cloud Application Default Credentials or service account key
+
+Authentication Setup:
+Option 1 (Recommended): gcloud auth application-default login
+Option 2: Set GOOGLE_APPLICATION_CREDENTIALS to service account key path
+
+Available Models:
+- vertex_ai/gemini-2.0-flash
+- vertex_ai/gemini-2.5-flash
+- vertex_ai/gemini-2.5-pro
+
+## Configuration
+
+| Field | Type | Required | Default | Description |
+|-------|------|----------|---------|-------------|
+| `project` | `` | No | | Google Cloud project ID for Vertex AI |
+| `location` | `` | No | us-central1 | Google Cloud location for Vertex AI |
+
+## Sample Configuration
+
+```yaml
+project: ${env.VERTEX_AI_PROJECT:=}
+location: ${env.VERTEX_AI_LOCATION:=us-central1}
+
+```
+
diff --git a/docs/source/providers/vector_io/inline_faiss.md b/docs/source/providers/vector_io/inline_faiss.md
index bcff66f3f..cfa18a839 100644
--- a/docs/source/providers/vector_io/inline_faiss.md
+++ b/docs/source/providers/vector_io/inline_faiss.md
@@ -12,6 +12,18 @@ That means you'll get fast and efficient vector retrieval.
- Lightweight and easy to use
- Fully integrated with Llama Stack
- GPU support
+- **Vector search** - FAISS supports pure vector similarity search using embeddings
+
+## Search Modes
+
+**Supported:**
+- **Vector Search** (`mode="vector"`): Performs vector similarity search using embeddings
+
+**Not Supported:**
+- **Keyword Search** (`mode="keyword"`): Not supported by FAISS
+- **Hybrid Search** (`mode="hybrid"`): Not supported by FAISS
+
+> **Note**: FAISS is designed as a pure vector similarity search library. See the [FAISS GitHub repository](https://github.com/facebookresearch/faiss) for more details about FAISS's core functionality.
## Usage
diff --git a/docs/source/providers/vector_io/remote_milvus.md b/docs/source/providers/vector_io/remote_milvus.md
index 3646f4acc..2af64b8bb 100644
--- a/docs/source/providers/vector_io/remote_milvus.md
+++ b/docs/source/providers/vector_io/remote_milvus.md
@@ -11,6 +11,7 @@ That means you're not limited to storing vectors in memory or in a separate serv
- Easy to use
- Fully integrated with Llama Stack
+- Supports all search modes: vector, keyword, and hybrid search (both inline and remote configurations)
## Usage
@@ -101,6 +102,92 @@ vector_io:
- **`client_pem_path`**: Path to the **client certificate** file (required for mTLS).
- **`client_key_path`**: Path to the **client private key** file (required for mTLS).
+## Search Modes
+
+Milvus supports three different search modes for both inline and remote configurations:
+
+### Vector Search
+Vector search uses semantic similarity to find the most relevant chunks based on embedding vectors. This is the default search mode and works well for finding conceptually similar content.
+
+```python
+# Vector search example
+search_response = client.vector_stores.search(
+ vector_store_id=vector_store.id,
+ query="What is machine learning?",
+ search_mode="vector",
+ max_num_results=5,
+)
+```
+
+### Keyword Search
+Keyword search uses traditional text-based matching to find chunks containing specific terms or phrases. This is useful when you need exact term matches.
+
+```python
+# Keyword search example
+search_response = client.vector_stores.search(
+ vector_store_id=vector_store.id,
+ query="Python programming language",
+ search_mode="keyword",
+ max_num_results=5,
+)
+```
+
+### Hybrid Search
+Hybrid search combines both vector and keyword search methods to provide more comprehensive results. It leverages the strengths of both semantic similarity and exact term matching.
+
+#### Basic Hybrid Search
+```python
+# Basic hybrid search example (uses RRF ranker with default impact_factor=60.0)
+search_response = client.vector_stores.search(
+ vector_store_id=vector_store.id,
+ query="neural networks in Python",
+ search_mode="hybrid",
+ max_num_results=5,
+)
+```
+
+**Note**: The default `impact_factor` value of 60.0 was empirically determined to be optimal in the original RRF research paper: ["Reciprocal Rank Fusion outperforms Condorcet and individual Rank Learning Methods"](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) (Cormack et al., 2009).
+
+#### Hybrid Search with RRF (Reciprocal Rank Fusion) Ranker
+RRF combines rankings from vector and keyword search by using reciprocal ranks. The impact factor controls how much weight is given to higher-ranked results.
+
+```python
+# Hybrid search with custom RRF parameters
+search_response = client.vector_stores.search(
+ vector_store_id=vector_store.id,
+ query="neural networks in Python",
+ search_mode="hybrid",
+ max_num_results=5,
+ ranking_options={
+ "ranker": {
+ "type": "rrf",
+ "impact_factor": 100.0, # Higher values give more weight to top-ranked results
+ }
+ },
+)
+```
+
+#### Hybrid Search with Weighted Ranker
+Weighted ranker linearly combines normalized scores from vector and keyword search. The alpha parameter controls the balance between the two search methods.
+
+```python
+# Hybrid search with weighted ranker
+search_response = client.vector_stores.search(
+ vector_store_id=vector_store.id,
+ query="neural networks in Python",
+ search_mode="hybrid",
+ max_num_results=5,
+ ranking_options={
+ "ranker": {
+ "type": "weighted",
+ "alpha": 0.7, # 70% vector search, 30% keyword search
+ }
+ },
+)
+```
+
+For detailed documentation on RRF and Weighted rankers, please refer to the [Milvus Reranking Guide](https://milvus.io/docs/reranking.md).
+
## Documentation
See the [Milvus documentation](https://milvus.io/docs/install-overview.md) for more details about Milvus in general.
diff --git a/llama_stack/core/routing_tables/toolgroups.py b/llama_stack/core/routing_tables/toolgroups.py
index e172af991..6910b3906 100644
--- a/llama_stack/core/routing_tables/toolgroups.py
+++ b/llama_stack/core/routing_tables/toolgroups.py
@@ -124,10 +124,7 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
return toolgroup
async def unregister_toolgroup(self, toolgroup_id: str) -> None:
- tool_group = await self.get_tool_group(toolgroup_id)
- if tool_group is None:
- raise ToolGroupNotFoundError(toolgroup_id)
- await self.unregister_object(tool_group)
+ await self.unregister_object(await self.get_tool_group(toolgroup_id))
async def shutdown(self) -> None:
pass
diff --git a/llama_stack/distributions/ci-tests/build.yaml b/llama_stack/distributions/ci-tests/build.yaml
index 2f9ae8682..e6e699b62 100644
--- a/llama_stack/distributions/ci-tests/build.yaml
+++ b/llama_stack/distributions/ci-tests/build.yaml
@@ -14,6 +14,7 @@ distribution_spec:
- provider_type: remote::openai
- provider_type: remote::anthropic
- provider_type: remote::gemini
+ - provider_type: remote::vertexai
- provider_type: remote::groq
- provider_type: remote::sambanova
- provider_type: inline::sentence-transformers
diff --git a/llama_stack/distributions/ci-tests/run.yaml b/llama_stack/distributions/ci-tests/run.yaml
index 188c66275..05e1b4576 100644
--- a/llama_stack/distributions/ci-tests/run.yaml
+++ b/llama_stack/distributions/ci-tests/run.yaml
@@ -65,6 +65,11 @@ providers:
provider_type: remote::gemini
config:
api_key: ${env.GEMINI_API_KEY:=}
+ - provider_id: ${env.VERTEX_AI_PROJECT:+vertexai}
+ provider_type: remote::vertexai
+ config:
+ project: ${env.VERTEX_AI_PROJECT:=}
+ location: ${env.VERTEX_AI_LOCATION:=us-central1}
- provider_id: groq
provider_type: remote::groq
config:
diff --git a/llama_stack/distributions/starter/build.yaml b/llama_stack/distributions/starter/build.yaml
index f95a03a9e..1a4f81d49 100644
--- a/llama_stack/distributions/starter/build.yaml
+++ b/llama_stack/distributions/starter/build.yaml
@@ -14,6 +14,7 @@ distribution_spec:
- provider_type: remote::openai
- provider_type: remote::anthropic
- provider_type: remote::gemini
+ - provider_type: remote::vertexai
- provider_type: remote::groq
- provider_type: remote::sambanova
- provider_type: inline::sentence-transformers
diff --git a/llama_stack/distributions/starter/run.yaml b/llama_stack/distributions/starter/run.yaml
index 8bd737686..46bd12956 100644
--- a/llama_stack/distributions/starter/run.yaml
+++ b/llama_stack/distributions/starter/run.yaml
@@ -65,6 +65,11 @@ providers:
provider_type: remote::gemini
config:
api_key: ${env.GEMINI_API_KEY:=}
+ - provider_id: ${env.VERTEX_AI_PROJECT:+vertexai}
+ provider_type: remote::vertexai
+ config:
+ project: ${env.VERTEX_AI_PROJECT:=}
+ location: ${env.VERTEX_AI_LOCATION:=us-central1}
- provider_id: groq
provider_type: remote::groq
config:
diff --git a/llama_stack/distributions/starter/starter.py b/llama_stack/distributions/starter/starter.py
index a970f2d1c..0270b68ad 100644
--- a/llama_stack/distributions/starter/starter.py
+++ b/llama_stack/distributions/starter/starter.py
@@ -56,6 +56,7 @@ ENABLED_INFERENCE_PROVIDERS = [
"fireworks",
"together",
"gemini",
+ "vertexai",
"groq",
"sambanova",
"anthropic",
@@ -71,6 +72,7 @@ INFERENCE_PROVIDER_IDS = {
"tgi": "${env.TGI_URL:+tgi}",
"cerebras": "${env.CEREBRAS_API_KEY:+cerebras}",
"nvidia": "${env.NVIDIA_API_KEY:+nvidia}",
+ "vertexai": "${env.VERTEX_AI_PROJECT:+vertexai}",
}
@@ -246,6 +248,14 @@ def get_distribution_template() -> DistributionTemplate:
"",
"Gemini API Key",
),
+ "VERTEX_AI_PROJECT": (
+ "",
+ "Google Cloud Project ID for Vertex AI",
+ ),
+ "VERTEX_AI_LOCATION": (
+ "us-central1",
+ "Google Cloud Location for Vertex AI",
+ ),
"SAMBANOVA_API_KEY": (
"",
"SambaNova API Key",
diff --git a/llama_stack/log.py b/llama_stack/log.py
index ab53e08c0..0a2d63ef6 100644
--- a/llama_stack/log.py
+++ b/llama_stack/log.py
@@ -99,7 +99,8 @@ def parse_environment_config(env_config: str) -> dict[str, int]:
Dict[str, int]: A dictionary mapping categories to their log levels.
"""
category_levels = {}
- for pair in env_config.split(";"):
+ delimiter = ","
+ for pair in env_config.split(delimiter):
if not pair.strip():
continue
diff --git a/llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py b/llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py
index 796771ee1..e11ec5cf5 100644
--- a/llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py
+++ b/llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py
@@ -15,6 +15,7 @@ from llama_stack.apis.safety import (
RunShieldResponse,
Safety,
SafetyViolation,
+ ShieldStore,
ViolationLevel,
)
from llama_stack.apis.shields import Shield
@@ -32,6 +33,8 @@ PROMPT_GUARD_MODEL = "Prompt-Guard-86M"
class PromptGuardSafetyImpl(Safety, ShieldsProtocolPrivate):
+ shield_store: ShieldStore
+
def __init__(self, config: PromptGuardConfig, _deps) -> None:
self.config = config
@@ -53,7 +56,7 @@ class PromptGuardSafetyImpl(Safety, ShieldsProtocolPrivate):
self,
shield_id: str,
messages: list[Message],
- params: dict[str, Any] = None,
+ params: dict[str, Any],
) -> RunShieldResponse:
shield = await self.shield_store.get_shield(shield_id)
if not shield:
@@ -117,8 +120,10 @@ class PromptGuardShield:
elif self.config.guard_type == PromptGuardType.jailbreak.value and score_malicious > self.threshold:
violation = SafetyViolation(
violation_level=ViolationLevel.ERROR,
- violation_type=f"prompt_injection:malicious={score_malicious}",
- violation_return_message="Sorry, I cannot do this.",
+ user_message="Sorry, I cannot do this.",
+ metadata={
+ "violation_type": f"prompt_injection:malicious={score_malicious}",
+ },
)
return RunShieldResponse(violation=violation)
diff --git a/llama_stack/providers/inline/vector_io/faiss/faiss.py b/llama_stack/providers/inline/vector_io/faiss/faiss.py
index 7a5373726..5a063592c 100644
--- a/llama_stack/providers/inline/vector_io/faiss/faiss.py
+++ b/llama_stack/providers/inline/vector_io/faiss/faiss.py
@@ -174,7 +174,9 @@ class FaissIndex(EmbeddingIndex):
k: int,
score_threshold: float,
) -> QueryChunksResponse:
- raise NotImplementedError("Keyword search is not supported in FAISS")
+ raise NotImplementedError(
+ "Keyword search is not supported - underlying DB FAISS does not support this search mode"
+ )
async def query_hybrid(
self,
@@ -185,7 +187,9 @@ class FaissIndex(EmbeddingIndex):
reranker_type: str,
reranker_params: dict[str, Any] | None = None,
) -> QueryChunksResponse:
- raise NotImplementedError("Hybrid search is not supported in FAISS")
+ raise NotImplementedError(
+ "Hybrid search is not supported - underlying DB FAISS does not support this search mode"
+ )
class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate):
diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py
index a8bc96a77..1801cdcad 100644
--- a/llama_stack/providers/registry/inference.py
+++ b/llama_stack/providers/registry/inference.py
@@ -213,6 +213,36 @@ def available_providers() -> list[ProviderSpec]:
description="Google Gemini inference provider for accessing Gemini models and Google's AI services.",
),
),
+ remote_provider_spec(
+ api=Api.inference,
+ adapter=AdapterSpec(
+ adapter_type="vertexai",
+ pip_packages=["litellm", "google-cloud-aiplatform"],
+ module="llama_stack.providers.remote.inference.vertexai",
+ config_class="llama_stack.providers.remote.inference.vertexai.VertexAIConfig",
+ provider_data_validator="llama_stack.providers.remote.inference.vertexai.config.VertexAIProviderDataValidator",
+ description="""Google Vertex AI inference provider enables you to use Google's Gemini models through Google Cloud's Vertex AI platform, providing several advantages:
+
+• Enterprise-grade security: Uses Google Cloud's security controls and IAM
+• Better integration: Seamless integration with other Google Cloud services
+• Advanced features: Access to additional Vertex AI features like model tuning and monitoring
+• Authentication: Uses Google Cloud Application Default Credentials (ADC) instead of API keys
+
+Configuration:
+- Set VERTEX_AI_PROJECT environment variable (required)
+- Set VERTEX_AI_LOCATION environment variable (optional, defaults to us-central1)
+- Use Google Cloud Application Default Credentials or service account key
+
+Authentication Setup:
+Option 1 (Recommended): gcloud auth application-default login
+Option 2: Set GOOGLE_APPLICATION_CREDENTIALS to service account key path
+
+Available Models:
+- vertex_ai/gemini-2.0-flash
+- vertex_ai/gemini-2.5-flash
+- vertex_ai/gemini-2.5-pro""",
+ ),
+ ),
remote_provider_spec(
api=Api.inference,
adapter=AdapterSpec(
diff --git a/llama_stack/providers/registry/vector_io.py b/llama_stack/providers/registry/vector_io.py
index 846f7b88e..ed170b508 100644
--- a/llama_stack/providers/registry/vector_io.py
+++ b/llama_stack/providers/registry/vector_io.py
@@ -45,6 +45,18 @@ That means you'll get fast and efficient vector retrieval.
- Lightweight and easy to use
- Fully integrated with Llama Stack
- GPU support
+- **Vector search** - FAISS supports pure vector similarity search using embeddings
+
+## Search Modes
+
+**Supported:**
+- **Vector Search** (`mode="vector"`): Performs vector similarity search using embeddings
+
+**Not Supported:**
+- **Keyword Search** (`mode="keyword"`): Not supported by FAISS
+- **Hybrid Search** (`mode="hybrid"`): Not supported by FAISS
+
+> **Note**: FAISS is designed as a pure vector similarity search library. See the [FAISS GitHub repository](https://github.com/facebookresearch/faiss) for more details about FAISS's core functionality.
## Usage
@@ -535,6 +547,7 @@ That means you're not limited to storing vectors in memory or in a separate serv
- Easy to use
- Fully integrated with Llama Stack
+- Supports all search modes: vector, keyword, and hybrid search (both inline and remote configurations)
## Usage
@@ -625,6 +638,92 @@ vector_io:
- **`client_pem_path`**: Path to the **client certificate** file (required for mTLS).
- **`client_key_path`**: Path to the **client private key** file (required for mTLS).
+## Search Modes
+
+Milvus supports three different search modes for both inline and remote configurations:
+
+### Vector Search
+Vector search uses semantic similarity to find the most relevant chunks based on embedding vectors. This is the default search mode and works well for finding conceptually similar content.
+
+```python
+# Vector search example
+search_response = client.vector_stores.search(
+ vector_store_id=vector_store.id,
+ query="What is machine learning?",
+ search_mode="vector",
+ max_num_results=5,
+)
+```
+
+### Keyword Search
+Keyword search uses traditional text-based matching to find chunks containing specific terms or phrases. This is useful when you need exact term matches.
+
+```python
+# Keyword search example
+search_response = client.vector_stores.search(
+ vector_store_id=vector_store.id,
+ query="Python programming language",
+ search_mode="keyword",
+ max_num_results=5,
+)
+```
+
+### Hybrid Search
+Hybrid search combines both vector and keyword search methods to provide more comprehensive results. It leverages the strengths of both semantic similarity and exact term matching.
+
+#### Basic Hybrid Search
+```python
+# Basic hybrid search example (uses RRF ranker with default impact_factor=60.0)
+search_response = client.vector_stores.search(
+ vector_store_id=vector_store.id,
+ query="neural networks in Python",
+ search_mode="hybrid",
+ max_num_results=5,
+)
+```
+
+**Note**: The default `impact_factor` value of 60.0 was empirically determined to be optimal in the original RRF research paper: ["Reciprocal Rank Fusion outperforms Condorcet and individual Rank Learning Methods"](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) (Cormack et al., 2009).
+
+#### Hybrid Search with RRF (Reciprocal Rank Fusion) Ranker
+RRF combines rankings from vector and keyword search by using reciprocal ranks. The impact factor controls how much weight is given to higher-ranked results.
+
+```python
+# Hybrid search with custom RRF parameters
+search_response = client.vector_stores.search(
+ vector_store_id=vector_store.id,
+ query="neural networks in Python",
+ search_mode="hybrid",
+ max_num_results=5,
+ ranking_options={
+ "ranker": {
+ "type": "rrf",
+ "impact_factor": 100.0, # Higher values give more weight to top-ranked results
+ }
+ },
+)
+```
+
+#### Hybrid Search with Weighted Ranker
+Weighted ranker linearly combines normalized scores from vector and keyword search. The alpha parameter controls the balance between the two search methods.
+
+```python
+# Hybrid search with weighted ranker
+search_response = client.vector_stores.search(
+ vector_store_id=vector_store.id,
+ query="neural networks in Python",
+ search_mode="hybrid",
+ max_num_results=5,
+ ranking_options={
+ "ranker": {
+ "type": "weighted",
+ "alpha": 0.7, # 70% vector search, 30% keyword search
+ }
+ },
+)
+```
+
+For detailed documentation on RRF and Weighted rankers, please refer to the [Milvus Reranking Guide](https://milvus.io/docs/reranking.md).
+
## Documentation
See the [Milvus documentation](https://milvus.io/docs/install-overview.md) for more details about Milvus in general.
diff --git a/llama_stack/providers/remote/inference/vertexai/__init__.py b/llama_stack/providers/remote/inference/vertexai/__init__.py
new file mode 100644
index 000000000..d9e9419be
--- /dev/null
+++ b/llama_stack/providers/remote/inference/vertexai/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .config import VertexAIConfig
+
+
+async def get_adapter_impl(config: VertexAIConfig, _deps):
+ from .vertexai import VertexAIInferenceAdapter
+
+ impl = VertexAIInferenceAdapter(config)
+ await impl.initialize()
+ return impl
diff --git a/llama_stack/providers/remote/inference/vertexai/config.py b/llama_stack/providers/remote/inference/vertexai/config.py
new file mode 100644
index 000000000..659de653e
--- /dev/null
+++ b/llama_stack/providers/remote/inference/vertexai/config.py
@@ -0,0 +1,45 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+from llama_stack.schema_utils import json_schema_type
+
+
+class VertexAIProviderDataValidator(BaseModel):
+ vertex_project: str | None = Field(
+ default=None,
+ description="Google Cloud project ID for Vertex AI",
+ )
+ vertex_location: str | None = Field(
+ default=None,
+ description="Google Cloud location for Vertex AI (e.g., us-central1)",
+ )
+
+
+@json_schema_type
+class VertexAIConfig(BaseModel):
+ project: str = Field(
+ description="Google Cloud project ID for Vertex AI",
+ )
+ location: str = Field(
+ default="us-central1",
+ description="Google Cloud location for Vertex AI",
+ )
+
+ @classmethod
+ def sample_run_config(
+ cls,
+ project: str = "${env.VERTEX_AI_PROJECT:=}",
+ location: str = "${env.VERTEX_AI_LOCATION:=us-central1}",
+ **kwargs,
+ ) -> dict[str, Any]:
+ return {
+ "project": project,
+ "location": location,
+ }
diff --git a/llama_stack/providers/remote/inference/vertexai/models.py b/llama_stack/providers/remote/inference/vertexai/models.py
new file mode 100644
index 000000000..e72db533d
--- /dev/null
+++ b/llama_stack/providers/remote/inference/vertexai/models.py
@@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.providers.utils.inference.model_registry import (
+ ProviderModelEntry,
+)
+
+# Vertex AI model IDs with vertex_ai/ prefix as required by litellm
+LLM_MODEL_IDS = [
+ "vertex_ai/gemini-2.0-flash",
+ "vertex_ai/gemini-2.5-flash",
+ "vertex_ai/gemini-2.5-pro",
+]
+
+SAFETY_MODELS_ENTRIES = list[ProviderModelEntry]()
+
+MODEL_ENTRIES = [ProviderModelEntry(provider_model_id=m) for m in LLM_MODEL_IDS] + SAFETY_MODELS_ENTRIES
diff --git a/llama_stack/providers/remote/inference/vertexai/vertexai.py b/llama_stack/providers/remote/inference/vertexai/vertexai.py
new file mode 100644
index 000000000..8807fd0e6
--- /dev/null
+++ b/llama_stack/providers/remote/inference/vertexai/vertexai.py
@@ -0,0 +1,52 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any
+
+from llama_stack.apis.inference import ChatCompletionRequest
+from llama_stack.providers.utils.inference.litellm_openai_mixin import (
+ LiteLLMOpenAIMixin,
+)
+
+from .config import VertexAIConfig
+from .models import MODEL_ENTRIES
+
+
+class VertexAIInferenceAdapter(LiteLLMOpenAIMixin):
+ def __init__(self, config: VertexAIConfig) -> None:
+ LiteLLMOpenAIMixin.__init__(
+ self,
+ MODEL_ENTRIES,
+ litellm_provider_name="vertex_ai",
+ api_key_from_config=None, # Vertex AI uses ADC, not API keys
+ provider_data_api_key_field="vertex_project", # Use project for validation
+ )
+ self.config = config
+
+ def get_api_key(self) -> str:
+ # Vertex AI doesn't use API keys, it uses Application Default Credentials
+ # Return empty string to let litellm handle authentication via ADC
+ return ""
+
+ async def _get_params(self, request: ChatCompletionRequest) -> dict[str, Any]:
+ # Get base parameters from parent
+ params = await super()._get_params(request)
+
+ # Add Vertex AI specific parameters
+ provider_data = self.get_request_provider_data()
+ if provider_data:
+ if getattr(provider_data, "vertex_project", None):
+ params["vertex_project"] = provider_data.vertex_project
+ if getattr(provider_data, "vertex_location", None):
+ params["vertex_location"] = provider_data.vertex_location
+ else:
+ params["vertex_project"] = self.config.project
+ params["vertex_location"] = self.config.location
+
+ # Remove api_key since Vertex AI uses ADC
+ params.pop("api_key", None)
+
+ return params
diff --git a/llama_stack/ui/app/chat-playground/page.tsx b/llama_stack/ui/app/chat-playground/page.tsx
index c31248b78..d8094af85 100644
--- a/llama_stack/ui/app/chat-playground/page.tsx
+++ b/llama_stack/ui/app/chat-playground/page.tsx
@@ -175,7 +175,7 @@ const handleSubmitWithContent = async (content: string) => {
return (