mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-23 00:12:24 +00:00
Merge branch 'meta-llama:main' into dell-distro
This commit is contained in:
commit
839cc911ac
42 changed files with 786 additions and 186 deletions
|
|
@ -62,3 +62,13 @@ class SessionNotFoundError(ValueError):
|
|||
def __init__(self, session_name: str) -> None:
|
||||
message = f"Session '{session_name}' not found or access denied."
|
||||
super().__init__(message)
|
||||
|
||||
|
||||
class ModelTypeError(TypeError):
|
||||
"""raised when a model is present but not the correct type"""
|
||||
|
||||
def __init__(self, model_name: str, model_type: str, expected_model_type: str) -> None:
|
||||
message = (
|
||||
f"Model '{model_name}' is of type '{model_type}' rather than the expected type '{expected_model_type}'"
|
||||
)
|
||||
super().__init__(message)
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ from llama_stack.apis.common.content_types import (
|
|||
InterleavedContent,
|
||||
InterleavedContentItem,
|
||||
)
|
||||
from llama_stack.apis.common.errors import ModelNotFoundError
|
||||
from llama_stack.apis.common.errors import ModelNotFoundError, ModelTypeError
|
||||
from llama_stack.apis.inference import (
|
||||
BatchChatCompletionResponse,
|
||||
BatchCompletionResponse,
|
||||
|
|
@ -177,6 +177,15 @@ class InferenceRouter(Inference):
|
|||
encoded = self.formatter.encode_content(messages)
|
||||
return len(encoded.tokens) if encoded and encoded.tokens else 0
|
||||
|
||||
async def _get_model(self, model_id: str, expected_model_type: str) -> Model:
|
||||
"""takes a model id and gets model after ensuring that it is accessible and of the correct type"""
|
||||
model = await self.routing_table.get_model(model_id)
|
||||
if model is None:
|
||||
raise ModelNotFoundError(model_id)
|
||||
if model.model_type != expected_model_type:
|
||||
raise ModelTypeError(model_id, model.model_type, expected_model_type)
|
||||
return model
|
||||
|
||||
async def chat_completion(
|
||||
self,
|
||||
model_id: str,
|
||||
|
|
@ -195,11 +204,7 @@ class InferenceRouter(Inference):
|
|||
)
|
||||
if sampling_params is None:
|
||||
sampling_params = SamplingParams()
|
||||
model = await self.routing_table.get_model(model_id)
|
||||
if model is None:
|
||||
raise ModelNotFoundError(model_id)
|
||||
if model.model_type == ModelType.embedding:
|
||||
raise ValueError(f"Model '{model_id}' is an embedding model and does not support chat completions")
|
||||
model = await self._get_model(model_id, ModelType.llm)
|
||||
if tool_config:
|
||||
if tool_choice and tool_choice != tool_config.tool_choice:
|
||||
raise ValueError("tool_choice and tool_config.tool_choice must match")
|
||||
|
|
@ -301,11 +306,7 @@ class InferenceRouter(Inference):
|
|||
logger.debug(
|
||||
f"InferenceRouter.completion: {model_id=}, {stream=}, {content=}, {sampling_params=}, {response_format=}",
|
||||
)
|
||||
model = await self.routing_table.get_model(model_id)
|
||||
if model is None:
|
||||
raise ModelNotFoundError(model_id)
|
||||
if model.model_type == ModelType.embedding:
|
||||
raise ValueError(f"Model '{model_id}' is an embedding model and does not support chat completions")
|
||||
model = await self._get_model(model_id, ModelType.llm)
|
||||
provider = await self.routing_table.get_provider_impl(model_id)
|
||||
params = dict(
|
||||
model_id=model_id,
|
||||
|
|
@ -355,11 +356,7 @@ class InferenceRouter(Inference):
|
|||
task_type: EmbeddingTaskType | None = None,
|
||||
) -> EmbeddingsResponse:
|
||||
logger.debug(f"InferenceRouter.embeddings: {model_id}")
|
||||
model = await self.routing_table.get_model(model_id)
|
||||
if model is None:
|
||||
raise ModelNotFoundError(model_id)
|
||||
if model.model_type == ModelType.llm:
|
||||
raise ValueError(f"Model '{model_id}' is an LLM model and does not support embeddings")
|
||||
await self._get_model(model_id, ModelType.embedding)
|
||||
provider = await self.routing_table.get_provider_impl(model_id)
|
||||
return await provider.embeddings(
|
||||
model_id=model_id,
|
||||
|
|
@ -395,12 +392,7 @@ class InferenceRouter(Inference):
|
|||
logger.debug(
|
||||
f"InferenceRouter.openai_completion: {model=}, {stream=}, {prompt=}",
|
||||
)
|
||||
model_obj = await self.routing_table.get_model(model)
|
||||
if model_obj is None:
|
||||
raise ModelNotFoundError(model)
|
||||
if model_obj.model_type == ModelType.embedding:
|
||||
raise ValueError(f"Model '{model}' is an embedding model and does not support completions")
|
||||
|
||||
model_obj = await self._get_model(model, ModelType.llm)
|
||||
params = dict(
|
||||
model=model_obj.identifier,
|
||||
prompt=prompt,
|
||||
|
|
@ -476,11 +468,7 @@ class InferenceRouter(Inference):
|
|||
logger.debug(
|
||||
f"InferenceRouter.openai_chat_completion: {model=}, {stream=}, {messages=}",
|
||||
)
|
||||
model_obj = await self.routing_table.get_model(model)
|
||||
if model_obj is None:
|
||||
raise ModelNotFoundError(model)
|
||||
if model_obj.model_type == ModelType.embedding:
|
||||
raise ValueError(f"Model '{model}' is an embedding model and does not support chat completions")
|
||||
model_obj = await self._get_model(model, ModelType.llm)
|
||||
|
||||
# Use the OpenAI client for a bit of extra input validation without
|
||||
# exposing the OpenAI client itself as part of our API surface
|
||||
|
|
@ -567,12 +555,7 @@ class InferenceRouter(Inference):
|
|||
logger.debug(
|
||||
f"InferenceRouter.openai_embeddings: {model=}, input_type={type(input)}, {encoding_format=}, {dimensions=}",
|
||||
)
|
||||
model_obj = await self.routing_table.get_model(model)
|
||||
if model_obj is None:
|
||||
raise ModelNotFoundError(model)
|
||||
if model_obj.model_type != ModelType.embedding:
|
||||
raise ValueError(f"Model '{model}' is not an embedding model")
|
||||
|
||||
model_obj = await self._get_model(model, ModelType.embedding)
|
||||
params = dict(
|
||||
model=model_obj.identifier,
|
||||
input=input,
|
||||
|
|
|
|||
|
|
@ -124,10 +124,7 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
|
|||
return toolgroup
|
||||
|
||||
async def unregister_toolgroup(self, toolgroup_id: str) -> None:
|
||||
tool_group = await self.get_tool_group(toolgroup_id)
|
||||
if tool_group is None:
|
||||
raise ToolGroupNotFoundError(toolgroup_id)
|
||||
await self.unregister_object(tool_group)
|
||||
await self.unregister_object(await self.get_tool_group(toolgroup_id))
|
||||
|
||||
async def shutdown(self) -> None:
|
||||
pass
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ from typing import Any
|
|||
|
||||
from pydantic import TypeAdapter
|
||||
|
||||
from llama_stack.apis.common.errors import ModelNotFoundError, VectorStoreNotFoundError
|
||||
from llama_stack.apis.common.errors import ModelNotFoundError, ModelTypeError, VectorStoreNotFoundError
|
||||
from llama_stack.apis.models import ModelType
|
||||
from llama_stack.apis.resource import ResourceType
|
||||
from llama_stack.apis.vector_dbs import ListVectorDBsResponse, VectorDB, VectorDBs
|
||||
|
|
@ -66,7 +66,7 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
|
|||
if model is None:
|
||||
raise ModelNotFoundError(embedding_model)
|
||||
if model.model_type != ModelType.embedding:
|
||||
raise ValueError(f"Model {embedding_model} is not an embedding model")
|
||||
raise ModelTypeError(embedding_model, model.model_type, ModelType.embedding)
|
||||
if "embedding_dimension" not in model.metadata:
|
||||
raise ValueError(f"Model {embedding_model} does not have an embedding dimension")
|
||||
vector_db_data = {
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ distribution_spec:
|
|||
- provider_type: remote::openai
|
||||
- provider_type: remote::anthropic
|
||||
- provider_type: remote::gemini
|
||||
- provider_type: remote::vertexai
|
||||
- provider_type: remote::groq
|
||||
- provider_type: remote::sambanova
|
||||
- provider_type: inline::sentence-transformers
|
||||
|
|
|
|||
|
|
@ -65,6 +65,11 @@ providers:
|
|||
provider_type: remote::gemini
|
||||
config:
|
||||
api_key: ${env.GEMINI_API_KEY:=}
|
||||
- provider_id: ${env.VERTEX_AI_PROJECT:+vertexai}
|
||||
provider_type: remote::vertexai
|
||||
config:
|
||||
project: ${env.VERTEX_AI_PROJECT:=}
|
||||
location: ${env.VERTEX_AI_LOCATION:=us-central1}
|
||||
- provider_id: groq
|
||||
provider_type: remote::groq
|
||||
config:
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ distribution_spec:
|
|||
- provider_type: remote::openai
|
||||
- provider_type: remote::anthropic
|
||||
- provider_type: remote::gemini
|
||||
- provider_type: remote::vertexai
|
||||
- provider_type: remote::groq
|
||||
- provider_type: remote::sambanova
|
||||
- provider_type: inline::sentence-transformers
|
||||
|
|
|
|||
|
|
@ -65,6 +65,11 @@ providers:
|
|||
provider_type: remote::gemini
|
||||
config:
|
||||
api_key: ${env.GEMINI_API_KEY:=}
|
||||
- provider_id: ${env.VERTEX_AI_PROJECT:+vertexai}
|
||||
provider_type: remote::vertexai
|
||||
config:
|
||||
project: ${env.VERTEX_AI_PROJECT:=}
|
||||
location: ${env.VERTEX_AI_LOCATION:=us-central1}
|
||||
- provider_id: groq
|
||||
provider_type: remote::groq
|
||||
config:
|
||||
|
|
|
|||
|
|
@ -56,6 +56,7 @@ ENABLED_INFERENCE_PROVIDERS = [
|
|||
"fireworks",
|
||||
"together",
|
||||
"gemini",
|
||||
"vertexai",
|
||||
"groq",
|
||||
"sambanova",
|
||||
"anthropic",
|
||||
|
|
@ -71,6 +72,7 @@ INFERENCE_PROVIDER_IDS = {
|
|||
"tgi": "${env.TGI_URL:+tgi}",
|
||||
"cerebras": "${env.CEREBRAS_API_KEY:+cerebras}",
|
||||
"nvidia": "${env.NVIDIA_API_KEY:+nvidia}",
|
||||
"vertexai": "${env.VERTEX_AI_PROJECT:+vertexai}",
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -246,6 +248,14 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
"",
|
||||
"Gemini API Key",
|
||||
),
|
||||
"VERTEX_AI_PROJECT": (
|
||||
"",
|
||||
"Google Cloud Project ID for Vertex AI",
|
||||
),
|
||||
"VERTEX_AI_LOCATION": (
|
||||
"us-central1",
|
||||
"Google Cloud Location for Vertex AI",
|
||||
),
|
||||
"SAMBANOVA_API_KEY": (
|
||||
"",
|
||||
"SambaNova API Key",
|
||||
|
|
|
|||
|
|
@ -99,7 +99,8 @@ def parse_environment_config(env_config: str) -> dict[str, int]:
|
|||
Dict[str, int]: A dictionary mapping categories to their log levels.
|
||||
"""
|
||||
category_levels = {}
|
||||
for pair in env_config.split(";"):
|
||||
delimiter = ","
|
||||
for pair in env_config.split(delimiter):
|
||||
if not pair.strip():
|
||||
continue
|
||||
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ from llama_stack.apis.safety import (
|
|||
RunShieldResponse,
|
||||
Safety,
|
||||
SafetyViolation,
|
||||
ShieldStore,
|
||||
ViolationLevel,
|
||||
)
|
||||
from llama_stack.apis.shields import Shield
|
||||
|
|
@ -32,6 +33,8 @@ PROMPT_GUARD_MODEL = "Prompt-Guard-86M"
|
|||
|
||||
|
||||
class PromptGuardSafetyImpl(Safety, ShieldsProtocolPrivate):
|
||||
shield_store: ShieldStore
|
||||
|
||||
def __init__(self, config: PromptGuardConfig, _deps) -> None:
|
||||
self.config = config
|
||||
|
||||
|
|
@ -53,7 +56,7 @@ class PromptGuardSafetyImpl(Safety, ShieldsProtocolPrivate):
|
|||
self,
|
||||
shield_id: str,
|
||||
messages: list[Message],
|
||||
params: dict[str, Any] = None,
|
||||
params: dict[str, Any],
|
||||
) -> RunShieldResponse:
|
||||
shield = await self.shield_store.get_shield(shield_id)
|
||||
if not shield:
|
||||
|
|
@ -61,6 +64,9 @@ class PromptGuardSafetyImpl(Safety, ShieldsProtocolPrivate):
|
|||
|
||||
return await self.shield.run(messages)
|
||||
|
||||
async def run_moderation(self, input: str | list[str], model: str):
|
||||
raise NotImplementedError("run_moderation not implemented for PromptGuard")
|
||||
|
||||
|
||||
class PromptGuardShield:
|
||||
def __init__(
|
||||
|
|
@ -117,8 +123,10 @@ class PromptGuardShield:
|
|||
elif self.config.guard_type == PromptGuardType.jailbreak.value and score_malicious > self.threshold:
|
||||
violation = SafetyViolation(
|
||||
violation_level=ViolationLevel.ERROR,
|
||||
violation_type=f"prompt_injection:malicious={score_malicious}",
|
||||
violation_return_message="Sorry, I cannot do this.",
|
||||
user_message="Sorry, I cannot do this.",
|
||||
metadata={
|
||||
"violation_type": f"prompt_injection:malicious={score_malicious}",
|
||||
},
|
||||
)
|
||||
|
||||
return RunShieldResponse(violation=violation)
|
||||
|
|
|
|||
|
|
@ -174,7 +174,9 @@ class FaissIndex(EmbeddingIndex):
|
|||
k: int,
|
||||
score_threshold: float,
|
||||
) -> QueryChunksResponse:
|
||||
raise NotImplementedError("Keyword search is not supported in FAISS")
|
||||
raise NotImplementedError(
|
||||
"Keyword search is not supported - underlying DB FAISS does not support this search mode"
|
||||
)
|
||||
|
||||
async def query_hybrid(
|
||||
self,
|
||||
|
|
@ -185,7 +187,9 @@ class FaissIndex(EmbeddingIndex):
|
|||
reranker_type: str,
|
||||
reranker_params: dict[str, Any] | None = None,
|
||||
) -> QueryChunksResponse:
|
||||
raise NotImplementedError("Hybrid search is not supported in FAISS")
|
||||
raise NotImplementedError(
|
||||
"Hybrid search is not supported - underlying DB FAISS does not support this search mode"
|
||||
)
|
||||
|
||||
|
||||
class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate):
|
||||
|
|
|
|||
|
|
@ -213,6 +213,36 @@ def available_providers() -> list[ProviderSpec]:
|
|||
description="Google Gemini inference provider for accessing Gemini models and Google's AI services.",
|
||||
),
|
||||
),
|
||||
remote_provider_spec(
|
||||
api=Api.inference,
|
||||
adapter=AdapterSpec(
|
||||
adapter_type="vertexai",
|
||||
pip_packages=["litellm", "google-cloud-aiplatform"],
|
||||
module="llama_stack.providers.remote.inference.vertexai",
|
||||
config_class="llama_stack.providers.remote.inference.vertexai.VertexAIConfig",
|
||||
provider_data_validator="llama_stack.providers.remote.inference.vertexai.config.VertexAIProviderDataValidator",
|
||||
description="""Google Vertex AI inference provider enables you to use Google's Gemini models through Google Cloud's Vertex AI platform, providing several advantages:
|
||||
|
||||
• Enterprise-grade security: Uses Google Cloud's security controls and IAM
|
||||
• Better integration: Seamless integration with other Google Cloud services
|
||||
• Advanced features: Access to additional Vertex AI features like model tuning and monitoring
|
||||
• Authentication: Uses Google Cloud Application Default Credentials (ADC) instead of API keys
|
||||
|
||||
Configuration:
|
||||
- Set VERTEX_AI_PROJECT environment variable (required)
|
||||
- Set VERTEX_AI_LOCATION environment variable (optional, defaults to us-central1)
|
||||
- Use Google Cloud Application Default Credentials or service account key
|
||||
|
||||
Authentication Setup:
|
||||
Option 1 (Recommended): gcloud auth application-default login
|
||||
Option 2: Set GOOGLE_APPLICATION_CREDENTIALS to service account key path
|
||||
|
||||
Available Models:
|
||||
- vertex_ai/gemini-2.0-flash
|
||||
- vertex_ai/gemini-2.5-flash
|
||||
- vertex_ai/gemini-2.5-pro""",
|
||||
),
|
||||
),
|
||||
remote_provider_spec(
|
||||
api=Api.inference,
|
||||
adapter=AdapterSpec(
|
||||
|
|
|
|||
|
|
@ -45,6 +45,18 @@ That means you'll get fast and efficient vector retrieval.
|
|||
- Lightweight and easy to use
|
||||
- Fully integrated with Llama Stack
|
||||
- GPU support
|
||||
- **Vector search** - FAISS supports pure vector similarity search using embeddings
|
||||
|
||||
## Search Modes
|
||||
|
||||
**Supported:**
|
||||
- **Vector Search** (`mode="vector"`): Performs vector similarity search using embeddings
|
||||
|
||||
**Not Supported:**
|
||||
- **Keyword Search** (`mode="keyword"`): Not supported by FAISS
|
||||
- **Hybrid Search** (`mode="hybrid"`): Not supported by FAISS
|
||||
|
||||
> **Note**: FAISS is designed as a pure vector similarity search library. See the [FAISS GitHub repository](https://github.com/facebookresearch/faiss) for more details about FAISS's core functionality.
|
||||
|
||||
## Usage
|
||||
|
||||
|
|
@ -535,6 +547,7 @@ That means you're not limited to storing vectors in memory or in a separate serv
|
|||
|
||||
- Easy to use
|
||||
- Fully integrated with Llama Stack
|
||||
- Supports all search modes: vector, keyword, and hybrid search (both inline and remote configurations)
|
||||
|
||||
## Usage
|
||||
|
||||
|
|
@ -625,6 +638,92 @@ vector_io:
|
|||
- **`client_pem_path`**: Path to the **client certificate** file (required for mTLS).
|
||||
- **`client_key_path`**: Path to the **client private key** file (required for mTLS).
|
||||
|
||||
## Search Modes
|
||||
|
||||
Milvus supports three different search modes for both inline and remote configurations:
|
||||
|
||||
### Vector Search
|
||||
Vector search uses semantic similarity to find the most relevant chunks based on embedding vectors. This is the default search mode and works well for finding conceptually similar content.
|
||||
|
||||
```python
|
||||
# Vector search example
|
||||
search_response = client.vector_stores.search(
|
||||
vector_store_id=vector_store.id,
|
||||
query="What is machine learning?",
|
||||
search_mode="vector",
|
||||
max_num_results=5,
|
||||
)
|
||||
```
|
||||
|
||||
### Keyword Search
|
||||
Keyword search uses traditional text-based matching to find chunks containing specific terms or phrases. This is useful when you need exact term matches.
|
||||
|
||||
```python
|
||||
# Keyword search example
|
||||
search_response = client.vector_stores.search(
|
||||
vector_store_id=vector_store.id,
|
||||
query="Python programming language",
|
||||
search_mode="keyword",
|
||||
max_num_results=5,
|
||||
)
|
||||
```
|
||||
|
||||
### Hybrid Search
|
||||
Hybrid search combines both vector and keyword search methods to provide more comprehensive results. It leverages the strengths of both semantic similarity and exact term matching.
|
||||
|
||||
#### Basic Hybrid Search
|
||||
```python
|
||||
# Basic hybrid search example (uses RRF ranker with default impact_factor=60.0)
|
||||
search_response = client.vector_stores.search(
|
||||
vector_store_id=vector_store.id,
|
||||
query="neural networks in Python",
|
||||
search_mode="hybrid",
|
||||
max_num_results=5,
|
||||
)
|
||||
```
|
||||
|
||||
**Note**: The default `impact_factor` value of 60.0 was empirically determined to be optimal in the original RRF research paper: ["Reciprocal Rank Fusion outperforms Condorcet and individual Rank Learning Methods"](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) (Cormack et al., 2009).
|
||||
|
||||
#### Hybrid Search with RRF (Reciprocal Rank Fusion) Ranker
|
||||
RRF combines rankings from vector and keyword search by using reciprocal ranks. The impact factor controls how much weight is given to higher-ranked results.
|
||||
|
||||
```python
|
||||
# Hybrid search with custom RRF parameters
|
||||
search_response = client.vector_stores.search(
|
||||
vector_store_id=vector_store.id,
|
||||
query="neural networks in Python",
|
||||
search_mode="hybrid",
|
||||
max_num_results=5,
|
||||
ranking_options={
|
||||
"ranker": {
|
||||
"type": "rrf",
|
||||
"impact_factor": 100.0, # Higher values give more weight to top-ranked results
|
||||
}
|
||||
},
|
||||
)
|
||||
```
|
||||
|
||||
#### Hybrid Search with Weighted Ranker
|
||||
Weighted ranker linearly combines normalized scores from vector and keyword search. The alpha parameter controls the balance between the two search methods.
|
||||
|
||||
```python
|
||||
# Hybrid search with weighted ranker
|
||||
search_response = client.vector_stores.search(
|
||||
vector_store_id=vector_store.id,
|
||||
query="neural networks in Python",
|
||||
search_mode="hybrid",
|
||||
max_num_results=5,
|
||||
ranking_options={
|
||||
"ranker": {
|
||||
"type": "weighted",
|
||||
"alpha": 0.7, # 70% vector search, 30% keyword search
|
||||
}
|
||||
},
|
||||
)
|
||||
```
|
||||
|
||||
For detailed documentation on RRF and Weighted rankers, please refer to the [Milvus Reranking Guide](https://milvus.io/docs/reranking.md).
|
||||
|
||||
## Documentation
|
||||
See the [Milvus documentation](https://milvus.io/docs/install-overview.md) for more details about Milvus in general.
|
||||
|
||||
|
|
|
|||
|
|
@ -13,7 +13,9 @@ LLM_MODEL_IDS = [
|
|||
"gemini-1.5-flash",
|
||||
"gemini-1.5-pro",
|
||||
"gemini-2.0-flash",
|
||||
"gemini-2.0-flash-lite",
|
||||
"gemini-2.5-flash",
|
||||
"gemini-2.5-flash-lite",
|
||||
"gemini-2.5-pro",
|
||||
]
|
||||
|
||||
|
|
|
|||
|
|
@ -457,9 +457,6 @@ class OllamaInferenceAdapter(
|
|||
user: str | None = None,
|
||||
) -> OpenAIEmbeddingsResponse:
|
||||
model_obj = await self._get_model(model)
|
||||
if model_obj.model_type != ModelType.embedding:
|
||||
raise ValueError(f"Model {model} is not an embedding model")
|
||||
|
||||
if model_obj.provider_resource_id is None:
|
||||
raise ValueError(f"Model {model} has no provider_resource_id set")
|
||||
|
||||
|
|
|
|||
15
llama_stack/providers/remote/inference/vertexai/__init__.py
Normal file
15
llama_stack/providers/remote/inference/vertexai/__init__.py
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from .config import VertexAIConfig
|
||||
|
||||
|
||||
async def get_adapter_impl(config: VertexAIConfig, _deps):
|
||||
from .vertexai import VertexAIInferenceAdapter
|
||||
|
||||
impl = VertexAIInferenceAdapter(config)
|
||||
await impl.initialize()
|
||||
return impl
|
||||
45
llama_stack/providers/remote/inference/vertexai/config.py
Normal file
45
llama_stack/providers/remote/inference/vertexai/config.py
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from llama_stack.schema_utils import json_schema_type
|
||||
|
||||
|
||||
class VertexAIProviderDataValidator(BaseModel):
|
||||
vertex_project: str | None = Field(
|
||||
default=None,
|
||||
description="Google Cloud project ID for Vertex AI",
|
||||
)
|
||||
vertex_location: str | None = Field(
|
||||
default=None,
|
||||
description="Google Cloud location for Vertex AI (e.g., us-central1)",
|
||||
)
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class VertexAIConfig(BaseModel):
|
||||
project: str = Field(
|
||||
description="Google Cloud project ID for Vertex AI",
|
||||
)
|
||||
location: str = Field(
|
||||
default="us-central1",
|
||||
description="Google Cloud location for Vertex AI",
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def sample_run_config(
|
||||
cls,
|
||||
project: str = "${env.VERTEX_AI_PROJECT:=}",
|
||||
location: str = "${env.VERTEX_AI_LOCATION:=us-central1}",
|
||||
**kwargs,
|
||||
) -> dict[str, Any]:
|
||||
return {
|
||||
"project": project,
|
||||
"location": location,
|
||||
}
|
||||
20
llama_stack/providers/remote/inference/vertexai/models.py
Normal file
20
llama_stack/providers/remote/inference/vertexai/models.py
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from llama_stack.providers.utils.inference.model_registry import (
|
||||
ProviderModelEntry,
|
||||
)
|
||||
|
||||
# Vertex AI model IDs with vertex_ai/ prefix as required by litellm
|
||||
LLM_MODEL_IDS = [
|
||||
"vertex_ai/gemini-2.0-flash",
|
||||
"vertex_ai/gemini-2.5-flash",
|
||||
"vertex_ai/gemini-2.5-pro",
|
||||
]
|
||||
|
||||
SAFETY_MODELS_ENTRIES = list[ProviderModelEntry]()
|
||||
|
||||
MODEL_ENTRIES = [ProviderModelEntry(provider_model_id=m) for m in LLM_MODEL_IDS] + SAFETY_MODELS_ENTRIES
|
||||
52
llama_stack/providers/remote/inference/vertexai/vertexai.py
Normal file
52
llama_stack/providers/remote/inference/vertexai/vertexai.py
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from typing import Any
|
||||
|
||||
from llama_stack.apis.inference import ChatCompletionRequest
|
||||
from llama_stack.providers.utils.inference.litellm_openai_mixin import (
|
||||
LiteLLMOpenAIMixin,
|
||||
)
|
||||
|
||||
from .config import VertexAIConfig
|
||||
from .models import MODEL_ENTRIES
|
||||
|
||||
|
||||
class VertexAIInferenceAdapter(LiteLLMOpenAIMixin):
|
||||
def __init__(self, config: VertexAIConfig) -> None:
|
||||
LiteLLMOpenAIMixin.__init__(
|
||||
self,
|
||||
MODEL_ENTRIES,
|
||||
litellm_provider_name="vertex_ai",
|
||||
api_key_from_config=None, # Vertex AI uses ADC, not API keys
|
||||
provider_data_api_key_field="vertex_project", # Use project for validation
|
||||
)
|
||||
self.config = config
|
||||
|
||||
def get_api_key(self) -> str:
|
||||
# Vertex AI doesn't use API keys, it uses Application Default Credentials
|
||||
# Return empty string to let litellm handle authentication via ADC
|
||||
return ""
|
||||
|
||||
async def _get_params(self, request: ChatCompletionRequest) -> dict[str, Any]:
|
||||
# Get base parameters from parent
|
||||
params = await super()._get_params(request)
|
||||
|
||||
# Add Vertex AI specific parameters
|
||||
provider_data = self.get_request_provider_data()
|
||||
if provider_data:
|
||||
if getattr(provider_data, "vertex_project", None):
|
||||
params["vertex_project"] = provider_data.vertex_project
|
||||
if getattr(provider_data, "vertex_location", None):
|
||||
params["vertex_location"] = provider_data.vertex_location
|
||||
else:
|
||||
params["vertex_project"] = self.config.project
|
||||
params["vertex_location"] = self.config.location
|
||||
|
||||
# Remove api_key since Vertex AI uses ADC
|
||||
params.pop("api_key", None)
|
||||
|
||||
return params
|
||||
|
|
@ -70,7 +70,7 @@ from openai.types.chat.chat_completion_chunk import (
|
|||
from openai.types.chat.chat_completion_content_part_image_param import (
|
||||
ImageURL as OpenAIImageURL,
|
||||
)
|
||||
from openai.types.chat.chat_completion_message_tool_call_param import (
|
||||
from openai.types.chat.chat_completion_message_tool_call import (
|
||||
Function as OpenAIFunction,
|
||||
)
|
||||
from pydantic import BaseModel
|
||||
|
|
|
|||
|
|
@ -9,7 +9,9 @@ import contextvars
|
|||
import logging
|
||||
import queue
|
||||
import random
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
from collections.abc import Callable
|
||||
from datetime import UTC, datetime
|
||||
from functools import wraps
|
||||
|
|
@ -30,6 +32,16 @@ from llama_stack.providers.utils.telemetry.trace_protocol import serialize_value
|
|||
|
||||
logger = get_logger(__name__, category="core")
|
||||
|
||||
# Fallback logger that does NOT propagate to TelemetryHandler to avoid recursion
|
||||
_fallback_logger = logging.getLogger("llama_stack.telemetry.background")
|
||||
if not _fallback_logger.handlers:
|
||||
_fallback_logger.propagate = False
|
||||
_fallback_logger.setLevel(logging.ERROR)
|
||||
_fallback_handler = logging.StreamHandler(sys.stderr)
|
||||
_fallback_handler.setLevel(logging.ERROR)
|
||||
_fallback_handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(name)s: %(message)s"))
|
||||
_fallback_logger.addHandler(_fallback_handler)
|
||||
|
||||
|
||||
INVALID_SPAN_ID = 0x0000000000000000
|
||||
INVALID_TRACE_ID = 0x00000000000000000000000000000000
|
||||
|
|
@ -79,19 +91,32 @@ def generate_trace_id() -> str:
|
|||
CURRENT_TRACE_CONTEXT = contextvars.ContextVar("trace_context", default=None)
|
||||
BACKGROUND_LOGGER = None
|
||||
|
||||
LOG_QUEUE_FULL_LOG_INTERVAL_SECONDS = 60.0
|
||||
|
||||
|
||||
class BackgroundLogger:
|
||||
def __init__(self, api: Telemetry, capacity: int = 100000):
|
||||
self.api = api
|
||||
self.log_queue = queue.Queue(maxsize=capacity)
|
||||
self.log_queue: queue.Queue[Any] = queue.Queue(maxsize=capacity)
|
||||
self.worker_thread = threading.Thread(target=self._process_logs, daemon=True)
|
||||
self.worker_thread.start()
|
||||
self._last_queue_full_log_time: float = 0.0
|
||||
self._dropped_since_last_notice: int = 0
|
||||
|
||||
def log_event(self, event):
|
||||
try:
|
||||
self.log_queue.put_nowait(event)
|
||||
except queue.Full:
|
||||
logger.error("Log queue is full, dropping event")
|
||||
# Aggregate drops and emit at most once per interval via fallback logger
|
||||
self._dropped_since_last_notice += 1
|
||||
current_time = time.time()
|
||||
if current_time - self._last_queue_full_log_time >= LOG_QUEUE_FULL_LOG_INTERVAL_SECONDS:
|
||||
_fallback_logger.error(
|
||||
"Log queue is full; dropped %d events since last notice",
|
||||
self._dropped_since_last_notice,
|
||||
)
|
||||
self._last_queue_full_log_time = current_time
|
||||
self._dropped_since_last_notice = 0
|
||||
|
||||
def _process_logs(self):
|
||||
while True:
|
||||
|
|
|
|||
|
|
@ -175,7 +175,7 @@ const handleSubmitWithContent = async (content: string) => {
|
|||
return (
|
||||
<div className="flex flex-col h-full max-w-4xl mx-auto">
|
||||
<div className="mb-4 flex justify-between items-center">
|
||||
<h1 className="text-2xl font-bold">Chat Playground</h1>
|
||||
<h1 className="text-2xl font-bold">Chat Playground (Completions)</h1>
|
||||
<div className="flex gap-2">
|
||||
<Select value={selectedModel} onValueChange={setSelectedModel} disabled={isModelsLoading || isGenerating}>
|
||||
<SelectTrigger className="w-[180px]">
|
||||
|
|
|
|||
|
|
@ -6,6 +6,8 @@ import {
|
|||
MoveUpRight,
|
||||
Database,
|
||||
MessageCircle,
|
||||
Settings2,
|
||||
Compass,
|
||||
} from "lucide-react";
|
||||
import Link from "next/link";
|
||||
import { usePathname } from "next/navigation";
|
||||
|
|
@ -22,15 +24,16 @@ import {
|
|||
SidebarMenuItem,
|
||||
SidebarHeader,
|
||||
} from "@/components/ui/sidebar";
|
||||
// Extracted Chat Playground item
|
||||
const chatPlaygroundItem = {
|
||||
title: "Chat Playground",
|
||||
url: "/chat-playground",
|
||||
icon: MessageCircle,
|
||||
};
|
||||
|
||||
// Removed Chat Playground from log items
|
||||
const logItems = [
|
||||
const createItems = [
|
||||
{
|
||||
title: "Chat Playground",
|
||||
url: "/chat-playground",
|
||||
icon: MessageCircle,
|
||||
},
|
||||
];
|
||||
|
||||
const manageItems = [
|
||||
{
|
||||
title: "Chat Completions",
|
||||
url: "/logs/chat-completions",
|
||||
|
|
@ -53,77 +56,96 @@ const logItems = [
|
|||
},
|
||||
];
|
||||
|
||||
const optimizeItems: { title: string; url: string; icon: React.ElementType }[] = [
|
||||
{
|
||||
title: "Evaluations",
|
||||
url: "",
|
||||
icon: Compass,
|
||||
},
|
||||
{
|
||||
title: "Fine-tuning",
|
||||
url: "",
|
||||
icon: Settings2,
|
||||
},
|
||||
];
|
||||
|
||||
interface SidebarItem {
|
||||
title: string;
|
||||
url: string;
|
||||
icon: React.ElementType;
|
||||
}
|
||||
|
||||
export function AppSidebar() {
|
||||
const pathname = usePathname();
|
||||
|
||||
return (
|
||||
<Sidebar>
|
||||
<SidebarHeader>
|
||||
<Link href="/">Llama Stack</Link>
|
||||
</SidebarHeader>
|
||||
<SidebarContent>
|
||||
{/* Chat Playground as its own section */}
|
||||
<SidebarGroup>
|
||||
<SidebarGroupContent>
|
||||
<SidebarMenu>
|
||||
<SidebarMenuItem>
|
||||
const renderSidebarItems = (items: SidebarItem[]) => {
|
||||
return items.map((item) => {
|
||||
const isActive = pathname.startsWith(item.url);
|
||||
return (
|
||||
<SidebarMenuItem key={item.title}>
|
||||
<SidebarMenuButton
|
||||
asChild
|
||||
className={cn(
|
||||
"justify-start",
|
||||
isActive &&
|
||||
"bg-gray-200 dark:bg-gray-700 hover:bg-gray-200 dark:hover:bg-gray-700 text-gray-900 dark:text-gray-100",
|
||||
)}
|
||||
>
|
||||
<Link href={item.url}>
|
||||
<item.icon
|
||||
className={cn(
|
||||
isActive && "text-gray-900 dark:text-gray-100",
|
||||
"mr-2 h-4 w-4",
|
||||
)}
|
||||
/>
|
||||
<span>{item.title}</span>
|
||||
</Link>
|
||||
</SidebarMenuButton>
|
||||
</SidebarMenuItem>
|
||||
);
|
||||
});
|
||||
};
|
||||
|
||||
return (
|
||||
<Sidebar>
|
||||
<SidebarHeader>
|
||||
<Link href="/">Llama Stack</Link>
|
||||
</SidebarHeader>
|
||||
<SidebarContent>
|
||||
<SidebarGroup>
|
||||
<SidebarGroupLabel>Create</SidebarGroupLabel>
|
||||
<SidebarGroupContent>
|
||||
<SidebarMenu>{renderSidebarItems(createItems)}</SidebarMenu>
|
||||
</SidebarGroupContent>
|
||||
</SidebarGroup>
|
||||
|
||||
<SidebarGroup>
|
||||
<SidebarGroupLabel>Manage</SidebarGroupLabel>
|
||||
<SidebarGroupContent>
|
||||
<SidebarMenu>{renderSidebarItems(manageItems)}</SidebarMenu>
|
||||
</SidebarGroupContent>
|
||||
</SidebarGroup>
|
||||
|
||||
<SidebarGroup>
|
||||
<SidebarGroupLabel>Optimize</SidebarGroupLabel>
|
||||
<SidebarGroupContent>
|
||||
<SidebarMenu>
|
||||
{optimizeItems.map((item) => (
|
||||
<SidebarMenuItem key={item.title}>
|
||||
<SidebarMenuButton
|
||||
asChild
|
||||
className={cn(
|
||||
"justify-start",
|
||||
pathname.startsWith(chatPlaygroundItem.url) &&
|
||||
"bg-gray-200 dark:bg-gray-700 hover:bg-gray-200 dark:hover:bg-gray-700 text-gray-900 dark:text-gray-100",
|
||||
)}
|
||||
disabled
|
||||
className="justify-start opacity-60 cursor-not-allowed"
|
||||
>
|
||||
<Link href={chatPlaygroundItem.url}>
|
||||
<chatPlaygroundItem.icon
|
||||
className={cn(
|
||||
pathname.startsWith(chatPlaygroundItem.url) && "text-gray-900 dark:text-gray-100",
|
||||
"mr-2 h-4 w-4",
|
||||
)}
|
||||
/>
|
||||
<span>{chatPlaygroundItem.title}</span>
|
||||
</Link>
|
||||
<item.icon className="mr-2 h-4 w-4" />
|
||||
<span>{item.title}</span>
|
||||
<span className="ml-2 text-xs text-gray-500">(Coming Soon)</span>
|
||||
</SidebarMenuButton>
|
||||
</SidebarMenuItem>
|
||||
</SidebarMenu>
|
||||
</SidebarGroupContent>
|
||||
</SidebarGroup>
|
||||
|
||||
{/* Logs section */}
|
||||
<SidebarGroup>
|
||||
<SidebarGroupLabel>Logs</SidebarGroupLabel>
|
||||
<SidebarGroupContent>
|
||||
<SidebarMenu>
|
||||
{logItems.map((item) => {
|
||||
const isActive = pathname.startsWith(item.url);
|
||||
return (
|
||||
<SidebarMenuItem key={item.title}>
|
||||
<SidebarMenuButton
|
||||
asChild
|
||||
className={cn(
|
||||
"justify-start",
|
||||
isActive &&
|
||||
"bg-gray-200 dark:bg-gray-700 hover:bg-gray-200 dark:hover:bg-gray-700 text-gray-900 dark:text-gray-100",
|
||||
)}
|
||||
>
|
||||
<Link href={item.url}>
|
||||
<item.icon
|
||||
className={cn(
|
||||
isActive && "text-gray-900 dark:text-gray-100",
|
||||
"mr-2 h-4 w-4",
|
||||
)}
|
||||
/>
|
||||
<span>{item.title}</span>
|
||||
</Link>
|
||||
</SidebarMenuButton>
|
||||
</SidebarMenuItem>
|
||||
);
|
||||
})}
|
||||
</SidebarMenu>
|
||||
</SidebarGroupContent>
|
||||
</SidebarGroup>
|
||||
</SidebarContent>
|
||||
</Sidebar>
|
||||
))}
|
||||
</SidebarMenu>
|
||||
</SidebarGroupContent>
|
||||
</SidebarGroup>
|
||||
</SidebarContent>
|
||||
</Sidebar>
|
||||
);
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue