mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-04 04:04:14 +00:00
Merge c71bcd5479
into 2f58d87c22
This commit is contained in:
commit
c4f3d41c57
6 changed files with 109 additions and 24 deletions
|
@ -4,12 +4,12 @@
|
||||||
|
|
||||||
Agents API for creating and interacting with agentic systems.
|
Agents API for creating and interacting with agentic systems.
|
||||||
|
|
||||||
Main functionalities provided by this API:
|
Main functionalities provided by this API:
|
||||||
- Create agents with specific instructions and ability to use tools.
|
- Create agents with specific instructions and ability to use tools.
|
||||||
- Interactions with agents are grouped into sessions ("threads"), and each interaction is called a "turn".
|
- Interactions with agents are grouped into sessions ("threads"), and each interaction is called a "turn".
|
||||||
- Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details).
|
- Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details).
|
||||||
- Agents can be provided with various shields (see the Safety API for more details).
|
- Agents can be provided with various shields (see the Safety API for more details).
|
||||||
- Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details.
|
- Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details.
|
||||||
|
|
||||||
This section contains documentation for all available providers for the **agents** API.
|
This section contains documentation for all available providers for the **agents** API.
|
||||||
|
|
||||||
|
|
|
@ -3,15 +3,15 @@
|
||||||
## Overview
|
## Overview
|
||||||
|
|
||||||
The Batches API enables efficient processing of multiple requests in a single operation,
|
The Batches API enables efficient processing of multiple requests in a single operation,
|
||||||
particularly useful for processing large datasets, batch evaluation workflows, and
|
particularly useful for processing large datasets, batch evaluation workflows, and
|
||||||
cost-effective inference at scale.
|
cost-effective inference at scale.
|
||||||
|
|
||||||
The API is designed to allow use of openai client libraries for seamless integration.
|
The API is designed to allow use of openai client libraries for seamless integration.
|
||||||
|
|
||||||
This API provides the following extensions:
|
This API provides the following extensions:
|
||||||
- idempotent batch creation
|
- idempotent batch creation
|
||||||
|
|
||||||
Note: This API is currently under active development and may undergo changes.
|
Note: This API is currently under active development and may undergo changes.
|
||||||
|
|
||||||
This section contains documentation for all available providers for the **batches** API.
|
This section contains documentation for all available providers for the **batches** API.
|
||||||
|
|
||||||
|
|
|
@ -4,9 +4,9 @@
|
||||||
|
|
||||||
Llama Stack Inference API for generating completions, chat completions, and embeddings.
|
Llama Stack Inference API for generating completions, chat completions, and embeddings.
|
||||||
|
|
||||||
This API provides the raw interface to the underlying models. Two kinds of models are supported:
|
This API provides the raw interface to the underlying models. Two kinds of models are supported:
|
||||||
- LLM models: these models generate "raw" and "chat" (conversational) completions.
|
- LLM models: these models generate "raw" and "chat" (conversational) completions.
|
||||||
- Embedding models: these models generate embeddings to be used for semantic search.
|
- Embedding models: these models generate embeddings to be used for semantic search.
|
||||||
|
|
||||||
This section contains documentation for all available providers for the **inference** API.
|
This section contains documentation for all available providers for the **inference** API.
|
||||||
|
|
||||||
|
|
|
@ -4,6 +4,7 @@
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import heapq
|
||||||
import json
|
import json
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
@ -30,6 +31,7 @@ from llama_stack.providers.utils.memory.vector_store import (
|
||||||
EmbeddingIndex,
|
EmbeddingIndex,
|
||||||
VectorDBWithIndex,
|
VectorDBWithIndex,
|
||||||
)
|
)
|
||||||
|
from llama_stack.providers.utils.vector_io.vector_utils import WeightedInMemoryAggregator
|
||||||
|
|
||||||
from .config import ChromaVectorIOConfig as RemoteChromaVectorIOConfig
|
from .config import ChromaVectorIOConfig as RemoteChromaVectorIOConfig
|
||||||
|
|
||||||
|
@ -114,7 +116,38 @@ class ChromaIndex(EmbeddingIndex):
|
||||||
k: int,
|
k: int,
|
||||||
score_threshold: float,
|
score_threshold: float,
|
||||||
) -> QueryChunksResponse:
|
) -> QueryChunksResponse:
|
||||||
raise NotImplementedError("Keyword search is not supported in Chroma")
|
results = await maybe_await(
|
||||||
|
self.collection.query(
|
||||||
|
query_texts=[query_string],
|
||||||
|
where_document={"$contains": query_string},
|
||||||
|
n_results=k,
|
||||||
|
include=["documents", "distances"],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
distances = results["distances"][0] if results["distances"] else []
|
||||||
|
documents = results["documents"][0] if results["documents"] else []
|
||||||
|
|
||||||
|
chunks = []
|
||||||
|
scores = []
|
||||||
|
|
||||||
|
for dist, doc in zip(distances, documents, strict=False):
|
||||||
|
try:
|
||||||
|
doc_data = json.loads(doc)
|
||||||
|
chunk = Chunk(**doc_data)
|
||||||
|
except Exception:
|
||||||
|
log.exception(f"Failed to load chunk: {doc}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
score = 1.0 / (1.0 + float(dist)) if dist is not None else 1.0
|
||||||
|
|
||||||
|
if score < score_threshold:
|
||||||
|
continue
|
||||||
|
|
||||||
|
chunks.append(chunk)
|
||||||
|
scores.append(score)
|
||||||
|
|
||||||
|
return QueryChunksResponse(chunks=chunks, scores=scores)
|
||||||
|
|
||||||
async def delete_chunks(self, chunks_for_deletion: list[ChunkForDeletion]) -> None:
|
async def delete_chunks(self, chunks_for_deletion: list[ChunkForDeletion]) -> None:
|
||||||
"""Delete a single chunk from the Chroma collection by its ID."""
|
"""Delete a single chunk from the Chroma collection by its ID."""
|
||||||
|
@ -130,7 +163,57 @@ class ChromaIndex(EmbeddingIndex):
|
||||||
reranker_type: str,
|
reranker_type: str,
|
||||||
reranker_params: dict[str, Any] | None = None,
|
reranker_params: dict[str, Any] | None = None,
|
||||||
) -> QueryChunksResponse:
|
) -> QueryChunksResponse:
|
||||||
raise NotImplementedError("Hybrid search is not supported in Chroma")
|
"""
|
||||||
|
Hybrid search combining vector similarity and keyword search using configurable reranking.
|
||||||
|
Args:
|
||||||
|
embedding: The query embedding vector
|
||||||
|
query_string: The text query for keyword search
|
||||||
|
k: Number of results to return
|
||||||
|
score_threshold: Minimum similarity score threshold
|
||||||
|
reranker_type: Type of reranker to use ("rrf" or "weighted")
|
||||||
|
reranker_params: Parameters for the reranker
|
||||||
|
Returns:
|
||||||
|
QueryChunksResponse with combined results
|
||||||
|
"""
|
||||||
|
if reranker_params is None:
|
||||||
|
reranker_params = {}
|
||||||
|
|
||||||
|
# Get results from both search methods
|
||||||
|
vector_response = await self.query_vector(embedding, k, score_threshold)
|
||||||
|
keyword_response = await self.query_keyword(query_string, k, score_threshold)
|
||||||
|
|
||||||
|
# Convert responses to score dictionaries using chunk_id
|
||||||
|
vector_scores = {
|
||||||
|
chunk.chunk_id: score for chunk, score in zip(vector_response.chunks, vector_response.scores, strict=False)
|
||||||
|
}
|
||||||
|
keyword_scores = {
|
||||||
|
chunk.chunk_id: score
|
||||||
|
for chunk, score in zip(keyword_response.chunks, keyword_response.scores, strict=False)
|
||||||
|
}
|
||||||
|
|
||||||
|
# Combine scores using the reranking utility
|
||||||
|
combined_scores = WeightedInMemoryAggregator.combine_search_results(
|
||||||
|
vector_scores, keyword_scores, reranker_type, reranker_params
|
||||||
|
)
|
||||||
|
|
||||||
|
# Efficient top-k selection because it only tracks the k best candidates it's seen so far
|
||||||
|
top_k_items = heapq.nlargest(k, combined_scores.items(), key=lambda x: x[1])
|
||||||
|
|
||||||
|
# Filter by score threshold
|
||||||
|
filtered_items = [(doc_id, score) for doc_id, score in top_k_items if score >= score_threshold]
|
||||||
|
|
||||||
|
# Create a map of chunk_id to chunk for both responses
|
||||||
|
chunk_map = {c.chunk_id: c for c in vector_response.chunks + keyword_response.chunks}
|
||||||
|
|
||||||
|
# Use the map to look up chunks by their IDs
|
||||||
|
chunks = []
|
||||||
|
scores = []
|
||||||
|
for doc_id, score in filtered_items:
|
||||||
|
if doc_id in chunk_map:
|
||||||
|
chunks.append(chunk_map[doc_id])
|
||||||
|
scores.append(score)
|
||||||
|
|
||||||
|
return QueryChunksResponse(chunks=chunks, scores=scores)
|
||||||
|
|
||||||
|
|
||||||
class ChromaVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate):
|
class ChromaVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate):
|
||||||
|
|
2
uv.lock
generated
2
uv.lock
generated
|
@ -1767,6 +1767,7 @@ dependencies = [
|
||||||
{ name = "opentelemetry-exporter-otlp-proto-http" },
|
{ name = "opentelemetry-exporter-otlp-proto-http" },
|
||||||
{ name = "opentelemetry-sdk" },
|
{ name = "opentelemetry-sdk" },
|
||||||
{ name = "pillow" },
|
{ name = "pillow" },
|
||||||
|
{ name = "pre-commit" },
|
||||||
{ name = "prompt-toolkit" },
|
{ name = "prompt-toolkit" },
|
||||||
{ name = "pydantic" },
|
{ name = "pydantic" },
|
||||||
{ name = "python-dotenv" },
|
{ name = "python-dotenv" },
|
||||||
|
@ -1892,6 +1893,7 @@ requires-dist = [
|
||||||
{ name = "opentelemetry-sdk", specifier = ">=1.30.0" },
|
{ name = "opentelemetry-sdk", specifier = ">=1.30.0" },
|
||||||
{ name = "pandas", marker = "extra == 'ui'" },
|
{ name = "pandas", marker = "extra == 'ui'" },
|
||||||
{ name = "pillow" },
|
{ name = "pillow" },
|
||||||
|
{ name = "pre-commit", specifier = ">=4.2.0" },
|
||||||
{ name = "prompt-toolkit" },
|
{ name = "prompt-toolkit" },
|
||||||
{ name = "pydantic", specifier = ">=2.11.9" },
|
{ name = "pydantic", specifier = ">=2.11.9" },
|
||||||
{ name = "python-dotenv" },
|
{ name = "python-dotenv" },
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue