Merge remote-tracking branch 'origin/main' into openai_v1

This commit is contained in:
Ashwin Bharambe 2025-09-29 13:41:11 -07:00
commit 35546386a2
52 changed files with 580 additions and 802 deletions

View file

@ -29,8 +29,8 @@ runs:
# Install llama-stack-client-python based on the client-version input
if [ "${{ inputs.client-version }}" = "latest" ]; then
echo "Installing latest llama-stack-client-python from next branch"
uv pip install git+https://github.com/llamastack/llama-stack-client-python.git@next
echo "Installing latest llama-stack-client-python from main branch"
uv pip install git+https://github.com/llamastack/llama-stack-client-python.git@main
elif [ "${{ inputs.client-version }}" = "published" ]; then
echo "Installing published llama-stack-client-python from PyPI"
uv pip install llama-stack-client

View file

@ -44,8 +44,8 @@ runs:
run: |
# Install llama-stack-client-python based on the client-version input
if [ "${{ inputs.client-version }}" = "latest" ]; then
echo "Installing latest llama-stack-client-python from next branch"
export LLAMA_STACK_CLIENT_DIR=git+https://github.com/llamastack/llama-stack-client-python.git@next
echo "Installing latest llama-stack-client-python from main branch"
export LLAMA_STACK_CLIENT_DIR=git+https://github.com/llamastack/llama-stack-client-python.git@main
elif [ "${{ inputs.client-version }}" = "published" ]; then
echo "Installing published llama-stack-client-python from PyPI"
unset LLAMA_STACK_CLIENT_DIR

View file

@ -43,7 +43,7 @@ jobs:
# Cache oasdiff to avoid checksum failures and speed up builds
- name: Cache oasdiff
id: cache-oasdiff
uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809
uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830
with:
path: ~/oasdiff
key: oasdiff-${{ runner.os }}

View file

@ -261,7 +261,7 @@ You can even run `llama model prompt-format` see all of the templates and their
```
llama model prompt-format -m Llama3.2-3B-Instruct
```
![alt text](../../../resources/prompt-format.png)
![alt text](/img/prompt-format.png)
You will be shown a Markdown formatted description of the model interface and how prompts / messages are formatted for various scenarios.

View file

@ -217,7 +217,6 @@ from llama_stack_client.types import (
Methods:
- <code title="post /v1/inference/chat-completion">client.inference.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/inference.py">chat_completion</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_chat_completion_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_chat_completion_response.py">InferenceChatCompletionResponse</a></code>
- <code title="post /v1/inference/completion">client.inference.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/inference.py">completion</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_completion_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_completion_response.py">InferenceCompletionResponse</a></code>
- <code title="post /v1/inference/embeddings">client.inference.<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/resources/inference.py">embeddings</a>(\*\*<a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/inference_embeddings_params.py">params</a>) -> <a href="https://github.com/meta-llama/llama-stack-client-python/tree/main/src/llama_stack_client/types/embeddings_response.py">EmbeddingsResponse</a></code>
## VectorIo

View file

@ -824,16 +824,10 @@
"\n",
"\n",
"user_input = \"Michael Jordan was born in 1963. He played basketball for the Chicago Bulls. He retired in 2003. Extract this information into JSON for me. \"\n",
"response = client.inference.completion(\n",
" model_id=\"meta-llama/Llama-3.1-8B-Instruct\",\n",
" content=user_input,\n",
" stream=False,\n",
" sampling_params={\n",
" \"strategy\": {\n",
" \"type\": \"greedy\",\n",
" },\n",
" \"max_tokens\": 50,\n",
" },\n",
"response = client.chat.completions.create(\n",
" model=\"meta-llama/Llama-3.1-8B-Instruct\",\n",
" messages=[{\"role\": \"user\", \"content\": user_input}],\n",
" max_tokens=50,\n",
" response_format={\n",
" \"type\": \"json_schema\",\n",
" \"json_schema\": Output.model_json_schema(),\n",
@ -1013,7 +1007,7 @@
"\n",
"\n",
"\n",
"<img src=\"https://github.com/meta-llama/llama-stack/blob/main/docs/resources/agentic-system.png?raw=true\" alt=\"drawing\" width=\"800\"/>\n",
"<img src=\"https://github.com/meta-llama/llama-stack/blob/main/docs/static/img/agentic-system.png?raw=true\" alt=\"drawing\" width=\"800\"/>\n",
"\n",
"\n",
"Agents are characterized by having access to\n",

View file

@ -706,20 +706,15 @@
" provider_id=\"nvidia\",\n",
")\n",
"\n",
"response = client.inference.completion(\n",
" content=\"Complete the sentence using one word: Roses are red, violets are \",\n",
"response = client.completions.create(\n",
" prompt=\"Complete the sentence using one word: Roses are red, violets are \",\n",
" stream=False,\n",
" model_id=CUSTOMIZED_MODEL_DIR,\n",
" sampling_params={\n",
" \"strategy\": {\n",
" \"type\": \"top_p\",\n",
" \"temperature\": 0.7,\n",
" \"top_p\": 0.9\n",
" },\n",
" \"max_tokens\": 20,\n",
" },\n",
" model=CUSTOMIZED_MODEL_DIR,\n",
" temperature=0.7,\n",
" top_p=0.9,\n",
" max_tokens=20,\n",
")\n",
"print(f\"Inference response: {response.content}\")"
"print(f\"Inference response: {response.choices[0].text}\")"
]
},
{
@ -1233,20 +1228,15 @@
" provider_id=\"nvidia\",\n",
")\n",
"\n",
"response = client.inference.completion(\n",
" content=\"Complete the sentence using one word: Roses are red, violets are \",\n",
"response = client.completions.create(\n",
" prompt=\"Complete the sentence using one word: Roses are red, violets are \",\n",
" stream=False,\n",
" model_id=customized_chat_model_dir,\n",
" sampling_params={\n",
" \"strategy\": {\n",
" \"type\": \"top_p\",\n",
" \"temperature\": 0.7,\n",
" \"top_p\": 0.9\n",
" },\n",
" \"max_tokens\": 20,\n",
" },\n",
" model=customized_chat_model_dir,\n",
" temperature=0.7,\n",
" top_p=0.9,\n",
" max_tokens=20,\n",
")\n",
"print(f\"Inference response: {response.content}\")"
"print(f\"Inference response: {response.choices[0].text}\")"
]
},
{

View file

Before

Width:  |  Height:  |  Size: 128 KiB

After

Width:  |  Height:  |  Size: 128 KiB

Before After
Before After

View file

Before

Width:  |  Height:  |  Size: 220 KiB

After

Width:  |  Height:  |  Size: 220 KiB

Before After
Before After

View file

Before

Width:  |  Height:  |  Size: 71 KiB

After

Width:  |  Height:  |  Size: 71 KiB

Before After
Before After

View file

Before

Width:  |  Height:  |  Size: 17 KiB

After

Width:  |  Height:  |  Size: 17 KiB

Before After
Before After

View file

Before

Width:  |  Height:  |  Size: 170 KiB

After

Width:  |  Height:  |  Size: 170 KiB

Before After
Before After

View file

@ -1239,50 +1239,6 @@
]
}
},
"/v1/inference/embeddings": {
"post": {
"responses": {
"200": {
"description": "An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}.",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/EmbeddingsResponse"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Inference"
],
"summary": "Generate embeddings for content pieces using the specified model.",
"description": "Generate embeddings for content pieces using the specified model.",
"parameters": [],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/EmbeddingsRequest"
}
}
},
"required": true
}
}
},
"/v1alpha/eval/benchmarks/{benchmark_id}/evaluations": {
"post": {
"responses": {
@ -6965,7 +6921,7 @@
}
}
},
"/v1/inference/rerank": {
"/v1alpha/inference/rerank": {
"post": {
"responses": {
"200": {
@ -12081,80 +12037,6 @@
"title": "OpenAIDeleteResponseObject",
"description": "Response object confirming deletion of an OpenAI response."
},
"EmbeddingsRequest": {
"type": "object",
"properties": {
"model_id": {
"type": "string",
"description": "The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint."
},
"contents": {
"oneOf": [
{
"type": "array",
"items": {
"type": "string"
}
},
{
"type": "array",
"items": {
"$ref": "#/components/schemas/InterleavedContentItem"
}
}
],
"description": "List of contents to generate embeddings for. Each content can be a string or an InterleavedContentItem (and hence can be multimodal). The behavior depends on the model and provider. Some models may only support text."
},
"text_truncation": {
"type": "string",
"enum": [
"none",
"start",
"end"
],
"description": "(Optional) Config for how to truncate text for embedding when text is longer than the model's max sequence length."
},
"output_dimension": {
"type": "integer",
"description": "(Optional) Output dimensionality for the embeddings. Only supported by Matryoshka models."
},
"task_type": {
"type": "string",
"enum": [
"query",
"document"
],
"description": "(Optional) How is the embedding being used? This is only supported by asymmetric embedding models."
}
},
"additionalProperties": false,
"required": [
"model_id",
"contents"
],
"title": "EmbeddingsRequest"
},
"EmbeddingsResponse": {
"type": "object",
"properties": {
"embeddings": {
"type": "array",
"items": {
"type": "array",
"items": {
"type": "number"
}
},
"description": "List of embedding vectors, one per input content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}"
}
},
"additionalProperties": false,
"required": [
"embeddings"
],
"title": "EmbeddingsResponse",
"description": "Response containing generated embeddings."
},
"AgentCandidate": {
"type": "object",
"properties": {

View file

@ -861,41 +861,6 @@ paths:
required: true
schema:
type: string
/v1/inference/embeddings:
post:
responses:
'200':
description: >-
An array of embeddings, one for each content. Each embedding is a list
of floats. The dimensionality of the embedding is model-specific; you
can check model metadata using /models/{model_id}.
content:
application/json:
schema:
$ref: '#/components/schemas/EmbeddingsResponse'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Inference
summary: >-
Generate embeddings for content pieces using the specified model.
description: >-
Generate embeddings for content pieces using the specified model.
parameters: []
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/EmbeddingsRequest'
required: true
/v1alpha/eval/benchmarks/{benchmark_id}/evaluations:
post:
responses:
@ -5040,7 +5005,7 @@ paths:
schema:
$ref: '#/components/schemas/QueryTracesRequest'
required: true
/v1/inference/rerank:
/v1alpha/inference/rerank:
post:
responses:
'200':
@ -8937,72 +8902,6 @@ components:
title: OpenAIDeleteResponseObject
description: >-
Response object confirming deletion of an OpenAI response.
EmbeddingsRequest:
type: object
properties:
model_id:
type: string
description: >-
The identifier of the model to use. The model must be an embedding model
registered with Llama Stack and available via the /models endpoint.
contents:
oneOf:
- type: array
items:
type: string
- type: array
items:
$ref: '#/components/schemas/InterleavedContentItem'
description: >-
List of contents to generate embeddings for. Each content can be a string
or an InterleavedContentItem (and hence can be multimodal). The behavior
depends on the model and provider. Some models may only support text.
text_truncation:
type: string
enum:
- none
- start
- end
description: >-
(Optional) Config for how to truncate text for embedding when text is
longer than the model's max sequence length.
output_dimension:
type: integer
description: >-
(Optional) Output dimensionality for the embeddings. Only supported by
Matryoshka models.
task_type:
type: string
enum:
- query
- document
description: >-
(Optional) How is the embedding being used? This is only supported by
asymmetric embedding models.
additionalProperties: false
required:
- model_id
- contents
title: EmbeddingsRequest
EmbeddingsResponse:
type: object
properties:
embeddings:
type: array
items:
type: array
items:
type: number
description: >-
List of embedding vectors, one per input content. Each embedding is a
list of floats. The dimensionality of the embedding is model-specific;
you can check model metadata using /models/{model_id}
additionalProperties: false
required:
- embeddings
title: EmbeddingsResponse
description: >-
Response containing generated embeddings.
AgentCandidate:
type: object
properties:

View file

@ -17,11 +17,11 @@ from typing import (
from pydantic import BaseModel, Field, field_validator
from typing_extensions import TypedDict
from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent, InterleavedContentItem
from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent
from llama_stack.apis.common.responses import Order
from llama_stack.apis.models import Model
from llama_stack.apis.telemetry import MetricResponseMixin
from llama_stack.apis.version import LLAMA_STACK_API_V1
from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA
from llama_stack.models.llama.datatypes import (
BuiltinTool,
StopReason,
@ -1070,27 +1070,7 @@ class InferenceProvider(Protocol):
"""
...
@webmethod(route="/inference/embeddings", method="POST", level=LLAMA_STACK_API_V1)
async def embeddings(
self,
model_id: str,
contents: list[str] | list[InterleavedContentItem],
text_truncation: TextTruncation | None = TextTruncation.none,
output_dimension: int | None = None,
task_type: EmbeddingTaskType | None = None,
) -> EmbeddingsResponse:
"""Generate embeddings for content pieces using the specified model.
:param model_id: The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint.
:param contents: List of contents to generate embeddings for. Each content can be a string or an InterleavedContentItem (and hence can be multimodal). The behavior depends on the model and provider. Some models may only support text.
:param output_dimension: (Optional) Output dimensionality for the embeddings. Only supported by Matryoshka models.
:param text_truncation: (Optional) Config for how to truncate text for embedding when text is longer than the model's max sequence length.
:param task_type: (Optional) How is the embedding being used? This is only supported by asymmetric embedding models.
:returns: An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}.
"""
...
@webmethod(route="/inference/rerank", method="POST", experimental=True, level=LLAMA_STACK_API_V1)
@webmethod(route="/inference/rerank", method="POST", level=LLAMA_STACK_API_V1ALPHA)
async def rerank(
self,
model: str,

View file

@ -433,6 +433,12 @@ class InferenceStoreConfig(BaseModel):
num_writers: int = Field(default=4, description="Number of concurrent background writers")
class ResponsesStoreConfig(BaseModel):
sql_store_config: SqlStoreConfig
max_write_queue_size: int = Field(default=10000, description="Max queued writes for responses store")
num_writers: int = Field(default=4, description="Number of concurrent background writers")
class StackRunConfig(BaseModel):
version: int = LLAMA_STACK_RUN_CONFIG_VERSION

View file

@ -29,6 +29,7 @@ from llama_stack.apis.telemetry import Telemetry
from llama_stack.apis.tools import ToolGroups, ToolRuntime
from llama_stack.apis.vector_dbs import VectorDBs
from llama_stack.apis.vector_io import VectorIO
from llama_stack.apis.version import LLAMA_STACK_API_V1ALPHA
from llama_stack.core.client import get_client_impl
from llama_stack.core.datatypes import (
AccessRule,
@ -412,8 +413,14 @@ def check_protocol_compliance(obj: Any, protocol: Any) -> None:
mro = type(obj).__mro__
for name, value in inspect.getmembers(protocol):
if inspect.isfunction(value) and hasattr(value, "__webmethod__"):
if value.__webmethod__.experimental:
if inspect.isfunction(value) and hasattr(value, "__webmethods__"):
has_alpha_api = False
for webmethod in value.__webmethods__:
if webmethod.level == LLAMA_STACK_API_V1ALPHA:
has_alpha_api = True
break
# if this API has multiple webmethods, and one of them is an alpha API, this API should be skipped when checking for missing or not callable routes
if has_alpha_api:
continue
if not hasattr(obj, name):
missing_methods.append((name, "missing"))

View file

@ -16,7 +16,6 @@ from pydantic import Field, TypeAdapter
from llama_stack.apis.common.content_types import (
InterleavedContent,
InterleavedContentItem,
)
from llama_stack.apis.common.errors import ModelNotFoundError, ModelTypeError
from llama_stack.apis.inference import (
@ -26,8 +25,6 @@ from llama_stack.apis.inference import (
CompletionMessage,
CompletionResponse,
CompletionResponseStreamChunk,
EmbeddingsResponse,
EmbeddingTaskType,
Inference,
ListOpenAIChatCompletionResponse,
LogProbConfig,
@ -48,7 +45,6 @@ from llama_stack.apis.inference import (
ResponseFormat,
SamplingParams,
StopReason,
TextTruncation,
ToolChoice,
ToolConfig,
ToolDefinition,
@ -312,25 +308,6 @@ class InferenceRouter(Inference):
return response
async def embeddings(
self,
model_id: str,
contents: list[str] | list[InterleavedContentItem],
text_truncation: TextTruncation | None = TextTruncation.none,
output_dimension: int | None = None,
task_type: EmbeddingTaskType | None = None,
) -> EmbeddingsResponse:
logger.debug(f"InferenceRouter.embeddings: {model_id}")
await self._get_model(model_id, ModelType.embedding)
provider = await self.routing_table.get_provider_impl(model_id)
return await provider.embeddings(
model_id=model_id,
contents=contents,
text_truncation=text_truncation,
output_dimension=output_dimension,
task_type=task_type,
)
async def openai_completion(
self,
model: str,

View file

@ -924,7 +924,7 @@ async def get_raw_document_text(document: Document) -> str:
DeprecationWarning,
stacklevel=2,
)
elif not (document.mime_type.startswith("text/") or document.mime_type == "application/yaml"):
elif not (document.mime_type.startswith("text/") or document.mime_type in ("application/yaml", "application/json")):
raise ValueError(f"Unexpected document mime type: {document.mime_type}")
if isinstance(document.content, URL):

View file

@ -12,7 +12,7 @@ from llama_stack.apis.agents import Agents, StepType
from llama_stack.apis.benchmarks import Benchmark
from llama_stack.apis.datasetio import DatasetIO
from llama_stack.apis.datasets import Datasets
from llama_stack.apis.inference import Inference, SystemMessage, UserMessage
from llama_stack.apis.inference import Inference, OpenAISystemMessageParam, OpenAIUserMessageParam, UserMessage
from llama_stack.apis.scoring import Scoring
from llama_stack.providers.datatypes import BenchmarksProtocolPrivate
from llama_stack.providers.inline.agents.meta_reference.agent_instance import (
@ -159,31 +159,40 @@ class MetaReferenceEvalImpl(
) -> list[dict[str, Any]]:
candidate = benchmark_config.eval_candidate
assert candidate.sampling_params.max_tokens is not None, "SamplingParams.max_tokens must be provided"
sampling_params = {"max_tokens": candidate.sampling_params.max_tokens}
generations = []
for x in tqdm(input_rows):
if ColumnName.completion_input.value in x:
if candidate.sampling_params.stop:
sampling_params["stop"] = candidate.sampling_params.stop
input_content = json.loads(x[ColumnName.completion_input.value])
response = await self.inference_api.completion(
response = await self.inference_api.openai_completion(
model=candidate.model,
content=input_content,
sampling_params=candidate.sampling_params,
prompt=input_content,
**sampling_params,
)
generations.append({ColumnName.generated_answer.value: response.completion_message.content})
generations.append({ColumnName.generated_answer.value: response.choices[0].text})
elif ColumnName.chat_completion_input.value in x:
chat_completion_input_json = json.loads(x[ColumnName.chat_completion_input.value])
input_messages = [UserMessage(**x) for x in chat_completion_input_json if x["role"] == "user"]
input_messages = [
OpenAIUserMessageParam(**x) for x in chat_completion_input_json if x["role"] == "user"
]
messages = []
if candidate.system_message:
messages.append(candidate.system_message)
messages += [SystemMessage(**x) for x in chat_completion_input_json if x["role"] == "system"]
messages += [OpenAISystemMessageParam(**x) for x in chat_completion_input_json if x["role"] == "system"]
messages += input_messages
response = await self.inference_api.chat_completion(
model_id=candidate.model,
response = await self.inference_api.openai_chat_completion(
model=candidate.model,
messages=messages,
sampling_params=candidate.sampling_params,
**sampling_params,
)
generations.append({ColumnName.generated_answer.value: response.completion_message.content})
generations.append({ColumnName.generated_answer.value: response.choices[0].message.content})
else:
raise ValueError("Invalid input row")

View file

@ -11,21 +11,17 @@ from botocore.client import BaseClient
from llama_stack.apis.common.content_types import (
InterleavedContent,
InterleavedContentItem,
)
from llama_stack.apis.inference import (
ChatCompletionRequest,
ChatCompletionResponse,
ChatCompletionResponseStreamChunk,
EmbeddingsResponse,
EmbeddingTaskType,
Inference,
LogProbConfig,
Message,
OpenAIEmbeddingsResponse,
ResponseFormat,
SamplingParams,
TextTruncation,
ToolChoice,
ToolConfig,
ToolDefinition,
@ -47,8 +43,6 @@ from llama_stack.providers.utils.inference.openai_compat import (
)
from llama_stack.providers.utils.inference.prompt_adapter import (
chat_completion_request_to_prompt,
content_has_media,
interleaved_content_as_str,
)
from .models import MODEL_ENTRIES
@ -218,36 +212,6 @@ class BedrockInferenceAdapter(
),
}
async def embeddings(
self,
model_id: str,
contents: list[str] | list[InterleavedContentItem],
text_truncation: TextTruncation | None = TextTruncation.none,
output_dimension: int | None = None,
task_type: EmbeddingTaskType | None = None,
) -> EmbeddingsResponse:
model = await self.model_store.get_model(model_id)
# Convert foundation model ID to inference profile ID
region_name = self.client.meta.region_name
inference_profile_id = _to_inference_profile_id(model.provider_resource_id, region_name)
embeddings = []
for content in contents:
assert not content_has_media(content), "Bedrock does not support media for embeddings"
input_text = interleaved_content_as_str(content)
input_body = {"inputText": input_text}
body = json.dumps(input_body)
response = self.client.invoke_model(
body=body,
modelId=inference_profile_id,
accept="application/json",
contentType="application/json",
)
response_body = json.loads(response.get("body").read())
embeddings.append(response_body.get("embedding"))
return EmbeddingsResponse(embeddings=embeddings)
async def openai_embeddings(
self,
model: str,

View file

@ -11,21 +11,17 @@ from cerebras.cloud.sdk import AsyncCerebras
from llama_stack.apis.common.content_types import (
InterleavedContent,
InterleavedContentItem,
)
from llama_stack.apis.inference import (
ChatCompletionRequest,
CompletionRequest,
CompletionResponse,
EmbeddingsResponse,
EmbeddingTaskType,
Inference,
LogProbConfig,
Message,
OpenAIEmbeddingsResponse,
ResponseFormat,
SamplingParams,
TextTruncation,
ToolChoice,
ToolConfig,
ToolDefinition,
@ -187,16 +183,6 @@ class CerebrasInferenceAdapter(
**get_sampling_options(request.sampling_params),
}
async def embeddings(
self,
model_id: str,
contents: list[str] | list[InterleavedContentItem],
text_truncation: TextTruncation | None = TextTruncation.none,
output_dimension: int | None = None,
task_type: EmbeddingTaskType | None = None,
) -> EmbeddingsResponse:
raise NotImplementedError()
async def openai_embeddings(
self,
model: str,

View file

@ -11,15 +11,12 @@ from databricks.sdk import WorkspaceClient
from llama_stack.apis.common.content_types import (
InterleavedContent,
InterleavedContentItem,
)
from llama_stack.apis.inference import (
ChatCompletionResponse,
ChatCompletionResponseStreamChunk,
CompletionResponse,
CompletionResponseStreamChunk,
EmbeddingsResponse,
EmbeddingTaskType,
Inference,
LogProbConfig,
Message,
@ -27,7 +24,6 @@ from llama_stack.apis.inference import (
OpenAICompletion,
ResponseFormat,
SamplingParams,
TextTruncation,
ToolChoice,
ToolConfig,
ToolDefinition,
@ -118,16 +114,6 @@ class DatabricksInferenceAdapter(
) -> ChatCompletionResponse | AsyncIterator[ChatCompletionResponseStreamChunk]:
raise NotImplementedError()
async def embeddings(
self,
model_id: str,
contents: list[str] | list[InterleavedContentItem],
text_truncation: TextTruncation | None = TextTruncation.none,
output_dimension: int | None = None,
task_type: EmbeddingTaskType | None = None,
) -> EmbeddingsResponse:
raise NotImplementedError()
async def list_models(self) -> list[Model] | None:
self._model_cache = {} # from OpenAIMixin
ws_client = WorkspaceClient(host=self.config.url, token=self.get_api_key()) # TODO: this is not async

View file

@ -10,22 +10,18 @@ from fireworks.client import Fireworks
from llama_stack.apis.common.content_types import (
InterleavedContent,
InterleavedContentItem,
)
from llama_stack.apis.inference import (
ChatCompletionRequest,
ChatCompletionResponse,
CompletionRequest,
CompletionResponse,
EmbeddingsResponse,
EmbeddingTaskType,
Inference,
LogProbConfig,
Message,
ResponseFormat,
ResponseFormatType,
SamplingParams,
TextTruncation,
ToolChoice,
ToolConfig,
ToolDefinition,
@ -48,8 +44,6 @@ from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from llama_stack.providers.utils.inference.prompt_adapter import (
chat_completion_request_to_prompt,
completion_request_to_prompt,
content_has_media,
interleaved_content_as_str,
request_has_media,
)
@ -259,28 +253,3 @@ class FireworksInferenceAdapter(OpenAIMixin, ModelRegistryHelper, Inference, Nee
logger.debug(f"params to fireworks: {params}")
return params
async def embeddings(
self,
model_id: str,
contents: list[str] | list[InterleavedContentItem],
text_truncation: TextTruncation | None = TextTruncation.none,
output_dimension: int | None = None,
task_type: EmbeddingTaskType | None = None,
) -> EmbeddingsResponse:
model = await self.model_store.get_model(model_id)
kwargs = {}
if model.metadata.get("embedding_dimension"):
kwargs["dimensions"] = model.metadata.get("embedding_dimension")
assert all(not content_has_media(content) for content in contents), (
"Fireworks does not support media for embeddings"
)
response = self._get_client().embeddings.create(
model=model.provider_resource_id,
input=[interleaved_content_as_str(content) for content in contents],
**kwargs,
)
embeddings = [data.embedding for data in response.data]
return EmbeddingsResponse(embeddings=embeddings)

View file

@ -39,25 +39,6 @@ client = LlamaStackAsLibraryClient("nvidia")
client.initialize()
```
### Create Completion
The following example shows how to create a completion for an NVIDIA NIM.
> [!NOTE]
> The hosted NVIDIA Llama NIMs (for example ```meta-llama/Llama-3.1-8B-Instruct```) that have ```NVIDIA_BASE_URL="https://integrate.api.nvidia.com"``` do not support the ```completion``` method, while locally deployed NIMs do.
```python
response = client.inference.completion(
model_id="meta-llama/Llama-3.1-8B-Instruct",
content="Complete the sentence using one word: Roses are red, violets are :",
stream=False,
sampling_params={
"max_tokens": 50,
},
)
print(f"Response: {response.content}")
```
### Create Chat Completion
The following example shows how to create a chat completion for an NVIDIA NIM.

View file

@ -11,8 +11,6 @@ from openai import NOT_GIVEN, APIConnectionError
from llama_stack.apis.common.content_types import (
InterleavedContent,
InterleavedContentItem,
TextContentItem,
)
from llama_stack.apis.inference import (
ChatCompletionRequest,
@ -21,8 +19,6 @@ from llama_stack.apis.inference import (
CompletionRequest,
CompletionResponse,
CompletionResponseStreamChunk,
EmbeddingsResponse,
EmbeddingTaskType,
Inference,
LogProbConfig,
Message,
@ -31,7 +27,6 @@ from llama_stack.apis.inference import (
OpenAIEmbeddingUsage,
ResponseFormat,
SamplingParams,
TextTruncation,
ToolChoice,
ToolConfig,
)
@ -156,60 +151,6 @@ class NVIDIAInferenceAdapter(OpenAIMixin, Inference):
# we pass n=1 to get only one completion
return convert_openai_completion_choice(response.choices[0])
async def embeddings(
self,
model_id: str,
contents: list[str] | list[InterleavedContentItem],
text_truncation: TextTruncation | None = TextTruncation.none,
output_dimension: int | None = None,
task_type: EmbeddingTaskType | None = None,
) -> EmbeddingsResponse:
if any(content_has_media(content) for content in contents):
raise NotImplementedError("Media is not supported")
#
# Llama Stack: contents = list[str] | list[InterleavedContentItem]
# ->
# OpenAI: input = str | list[str]
#
# we can ignore str and always pass list[str] to OpenAI
#
flat_contents = [content.text if isinstance(content, TextContentItem) else content for content in contents]
input = [content.text if isinstance(content, TextContentItem) else content for content in flat_contents]
provider_model_id = await self._get_provider_model_id(model_id)
extra_body = {}
if text_truncation is not None:
text_truncation_options = {
TextTruncation.none: "NONE",
TextTruncation.end: "END",
TextTruncation.start: "START",
}
extra_body["truncate"] = text_truncation_options[text_truncation]
if output_dimension is not None:
extra_body["dimensions"] = output_dimension
if task_type is not None:
task_type_options = {
EmbeddingTaskType.document: "passage",
EmbeddingTaskType.query: "query",
}
extra_body["input_type"] = task_type_options[task_type]
response = await self.client.embeddings.create(
model=provider_model_id,
input=input,
extra_body=extra_body,
)
#
# OpenAI: CreateEmbeddingResponse(data=[Embedding(embedding=list[float], ...)], ...)
# ->
# Llama Stack: EmbeddingsResponse(embeddings=list[list[float]])
#
return EmbeddingsResponse(embeddings=[embedding.embedding for embedding in response.data])
async def openai_embeddings(
self,
model: str,

View file

@ -14,7 +14,6 @@ from ollama import AsyncClient as AsyncOllamaClient
from llama_stack.apis.common.content_types import (
ImageContentItem,
InterleavedContent,
InterleavedContentItem,
TextContentItem,
)
from llama_stack.apis.common.errors import UnsupportedModelError
@ -25,8 +24,6 @@ from llama_stack.apis.inference import (
CompletionRequest,
CompletionResponse,
CompletionResponseStreamChunk,
EmbeddingsResponse,
EmbeddingTaskType,
GrammarResponseFormat,
InferenceProvider,
JsonSchemaResponseFormat,
@ -34,7 +31,6 @@ from llama_stack.apis.inference import (
Message,
ResponseFormat,
SamplingParams,
TextTruncation,
ToolChoice,
ToolConfig,
ToolDefinition,
@ -66,9 +62,7 @@ from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from llama_stack.providers.utils.inference.prompt_adapter import (
chat_completion_request_to_prompt,
completion_request_to_prompt,
content_has_media,
convert_image_content_to_url,
interleaved_content_as_str,
request_has_media,
)
@ -363,27 +357,6 @@ class OllamaInferenceAdapter(
async for chunk in process_chat_completion_stream_response(stream, request):
yield chunk
async def embeddings(
self,
model_id: str,
contents: list[str] | list[InterleavedContentItem],
text_truncation: TextTruncation | None = TextTruncation.none,
output_dimension: int | None = None,
task_type: EmbeddingTaskType | None = None,
) -> EmbeddingsResponse:
model = await self._get_model(model_id)
assert all(not content_has_media(content) for content in contents), (
"Ollama does not support media for embeddings"
)
response = await self.ollama_client.embed(
model=model.provider_resource_id,
input=[interleaved_content_as_str(content) for content in contents],
)
embeddings = response["embeddings"]
return EmbeddingsResponse(embeddings=embeddings)
async def register_model(self, model: Model) -> Model:
if await self.check_model_availability(model.provider_model_id):
return model

View file

@ -14,8 +14,6 @@ from llama_stack.apis.inference import (
ChatCompletionResponse,
ChatCompletionResponseStreamChunk,
CompletionMessage,
EmbeddingsResponse,
EmbeddingTaskType,
Inference,
LogProbConfig,
Message,
@ -27,7 +25,6 @@ from llama_stack.apis.inference import (
OpenAIResponseFormatParam,
ResponseFormat,
SamplingParams,
TextTruncation,
ToolChoice,
ToolConfig,
ToolDefinition,
@ -190,25 +187,6 @@ class PassthroughInferenceAdapter(Inference):
chunk = convert_to_pydantic(ChatCompletionResponseStreamChunk, chunk)
yield chunk
async def embeddings(
self,
model_id: str,
contents: list[InterleavedContent],
text_truncation: TextTruncation | None = TextTruncation.none,
output_dimension: int | None = None,
task_type: EmbeddingTaskType | None = None,
) -> EmbeddingsResponse:
client = self._get_client()
model = await self.model_store.get_model(model_id)
return await client.inference.embeddings(
model_id=model.provider_resource_id,
contents=contents,
text_truncation=text_truncation,
output_dimension=output_dimension,
task_type=task_type,
)
async def openai_embeddings(
self,
model: str,

View file

@ -136,16 +136,6 @@ class RunpodInferenceAdapter(
**get_sampling_options(request.sampling_params),
}
async def embeddings(
self,
model: str,
contents: list[str] | list[InterleavedContentItem],
text_truncation: TextTruncation | None = TextTruncation.none,
output_dimension: int | None = None,
task_type: EmbeddingTaskType | None = None,
) -> EmbeddingsResponse:
raise NotImplementedError()
async def openai_embeddings(
self,
model: str,

View file

@ -12,14 +12,11 @@ from pydantic import SecretStr
from llama_stack.apis.common.content_types import (
InterleavedContent,
InterleavedContentItem,
)
from llama_stack.apis.inference import (
ChatCompletionRequest,
ChatCompletionResponse,
CompletionRequest,
EmbeddingsResponse,
EmbeddingTaskType,
Inference,
LogProbConfig,
Message,
@ -27,7 +24,6 @@ from llama_stack.apis.inference import (
ResponseFormat,
ResponseFormatType,
SamplingParams,
TextTruncation,
ToolChoice,
ToolConfig,
ToolDefinition,
@ -306,16 +302,6 @@ class _HfAdapter(
**self._build_options(request.sampling_params, request.response_format),
)
async def embeddings(
self,
model_id: str,
contents: list[str] | list[InterleavedContentItem],
text_truncation: TextTruncation | None = TextTruncation.none,
output_dimension: int | None = None,
task_type: EmbeddingTaskType | None = None,
) -> EmbeddingsResponse:
raise NotImplementedError()
async def openai_embeddings(
self,
model: str,

View file

@ -12,14 +12,11 @@ from together.constants import BASE_URL
from llama_stack.apis.common.content_types import (
InterleavedContent,
InterleavedContentItem,
)
from llama_stack.apis.inference import (
ChatCompletionRequest,
ChatCompletionResponse,
CompletionRequest,
EmbeddingsResponse,
EmbeddingTaskType,
Inference,
LogProbConfig,
Message,
@ -27,7 +24,6 @@ from llama_stack.apis.inference import (
ResponseFormat,
ResponseFormatType,
SamplingParams,
TextTruncation,
ToolChoice,
ToolConfig,
ToolDefinition,
@ -50,8 +46,6 @@ from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from llama_stack.providers.utils.inference.prompt_adapter import (
chat_completion_request_to_prompt,
completion_request_to_prompt,
content_has_media,
interleaved_content_as_str,
request_has_media,
)
@ -247,26 +241,6 @@ class TogetherInferenceAdapter(OpenAIMixin, ModelRegistryHelper, Inference, Need
logger.debug(f"params to together: {params}")
return params
async def embeddings(
self,
model_id: str,
contents: list[str] | list[InterleavedContentItem],
text_truncation: TextTruncation | None = TextTruncation.none,
output_dimension: int | None = None,
task_type: EmbeddingTaskType | None = None,
) -> EmbeddingsResponse:
model = await self.model_store.get_model(model_id)
assert all(not content_has_media(content) for content in contents), (
"Together does not support media for embeddings"
)
client = self._get_client()
r = await client.embeddings.create(
model=model.provider_resource_id,
input=[interleaved_content_as_str(content) for content in contents],
)
embeddings = [item.embedding for item in r.data]
return EmbeddingsResponse(embeddings=embeddings)
async def list_models(self) -> list[Model] | None:
self._model_cache = {}
# Together's /v1/models is not compatible with OpenAI's /v1/models. Together support ticket #13355 -> will not fix, use Together's own client

View file

@ -16,7 +16,6 @@ from openai.types.chat.chat_completion_chunk import (
from llama_stack.apis.common.content_types import (
InterleavedContent,
InterleavedContentItem,
TextDelta,
ToolCallDelta,
ToolCallParseStatus,
@ -31,8 +30,6 @@ from llama_stack.apis.inference import (
CompletionRequest,
CompletionResponse,
CompletionResponseStreamChunk,
EmbeddingsResponse,
EmbeddingTaskType,
GrammarResponseFormat,
Inference,
JsonSchemaResponseFormat,
@ -41,7 +38,6 @@ from llama_stack.apis.inference import (
ModelStore,
ResponseFormat,
SamplingParams,
TextTruncation,
ToolChoice,
ToolConfig,
ToolDefinition,
@ -74,8 +70,6 @@ from llama_stack.providers.utils.inference.openai_compat import (
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from llama_stack.providers.utils.inference.prompt_adapter import (
completion_request_to_prompt,
content_has_media,
interleaved_content_as_str,
request_has_media,
)
@ -550,27 +544,3 @@ class VLLMInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin, Inference, ModelsPro
"stream": request.stream,
**options,
}
async def embeddings(
self,
model_id: str,
contents: list[str] | list[InterleavedContentItem],
text_truncation: TextTruncation | None = TextTruncation.none,
output_dimension: int | None = None,
task_type: EmbeddingTaskType | None = None,
) -> EmbeddingsResponse:
model = await self._get_model(model_id)
kwargs = {}
assert model.model_type == ModelType.embedding
assert model.metadata.get("embedding_dimension")
kwargs["dimensions"] = model.metadata.get("embedding_dimension")
assert all(not content_has_media(content) for content in contents), "VLLM does not support media for embeddings"
response = await self.client.embeddings.create(
model=model.provider_resource_id,
input=[interleaved_content_as_str(content) for content in contents],
**kwargs,
)
embeddings = [data.embedding for data in response.data]
return EmbeddingsResponse(embeddings=embeddings)

View file

@ -11,13 +11,11 @@ from ibm_watsonx_ai.foundation_models import Model
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
from openai import AsyncOpenAI
from llama_stack.apis.common.content_types import InterleavedContent, InterleavedContentItem
from llama_stack.apis.common.content_types import InterleavedContent
from llama_stack.apis.inference import (
ChatCompletionRequest,
ChatCompletionResponse,
CompletionRequest,
EmbeddingsResponse,
EmbeddingTaskType,
GreedySamplingStrategy,
Inference,
LogProbConfig,
@ -30,7 +28,6 @@ from llama_stack.apis.inference import (
OpenAIResponseFormatParam,
ResponseFormat,
SamplingParams,
TextTruncation,
ToolChoice,
ToolConfig,
ToolDefinition,
@ -265,16 +262,6 @@ class WatsonXInferenceAdapter(Inference, ModelRegistryHelper):
}
return params
async def embeddings(
self,
model_id: str,
contents: list[str] | list[InterleavedContentItem],
text_truncation: TextTruncation | None = TextTruncation.none,
output_dimension: int | None = None,
task_type: EmbeddingTaskType | None = None,
) -> EmbeddingsResponse:
raise NotImplementedError("embedding is not supported for watsonx")
async def openai_embeddings(
self,
model: str,

View file

@ -140,13 +140,11 @@ client.models.register(
#### 2. Inference with the fine-tuned model
```python
response = client.inference.completion(
content="Complete the sentence using one word: Roses are red, violets are ",
response = client.completions.create(
prompt="Complete the sentence using one word: Roses are red, violets are ",
stream=False,
model_id="test-example-model@v1",
sampling_params={
"max_tokens": 50,
},
model="test-example-model@v1",
max_tokens=50,
)
print(response.content)
print(response.choices[0].text)
```

View file

@ -15,16 +15,11 @@ if TYPE_CHECKING:
from sentence_transformers import SentenceTransformer
from llama_stack.apis.inference import (
EmbeddingsResponse,
EmbeddingTaskType,
InterleavedContentItem,
ModelStore,
OpenAIEmbeddingData,
OpenAIEmbeddingsResponse,
OpenAIEmbeddingUsage,
TextTruncation,
)
from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str
EMBEDDING_MODELS = {}
@ -35,23 +30,6 @@ log = get_logger(name=__name__, category="providers::utils")
class SentenceTransformerEmbeddingMixin:
model_store: ModelStore
async def embeddings(
self,
model_id: str,
contents: list[str] | list[InterleavedContentItem],
text_truncation: TextTruncation | None = TextTruncation.none,
output_dimension: int | None = None,
task_type: EmbeddingTaskType | None = None,
) -> EmbeddingsResponse:
model = await self.model_store.get_model(model_id)
embedding_model = await self._load_sentence_transformer_model(model.provider_resource_id)
embeddings = await asyncio.to_thread(
embedding_model.encode,
[interleaved_content_as_str(content) for content in contents],
show_progress_bar=False,
)
return EmbeddingsResponse(embeddings=embeddings)
async def openai_embeddings(
self,
model: str,

View file

@ -11,14 +11,11 @@ import litellm
from llama_stack.apis.common.content_types import (
InterleavedContent,
InterleavedContentItem,
)
from llama_stack.apis.inference import (
ChatCompletionRequest,
ChatCompletionResponse,
ChatCompletionResponseStreamChunk,
EmbeddingsResponse,
EmbeddingTaskType,
InferenceProvider,
JsonSchemaResponseFormat,
LogProbConfig,
@ -32,7 +29,6 @@ from llama_stack.apis.inference import (
OpenAIResponseFormatParam,
ResponseFormat,
SamplingParams,
TextTruncation,
ToolChoice,
ToolConfig,
ToolDefinition,
@ -50,9 +46,6 @@ from llama_stack.providers.utils.inference.openai_compat import (
get_sampling_options,
prepare_openai_completion_params,
)
from llama_stack.providers.utils.inference.prompt_adapter import (
interleaved_content_as_str,
)
logger = get_logger(name=__name__, category="providers::utils")
@ -269,24 +262,6 @@ class LiteLLMOpenAIMixin(
)
return api_key
async def embeddings(
self,
model_id: str,
contents: list[str] | list[InterleavedContentItem],
text_truncation: TextTruncation | None = TextTruncation.none,
output_dimension: int | None = None,
task_type: EmbeddingTaskType | None = None,
) -> EmbeddingsResponse:
model = await self.model_store.get_model(model_id)
response = litellm.embedding(
model=self.get_litellm_model_name(model.provider_resource_id),
input=[interleaved_content_as_str(content) for content in contents],
)
embeddings = [data["embedding"] for data in response["data"]]
return EmbeddingsResponse(embeddings=embeddings)
async def openai_embeddings(
self,
model: str,

View file

@ -3,6 +3,9 @@
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import asyncio
from typing import Any
from llama_stack.apis.agents import (
Order,
)
@ -14,24 +17,51 @@ from llama_stack.apis.agents.openai_responses import (
OpenAIResponseObject,
OpenAIResponseObjectWithInput,
)
from llama_stack.core.datatypes import AccessRule
from llama_stack.core.datatypes import AccessRule, ResponsesStoreConfig
from llama_stack.core.utils.config_dirs import RUNTIME_BASE_DIR
from llama_stack.log import get_logger
from ..sqlstore.api import ColumnDefinition, ColumnType
from ..sqlstore.authorized_sqlstore import AuthorizedSqlStore
from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, sqlstore_impl
from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, SqlStoreType, sqlstore_impl
logger = get_logger(name=__name__, category="responses_store")
class ResponsesStore:
def __init__(self, sql_store_config: SqlStoreConfig, policy: list[AccessRule]):
if not sql_store_config:
sql_store_config = SqliteSqlStoreConfig(
def __init__(
self,
config: ResponsesStoreConfig | SqlStoreConfig,
policy: list[AccessRule],
):
# Handle backward compatibility
if not isinstance(config, ResponsesStoreConfig):
# Legacy: SqlStoreConfig passed directly as config
config = ResponsesStoreConfig(
sql_store_config=config,
)
self.config = config
self.sql_store_config = config.sql_store_config
if not self.sql_store_config:
self.sql_store_config = SqliteSqlStoreConfig(
db_path=(RUNTIME_BASE_DIR / "sqlstore.db").as_posix(),
)
self.sql_store = AuthorizedSqlStore(sqlstore_impl(sql_store_config), policy)
self.sql_store = None
self.policy = policy
# Disable write queue for SQLite to avoid concurrency issues
self.enable_write_queue = self.sql_store_config.type != SqlStoreType.sqlite
# Async write queue and worker control
self._queue: asyncio.Queue[tuple[OpenAIResponseObject, list[OpenAIResponseInput]]] | None = None
self._worker_tasks: list[asyncio.Task[Any]] = []
self._max_write_queue_size: int = config.max_write_queue_size
self._num_writers: int = max(1, config.num_writers)
async def initialize(self):
"""Create the necessary tables if they don't exist."""
self.sql_store = AuthorizedSqlStore(sqlstore_impl(self.sql_store_config), self.policy)
await self.sql_store.create_table(
"openai_responses",
{
@ -42,9 +72,68 @@ class ResponsesStore:
},
)
if self.enable_write_queue:
self._queue = asyncio.Queue(maxsize=self._max_write_queue_size)
for _ in range(self._num_writers):
self._worker_tasks.append(asyncio.create_task(self._worker_loop()))
else:
logger.info("Write queue disabled for SQLite to avoid concurrency issues")
async def shutdown(self) -> None:
if not self._worker_tasks:
return
if self._queue is not None:
await self._queue.join()
for t in self._worker_tasks:
if not t.done():
t.cancel()
for t in self._worker_tasks:
try:
await t
except asyncio.CancelledError:
pass
self._worker_tasks.clear()
async def flush(self) -> None:
"""Wait for all queued writes to complete. Useful for testing."""
if self.enable_write_queue and self._queue is not None:
await self._queue.join()
async def store_response_object(
self, response_object: OpenAIResponseObject, input: list[OpenAIResponseInput]
) -> None:
if self.enable_write_queue:
if self._queue is None:
raise ValueError("Responses store is not initialized")
try:
self._queue.put_nowait((response_object, input))
except asyncio.QueueFull:
logger.warning(f"Write queue full; adding response id={getattr(response_object, 'id', '<unknown>')}")
await self._queue.put((response_object, input))
else:
await self._write_response_object(response_object, input)
async def _worker_loop(self) -> None:
assert self._queue is not None
while True:
try:
item = await self._queue.get()
except asyncio.CancelledError:
break
response_object, input = item
try:
await self._write_response_object(response_object, input)
except Exception as e: # noqa: BLE001
logger.error(f"Error writing response object: {e}")
finally:
self._queue.task_done()
async def _write_response_object(
self, response_object: OpenAIResponseObject, input: list[OpenAIResponseInput]
) -> None:
if self.sql_store is None:
raise ValueError("Responses store is not initialized")
data = response_object.model_dump()
data["input"] = [input_item.model_dump() for input_item in input]
@ -73,6 +162,9 @@ class ResponsesStore:
:param model: The model to filter by.
:param order: The order to sort the responses by.
"""
if not self.sql_store:
raise ValueError("Responses store is not initialized")
if not order:
order = Order.desc
@ -100,6 +192,9 @@ class ResponsesStore:
"""
Get a response object with automatic access control checking.
"""
if not self.sql_store:
raise ValueError("Responses store is not initialized")
row = await self.sql_store.fetch_one(
"openai_responses",
where={"id": response_id},
@ -113,6 +208,9 @@ class ResponsesStore:
return OpenAIResponseObjectWithInput(**row["response_object"])
async def delete_response_object(self, response_id: str) -> OpenAIDeleteResponseObject:
if not self.sql_store:
raise ValueError("Responses store is not initialized")
row = await self.sql_store.fetch_one("openai_responses", where={"id": response_id})
if not row:
raise ValueError(f"Response with id {response_id} not found")

View file

@ -22,7 +22,6 @@ class WebMethod:
raw_bytes_request_body: bool | None = False
# A descriptive name of the corresponding span created by tracing
descriptive_name: str | None = None
experimental: bool | None = False
required_scope: str | None = None
deprecated: bool | None = False
@ -39,7 +38,6 @@ def webmethod(
response_examples: list[Any] | None = None,
raw_bytes_request_body: bool | None = False,
descriptive_name: str | None = None,
experimental: bool | None = False,
required_scope: str | None = None,
deprecated: bool | None = False,
) -> Callable[[T], T]:
@ -50,7 +48,6 @@ def webmethod(
:param public: True if the operation can be invoked without prior authentication.
:param request_examples: Sample requests that the operation might take. Pass a list of objects, not JSON.
:param response_examples: Sample responses that the operation might produce. Pass a list of objects, not JSON.
:param experimental: True if the operation is experimental and subject to change.
:param required_scope: Required scope for this endpoint (e.g., 'monitoring.viewer').
"""
@ -64,7 +61,6 @@ def webmethod(
response_examples=response_examples,
raw_bytes_request_body=raw_bytes_request_body,
descriptive_name=descriptive_name,
experimental=experimental,
required_scope=required_scope,
deprecated=deprecated,
)

View file

@ -28,7 +28,7 @@
"react-markdown": "^10.1.0",
"remark-gfm": "^4.0.1",
"remeda": "^2.32.0",
"shiki": "^1.29.2",
"shiki": "^3.13.0",
"sonner": "^2.0.7",
"tailwind-merge": "^3.3.1"
},
@ -51,7 +51,7 @@
"prettier": "3.6.2",
"tailwindcss": "^4",
"ts-node": "^10.9.2",
"tw-animate-css": "^1.2.9",
"tw-animate-css": "^1.4.0",
"typescript": "^5"
}
},
@ -3250,65 +3250,63 @@
"license": "MIT"
},
"node_modules/@shikijs/core": {
"version": "1.29.2",
"resolved": "https://registry.npmjs.org/@shikijs/core/-/core-1.29.2.tgz",
"integrity": "sha512-vju0lY9r27jJfOY4Z7+Rt/nIOjzJpZ3y+nYpqtUZInVoXQ/TJZcfGnNOGnKjFdVZb8qexiCuSlZRKcGfhhTTZQ==",
"version": "3.13.0",
"resolved": "https://registry.npmjs.org/@shikijs/core/-/core-3.13.0.tgz",
"integrity": "sha512-3P8rGsg2Eh2qIHekwuQjzWhKI4jV97PhvYjYUzGqjvJfqdQPz+nMlfWahU24GZAyW1FxFI1sYjyhfh5CoLmIUA==",
"license": "MIT",
"dependencies": {
"@shikijs/engine-javascript": "1.29.2",
"@shikijs/engine-oniguruma": "1.29.2",
"@shikijs/types": "1.29.2",
"@shikijs/vscode-textmate": "^10.0.1",
"@shikijs/types": "3.13.0",
"@shikijs/vscode-textmate": "^10.0.2",
"@types/hast": "^3.0.4",
"hast-util-to-html": "^9.0.4"
"hast-util-to-html": "^9.0.5"
}
},
"node_modules/@shikijs/engine-javascript": {
"version": "1.29.2",
"resolved": "https://registry.npmjs.org/@shikijs/engine-javascript/-/engine-javascript-1.29.2.tgz",
"integrity": "sha512-iNEZv4IrLYPv64Q6k7EPpOCE/nuvGiKl7zxdq0WFuRPF5PAE9PRo2JGq/d8crLusM59BRemJ4eOqrFrC4wiQ+A==",
"version": "3.13.0",
"resolved": "https://registry.npmjs.org/@shikijs/engine-javascript/-/engine-javascript-3.13.0.tgz",
"integrity": "sha512-Ty7xv32XCp8u0eQt8rItpMs6rU9Ki6LJ1dQOW3V/56PKDcpvfHPnYFbsx5FFUP2Yim34m/UkazidamMNVR4vKg==",
"license": "MIT",
"dependencies": {
"@shikijs/types": "1.29.2",
"@shikijs/vscode-textmate": "^10.0.1",
"oniguruma-to-es": "^2.2.0"
"@shikijs/types": "3.13.0",
"@shikijs/vscode-textmate": "^10.0.2",
"oniguruma-to-es": "^4.3.3"
}
},
"node_modules/@shikijs/engine-oniguruma": {
"version": "1.29.2",
"resolved": "https://registry.npmjs.org/@shikijs/engine-oniguruma/-/engine-oniguruma-1.29.2.tgz",
"integrity": "sha512-7iiOx3SG8+g1MnlzZVDYiaeHe7Ez2Kf2HrJzdmGwkRisT7r4rak0e655AcM/tF9JG/kg5fMNYlLLKglbN7gBqA==",
"version": "3.13.0",
"resolved": "https://registry.npmjs.org/@shikijs/engine-oniguruma/-/engine-oniguruma-3.13.0.tgz",
"integrity": "sha512-O42rBGr4UDSlhT2ZFMxqM7QzIU+IcpoTMzb3W7AlziI1ZF7R8eS2M0yt5Ry35nnnTX/LTLXFPUjRFCIW+Operg==",
"license": "MIT",
"dependencies": {
"@shikijs/types": "1.29.2",
"@shikijs/vscode-textmate": "^10.0.1"
"@shikijs/types": "3.13.0",
"@shikijs/vscode-textmate": "^10.0.2"
}
},
"node_modules/@shikijs/langs": {
"version": "1.29.2",
"resolved": "https://registry.npmjs.org/@shikijs/langs/-/langs-1.29.2.tgz",
"integrity": "sha512-FIBA7N3LZ+223U7cJDUYd5shmciFQlYkFXlkKVaHsCPgfVLiO+e12FmQE6Tf9vuyEsFe3dIl8qGWKXgEHL9wmQ==",
"version": "3.13.0",
"resolved": "https://registry.npmjs.org/@shikijs/langs/-/langs-3.13.0.tgz",
"integrity": "sha512-672c3WAETDYHwrRP0yLy3W1QYB89Hbpj+pO4KhxK6FzIrDI2FoEXNiNCut6BQmEApYLfuYfpgOZaqbY+E9b8wQ==",
"license": "MIT",
"dependencies": {
"@shikijs/types": "1.29.2"
"@shikijs/types": "3.13.0"
}
},
"node_modules/@shikijs/themes": {
"version": "1.29.2",
"resolved": "https://registry.npmjs.org/@shikijs/themes/-/themes-1.29.2.tgz",
"integrity": "sha512-i9TNZlsq4uoyqSbluIcZkmPL9Bfi3djVxRnofUHwvx/h6SRW3cwgBC5SML7vsDcWyukY0eCzVN980rqP6qNl9g==",
"version": "3.13.0",
"resolved": "https://registry.npmjs.org/@shikijs/themes/-/themes-3.13.0.tgz",
"integrity": "sha512-Vxw1Nm1/Od8jyA7QuAenaV78BG2nSr3/gCGdBkLpfLscddCkzkL36Q5b67SrLLfvAJTOUzW39x4FHVCFriPVgg==",
"license": "MIT",
"dependencies": {
"@shikijs/types": "1.29.2"
"@shikijs/types": "3.13.0"
}
},
"node_modules/@shikijs/types": {
"version": "1.29.2",
"resolved": "https://registry.npmjs.org/@shikijs/types/-/types-1.29.2.tgz",
"integrity": "sha512-VJjK0eIijTZf0QSTODEXCqinjBn0joAHQ+aPSBzrv4O2d/QSbsMw+ZeSRx03kV34Hy7NzUvV/7NqfYGRLrASmw==",
"version": "3.13.0",
"resolved": "https://registry.npmjs.org/@shikijs/types/-/types-3.13.0.tgz",
"integrity": "sha512-oM9P+NCFri/mmQ8LoFGVfVyemm5Hi27330zuOBp0annwJdKH1kOLndw3zCtAVDehPLg9fKqoEx3Ht/wNZxolfw==",
"license": "MIT",
"dependencies": {
"@shikijs/vscode-textmate": "^10.0.1",
"@shikijs/vscode-textmate": "^10.0.2",
"@types/hast": "^3.0.4"
}
},
@ -6084,12 +6082,6 @@
"dev": true,
"license": "MIT"
},
"node_modules/emoji-regex-xs": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/emoji-regex-xs/-/emoji-regex-xs-1.0.0.tgz",
"integrity": "sha512-LRlerrMYoIDrT6jgpeZ2YYl/L8EulRTt5hQcYjy5AInh7HWXKimpqx68aknBFpGL2+/IcogTcaydJEgaTmOpDg==",
"license": "MIT"
},
"node_modules/encodeurl": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-2.0.0.tgz",
@ -11813,15 +11805,21 @@
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/oniguruma-parser": {
"version": "0.12.1",
"resolved": "https://registry.npmjs.org/oniguruma-parser/-/oniguruma-parser-0.12.1.tgz",
"integrity": "sha512-8Unqkvk1RYc6yq2WBYRj4hdnsAxVze8i7iPfQr8e4uSP3tRv0rpZcbGUDvxfQQcdwHt/e9PrMvGCsa8OqG9X3w==",
"license": "MIT"
},
"node_modules/oniguruma-to-es": {
"version": "2.3.0",
"resolved": "https://registry.npmjs.org/oniguruma-to-es/-/oniguruma-to-es-2.3.0.tgz",
"integrity": "sha512-bwALDxriqfKGfUufKGGepCzu9x7nJQuoRoAFp4AnwehhC2crqrDIAP/uN2qdlsAvSMpeRC3+Yzhqc7hLmle5+g==",
"version": "4.3.3",
"resolved": "https://registry.npmjs.org/oniguruma-to-es/-/oniguruma-to-es-4.3.3.tgz",
"integrity": "sha512-rPiZhzC3wXwE59YQMRDodUwwT9FZ9nNBwQQfsd1wfdtlKEyCdRV0avrTcSZ5xlIvGRVPd/cx6ZN45ECmS39xvg==",
"license": "MIT",
"dependencies": {
"emoji-regex-xs": "^1.0.0",
"regex": "^5.1.1",
"regex-recursion": "^5.1.1"
"oniguruma-parser": "^0.12.1",
"regex": "^6.0.1",
"regex-recursion": "^6.0.2"
}
},
"node_modules/openid-client": {
@ -12613,21 +12611,20 @@
}
},
"node_modules/regex": {
"version": "5.1.1",
"resolved": "https://registry.npmjs.org/regex/-/regex-5.1.1.tgz",
"integrity": "sha512-dN5I359AVGPnwzJm2jN1k0W9LPZ+ePvoOeVMMfqIMFz53sSwXkxaJoxr50ptnsC771lK95BnTrVSZxq0b9yCGw==",
"version": "6.0.1",
"resolved": "https://registry.npmjs.org/regex/-/regex-6.0.1.tgz",
"integrity": "sha512-uorlqlzAKjKQZ5P+kTJr3eeJGSVroLKoHmquUj4zHWuR+hEyNqlXsSKlYYF5F4NI6nl7tWCs0apKJ0lmfsXAPA==",
"license": "MIT",
"dependencies": {
"regex-utilities": "^2.3.0"
}
},
"node_modules/regex-recursion": {
"version": "5.1.1",
"resolved": "https://registry.npmjs.org/regex-recursion/-/regex-recursion-5.1.1.tgz",
"integrity": "sha512-ae7SBCbzVNrIjgSbh7wMznPcQel1DNlDtzensnFxpiNpXt1U2ju/bHugH422r+4LAVS1FpW1YCwilmnNsjum9w==",
"version": "6.0.2",
"resolved": "https://registry.npmjs.org/regex-recursion/-/regex-recursion-6.0.2.tgz",
"integrity": "sha512-0YCaSCq2VRIebiaUviZNs0cBz1kg5kVS2UKUfNIx8YVs1cN3AV7NTctO5FOKBA+UT2BPJIWZauYHPqJODG50cg==",
"license": "MIT",
"dependencies": {
"regex": "^5.1.1",
"regex-utilities": "^2.3.0"
}
},
@ -13165,18 +13162,18 @@
}
},
"node_modules/shiki": {
"version": "1.29.2",
"resolved": "https://registry.npmjs.org/shiki/-/shiki-1.29.2.tgz",
"integrity": "sha512-njXuliz/cP+67jU2hukkxCNuH1yUi4QfdZZY+sMr5PPrIyXSu5iTb/qYC4BiWWB0vZ+7TbdvYUCeL23zpwCfbg==",
"version": "3.13.0",
"resolved": "https://registry.npmjs.org/shiki/-/shiki-3.13.0.tgz",
"integrity": "sha512-aZW4l8Og16CokuCLf8CF8kq+KK2yOygapU5m3+hoGw0Mdosc6fPitjM+ujYarppj5ZIKGyPDPP1vqmQhr+5/0g==",
"license": "MIT",
"dependencies": {
"@shikijs/core": "1.29.2",
"@shikijs/engine-javascript": "1.29.2",
"@shikijs/engine-oniguruma": "1.29.2",
"@shikijs/langs": "1.29.2",
"@shikijs/themes": "1.29.2",
"@shikijs/types": "1.29.2",
"@shikijs/vscode-textmate": "^10.0.1",
"@shikijs/core": "3.13.0",
"@shikijs/engine-javascript": "3.13.0",
"@shikijs/engine-oniguruma": "3.13.0",
"@shikijs/langs": "3.13.0",
"@shikijs/themes": "3.13.0",
"@shikijs/types": "3.13.0",
"@shikijs/vscode-textmate": "^10.0.2",
"@types/hast": "^3.0.4"
}
},
@ -13970,9 +13967,9 @@
"license": "0BSD"
},
"node_modules/tw-animate-css": {
"version": "1.2.9",
"resolved": "https://registry.npmjs.org/tw-animate-css/-/tw-animate-css-1.2.9.tgz",
"integrity": "sha512-9O4k1at9pMQff9EAcCEuy1UNO43JmaPQvq+0lwza9Y0BQ6LB38NiMj+qHqjoQf40355MX+gs6wtlR6H9WsSXFg==",
"version": "1.4.0",
"resolved": "https://registry.npmjs.org/tw-animate-css/-/tw-animate-css-1.4.0.tgz",
"integrity": "sha512-7bziOlRqH0hJx80h/3mbicLW7o8qLsH5+RaLR2t+OHM3D0JlWGODQKQ4cxbK7WlvmUxpcj6Kgu6EKqjrGFe3QQ==",
"dev": true,
"license": "MIT",
"funding": {

View file

@ -33,7 +33,7 @@
"react-markdown": "^10.1.0",
"remark-gfm": "^4.0.1",
"remeda": "^2.32.0",
"shiki": "^1.29.2",
"shiki": "^3.13.0",
"sonner": "^2.0.7",
"tailwind-merge": "^3.3.1"
},
@ -56,7 +56,7 @@
"prettier": "3.6.2",
"tailwindcss": "^4",
"ts-node": "^10.9.2",
"tw-animate-css": "^1.2.9",
"tw-animate-css": "^1.4.0",
"typescript": "^5"
}
}

View file

@ -178,10 +178,10 @@ Note that when re-recording tests, you must use a Stack pointing to a server (i.
### Basic Test Pattern
```python
def test_basic_completion(llama_stack_client, text_model_id):
response = llama_stack_client.inference.completion(
def test_basic_chat_completion(llama_stack_client, text_model_id):
response = llama_stack_client.inference.chat_completion(
model_id=text_model_id,
content=CompletionMessage(role="user", content="Hello"),
messages=[{"role": "user", "content": "Hello"}],
)
# Test structure, not AI output quality

View file

@ -166,7 +166,7 @@ def model_providers(llama_stack_client):
@pytest.fixture(autouse=True)
def skip_if_no_model(request):
model_fixtures = ["text_model_id", "vision_model_id", "embedding_model_id", "judge_model_id"]
model_fixtures = ["text_model_id", "vision_model_id", "embedding_model_id", "judge_model_id", "shield_id"]
test_func = request.node.function
actual_params = inspect.signature(test_func).parameters.keys()

View file

@ -14,6 +14,13 @@ from . import skip_in_github_actions
# LLAMA_STACK_CONFIG="nvidia" pytest -v tests/integration/providers/nvidia/test_datastore.py
@pytest.fixture(autouse=True)
def skip_if_no_nvidia_provider(llama_stack_client):
provider_types = {p.provider_type for p in llama_stack_client.providers.list() if p.api == "datasetio"}
if "remote::nvidia" not in provider_types:
pytest.skip("datasetio=remote::nvidia provider not configured, skipping")
# nvidia provider only
@skip_in_github_actions
@pytest.mark.parametrize(

View file

@ -0,0 +1,56 @@
{
"request": {
"method": "POST",
"url": "http://0.0.0.0:11434/v1/v1/chat/completions",
"headers": {},
"body": {
"model": "llama3.2:3b-instruct-fp16",
"messages": [
{
"role": "user",
"content": "Who is the CEO of Meta?"
}
],
"max_tokens": 0
},
"endpoint": "/v1/chat/completions",
"model": "llama3.2:3b-instruct-fp16"
},
"response": {
"body": {
"__type__": "openai.types.chat.chat_completion.ChatCompletion",
"__data__": {
"id": "chatcmpl-708",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"logprobs": null,
"message": {
"content": "Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004.",
"refusal": null,
"role": "assistant",
"annotations": null,
"audio": null,
"function_call": null,
"tool_calls": null
}
}
],
"created": 1759012142,
"model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion",
"service_tier": null,
"system_fingerprint": "fp_ollama",
"usage": {
"completion_tokens": 24,
"prompt_tokens": 32,
"total_tokens": 56,
"completion_tokens_details": null,
"prompt_tokens_details": null
}
}
},
"is_streaming": false
}
}

View file

@ -0,0 +1,56 @@
{
"request": {
"method": "POST",
"url": "http://0.0.0.0:11434/v1/v1/chat/completions",
"headers": {},
"body": {
"model": "llama3.2:3b-instruct-fp16",
"messages": [
{
"role": "user",
"content": "What is the currency of Japan?"
}
],
"max_tokens": 0
},
"endpoint": "/v1/chat/completions",
"model": "llama3.2:3b-instruct-fp16"
},
"response": {
"body": {
"__type__": "openai.types.chat.chat_completion.ChatCompletion",
"__data__": {
"id": "chatcmpl-343",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"logprobs": null,
"message": {
"content": "The currency of Japan is the Japanese yen (, ry\u014d) and its symbol, \u00a5.",
"refusal": null,
"role": "assistant",
"annotations": null,
"audio": null,
"function_call": null,
"tool_calls": null
}
}
],
"created": 1759012146,
"model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion",
"service_tier": null,
"system_fingerprint": "fp_ollama",
"usage": {
"completion_tokens": 20,
"prompt_tokens": 32,
"total_tokens": 52,
"completion_tokens_details": null,
"prompt_tokens_details": null
}
}
},
"is_streaming": false
}
}

View file

@ -0,0 +1,56 @@
{
"request": {
"method": "POST",
"url": "http://0.0.0.0:11434/v1/v1/chat/completions",
"headers": {},
"body": {
"model": "llama3.2:3b-instruct-fp16",
"messages": [
{
"role": "user",
"content": "What is the smallest country in the world?"
}
],
"max_tokens": 0
},
"endpoint": "/v1/chat/completions",
"model": "llama3.2:3b-instruct-fp16"
},
"response": {
"body": {
"__type__": "openai.types.chat.chat_completion.ChatCompletion",
"__data__": {
"id": "chatcmpl-842",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"logprobs": null,
"message": {
"content": "The smallest country in the world is the Vatican City, an independent city-state located within Rome, Italy. It has a total area of approximately 0.44 km\u00b2 (0.17 sq mi) and a population of around 800 people.\n\nDespite its tiny size, the Vatican City is a sovereign state with its own government, currency, postal system, and even a small army (the Gendarmeria Romana). It's also home to numerous iconic landmarks, including St. Peter's Basilica, the Sistine Chapel, and the Vatican Museums.\n\nThe Vatican City is so small that it can fit entirely within an average American city park!",
"refusal": null,
"role": "assistant",
"annotations": null,
"audio": null,
"function_call": null,
"tool_calls": null
}
}
],
"created": 1759012145,
"model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion",
"service_tier": null,
"system_fingerprint": "fp_ollama",
"usage": {
"completion_tokens": 133,
"prompt_tokens": 34,
"total_tokens": 167,
"completion_tokens_details": null,
"prompt_tokens_details": null
}
}
},
"is_streaming": false
}
}

View file

@ -0,0 +1,56 @@
{
"request": {
"method": "POST",
"url": "http://0.0.0.0:11434/v1/v1/chat/completions",
"headers": {},
"body": {
"model": "llama3.2:3b-instruct-fp16",
"messages": [
{
"role": "user",
"content": "What is the capital of France?"
}
],
"max_tokens": 0
},
"endpoint": "/v1/chat/completions",
"model": "llama3.2:3b-instruct-fp16"
},
"response": {
"body": {
"__type__": "openai.types.chat.chat_completion.ChatCompletion",
"__data__": {
"id": "chatcmpl-808",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"logprobs": null,
"message": {
"content": "The capital of France is Paris.",
"refusal": null,
"role": "assistant",
"annotations": null,
"audio": null,
"function_call": null,
"tool_calls": null
}
}
],
"created": 1759012142,
"model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion",
"service_tier": null,
"system_fingerprint": "fp_ollama",
"usage": {
"completion_tokens": 8,
"prompt_tokens": 32,
"total_tokens": 40,
"completion_tokens_details": null,
"prompt_tokens_details": null
}
}
},
"is_streaming": false
}
}

View file

@ -0,0 +1,56 @@
{
"request": {
"method": "POST",
"url": "http://0.0.0.0:11434/v1/v1/chat/completions",
"headers": {},
"body": {
"model": "llama3.2:3b-instruct-fp16",
"messages": [
{
"role": "user",
"content": "What is the largest planet in our solar system?"
}
],
"max_tokens": 0
},
"endpoint": "/v1/chat/completions",
"model": "llama3.2:3b-instruct-fp16"
},
"response": {
"body": {
"__type__": "openai.types.chat.chat_completion.ChatCompletion",
"__data__": {
"id": "chatcmpl-282",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"logprobs": null,
"message": {
"content": "The largest planet in our solar system is Jupiter. It is a gas giant, with a diameter of approximately 142,984 kilometers (88,846 miles). This makes it more than 11 times the diameter of the Earth and more than 2.5 times the mass of all the other planets in our solar system combined.",
"refusal": null,
"role": "assistant",
"annotations": null,
"audio": null,
"function_call": null,
"tool_calls": null
}
}
],
"created": 1759012143,
"model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion",
"service_tier": null,
"system_fingerprint": "fp_ollama",
"usage": {
"completion_tokens": 67,
"prompt_tokens": 35,
"total_tokens": 102,
"completion_tokens_details": null,
"prompt_tokens_details": null
}
}
},
"is_streaming": false
}
}

View file

@ -107,14 +107,34 @@ async def test_get_raw_document_text_deprecated_text_yaml_with_text_content_item
assert "text/yaml" in str(w[0].message)
async def test_get_raw_document_text_supports_json_mime_type():
"""Test that the function accepts application/json mime type."""
json_content = '{"name": "test", "version": "1.0", "items": ["item1", "item2"]}'
document = Document(content=json_content, mime_type="application/json")
result = await get_raw_document_text(document)
assert result == json_content
async def test_get_raw_document_text_with_json_text_content_item():
"""Test that the function handles JSON TextContentItem correctly."""
json_content = '{"key": "value", "nested": {"array": [1, 2, 3]}}'
document = Document(content=TextContentItem(text=json_content), mime_type="application/json")
result = await get_raw_document_text(document)
assert result == json_content
async def test_get_raw_document_text_rejects_unsupported_mime_types():
"""Test that the function rejects unsupported mime types."""
document = Document(
content="Some content",
mime_type="application/json", # Not supported
mime_type="application/pdf", # Not supported
)
with pytest.raises(ValueError, match="Unexpected document mime type: application/json"):
with pytest.raises(ValueError, match="Unexpected document mime type: application/pdf"):
await get_raw_document_text(document)

View file

@ -42,10 +42,12 @@ from llama_stack.apis.inference import (
)
from llama_stack.apis.tools.tools import Tool, ToolGroups, ToolInvocationResult, ToolParameter, ToolRuntime
from llama_stack.core.access_control.access_control import default_policy
from llama_stack.core.datatypes import ResponsesStoreConfig
from llama_stack.providers.inline.agents.meta_reference.responses.openai_responses import (
OpenAIResponsesImpl,
)
from llama_stack.providers.utils.responses.responses_store import ResponsesStore
from llama_stack.providers.utils.sqlstore.sqlstore import SqliteSqlStoreConfig
from tests.unit.providers.agents.meta_reference.fixtures import load_chat_completion_fixture
@ -677,7 +679,9 @@ async def test_responses_store_list_input_items_logic():
# Create mock store and response store
mock_sql_store = AsyncMock()
responses_store = ResponsesStore(sql_store_config=None, policy=default_policy())
responses_store = ResponsesStore(
ResponsesStoreConfig(sql_store_config=SqliteSqlStoreConfig(db_path="mock_db_path")), policy=default_policy()
)
responses_store.sql_store = mock_sql_store
# Setup test data - multiple input items

View file

@ -5,13 +5,12 @@
# the root directory of this source tree.
import asyncio
from unittest.mock import AsyncMock, MagicMock, patch
from unittest.mock import MagicMock, patch
import numpy as np
import pytest
from llama_stack.apis.files import Files
from llama_stack.apis.inference import EmbeddingsResponse, Inference
from llama_stack.apis.vector_dbs import VectorDB
from llama_stack.apis.vector_io import Chunk, QueryChunksResponse
from llama_stack.providers.datatypes import HealthStatus
@ -70,13 +69,6 @@ def mock_vector_db(vector_db_id, embedding_dimension) -> MagicMock:
return mock_vector_db
@pytest.fixture
def mock_inference_api(sample_embeddings):
mock_api = MagicMock(spec=Inference)
mock_api.embeddings = AsyncMock(return_value=EmbeddingsResponse(embeddings=sample_embeddings))
return mock_api
@pytest.fixture
def mock_files_api():
mock_api = MagicMock(spec=Files)
@ -96,22 +88,6 @@ async def faiss_index(embedding_dimension):
yield index
@pytest.fixture
async def faiss_adapter(faiss_config, mock_inference_api, mock_files_api) -> FaissVectorIOAdapter:
# Create the adapter
adapter = FaissVectorIOAdapter(config=faiss_config, inference_api=mock_inference_api, files_api=mock_files_api)
# Create a mock KVStore
mock_kvstore = MagicMock()
mock_kvstore.values_in_range = AsyncMock(return_value=[])
# Patch the initialize method to avoid the kvstore_impl call
with patch.object(FaissVectorIOAdapter, "initialize"):
# Set the kvstore directly
adapter.kvstore = mock_kvstore
yield adapter
async def test_faiss_query_vector_returns_infinity_when_query_and_embedding_are_identical(
faiss_index, sample_chunks, sample_embeddings, embedding_dimension
):

View file

@ -67,6 +67,9 @@ async def test_responses_store_pagination_basic():
input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
await store.store_response_object(response, input_list)
# Wait for all queued writes to complete
await store.flush()
# Test 1: First page with limit=2, descending order (default)
result = await store.list_responses(limit=2, order=Order.desc)
assert len(result.data) == 2
@ -110,6 +113,9 @@ async def test_responses_store_pagination_ascending():
input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
await store.store_response_object(response, input_list)
# Wait for all queued writes to complete
await store.flush()
# Test ascending order pagination
result = await store.list_responses(limit=1, order=Order.asc)
assert len(result.data) == 1
@ -145,6 +151,9 @@ async def test_responses_store_pagination_with_model_filter():
input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
await store.store_response_object(response, input_list)
# Wait for all queued writes to complete
await store.flush()
# Test pagination with model filter
result = await store.list_responses(limit=1, model="model-a", order=Order.desc)
assert len(result.data) == 1
@ -192,6 +201,9 @@ async def test_responses_store_pagination_no_limit():
input_list = [create_test_response_input(f"Input for {response_id}", f"input-{response_id}")]
await store.store_response_object(response, input_list)
# Wait for all queued writes to complete
await store.flush()
# Test without limit (should use default of 50)
result = await store.list_responses(order=Order.desc)
assert len(result.data) == 2
@ -212,6 +224,9 @@ async def test_responses_store_get_response_object():
input_list = [create_test_response_input("Test input content", "input-test-resp")]
await store.store_response_object(response, input_list)
# Wait for all queued writes to complete
await store.flush()
# Retrieve the response
retrieved = await store.get_response_object("test-resp")
assert retrieved.id == "test-resp"
@ -242,6 +257,9 @@ async def test_responses_store_input_items_pagination():
]
await store.store_response_object(response, input_list)
# Wait for all queued writes to complete
await store.flush()
# Verify all items are stored correctly with explicit IDs
all_items = await store.list_response_input_items("test-resp", order=Order.desc)
assert len(all_items.data) == 5
@ -319,6 +337,9 @@ async def test_responses_store_input_items_before_pagination():
]
await store.store_response_object(response, input_list)
# Wait for all queued writes to complete
await store.flush()
# Test before pagination with descending order
# In desc order: [Fifth, Fourth, Third, Second, First]
# before="before-3" should return [Fifth, Fourth]