mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-31 16:01:46 +00:00
Merge branch 'main' into providers
This commit is contained in:
commit
7a701d5020
20 changed files with 3099 additions and 2594 deletions
2
.github/actions/setup-runner/action.yml
vendored
2
.github/actions/setup-runner/action.yml
vendored
|
@ -13,7 +13,7 @@ runs:
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
uv sync --all-extras
|
uv sync --all-groups
|
||||||
uv pip install ollama faiss-cpu
|
uv pip install ollama faiss-cpu
|
||||||
# always test against the latest version of the client
|
# always test against the latest version of the client
|
||||||
# TODO: this is not necessarily a good idea. we need to test against both published and latest
|
# TODO: this is not necessarily a good idea. we need to test against both published and latest
|
||||||
|
|
|
@ -53,7 +53,7 @@ repos:
|
||||||
- black==24.3.0
|
- black==24.3.0
|
||||||
|
|
||||||
- repo: https://github.com/astral-sh/uv-pre-commit
|
- repo: https://github.com/astral-sh/uv-pre-commit
|
||||||
rev: 0.6.3
|
rev: 0.7.8
|
||||||
hooks:
|
hooks:
|
||||||
- id: uv-lock
|
- id: uv-lock
|
||||||
- id: uv-export
|
- id: uv-export
|
||||||
|
@ -61,6 +61,7 @@ repos:
|
||||||
"--frozen",
|
"--frozen",
|
||||||
"--no-hashes",
|
"--no-hashes",
|
||||||
"--no-emit-project",
|
"--no-emit-project",
|
||||||
|
"--no-default-groups",
|
||||||
"--output-file=requirements.txt"
|
"--output-file=requirements.txt"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -88,8 +89,8 @@ repos:
|
||||||
- id: distro-codegen
|
- id: distro-codegen
|
||||||
name: Distribution Template Codegen
|
name: Distribution Template Codegen
|
||||||
additional_dependencies:
|
additional_dependencies:
|
||||||
- uv==0.6.0
|
- uv==0.7.8
|
||||||
entry: uv run --extra codegen ./scripts/distro_codegen.py
|
entry: uv run --group codegen ./scripts/distro_codegen.py
|
||||||
language: python
|
language: python
|
||||||
pass_filenames: false
|
pass_filenames: false
|
||||||
require_serial: true
|
require_serial: true
|
||||||
|
@ -97,8 +98,8 @@ repos:
|
||||||
- id: openapi-codegen
|
- id: openapi-codegen
|
||||||
name: API Spec Codegen
|
name: API Spec Codegen
|
||||||
additional_dependencies:
|
additional_dependencies:
|
||||||
- uv==0.6.2
|
- uv==0.7.8
|
||||||
entry: sh -c 'uv run --with ".[dev]" ./docs/openapi_generator/run_openapi_generator.sh > /dev/null'
|
entry: sh -c 'uv run ./docs/openapi_generator/run_openapi_generator.sh > /dev/null'
|
||||||
language: python
|
language: python
|
||||||
pass_filenames: false
|
pass_filenames: false
|
||||||
require_serial: true
|
require_serial: true
|
||||||
|
|
|
@ -5,28 +5,21 @@
|
||||||
# Required
|
# Required
|
||||||
version: 2
|
version: 2
|
||||||
|
|
||||||
|
# Build documentation in the "docs/" directory with Sphinx
|
||||||
|
sphinx:
|
||||||
|
configuration: docs/source/conf.py
|
||||||
|
|
||||||
# Set the OS, Python version and other tools you might need
|
# Set the OS, Python version and other tools you might need
|
||||||
build:
|
build:
|
||||||
os: ubuntu-22.04
|
os: ubuntu-22.04
|
||||||
tools:
|
tools:
|
||||||
python: "3.12"
|
python: "3.12"
|
||||||
# You can also specify other tool versions:
|
jobs:
|
||||||
# nodejs: "19"
|
pre_create_environment:
|
||||||
# rust: "1.64"
|
- asdf plugin add uv
|
||||||
# golang: "1.19"
|
- asdf install uv latest
|
||||||
|
- asdf global uv latest
|
||||||
# Build documentation in the "docs/" directory with Sphinx
|
create_environment:
|
||||||
sphinx:
|
- uv venv "${READTHEDOCS_VIRTUALENV_PATH}"
|
||||||
configuration: docs/source/conf.py
|
install:
|
||||||
|
- UV_PROJECT_ENVIRONMENT="${READTHEDOCS_VIRTUALENV_PATH}" uv sync --frozen --group docs
|
||||||
# Optionally build your docs in additional formats such as PDF and ePub
|
|
||||||
# formats:
|
|
||||||
# - pdf
|
|
||||||
# - epub
|
|
||||||
|
|
||||||
# Optional but recommended, declare the Python requirements required
|
|
||||||
# to build your documentation
|
|
||||||
# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
|
|
||||||
python:
|
|
||||||
install:
|
|
||||||
- requirements: docs/requirements.txt
|
|
||||||
|
|
39
docs/_static/llama-stack-spec.html
vendored
39
docs/_static/llama-stack-spec.html
vendored
|
@ -7540,6 +7540,9 @@
|
||||||
{
|
{
|
||||||
"$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseCreated"
|
"$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseCreated"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseOutputTextDelta"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted"
|
"$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted"
|
||||||
}
|
}
|
||||||
|
@ -7548,6 +7551,7 @@
|
||||||
"propertyName": "type",
|
"propertyName": "type",
|
||||||
"mapping": {
|
"mapping": {
|
||||||
"response.created": "#/components/schemas/OpenAIResponseObjectStreamResponseCreated",
|
"response.created": "#/components/schemas/OpenAIResponseObjectStreamResponseCreated",
|
||||||
|
"response.output_text.delta": "#/components/schemas/OpenAIResponseObjectStreamResponseOutputTextDelta",
|
||||||
"response.completed": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted"
|
"response.completed": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -7590,6 +7594,41 @@
|
||||||
],
|
],
|
||||||
"title": "OpenAIResponseObjectStreamResponseCreated"
|
"title": "OpenAIResponseObjectStreamResponseCreated"
|
||||||
},
|
},
|
||||||
|
"OpenAIResponseObjectStreamResponseOutputTextDelta": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"content_index": {
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"delta": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"item_id": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"output_index": {
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"sequence_number": {
|
||||||
|
"type": "integer"
|
||||||
|
},
|
||||||
|
"type": {
|
||||||
|
"type": "string",
|
||||||
|
"const": "response.output_text.delta",
|
||||||
|
"default": "response.output_text.delta"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"content_index",
|
||||||
|
"delta",
|
||||||
|
"item_id",
|
||||||
|
"output_index",
|
||||||
|
"sequence_number",
|
||||||
|
"type"
|
||||||
|
],
|
||||||
|
"title": "OpenAIResponseObjectStreamResponseOutputTextDelta"
|
||||||
|
},
|
||||||
"CreateUploadSessionRequest": {
|
"CreateUploadSessionRequest": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
|
29
docs/_static/llama-stack-spec.yaml
vendored
29
docs/_static/llama-stack-spec.yaml
vendored
|
@ -5294,11 +5294,13 @@ components:
|
||||||
OpenAIResponseObjectStream:
|
OpenAIResponseObjectStream:
|
||||||
oneOf:
|
oneOf:
|
||||||
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated'
|
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated'
|
||||||
|
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseOutputTextDelta'
|
||||||
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted'
|
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted'
|
||||||
discriminator:
|
discriminator:
|
||||||
propertyName: type
|
propertyName: type
|
||||||
mapping:
|
mapping:
|
||||||
response.created: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated'
|
response.created: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated'
|
||||||
|
response.output_text.delta: '#/components/schemas/OpenAIResponseObjectStreamResponseOutputTextDelta'
|
||||||
response.completed: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted'
|
response.completed: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted'
|
||||||
"OpenAIResponseObjectStreamResponseCompleted":
|
"OpenAIResponseObjectStreamResponseCompleted":
|
||||||
type: object
|
type: object
|
||||||
|
@ -5330,6 +5332,33 @@ components:
|
||||||
- type
|
- type
|
||||||
title: >-
|
title: >-
|
||||||
OpenAIResponseObjectStreamResponseCreated
|
OpenAIResponseObjectStreamResponseCreated
|
||||||
|
"OpenAIResponseObjectStreamResponseOutputTextDelta":
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
content_index:
|
||||||
|
type: integer
|
||||||
|
delta:
|
||||||
|
type: string
|
||||||
|
item_id:
|
||||||
|
type: string
|
||||||
|
output_index:
|
||||||
|
type: integer
|
||||||
|
sequence_number:
|
||||||
|
type: integer
|
||||||
|
type:
|
||||||
|
type: string
|
||||||
|
const: response.output_text.delta
|
||||||
|
default: response.output_text.delta
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- content_index
|
||||||
|
- delta
|
||||||
|
- item_id
|
||||||
|
- output_index
|
||||||
|
- sequence_number
|
||||||
|
- type
|
||||||
|
title: >-
|
||||||
|
OpenAIResponseObjectStreamResponseOutputTextDelta
|
||||||
CreateUploadSessionRequest:
|
CreateUploadSessionRequest:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
|
|
|
@ -149,6 +149,16 @@ class OpenAIResponseObjectStreamResponseCreated(BaseModel):
|
||||||
type: Literal["response.created"] = "response.created"
|
type: Literal["response.created"] = "response.created"
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class OpenAIResponseObjectStreamResponseOutputTextDelta(BaseModel):
|
||||||
|
content_index: int
|
||||||
|
delta: str
|
||||||
|
item_id: str
|
||||||
|
output_index: int
|
||||||
|
sequence_number: int
|
||||||
|
type: Literal["response.output_text.delta"] = "response.output_text.delta"
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class OpenAIResponseObjectStreamResponseCompleted(BaseModel):
|
class OpenAIResponseObjectStreamResponseCompleted(BaseModel):
|
||||||
response: OpenAIResponseObject
|
response: OpenAIResponseObject
|
||||||
|
@ -156,7 +166,9 @@ class OpenAIResponseObjectStreamResponseCompleted(BaseModel):
|
||||||
|
|
||||||
|
|
||||||
OpenAIResponseObjectStream = Annotated[
|
OpenAIResponseObjectStream = Annotated[
|
||||||
OpenAIResponseObjectStreamResponseCreated | OpenAIResponseObjectStreamResponseCompleted,
|
OpenAIResponseObjectStreamResponseCreated
|
||||||
|
| OpenAIResponseObjectStreamResponseOutputTextDelta
|
||||||
|
| OpenAIResponseObjectStreamResponseCompleted,
|
||||||
Field(discriminator="type"),
|
Field(discriminator="type"),
|
||||||
]
|
]
|
||||||
register_schema(OpenAIResponseObjectStream, name="OpenAIResponseObjectStream")
|
register_schema(OpenAIResponseObjectStream, name="OpenAIResponseObjectStream")
|
||||||
|
|
|
@ -5,6 +5,7 @@
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
from collections.abc import AsyncIterator
|
from collections.abc import AsyncIterator
|
||||||
from typing import Any, cast
|
from typing import Any, cast
|
||||||
|
@ -29,10 +30,12 @@ from llama_stack.apis.agents.openai_responses import (
|
||||||
OpenAIResponseObjectStream,
|
OpenAIResponseObjectStream,
|
||||||
OpenAIResponseObjectStreamResponseCompleted,
|
OpenAIResponseObjectStreamResponseCompleted,
|
||||||
OpenAIResponseObjectStreamResponseCreated,
|
OpenAIResponseObjectStreamResponseCreated,
|
||||||
|
OpenAIResponseObjectStreamResponseOutputTextDelta,
|
||||||
OpenAIResponseOutput,
|
OpenAIResponseOutput,
|
||||||
OpenAIResponseOutputMessageContent,
|
OpenAIResponseOutputMessageContent,
|
||||||
OpenAIResponseOutputMessageContentOutputText,
|
OpenAIResponseOutputMessageContentOutputText,
|
||||||
OpenAIResponseOutputMessageFunctionToolCall,
|
OpenAIResponseOutputMessageFunctionToolCall,
|
||||||
|
OpenAIResponseOutputMessageMCPListTools,
|
||||||
OpenAIResponseOutputMessageWebSearchToolCall,
|
OpenAIResponseOutputMessageWebSearchToolCall,
|
||||||
)
|
)
|
||||||
from llama_stack.apis.inference.inference import (
|
from llama_stack.apis.inference.inference import (
|
||||||
|
@ -255,110 +258,14 @@ class OpenAIResponsesImpl:
|
||||||
"""
|
"""
|
||||||
return await self.responses_store.list_response_input_items(response_id, after, before, include, limit, order)
|
return await self.responses_store.list_response_input_items(response_id, after, before, include, limit, order)
|
||||||
|
|
||||||
async def create_openai_response(
|
async def _process_response_choices(
|
||||||
self,
|
self,
|
||||||
input: str | list[OpenAIResponseInput],
|
chat_response: OpenAIChatCompletion,
|
||||||
model: str,
|
ctx: ChatCompletionContext,
|
||||||
instructions: str | None = None,
|
tools: list[OpenAIResponseInputTool] | None,
|
||||||
previous_response_id: str | None = None,
|
) -> list[OpenAIResponseOutput]:
|
||||||
store: bool | None = True,
|
"""Handle tool execution and response message creation."""
|
||||||
stream: bool | None = False,
|
|
||||||
temperature: float | None = None,
|
|
||||||
tools: list[OpenAIResponseInputTool] | None = None,
|
|
||||||
):
|
|
||||||
output_messages: list[OpenAIResponseOutput] = []
|
output_messages: list[OpenAIResponseOutput] = []
|
||||||
|
|
||||||
stream = False if stream is None else stream
|
|
||||||
|
|
||||||
# Huge TODO: we need to run this in a loop, until morale improves
|
|
||||||
|
|
||||||
# Create context to run "chat completion"
|
|
||||||
input = await self._prepend_previous_response(input, previous_response_id)
|
|
||||||
messages = await _convert_response_input_to_chat_messages(input)
|
|
||||||
await self._prepend_instructions(messages, instructions)
|
|
||||||
chat_tools, mcp_tool_to_server, mcp_list_message = (
|
|
||||||
await self._convert_response_tools_to_chat_tools(tools) if tools else (None, {}, None)
|
|
||||||
)
|
|
||||||
if mcp_list_message:
|
|
||||||
output_messages.append(mcp_list_message)
|
|
||||||
|
|
||||||
ctx = ChatCompletionContext(
|
|
||||||
model=model,
|
|
||||||
messages=messages,
|
|
||||||
tools=chat_tools,
|
|
||||||
mcp_tool_to_server=mcp_tool_to_server,
|
|
||||||
stream=stream,
|
|
||||||
temperature=temperature,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Run inference
|
|
||||||
chat_response = await self.inference_api.openai_chat_completion(
|
|
||||||
model=model,
|
|
||||||
messages=messages,
|
|
||||||
tools=chat_tools,
|
|
||||||
stream=stream,
|
|
||||||
temperature=temperature,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Collect output
|
|
||||||
if stream:
|
|
||||||
# TODO: refactor this into a separate method that handles streaming
|
|
||||||
chat_response_id = ""
|
|
||||||
chat_response_content = []
|
|
||||||
chat_response_tool_calls: dict[int, OpenAIChatCompletionToolCall] = {}
|
|
||||||
# TODO: these chunk_ fields are hacky and only take the last chunk into account
|
|
||||||
chunk_created = 0
|
|
||||||
chunk_model = ""
|
|
||||||
chunk_finish_reason = ""
|
|
||||||
async for chunk in chat_response:
|
|
||||||
chat_response_id = chunk.id
|
|
||||||
chunk_created = chunk.created
|
|
||||||
chunk_model = chunk.model
|
|
||||||
for chunk_choice in chunk.choices:
|
|
||||||
# TODO: this only works for text content
|
|
||||||
chat_response_content.append(chunk_choice.delta.content or "")
|
|
||||||
if chunk_choice.finish_reason:
|
|
||||||
chunk_finish_reason = chunk_choice.finish_reason
|
|
||||||
|
|
||||||
# Aggregate tool call arguments across chunks, using their index as the aggregation key
|
|
||||||
if chunk_choice.delta.tool_calls:
|
|
||||||
for tool_call in chunk_choice.delta.tool_calls:
|
|
||||||
response_tool_call = chat_response_tool_calls.get(tool_call.index, None)
|
|
||||||
if response_tool_call:
|
|
||||||
response_tool_call.function.arguments += tool_call.function.arguments
|
|
||||||
else:
|
|
||||||
tool_call_dict: dict[str, Any] = tool_call.model_dump()
|
|
||||||
# Ensure we don't have any empty type field in the tool call dict.
|
|
||||||
# The OpenAI client used by providers often returns a type=None here.
|
|
||||||
tool_call_dict.pop("type", None)
|
|
||||||
response_tool_call = OpenAIChatCompletionToolCall(**tool_call_dict)
|
|
||||||
chat_response_tool_calls[tool_call.index] = response_tool_call
|
|
||||||
|
|
||||||
# Convert the dict of tool calls by index to a list of tool calls to pass back in our response
|
|
||||||
if chat_response_tool_calls:
|
|
||||||
tool_calls = [chat_response_tool_calls[i] for i in sorted(chat_response_tool_calls.keys())]
|
|
||||||
else:
|
|
||||||
tool_calls = None
|
|
||||||
assistant_message = OpenAIAssistantMessageParam(
|
|
||||||
content="".join(chat_response_content),
|
|
||||||
tool_calls=tool_calls,
|
|
||||||
)
|
|
||||||
chat_response = OpenAIChatCompletion(
|
|
||||||
id=chat_response_id,
|
|
||||||
choices=[
|
|
||||||
OpenAIChoice(
|
|
||||||
message=assistant_message,
|
|
||||||
finish_reason=chunk_finish_reason,
|
|
||||||
index=0,
|
|
||||||
)
|
|
||||||
],
|
|
||||||
created=chunk_created,
|
|
||||||
model=chunk_model,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# dump and reload to map to our pydantic types
|
|
||||||
chat_response = OpenAIChatCompletion(**chat_response.model_dump())
|
|
||||||
|
|
||||||
# Execute tool calls if any
|
# Execute tool calls if any
|
||||||
for choice in chat_response.choices:
|
for choice in chat_response.choices:
|
||||||
if choice.message.tool_calls and tools:
|
if choice.message.tool_calls and tools:
|
||||||
|
@ -380,7 +287,128 @@ class OpenAIResponsesImpl:
|
||||||
else:
|
else:
|
||||||
output_messages.append(await _convert_chat_choice_to_response_message(choice))
|
output_messages.append(await _convert_chat_choice_to_response_message(choice))
|
||||||
|
|
||||||
# Create response object
|
return output_messages
|
||||||
|
|
||||||
|
async def _store_response(
|
||||||
|
self,
|
||||||
|
response: OpenAIResponseObject,
|
||||||
|
original_input: str | list[OpenAIResponseInput],
|
||||||
|
) -> None:
|
||||||
|
new_input_id = f"msg_{uuid.uuid4()}"
|
||||||
|
if isinstance(original_input, str):
|
||||||
|
# synthesize a message from the input string
|
||||||
|
input_content = OpenAIResponseInputMessageContentText(text=original_input)
|
||||||
|
input_content_item = OpenAIResponseMessage(
|
||||||
|
role="user",
|
||||||
|
content=[input_content],
|
||||||
|
id=new_input_id,
|
||||||
|
)
|
||||||
|
input_items_data = [input_content_item]
|
||||||
|
else:
|
||||||
|
# we already have a list of messages
|
||||||
|
input_items_data = []
|
||||||
|
for input_item in original_input:
|
||||||
|
if isinstance(input_item, OpenAIResponseMessage):
|
||||||
|
# These may or may not already have an id, so dump to dict, check for id, and add if missing
|
||||||
|
input_item_dict = input_item.model_dump()
|
||||||
|
if "id" not in input_item_dict:
|
||||||
|
input_item_dict["id"] = new_input_id
|
||||||
|
input_items_data.append(OpenAIResponseMessage(**input_item_dict))
|
||||||
|
else:
|
||||||
|
input_items_data.append(input_item)
|
||||||
|
|
||||||
|
await self.responses_store.store_response_object(
|
||||||
|
response_object=response,
|
||||||
|
input=input_items_data,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def create_openai_response(
|
||||||
|
self,
|
||||||
|
input: str | list[OpenAIResponseInput],
|
||||||
|
model: str,
|
||||||
|
instructions: str | None = None,
|
||||||
|
previous_response_id: str | None = None,
|
||||||
|
store: bool | None = True,
|
||||||
|
stream: bool | None = False,
|
||||||
|
temperature: float | None = None,
|
||||||
|
tools: list[OpenAIResponseInputTool] | None = None,
|
||||||
|
):
|
||||||
|
stream = False if stream is None else stream
|
||||||
|
original_input = input # Keep reference for storage
|
||||||
|
|
||||||
|
output_messages: list[OpenAIResponseOutput] = []
|
||||||
|
|
||||||
|
# Input preprocessing
|
||||||
|
input = await self._prepend_previous_response(input, previous_response_id)
|
||||||
|
messages = await _convert_response_input_to_chat_messages(input)
|
||||||
|
await self._prepend_instructions(messages, instructions)
|
||||||
|
|
||||||
|
# Tool setup
|
||||||
|
chat_tools, mcp_tool_to_server, mcp_list_message = (
|
||||||
|
await self._convert_response_tools_to_chat_tools(tools) if tools else (None, {}, None)
|
||||||
|
)
|
||||||
|
if mcp_list_message:
|
||||||
|
output_messages.append(mcp_list_message)
|
||||||
|
|
||||||
|
ctx = ChatCompletionContext(
|
||||||
|
model=model,
|
||||||
|
messages=messages,
|
||||||
|
tools=chat_tools,
|
||||||
|
mcp_tool_to_server=mcp_tool_to_server,
|
||||||
|
stream=stream,
|
||||||
|
temperature=temperature,
|
||||||
|
)
|
||||||
|
|
||||||
|
inference_result = await self.inference_api.openai_chat_completion(
|
||||||
|
model=model,
|
||||||
|
messages=messages,
|
||||||
|
tools=chat_tools,
|
||||||
|
stream=stream,
|
||||||
|
temperature=temperature,
|
||||||
|
)
|
||||||
|
|
||||||
|
if stream:
|
||||||
|
return self._create_streaming_response(
|
||||||
|
inference_result=inference_result,
|
||||||
|
ctx=ctx,
|
||||||
|
output_messages=output_messages,
|
||||||
|
original_input=original_input,
|
||||||
|
model=model,
|
||||||
|
store=store,
|
||||||
|
tools=tools,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return await self._create_non_streaming_response(
|
||||||
|
inference_result=inference_result,
|
||||||
|
ctx=ctx,
|
||||||
|
output_messages=output_messages,
|
||||||
|
original_input=original_input,
|
||||||
|
model=model,
|
||||||
|
store=store,
|
||||||
|
tools=tools,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _create_non_streaming_response(
|
||||||
|
self,
|
||||||
|
inference_result: Any,
|
||||||
|
ctx: ChatCompletionContext,
|
||||||
|
output_messages: list[OpenAIResponseOutput],
|
||||||
|
original_input: str | list[OpenAIResponseInput],
|
||||||
|
model: str,
|
||||||
|
store: bool | None,
|
||||||
|
tools: list[OpenAIResponseInputTool] | None,
|
||||||
|
) -> OpenAIResponseObject:
|
||||||
|
chat_response = OpenAIChatCompletion(**inference_result.model_dump())
|
||||||
|
|
||||||
|
# Process response choices (tool execution and message creation)
|
||||||
|
output_messages.extend(
|
||||||
|
await self._process_response_choices(
|
||||||
|
chat_response=chat_response,
|
||||||
|
ctx=ctx,
|
||||||
|
tools=tools,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
response = OpenAIResponseObject(
|
response = OpenAIResponseObject(
|
||||||
created_at=chat_response.created,
|
created_at=chat_response.created,
|
||||||
id=f"resp-{uuid.uuid4()}",
|
id=f"resp-{uuid.uuid4()}",
|
||||||
|
@ -393,45 +421,135 @@ class OpenAIResponsesImpl:
|
||||||
|
|
||||||
# Store response if requested
|
# Store response if requested
|
||||||
if store:
|
if store:
|
||||||
new_input_id = f"msg_{uuid.uuid4()}"
|
await self._store_response(
|
||||||
if isinstance(input, str):
|
response=response,
|
||||||
# synthesize a message from the input string
|
original_input=original_input,
|
||||||
input_content = OpenAIResponseInputMessageContentText(text=input)
|
|
||||||
input_content_item = OpenAIResponseMessage(
|
|
||||||
role="user",
|
|
||||||
content=[input_content],
|
|
||||||
id=new_input_id,
|
|
||||||
)
|
|
||||||
input_items_data = [input_content_item]
|
|
||||||
else:
|
|
||||||
# we already have a list of messages
|
|
||||||
input_items_data = []
|
|
||||||
for input_item in input:
|
|
||||||
if isinstance(input_item, OpenAIResponseMessage):
|
|
||||||
# These may or may not already have an id, so dump to dict, check for id, and add if missing
|
|
||||||
input_item_dict = input_item.model_dump()
|
|
||||||
if "id" not in input_item_dict:
|
|
||||||
input_item_dict["id"] = new_input_id
|
|
||||||
input_items_data.append(OpenAIResponseMessage(**input_item_dict))
|
|
||||||
else:
|
|
||||||
input_items_data.append(input_item)
|
|
||||||
|
|
||||||
await self.responses_store.store_response_object(
|
|
||||||
response_object=response,
|
|
||||||
input=input_items_data,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if stream:
|
|
||||||
|
|
||||||
async def async_response() -> AsyncIterator[OpenAIResponseObjectStream]:
|
|
||||||
# TODO: response created should actually get emitted much earlier in the process
|
|
||||||
yield OpenAIResponseObjectStreamResponseCreated(response=response)
|
|
||||||
yield OpenAIResponseObjectStreamResponseCompleted(response=response)
|
|
||||||
|
|
||||||
return async_response()
|
|
||||||
|
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
async def _create_streaming_response(
|
||||||
|
self,
|
||||||
|
inference_result: Any,
|
||||||
|
ctx: ChatCompletionContext,
|
||||||
|
output_messages: list[OpenAIResponseOutput],
|
||||||
|
original_input: str | list[OpenAIResponseInput],
|
||||||
|
model: str,
|
||||||
|
store: bool | None,
|
||||||
|
tools: list[OpenAIResponseInputTool] | None,
|
||||||
|
) -> AsyncIterator[OpenAIResponseObjectStream]:
|
||||||
|
# Create initial response and emit response.created immediately
|
||||||
|
response_id = f"resp-{uuid.uuid4()}"
|
||||||
|
created_at = int(time.time())
|
||||||
|
|
||||||
|
initial_response = OpenAIResponseObject(
|
||||||
|
created_at=created_at,
|
||||||
|
id=response_id,
|
||||||
|
model=model,
|
||||||
|
object="response",
|
||||||
|
status="in_progress",
|
||||||
|
output=output_messages.copy(),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Emit response.created immediately
|
||||||
|
yield OpenAIResponseObjectStreamResponseCreated(response=initial_response)
|
||||||
|
|
||||||
|
# For streaming, inference_result is an async iterator of chunks
|
||||||
|
# Stream chunks and emit delta events as they arrive
|
||||||
|
chat_response_id = ""
|
||||||
|
chat_response_content = []
|
||||||
|
chat_response_tool_calls: dict[int, OpenAIChatCompletionToolCall] = {}
|
||||||
|
chunk_created = 0
|
||||||
|
chunk_model = ""
|
||||||
|
chunk_finish_reason = ""
|
||||||
|
sequence_number = 0
|
||||||
|
|
||||||
|
# Create a placeholder message item for delta events
|
||||||
|
message_item_id = f"msg_{uuid.uuid4()}"
|
||||||
|
|
||||||
|
async for chunk in inference_result:
|
||||||
|
chat_response_id = chunk.id
|
||||||
|
chunk_created = chunk.created
|
||||||
|
chunk_model = chunk.model
|
||||||
|
for chunk_choice in chunk.choices:
|
||||||
|
# Emit incremental text content as delta events
|
||||||
|
if chunk_choice.delta.content:
|
||||||
|
sequence_number += 1
|
||||||
|
yield OpenAIResponseObjectStreamResponseOutputTextDelta(
|
||||||
|
content_index=0,
|
||||||
|
delta=chunk_choice.delta.content,
|
||||||
|
item_id=message_item_id,
|
||||||
|
output_index=0,
|
||||||
|
sequence_number=sequence_number,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Collect content for final response
|
||||||
|
chat_response_content.append(chunk_choice.delta.content or "")
|
||||||
|
if chunk_choice.finish_reason:
|
||||||
|
chunk_finish_reason = chunk_choice.finish_reason
|
||||||
|
|
||||||
|
# Aggregate tool call arguments across chunks, using their index as the aggregation key
|
||||||
|
if chunk_choice.delta.tool_calls:
|
||||||
|
for tool_call in chunk_choice.delta.tool_calls:
|
||||||
|
response_tool_call = chat_response_tool_calls.get(tool_call.index, None)
|
||||||
|
if response_tool_call:
|
||||||
|
response_tool_call.function.arguments += tool_call.function.arguments
|
||||||
|
else:
|
||||||
|
tool_call_dict: dict[str, Any] = tool_call.model_dump()
|
||||||
|
tool_call_dict.pop("type", None)
|
||||||
|
response_tool_call = OpenAIChatCompletionToolCall(**tool_call_dict)
|
||||||
|
chat_response_tool_calls[tool_call.index] = response_tool_call
|
||||||
|
|
||||||
|
# Convert collected chunks to complete response
|
||||||
|
if chat_response_tool_calls:
|
||||||
|
tool_calls = [chat_response_tool_calls[i] for i in sorted(chat_response_tool_calls.keys())]
|
||||||
|
else:
|
||||||
|
tool_calls = None
|
||||||
|
assistant_message = OpenAIAssistantMessageParam(
|
||||||
|
content="".join(chat_response_content),
|
||||||
|
tool_calls=tool_calls,
|
||||||
|
)
|
||||||
|
chat_response_obj = OpenAIChatCompletion(
|
||||||
|
id=chat_response_id,
|
||||||
|
choices=[
|
||||||
|
OpenAIChoice(
|
||||||
|
message=assistant_message,
|
||||||
|
finish_reason=chunk_finish_reason,
|
||||||
|
index=0,
|
||||||
|
)
|
||||||
|
],
|
||||||
|
created=chunk_created,
|
||||||
|
model=chunk_model,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Process response choices (tool execution and message creation)
|
||||||
|
output_messages.extend(
|
||||||
|
await self._process_response_choices(
|
||||||
|
chat_response=chat_response_obj,
|
||||||
|
ctx=ctx,
|
||||||
|
tools=tools,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create final response
|
||||||
|
final_response = OpenAIResponseObject(
|
||||||
|
created_at=created_at,
|
||||||
|
id=response_id,
|
||||||
|
model=model,
|
||||||
|
object="response",
|
||||||
|
status="completed",
|
||||||
|
output=output_messages,
|
||||||
|
)
|
||||||
|
|
||||||
|
if store:
|
||||||
|
await self._store_response(
|
||||||
|
response=final_response,
|
||||||
|
original_input=original_input,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Emit response.completed
|
||||||
|
yield OpenAIResponseObjectStreamResponseCompleted(response=final_response)
|
||||||
|
|
||||||
async def _convert_response_tools_to_chat_tools(
|
async def _convert_response_tools_to_chat_tools(
|
||||||
self, tools: list[OpenAIResponseInputTool]
|
self, tools: list[OpenAIResponseInputTool]
|
||||||
) -> tuple[
|
) -> tuple[
|
||||||
|
@ -441,7 +559,6 @@ class OpenAIResponsesImpl:
|
||||||
]:
|
]:
|
||||||
from llama_stack.apis.agents.openai_responses import (
|
from llama_stack.apis.agents.openai_responses import (
|
||||||
MCPListToolsTool,
|
MCPListToolsTool,
|
||||||
OpenAIResponseOutputMessageMCPListTools,
|
|
||||||
)
|
)
|
||||||
from llama_stack.apis.tools.tools import Tool
|
from llama_stack.apis.tools.tools import Tool
|
||||||
|
|
||||||
|
|
|
@ -34,11 +34,16 @@ class VLLMInferenceAdapterConfig(BaseModel):
|
||||||
@classmethod
|
@classmethod
|
||||||
def validate_tls_verify(cls, v):
|
def validate_tls_verify(cls, v):
|
||||||
if isinstance(v, str):
|
if isinstance(v, str):
|
||||||
cert_path = Path(v)
|
# Check if it's a boolean string
|
||||||
|
if v.lower() in ("true", "false"):
|
||||||
|
return v.lower() == "true"
|
||||||
|
# Otherwise, treat it as a cert path
|
||||||
|
cert_path = Path(v).expanduser().resolve()
|
||||||
if not cert_path.exists():
|
if not cert_path.exists():
|
||||||
raise ValueError(f"TLS certificate file does not exist: {v}")
|
raise ValueError(f"TLS certificate file does not exist: {v}")
|
||||||
if not cert_path.is_file():
|
if not cert_path.is_file():
|
||||||
raise ValueError(f"TLS certificate path is not a file: {v}")
|
raise ValueError(f"TLS certificate path is not a file: {v}")
|
||||||
|
return v
|
||||||
return v
|
return v
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|
|
@ -1402,9 +1402,8 @@ class OpenAIChatCompletionToLlamaStackMixin:
|
||||||
outstanding_responses: list[Awaitable[AsyncIterator[ChatCompletionResponseStreamChunk]]],
|
outstanding_responses: list[Awaitable[AsyncIterator[ChatCompletionResponseStreamChunk]]],
|
||||||
):
|
):
|
||||||
id = f"chatcmpl-{uuid.uuid4()}"
|
id = f"chatcmpl-{uuid.uuid4()}"
|
||||||
for outstanding_response in outstanding_responses:
|
for i, outstanding_response in enumerate(outstanding_responses):
|
||||||
response = await outstanding_response
|
response = await outstanding_response
|
||||||
i = 0
|
|
||||||
async for chunk in response:
|
async for chunk in response:
|
||||||
event = chunk.event
|
event = chunk.event
|
||||||
finish_reason = _convert_stop_reason_to_openai_finish_reason(event.stop_reason)
|
finish_reason = _convert_stop_reason_to_openai_finish_reason(event.stop_reason)
|
||||||
|
@ -1459,7 +1458,6 @@ class OpenAIChatCompletionToLlamaStackMixin:
|
||||||
model=model,
|
model=model,
|
||||||
object="chat.completion.chunk",
|
object="chat.completion.chunk",
|
||||||
)
|
)
|
||||||
i = i + 1
|
|
||||||
|
|
||||||
async def _process_non_stream_response(
|
async def _process_non_stream_response(
|
||||||
self, model: str, outstanding_responses: list[Awaitable[ChatCompletionResponse]]
|
self, model: str, outstanding_responses: list[Awaitable[ChatCompletionResponse]]
|
||||||
|
|
|
@ -19,7 +19,7 @@
|
||||||
"@radix-ui/react-tooltip": "^1.2.6",
|
"@radix-ui/react-tooltip": "^1.2.6",
|
||||||
"class-variance-authority": "^0.7.1",
|
"class-variance-authority": "^0.7.1",
|
||||||
"clsx": "^2.1.1",
|
"clsx": "^2.1.1",
|
||||||
"llama-stack-client": "github:stainless-sdks/llama-stack-node#ehhuang/dev",
|
"llama-stack-client": "0.2.8",
|
||||||
"lucide-react": "^0.510.0",
|
"lucide-react": "^0.510.0",
|
||||||
"next": "15.3.2",
|
"next": "15.3.2",
|
||||||
"next-themes": "^0.4.6",
|
"next-themes": "^0.4.6",
|
||||||
|
|
|
@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "llama_stack"
|
name = "llama_stack"
|
||||||
version = "0.2.7"
|
version = "0.2.8"
|
||||||
authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }]
|
authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }]
|
||||||
description = "Llama Stack"
|
description = "Llama Stack"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
|
@ -26,7 +26,7 @@ dependencies = [
|
||||||
"huggingface-hub",
|
"huggingface-hub",
|
||||||
"jinja2>=3.1.6",
|
"jinja2>=3.1.6",
|
||||||
"jsonschema",
|
"jsonschema",
|
||||||
"llama-stack-client>=0.2.7",
|
"llama-stack-client>=0.2.8",
|
||||||
"openai>=1.66",
|
"openai>=1.66",
|
||||||
"prompt-toolkit",
|
"prompt-toolkit",
|
||||||
"python-dotenv",
|
"python-dotenv",
|
||||||
|
@ -42,6 +42,14 @@ dependencies = [
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
|
ui = [
|
||||||
|
"streamlit",
|
||||||
|
"pandas",
|
||||||
|
"llama-stack-client>=0.2.8",
|
||||||
|
"streamlit-option-menu",
|
||||||
|
]
|
||||||
|
|
||||||
|
[dependency-groups]
|
||||||
dev = [
|
dev = [
|
||||||
"pytest",
|
"pytest",
|
||||||
"pytest-timeout",
|
"pytest-timeout",
|
||||||
|
@ -112,12 +120,6 @@ docs = [
|
||||||
"sphinxcontrib.openapi",
|
"sphinxcontrib.openapi",
|
||||||
]
|
]
|
||||||
codegen = ["rich", "pydantic", "jinja2>=3.1.6"]
|
codegen = ["rich", "pydantic", "jinja2>=3.1.6"]
|
||||||
ui = [
|
|
||||||
"streamlit",
|
|
||||||
"pandas",
|
|
||||||
"llama-stack-client>=0.2.7",
|
|
||||||
"streamlit-option-menu",
|
|
||||||
]
|
|
||||||
|
|
||||||
[project.urls]
|
[project.urls]
|
||||||
Homepage = "https://github.com/meta-llama/llama-stack"
|
Homepage = "https://github.com/meta-llama/llama-stack"
|
||||||
|
|
119
requirements.txt
119
requirements.txt
|
@ -1,60 +1,175 @@
|
||||||
# This file was autogenerated by uv via the following command:
|
# This file was autogenerated by uv via the following command:
|
||||||
# uv export --frozen --no-hashes --no-emit-project --output-file=requirements.txt
|
# uv export --frozen --no-hashes --no-emit-project --no-default-groups --output-file=requirements.txt
|
||||||
annotated-types==0.7.0
|
annotated-types==0.7.0
|
||||||
|
# via pydantic
|
||||||
anyio==4.8.0
|
anyio==4.8.0
|
||||||
|
# via
|
||||||
|
# httpx
|
||||||
|
# llama-stack-client
|
||||||
|
# openai
|
||||||
attrs==25.1.0
|
attrs==25.1.0
|
||||||
|
# via
|
||||||
|
# jsonschema
|
||||||
|
# referencing
|
||||||
certifi==2025.1.31
|
certifi==2025.1.31
|
||||||
|
# via
|
||||||
|
# httpcore
|
||||||
|
# httpx
|
||||||
|
# requests
|
||||||
charset-normalizer==3.4.1
|
charset-normalizer==3.4.1
|
||||||
|
# via requests
|
||||||
click==8.1.8
|
click==8.1.8
|
||||||
|
# via llama-stack-client
|
||||||
colorama==0.4.6 ; sys_platform == 'win32'
|
colorama==0.4.6 ; sys_platform == 'win32'
|
||||||
|
# via
|
||||||
|
# click
|
||||||
|
# tqdm
|
||||||
distro==1.9.0
|
distro==1.9.0
|
||||||
|
# via
|
||||||
|
# llama-stack-client
|
||||||
|
# openai
|
||||||
ecdsa==0.19.1
|
ecdsa==0.19.1
|
||||||
|
# via python-jose
|
||||||
exceptiongroup==1.2.2 ; python_full_version < '3.11'
|
exceptiongroup==1.2.2 ; python_full_version < '3.11'
|
||||||
|
# via anyio
|
||||||
filelock==3.17.0
|
filelock==3.17.0
|
||||||
|
# via huggingface-hub
|
||||||
fire==0.7.0
|
fire==0.7.0
|
||||||
|
# via llama-stack
|
||||||
fsspec==2024.12.0
|
fsspec==2024.12.0
|
||||||
|
# via huggingface-hub
|
||||||
h11==0.16.0
|
h11==0.16.0
|
||||||
|
# via
|
||||||
|
# httpcore
|
||||||
|
# llama-stack
|
||||||
httpcore==1.0.9
|
httpcore==1.0.9
|
||||||
|
# via httpx
|
||||||
httpx==0.28.1
|
httpx==0.28.1
|
||||||
|
# via
|
||||||
|
# llama-stack
|
||||||
|
# llama-stack-client
|
||||||
|
# openai
|
||||||
huggingface-hub==0.29.0
|
huggingface-hub==0.29.0
|
||||||
|
# via llama-stack
|
||||||
idna==3.10
|
idna==3.10
|
||||||
|
# via
|
||||||
|
# anyio
|
||||||
|
# httpx
|
||||||
|
# requests
|
||||||
jinja2==3.1.6
|
jinja2==3.1.6
|
||||||
|
# via llama-stack
|
||||||
jiter==0.8.2
|
jiter==0.8.2
|
||||||
|
# via openai
|
||||||
jsonschema==4.23.0
|
jsonschema==4.23.0
|
||||||
|
# via llama-stack
|
||||||
jsonschema-specifications==2024.10.1
|
jsonschema-specifications==2024.10.1
|
||||||
llama-stack-client==0.2.7
|
# via jsonschema
|
||||||
|
llama-stack-client==0.2.8
|
||||||
|
# via llama-stack
|
||||||
markdown-it-py==3.0.0
|
markdown-it-py==3.0.0
|
||||||
|
# via rich
|
||||||
markupsafe==3.0.2
|
markupsafe==3.0.2
|
||||||
|
# via jinja2
|
||||||
mdurl==0.1.2
|
mdurl==0.1.2
|
||||||
|
# via markdown-it-py
|
||||||
numpy==2.2.3
|
numpy==2.2.3
|
||||||
|
# via pandas
|
||||||
openai==1.71.0
|
openai==1.71.0
|
||||||
|
# via llama-stack
|
||||||
packaging==24.2
|
packaging==24.2
|
||||||
|
# via huggingface-hub
|
||||||
pandas==2.2.3
|
pandas==2.2.3
|
||||||
|
# via llama-stack-client
|
||||||
pillow==11.1.0
|
pillow==11.1.0
|
||||||
|
# via llama-stack
|
||||||
prompt-toolkit==3.0.50
|
prompt-toolkit==3.0.50
|
||||||
|
# via
|
||||||
|
# llama-stack
|
||||||
|
# llama-stack-client
|
||||||
pyaml==25.1.0
|
pyaml==25.1.0
|
||||||
|
# via llama-stack-client
|
||||||
pyasn1==0.4.8
|
pyasn1==0.4.8
|
||||||
|
# via
|
||||||
|
# python-jose
|
||||||
|
# rsa
|
||||||
pydantic==2.10.6
|
pydantic==2.10.6
|
||||||
|
# via
|
||||||
|
# llama-stack
|
||||||
|
# llama-stack-client
|
||||||
|
# openai
|
||||||
pydantic-core==2.27.2
|
pydantic-core==2.27.2
|
||||||
|
# via pydantic
|
||||||
pygments==2.19.1
|
pygments==2.19.1
|
||||||
|
# via rich
|
||||||
python-dateutil==2.9.0.post0
|
python-dateutil==2.9.0.post0
|
||||||
|
# via pandas
|
||||||
python-dotenv==1.0.1
|
python-dotenv==1.0.1
|
||||||
|
# via llama-stack
|
||||||
python-jose==3.4.0
|
python-jose==3.4.0
|
||||||
|
# via llama-stack
|
||||||
pytz==2025.1
|
pytz==2025.1
|
||||||
|
# via pandas
|
||||||
pyyaml==6.0.2
|
pyyaml==6.0.2
|
||||||
|
# via
|
||||||
|
# huggingface-hub
|
||||||
|
# pyaml
|
||||||
referencing==0.36.2
|
referencing==0.36.2
|
||||||
|
# via
|
||||||
|
# jsonschema
|
||||||
|
# jsonschema-specifications
|
||||||
regex==2024.11.6
|
regex==2024.11.6
|
||||||
|
# via tiktoken
|
||||||
requests==2.32.3
|
requests==2.32.3
|
||||||
|
# via
|
||||||
|
# huggingface-hub
|
||||||
|
# llama-stack
|
||||||
|
# tiktoken
|
||||||
rich==13.9.4
|
rich==13.9.4
|
||||||
|
# via
|
||||||
|
# llama-stack
|
||||||
|
# llama-stack-client
|
||||||
rpds-py==0.22.3
|
rpds-py==0.22.3
|
||||||
|
# via
|
||||||
|
# jsonschema
|
||||||
|
# referencing
|
||||||
rsa==4.9
|
rsa==4.9
|
||||||
|
# via python-jose
|
||||||
setuptools==80.8.0
|
setuptools==80.8.0
|
||||||
|
# via llama-stack
|
||||||
six==1.17.0
|
six==1.17.0
|
||||||
|
# via
|
||||||
|
# ecdsa
|
||||||
|
# python-dateutil
|
||||||
sniffio==1.3.1
|
sniffio==1.3.1
|
||||||
|
# via
|
||||||
|
# anyio
|
||||||
|
# llama-stack-client
|
||||||
|
# openai
|
||||||
termcolor==2.5.0
|
termcolor==2.5.0
|
||||||
|
# via
|
||||||
|
# fire
|
||||||
|
# llama-stack
|
||||||
|
# llama-stack-client
|
||||||
tiktoken==0.9.0
|
tiktoken==0.9.0
|
||||||
|
# via llama-stack
|
||||||
tqdm==4.67.1
|
tqdm==4.67.1
|
||||||
|
# via
|
||||||
|
# huggingface-hub
|
||||||
|
# llama-stack-client
|
||||||
|
# openai
|
||||||
typing-extensions==4.12.2
|
typing-extensions==4.12.2
|
||||||
|
# via
|
||||||
|
# anyio
|
||||||
|
# huggingface-hub
|
||||||
|
# llama-stack-client
|
||||||
|
# openai
|
||||||
|
# pydantic
|
||||||
|
# pydantic-core
|
||||||
|
# referencing
|
||||||
|
# rich
|
||||||
tzdata==2025.1
|
tzdata==2025.1
|
||||||
|
# via pandas
|
||||||
urllib3==2.3.0
|
urllib3==2.3.0
|
||||||
|
# via requests
|
||||||
wcwidth==0.2.13
|
wcwidth==0.2.13
|
||||||
|
# via prompt-toolkit
|
||||||
|
|
|
@ -10,10 +10,10 @@ PYTHON_VERSION=${PYTHON_VERSION:-3.10}
|
||||||
|
|
||||||
command -v uv >/dev/null 2>&1 || { echo >&2 "uv is required but it's not installed. Exiting."; exit 1; }
|
command -v uv >/dev/null 2>&1 || { echo >&2 "uv is required but it's not installed. Exiting."; exit 1; }
|
||||||
|
|
||||||
uv python find $PYTHON_VERSION
|
uv python find "$PYTHON_VERSION"
|
||||||
FOUND_PYTHON=$?
|
FOUND_PYTHON=$?
|
||||||
if [ $FOUND_PYTHON -ne 0 ]; then
|
if [ $FOUND_PYTHON -ne 0 ]; then
|
||||||
uv python install $PYTHON_VERSION
|
uv python install "$PYTHON_VERSION"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
uv run --python $PYTHON_VERSION --with-editable . --with-editable ".[dev]" --with-editable ".[unit]" pytest --asyncio-mode=auto -s -v tests/unit/ $@
|
uv run --python "$PYTHON_VERSION" --with-editable . --group unit pytest --asyncio-mode=auto -s -v tests/unit/ $@
|
||||||
|
|
|
@ -41,7 +41,6 @@ def openai_client(client_with_models):
|
||||||
],
|
],
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@pytest.mark.skip(reason="Very flaky, sometimes there is a message not a function call, standard tool calling issues")
|
|
||||||
def test_responses_store(openai_client, client_with_models, text_model_id, stream, tools):
|
def test_responses_store(openai_client, client_with_models, text_model_id, stream, tools):
|
||||||
if isinstance(client_with_models, LlamaStackAsLibraryClient):
|
if isinstance(client_with_models, LlamaStackAsLibraryClient):
|
||||||
pytest.skip("OpenAI responses are not supported when testing with library client yet.")
|
pytest.skip("OpenAI responses are not supported when testing with library client yet.")
|
||||||
|
@ -68,13 +67,15 @@ def test_responses_store(openai_client, client_with_models, text_model_id, strea
|
||||||
for chunk in response:
|
for chunk in response:
|
||||||
if response_id is None:
|
if response_id is None:
|
||||||
response_id = chunk.response.id
|
response_id = chunk.response.id
|
||||||
if not tools:
|
if chunk.type == "response.completed":
|
||||||
if chunk.type == "response.completed":
|
response_id = chunk.response.id
|
||||||
response_id = chunk.response.id
|
output_type = chunk.response.output[0].type
|
||||||
|
if output_type == "message":
|
||||||
content = chunk.response.output[0].content[0].text
|
content = chunk.response.output[0].content[0].text
|
||||||
else:
|
else:
|
||||||
response_id = response.id
|
response_id = response.id
|
||||||
if not tools:
|
output_type = response.output[0].type
|
||||||
|
if output_type == "message":
|
||||||
content = response.output[0].content[0].text
|
content = response.output[0].content[0].text
|
||||||
|
|
||||||
# list responses - use the underlying HTTP client for endpoints not in SDK
|
# list responses - use the underlying HTTP client for endpoints not in SDK
|
||||||
|
@ -87,9 +88,8 @@ def test_responses_store(openai_client, client_with_models, text_model_id, strea
|
||||||
retrieved_response = client.responses.retrieve(response_id)
|
retrieved_response = client.responses.retrieve(response_id)
|
||||||
assert retrieved_response.id == response_id
|
assert retrieved_response.id == response_id
|
||||||
assert retrieved_response.model == text_model_id
|
assert retrieved_response.model == text_model_id
|
||||||
if tools:
|
assert retrieved_response.output[0].type == output_type, retrieved_response
|
||||||
assert retrieved_response.output[0].type == "function_call"
|
if output_type == "message":
|
||||||
else:
|
|
||||||
assert retrieved_response.output[0].content[0].text == content
|
assert retrieved_response.output[0].content[0].text == content
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -224,6 +224,43 @@ def test_openai_chat_completion_streaming(compat_client, client_with_models, tex
|
||||||
assert expected.lower() in "".join(streamed_content)
|
assert expected.lower() in "".join(streamed_content)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"test_case",
|
||||||
|
[
|
||||||
|
"inference:chat_completion:streaming_01",
|
||||||
|
"inference:chat_completion:streaming_02",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_openai_chat_completion_streaming_with_n(compat_client, client_with_models, text_model_id, test_case):
|
||||||
|
skip_if_model_doesnt_support_openai_chat_completion(client_with_models, text_model_id)
|
||||||
|
|
||||||
|
provider = provider_from_model(client_with_models, text_model_id)
|
||||||
|
if provider.provider_type == "remote::ollama":
|
||||||
|
pytest.skip(f"Model {text_model_id} hosted by {provider.provider_type} doesn't support n > 1.")
|
||||||
|
|
||||||
|
tc = TestCase(test_case)
|
||||||
|
question = tc["question"]
|
||||||
|
expected = tc["expected"]
|
||||||
|
|
||||||
|
response = compat_client.chat.completions.create(
|
||||||
|
model=text_model_id,
|
||||||
|
messages=[{"role": "user", "content": question}],
|
||||||
|
stream=True,
|
||||||
|
timeout=120, # Increase timeout to 2 minutes for large conversation history,
|
||||||
|
n=2,
|
||||||
|
)
|
||||||
|
streamed_content = {}
|
||||||
|
for chunk in response:
|
||||||
|
for choice in chunk.choices:
|
||||||
|
if choice.delta.content:
|
||||||
|
streamed_content[choice.index] = (
|
||||||
|
streamed_content.get(choice.index, "") + choice.delta.content.lower().strip()
|
||||||
|
)
|
||||||
|
assert len(streamed_content) == 2
|
||||||
|
for i, content in streamed_content.items():
|
||||||
|
assert expected.lower() in content, f"Choice {i}: Expected {expected.lower()} in {content}"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"stream",
|
"stream",
|
||||||
[
|
[
|
||||||
|
@ -231,7 +268,6 @@ def test_openai_chat_completion_streaming(compat_client, client_with_models, tex
|
||||||
False,
|
False,
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@pytest.mark.skip(reason="Very flaky, keeps failing on CI")
|
|
||||||
def test_inference_store(openai_client, client_with_models, text_model_id, stream):
|
def test_inference_store(openai_client, client_with_models, text_model_id, stream):
|
||||||
skip_if_model_doesnt_support_openai_chat_completion(client_with_models, text_model_id)
|
skip_if_model_doesnt_support_openai_chat_completion(client_with_models, text_model_id)
|
||||||
client = openai_client
|
client = openai_client
|
||||||
|
@ -254,7 +290,8 @@ def test_inference_store(openai_client, client_with_models, text_model_id, strea
|
||||||
for chunk in response:
|
for chunk in response:
|
||||||
if response_id is None:
|
if response_id is None:
|
||||||
response_id = chunk.id
|
response_id = chunk.id
|
||||||
content += chunk.choices[0].delta.content
|
if chunk.choices[0].delta.content:
|
||||||
|
content += chunk.choices[0].delta.content
|
||||||
else:
|
else:
|
||||||
response_id = response.id
|
response_id = response.id
|
||||||
content = response.choices[0].message.content
|
content = response.choices[0].message.content
|
||||||
|
@ -264,8 +301,8 @@ def test_inference_store(openai_client, client_with_models, text_model_id, strea
|
||||||
|
|
||||||
retrieved_response = client.chat.completions.retrieve(response_id)
|
retrieved_response = client.chat.completions.retrieve(response_id)
|
||||||
assert retrieved_response.id == response_id
|
assert retrieved_response.id == response_id
|
||||||
assert retrieved_response.input_messages[0]["content"] == message
|
assert retrieved_response.input_messages[0]["content"] == message, retrieved_response
|
||||||
assert retrieved_response.choices[0].message.content == content
|
assert retrieved_response.choices[0].message.content == content, retrieved_response
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
|
@ -275,7 +312,6 @@ def test_inference_store(openai_client, client_with_models, text_model_id, strea
|
||||||
False,
|
False,
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@pytest.mark.skip(reason="Very flaky, tool calling really wacky on CI")
|
|
||||||
def test_inference_store_tool_calls(openai_client, client_with_models, text_model_id, stream):
|
def test_inference_store_tool_calls(openai_client, client_with_models, text_model_id, stream):
|
||||||
skip_if_model_doesnt_support_openai_chat_completion(client_with_models, text_model_id)
|
skip_if_model_doesnt_support_openai_chat_completion(client_with_models, text_model_id)
|
||||||
client = openai_client
|
client = openai_client
|
||||||
|
@ -313,7 +349,9 @@ def test_inference_store_tool_calls(openai_client, client_with_models, text_mode
|
||||||
for chunk in response:
|
for chunk in response:
|
||||||
if response_id is None:
|
if response_id is None:
|
||||||
response_id = chunk.id
|
response_id = chunk.id
|
||||||
content += chunk.choices[0].delta.content
|
if delta := chunk.choices[0].delta:
|
||||||
|
if delta.content:
|
||||||
|
content += delta.content
|
||||||
else:
|
else:
|
||||||
response_id = response.id
|
response_id = response.id
|
||||||
content = response.choices[0].message.content
|
content = response.choices[0].message.content
|
||||||
|
@ -324,5 +362,11 @@ def test_inference_store_tool_calls(openai_client, client_with_models, text_mode
|
||||||
retrieved_response = client.chat.completions.retrieve(response_id)
|
retrieved_response = client.chat.completions.retrieve(response_id)
|
||||||
assert retrieved_response.id == response_id
|
assert retrieved_response.id == response_id
|
||||||
assert retrieved_response.input_messages[0]["content"] == message
|
assert retrieved_response.input_messages[0]["content"] == message
|
||||||
assert retrieved_response.choices[0].message.tool_calls[0].function.name == "get_weather"
|
tool_calls = retrieved_response.choices[0].message.tool_calls
|
||||||
assert retrieved_response.choices[0].message.tool_calls[0].function.arguments == '{"city":"Tokyo"}'
|
# sometimes model doesn't ouptut tool calls, but we still want to test that the tool was called
|
||||||
|
if tool_calls:
|
||||||
|
assert len(tool_calls) == 1
|
||||||
|
assert tool_calls[0].function.name == "get_weather"
|
||||||
|
assert "tokyo" in tool_calls[0].function.arguments.lower()
|
||||||
|
else:
|
||||||
|
assert retrieved_response.choices[0].message.content == content
|
||||||
|
|
|
@ -232,9 +232,17 @@ async def test_create_openai_response_with_tool_call_type_none(openai_responses_
|
||||||
|
|
||||||
# Check that we got the content from our mocked tool execution result
|
# Check that we got the content from our mocked tool execution result
|
||||||
chunks = [chunk async for chunk in result]
|
chunks = [chunk async for chunk in result]
|
||||||
assert len(chunks) > 0
|
assert len(chunks) == 2 # Should have response.created and response.completed
|
||||||
assert chunks[0].response.output[0].type == "function_call"
|
|
||||||
assert chunks[0].response.output[0].name == "get_weather"
|
# Check response.created event (should have empty output)
|
||||||
|
assert chunks[0].type == "response.created"
|
||||||
|
assert len(chunks[0].response.output) == 0
|
||||||
|
|
||||||
|
# Check response.completed event (should have the tool call)
|
||||||
|
assert chunks[1].type == "response.completed"
|
||||||
|
assert len(chunks[1].response.output) == 1
|
||||||
|
assert chunks[1].response.output[0].type == "function_call"
|
||||||
|
assert chunks[1].response.output[0].name == "get_weather"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
|
|
|
@ -10,17 +10,17 @@ from tests.verifications.openai_api.fixtures.fixtures import _load_all_verificat
|
||||||
def pytest_generate_tests(metafunc):
|
def pytest_generate_tests(metafunc):
|
||||||
"""Dynamically parametrize tests based on the selected provider and config."""
|
"""Dynamically parametrize tests based on the selected provider and config."""
|
||||||
if "model" in metafunc.fixturenames:
|
if "model" in metafunc.fixturenames:
|
||||||
|
model = metafunc.config.getoption("model")
|
||||||
|
if model:
|
||||||
|
metafunc.parametrize("model", [model])
|
||||||
|
return
|
||||||
|
|
||||||
provider = metafunc.config.getoption("provider")
|
provider = metafunc.config.getoption("provider")
|
||||||
if not provider:
|
if not provider:
|
||||||
print("Warning: --provider not specified. Skipping model parametrization.")
|
print("Warning: --provider not specified. Skipping model parametrization.")
|
||||||
metafunc.parametrize("model", [])
|
metafunc.parametrize("model", [])
|
||||||
return
|
return
|
||||||
|
|
||||||
model = metafunc.config.getoption("model")
|
|
||||||
if model:
|
|
||||||
metafunc.parametrize("model", [model])
|
|
||||||
return
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
config_data = _load_all_verification_configs()
|
config_data = _load_all_verification_configs()
|
||||||
except (OSError, FileNotFoundError) as e:
|
except (OSError, FileNotFoundError) as e:
|
||||||
|
|
|
@ -77,11 +77,12 @@ test_response_image:
|
||||||
image_url: "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg"
|
image_url: "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg"
|
||||||
output: "llama"
|
output: "llama"
|
||||||
|
|
||||||
|
# the models are really poor at tool calling after seeing images :/
|
||||||
test_response_multi_turn_image:
|
test_response_multi_turn_image:
|
||||||
test_name: test_response_multi_turn_image
|
test_name: test_response_multi_turn_image
|
||||||
test_params:
|
test_params:
|
||||||
case:
|
case:
|
||||||
- case_id: "llama_image_search"
|
- case_id: "llama_image_understanding"
|
||||||
turns:
|
turns:
|
||||||
- input:
|
- input:
|
||||||
- role: user
|
- role: user
|
||||||
|
@ -91,7 +92,5 @@ test_response_multi_turn_image:
|
||||||
- type: input_image
|
- type: input_image
|
||||||
image_url: "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg"
|
image_url: "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg"
|
||||||
output: "llama"
|
output: "llama"
|
||||||
- input: "Search the web using the search tool for the animal from the previous response. Your search query should be a single phrase that includes the animal's name and the words 'maverick', 'scout' and 'llm'"
|
- input: "What country do you find this animal primarily in? What continent?"
|
||||||
tools:
|
output: "peru"
|
||||||
- type: web_search
|
|
||||||
output: "model"
|
|
||||||
|
|
|
@ -7,6 +7,7 @@
|
||||||
import json
|
import json
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
import openai
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from llama_stack import LlamaStackAsLibraryClient
|
from llama_stack import LlamaStackAsLibraryClient
|
||||||
|
@ -61,23 +62,151 @@ def test_response_streaming_basic(request, openai_client, model, provider, verif
|
||||||
if should_skip_test(verification_config, provider, model, test_name_base):
|
if should_skip_test(verification_config, provider, model, test_name_base):
|
||||||
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
|
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
|
||||||
|
|
||||||
|
import time
|
||||||
|
|
||||||
response = openai_client.responses.create(
|
response = openai_client.responses.create(
|
||||||
model=model,
|
model=model,
|
||||||
input=case["input"],
|
input=case["input"],
|
||||||
stream=True,
|
stream=True,
|
||||||
)
|
)
|
||||||
streamed_content = []
|
|
||||||
|
# Track events and timing to verify proper streaming
|
||||||
|
events = []
|
||||||
|
event_times = []
|
||||||
response_id = ""
|
response_id = ""
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
for chunk in response:
|
for chunk in response:
|
||||||
if chunk.type == "response.completed":
|
current_time = time.time()
|
||||||
|
event_times.append(current_time - start_time)
|
||||||
|
events.append(chunk)
|
||||||
|
|
||||||
|
if chunk.type == "response.created":
|
||||||
|
# Verify response.created is emitted first and immediately
|
||||||
|
assert len(events) == 1, "response.created should be the first event"
|
||||||
|
assert event_times[0] < 0.1, "response.created should be emitted immediately"
|
||||||
|
assert chunk.response.status == "in_progress"
|
||||||
response_id = chunk.response.id
|
response_id = chunk.response.id
|
||||||
streamed_content.append(chunk.response.output_text.strip())
|
|
||||||
|
|
||||||
assert len(streamed_content) > 0
|
elif chunk.type == "response.completed":
|
||||||
assert case["output"].lower() in "".join(streamed_content).lower()
|
# Verify response.completed comes after response.created
|
||||||
|
assert len(events) >= 2, "response.completed should come after response.created"
|
||||||
|
assert chunk.response.status == "completed"
|
||||||
|
assert chunk.response.id == response_id, "Response ID should be consistent"
|
||||||
|
|
||||||
|
# Verify content quality
|
||||||
|
output_text = chunk.response.output_text.lower().strip()
|
||||||
|
assert len(output_text) > 0, "Response should have content"
|
||||||
|
assert case["output"].lower() in output_text, f"Expected '{case['output']}' in response"
|
||||||
|
|
||||||
|
# Verify we got both required events
|
||||||
|
event_types = [event.type for event in events]
|
||||||
|
assert "response.created" in event_types, "Missing response.created event"
|
||||||
|
assert "response.completed" in event_types, "Missing response.completed event"
|
||||||
|
|
||||||
|
# Verify event order
|
||||||
|
created_index = event_types.index("response.created")
|
||||||
|
completed_index = event_types.index("response.completed")
|
||||||
|
assert created_index < completed_index, "response.created should come before response.completed"
|
||||||
|
|
||||||
|
# Verify stored response matches streamed response
|
||||||
retrieved_response = openai_client.responses.retrieve(response_id=response_id)
|
retrieved_response = openai_client.responses.retrieve(response_id=response_id)
|
||||||
assert retrieved_response.output_text == "".join(streamed_content)
|
final_event = events[-1]
|
||||||
|
assert retrieved_response.output_text == final_event.response.output_text
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"case",
|
||||||
|
responses_test_cases["test_response_basic"]["test_params"]["case"],
|
||||||
|
ids=case_id_generator,
|
||||||
|
)
|
||||||
|
def test_response_streaming_incremental_content(request, openai_client, model, provider, verification_config, case):
|
||||||
|
"""Test that streaming actually delivers content incrementally, not just at the end."""
|
||||||
|
test_name_base = get_base_test_name(request)
|
||||||
|
if should_skip_test(verification_config, provider, model, test_name_base):
|
||||||
|
pytest.skip(f"Skipping {test_name_base} for model {model} on provider {provider} based on config.")
|
||||||
|
|
||||||
|
import time
|
||||||
|
|
||||||
|
response = openai_client.responses.create(
|
||||||
|
model=model,
|
||||||
|
input=case["input"],
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Track all events and their content to verify incremental streaming
|
||||||
|
events = []
|
||||||
|
content_snapshots = []
|
||||||
|
event_times = []
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
for chunk in response:
|
||||||
|
current_time = time.time()
|
||||||
|
event_times.append(current_time - start_time)
|
||||||
|
events.append(chunk)
|
||||||
|
|
||||||
|
# Track content at each event based on event type
|
||||||
|
if chunk.type == "response.output_text.delta":
|
||||||
|
# For delta events, track the delta content
|
||||||
|
content_snapshots.append(chunk.delta)
|
||||||
|
elif hasattr(chunk, "response") and hasattr(chunk.response, "output_text"):
|
||||||
|
# For response.created/completed events, track the full output_text
|
||||||
|
content_snapshots.append(chunk.response.output_text)
|
||||||
|
else:
|
||||||
|
content_snapshots.append("")
|
||||||
|
|
||||||
|
# Verify we have the expected events
|
||||||
|
event_types = [event.type for event in events]
|
||||||
|
assert "response.created" in event_types, "Missing response.created event"
|
||||||
|
assert "response.completed" in event_types, "Missing response.completed event"
|
||||||
|
|
||||||
|
# Check if we have incremental content updates
|
||||||
|
created_index = event_types.index("response.created")
|
||||||
|
completed_index = event_types.index("response.completed")
|
||||||
|
|
||||||
|
# The key test: verify content progression
|
||||||
|
created_content = content_snapshots[created_index]
|
||||||
|
completed_content = content_snapshots[completed_index]
|
||||||
|
|
||||||
|
# Verify that response.created has empty or minimal content
|
||||||
|
assert len(created_content) == 0, f"response.created should have empty content, got: {repr(created_content[:100])}"
|
||||||
|
|
||||||
|
# Verify that response.completed has the full content
|
||||||
|
assert len(completed_content) > 0, "response.completed should have content"
|
||||||
|
assert case["output"].lower() in completed_content.lower(), f"Expected '{case['output']}' in final content"
|
||||||
|
|
||||||
|
# Check for true incremental streaming by looking for delta events
|
||||||
|
delta_events = [i for i, event_type in enumerate(event_types) if event_type == "response.output_text.delta"]
|
||||||
|
|
||||||
|
# Assert that we have delta events (true incremental streaming)
|
||||||
|
assert len(delta_events) > 0, "Expected delta events for true incremental streaming, but found none"
|
||||||
|
|
||||||
|
# Verify delta events have content and accumulate to final content
|
||||||
|
delta_content_total = ""
|
||||||
|
non_empty_deltas = 0
|
||||||
|
|
||||||
|
for delta_idx in delta_events:
|
||||||
|
delta_content = content_snapshots[delta_idx]
|
||||||
|
if delta_content:
|
||||||
|
delta_content_total += delta_content
|
||||||
|
non_empty_deltas += 1
|
||||||
|
|
||||||
|
# Assert that we have meaningful delta content
|
||||||
|
assert non_empty_deltas > 0, "Delta events found but none contain content"
|
||||||
|
assert len(delta_content_total) > 0, "Delta events found but total delta content is empty"
|
||||||
|
|
||||||
|
# Verify that the accumulated delta content matches the final content
|
||||||
|
assert delta_content_total.strip() == completed_content.strip(), (
|
||||||
|
f"Delta content '{delta_content_total}' should match final content '{completed_content}'"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify timing: delta events should come between created and completed
|
||||||
|
for delta_idx in delta_events:
|
||||||
|
assert created_index < delta_idx < completed_index, (
|
||||||
|
f"Delta event at index {delta_idx} should be between created ({created_index}) and completed ({completed_index})"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
|
@ -178,7 +307,7 @@ def test_response_non_streaming_mcp_tool(request, openai_client, model, provider
|
||||||
exc_type = (
|
exc_type = (
|
||||||
AuthenticationRequiredError
|
AuthenticationRequiredError
|
||||||
if isinstance(openai_client, LlamaStackAsLibraryClient)
|
if isinstance(openai_client, LlamaStackAsLibraryClient)
|
||||||
else httpx.HTTPStatusError
|
else (httpx.HTTPStatusError, openai.AuthenticationError)
|
||||||
)
|
)
|
||||||
with pytest.raises(exc_type):
|
with pytest.raises(exc_type):
|
||||||
openai_client.responses.create(
|
openai_client.responses.create(
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue