mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-27 18:50:41 +00:00
feat(responses): add more streaming response types (#2375)
Some checks failed
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 6s
Integration Tests / test-matrix (http, agents) (push) Failing after 9s
Integration Tests / test-matrix (http, scoring) (push) Failing after 8s
Integration Tests / test-matrix (http, inspect) (push) Failing after 9s
Integration Tests / test-matrix (http, post_training) (push) Failing after 10s
Integration Tests / test-matrix (library, datasets) (push) Failing after 9s
Integration Tests / test-matrix (http, datasets) (push) Failing after 11s
Integration Tests / test-matrix (library, agents) (push) Failing after 9s
Integration Tests / test-matrix (http, inference) (push) Failing after 11s
Integration Tests / test-matrix (http, providers) (push) Failing after 10s
Integration Tests / test-matrix (http, tool_runtime) (push) Failing after 9s
Integration Tests / test-matrix (library, inference) (push) Failing after 7s
Integration Tests / test-matrix (library, inspect) (push) Failing after 7s
Test External Providers / test-external-providers (venv) (push) Failing after 7s
Integration Tests / test-matrix (library, providers) (push) Failing after 7s
Integration Tests / test-matrix (library, post_training) (push) Failing after 9s
Unit Tests / unit-tests (3.10) (push) Failing after 7s
Integration Tests / test-matrix (library, scoring) (push) Failing after 9s
Unit Tests / unit-tests (3.13) (push) Failing after 7s
Integration Tests / test-matrix (library, tool_runtime) (push) Failing after 10s
Update ReadTheDocs / update-readthedocs (push) Failing after 6s
Unit Tests / unit-tests (3.11) (push) Failing after 9s
Unit Tests / unit-tests (3.12) (push) Failing after 34s
Pre-commit / pre-commit (push) Successful in 1m21s
Some checks failed
Integration Auth Tests / test-matrix (oauth2_token) (push) Failing after 6s
Integration Tests / test-matrix (http, agents) (push) Failing after 9s
Integration Tests / test-matrix (http, scoring) (push) Failing after 8s
Integration Tests / test-matrix (http, inspect) (push) Failing after 9s
Integration Tests / test-matrix (http, post_training) (push) Failing after 10s
Integration Tests / test-matrix (library, datasets) (push) Failing after 9s
Integration Tests / test-matrix (http, datasets) (push) Failing after 11s
Integration Tests / test-matrix (library, agents) (push) Failing after 9s
Integration Tests / test-matrix (http, inference) (push) Failing after 11s
Integration Tests / test-matrix (http, providers) (push) Failing after 10s
Integration Tests / test-matrix (http, tool_runtime) (push) Failing after 9s
Integration Tests / test-matrix (library, inference) (push) Failing after 7s
Integration Tests / test-matrix (library, inspect) (push) Failing after 7s
Test External Providers / test-external-providers (venv) (push) Failing after 7s
Integration Tests / test-matrix (library, providers) (push) Failing after 7s
Integration Tests / test-matrix (library, post_training) (push) Failing after 9s
Unit Tests / unit-tests (3.10) (push) Failing after 7s
Integration Tests / test-matrix (library, scoring) (push) Failing after 9s
Unit Tests / unit-tests (3.13) (push) Failing after 7s
Integration Tests / test-matrix (library, tool_runtime) (push) Failing after 10s
Update ReadTheDocs / update-readthedocs (push) Failing after 6s
Unit Tests / unit-tests (3.11) (push) Failing after 9s
Unit Tests / unit-tests (3.12) (push) Failing after 34s
Pre-commit / pre-commit (push) Successful in 1m21s
This commit is contained in:
parent
d96f6ec763
commit
ed69c1b3cc
4 changed files with 1003 additions and 14 deletions
|
@ -179,6 +179,30 @@ class OpenAIResponseObjectStreamResponseCreated(BaseModel):
|
|||
type: Literal["response.created"] = "response.created"
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class OpenAIResponseObjectStreamResponseCompleted(BaseModel):
|
||||
response: OpenAIResponseObject
|
||||
type: Literal["response.completed"] = "response.completed"
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class OpenAIResponseObjectStreamResponseOutputItemAdded(BaseModel):
|
||||
response_id: str
|
||||
item: OpenAIResponseOutput
|
||||
output_index: int
|
||||
sequence_number: int
|
||||
type: Literal["response.output_item.added"] = "response.output_item.added"
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class OpenAIResponseObjectStreamResponseOutputItemDone(BaseModel):
|
||||
response_id: str
|
||||
item: OpenAIResponseOutput
|
||||
output_index: int
|
||||
sequence_number: int
|
||||
type: Literal["response.output_item.done"] = "response.output_item.done"
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class OpenAIResponseObjectStreamResponseOutputTextDelta(BaseModel):
|
||||
content_index: int
|
||||
|
@ -190,14 +214,132 @@ class OpenAIResponseObjectStreamResponseOutputTextDelta(BaseModel):
|
|||
|
||||
|
||||
@json_schema_type
|
||||
class OpenAIResponseObjectStreamResponseCompleted(BaseModel):
|
||||
response: OpenAIResponseObject
|
||||
type: Literal["response.completed"] = "response.completed"
|
||||
class OpenAIResponseObjectStreamResponseOutputTextDone(BaseModel):
|
||||
content_index: int
|
||||
text: str # final text of the output item
|
||||
item_id: str
|
||||
output_index: int
|
||||
sequence_number: int
|
||||
type: Literal["response.output_text.done"] = "response.output_text.done"
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class OpenAIResponseObjectStreamResponseFunctionCallArgumentsDelta(BaseModel):
|
||||
delta: str
|
||||
item_id: str
|
||||
output_index: int
|
||||
sequence_number: int
|
||||
type: Literal["response.function_call_arguments.delta"] = "response.function_call_arguments.delta"
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class OpenAIResponseObjectStreamResponseFunctionCallArgumentsDone(BaseModel):
|
||||
arguments: str # final arguments of the function call
|
||||
item_id: str
|
||||
output_index: int
|
||||
sequence_number: int
|
||||
type: Literal["response.function_call_arguments.done"] = "response.function_call_arguments.done"
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class OpenAIResponseObjectStreamResponseWebSearchCallInProgress(BaseModel):
|
||||
item_id: str
|
||||
output_index: int
|
||||
sequence_number: int
|
||||
type: Literal["response.web_search_call.in_progress"] = "response.web_search_call.in_progress"
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class OpenAIResponseObjectStreamResponseWebSearchCallSearching(BaseModel):
|
||||
item_id: str
|
||||
output_index: int
|
||||
sequence_number: int
|
||||
type: Literal["response.web_search_call.searching"] = "response.web_search_call.searching"
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class OpenAIResponseObjectStreamResponseWebSearchCallCompleted(BaseModel):
|
||||
item_id: str
|
||||
output_index: int
|
||||
sequence_number: int
|
||||
type: Literal["response.web_search_call.completed"] = "response.web_search_call.completed"
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class OpenAIResponseObjectStreamResponseMcpListToolsInProgress(BaseModel):
|
||||
sequence_number: int
|
||||
type: Literal["response.mcp_list_tools.in_progress"] = "response.mcp_list_tools.in_progress"
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class OpenAIResponseObjectStreamResponseMcpListToolsFailed(BaseModel):
|
||||
sequence_number: int
|
||||
type: Literal["response.mcp_list_tools.failed"] = "response.mcp_list_tools.failed"
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class OpenAIResponseObjectStreamResponseMcpListToolsCompleted(BaseModel):
|
||||
sequence_number: int
|
||||
type: Literal["response.mcp_list_tools.completed"] = "response.mcp_list_tools.completed"
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class OpenAIResponseObjectStreamResponseMcpCallArgumentsDelta(BaseModel):
|
||||
delta: str
|
||||
item_id: str
|
||||
output_index: int
|
||||
sequence_number: int
|
||||
type: Literal["response.mcp_call.arguments.delta"] = "response.mcp_call.arguments.delta"
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class OpenAIResponseObjectStreamResponseMcpCallArgumentsDone(BaseModel):
|
||||
arguments: str # final arguments of the MCP call
|
||||
item_id: str
|
||||
output_index: int
|
||||
sequence_number: int
|
||||
type: Literal["response.mcp_call.arguments.done"] = "response.mcp_call.arguments.done"
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class OpenAIResponseObjectStreamResponseMcpCallInProgress(BaseModel):
|
||||
item_id: str
|
||||
output_index: int
|
||||
sequence_number: int
|
||||
type: Literal["response.mcp_call.in_progress"] = "response.mcp_call.in_progress"
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class OpenAIResponseObjectStreamResponseMcpCallFailed(BaseModel):
|
||||
sequence_number: int
|
||||
type: Literal["response.mcp_call.failed"] = "response.mcp_call.failed"
|
||||
|
||||
|
||||
@json_schema_type
|
||||
class OpenAIResponseObjectStreamResponseMcpCallCompleted(BaseModel):
|
||||
sequence_number: int
|
||||
type: Literal["response.mcp_call.completed"] = "response.mcp_call.completed"
|
||||
|
||||
|
||||
OpenAIResponseObjectStream = Annotated[
|
||||
OpenAIResponseObjectStreamResponseCreated
|
||||
| OpenAIResponseObjectStreamResponseOutputItemAdded
|
||||
| OpenAIResponseObjectStreamResponseOutputItemDone
|
||||
| OpenAIResponseObjectStreamResponseOutputTextDelta
|
||||
| OpenAIResponseObjectStreamResponseOutputTextDone
|
||||
| OpenAIResponseObjectStreamResponseFunctionCallArgumentsDelta
|
||||
| OpenAIResponseObjectStreamResponseFunctionCallArgumentsDone
|
||||
| OpenAIResponseObjectStreamResponseWebSearchCallInProgress
|
||||
| OpenAIResponseObjectStreamResponseWebSearchCallSearching
|
||||
| OpenAIResponseObjectStreamResponseWebSearchCallCompleted
|
||||
| OpenAIResponseObjectStreamResponseMcpListToolsInProgress
|
||||
| OpenAIResponseObjectStreamResponseMcpListToolsFailed
|
||||
| OpenAIResponseObjectStreamResponseMcpListToolsCompleted
|
||||
| OpenAIResponseObjectStreamResponseMcpCallArgumentsDelta
|
||||
| OpenAIResponseObjectStreamResponseMcpCallArgumentsDone
|
||||
| OpenAIResponseObjectStreamResponseMcpCallInProgress
|
||||
| OpenAIResponseObjectStreamResponseMcpCallFailed
|
||||
| OpenAIResponseObjectStreamResponseMcpCallCompleted
|
||||
| OpenAIResponseObjectStreamResponseCompleted,
|
||||
Field(discriminator="type"),
|
||||
]
|
||||
|
|
|
@ -433,12 +433,10 @@ class OpenAIResponsesImpl:
|
|||
store: bool | None,
|
||||
text: OpenAIResponseText,
|
||||
tools: list[OpenAIResponseInputTool] | None,
|
||||
max_infer_iters: int | None,
|
||||
max_infer_iters: int,
|
||||
) -> OpenAIResponseObject:
|
||||
# Implement tool execution loop - handle ALL inference rounds including the first
|
||||
n_iter = 0
|
||||
messages = ctx.messages.copy()
|
||||
current_response = None
|
||||
|
||||
while True:
|
||||
# Do inference (including the first one)
|
||||
|
@ -450,13 +448,13 @@ class OpenAIResponsesImpl:
|
|||
temperature=ctx.temperature,
|
||||
response_format=ctx.response_format,
|
||||
)
|
||||
current_response = OpenAIChatCompletion(**inference_result.model_dump())
|
||||
completion = OpenAIChatCompletion(**inference_result.model_dump())
|
||||
|
||||
# Separate function vs non-function tool calls
|
||||
function_tool_calls = []
|
||||
non_function_tool_calls = []
|
||||
|
||||
for choice in current_response.choices:
|
||||
for choice in completion.choices:
|
||||
if choice.message.tool_calls and tools:
|
||||
for tool_call in choice.message.tool_calls:
|
||||
if self._is_function_tool_call(tool_call, tools):
|
||||
|
@ -468,7 +466,7 @@ class OpenAIResponsesImpl:
|
|||
if function_tool_calls:
|
||||
# For function tool calls, use existing logic and return immediately
|
||||
current_output_messages = await self._process_response_choices(
|
||||
chat_response=current_response,
|
||||
chat_response=completion,
|
||||
ctx=ctx,
|
||||
tools=tools,
|
||||
)
|
||||
|
@ -476,7 +474,7 @@ class OpenAIResponsesImpl:
|
|||
break
|
||||
elif non_function_tool_calls:
|
||||
# For non-function tool calls, execute them and continue loop
|
||||
for choice in current_response.choices:
|
||||
for choice in completion.choices:
|
||||
tool_outputs, tool_response_messages = await self._execute_tool_calls_only(choice, ctx)
|
||||
output_messages.extend(tool_outputs)
|
||||
|
||||
|
@ -485,19 +483,19 @@ class OpenAIResponsesImpl:
|
|||
messages.extend(tool_response_messages)
|
||||
|
||||
n_iter += 1
|
||||
if n_iter >= (max_infer_iters or 10):
|
||||
if n_iter >= max_infer_iters:
|
||||
break
|
||||
|
||||
# Continue with next iteration of the loop
|
||||
continue
|
||||
else:
|
||||
# No tool calls - convert response to message and we're done
|
||||
for choice in current_response.choices:
|
||||
for choice in completion.choices:
|
||||
output_messages.append(await _convert_chat_choice_to_response_message(choice))
|
||||
break
|
||||
|
||||
response = OpenAIResponseObject(
|
||||
created_at=current_response.created,
|
||||
created_at=completion.created,
|
||||
id=f"resp-{uuid.uuid4()}",
|
||||
model=model,
|
||||
object="response",
|
||||
|
@ -549,7 +547,6 @@ class OpenAIResponsesImpl:
|
|||
messages = ctx.messages.copy()
|
||||
|
||||
while True:
|
||||
# Do inference (including the first one) - streaming
|
||||
current_inference_result = await self.inference_api.openai_chat_completion(
|
||||
model=ctx.model,
|
||||
messages=messages,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue