mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-03 01:03:59 +00:00
OpenAI Responses - streaming handling for text chat responses
Signed-off-by: Ben Browning <bbrownin@redhat.com>
This commit is contained in:
parent
d523c8692a
commit
591e6a3972
1 changed files with 33 additions and 5 deletions
|
@ -152,17 +152,45 @@ class OpenAIResponsesImpl(OpenAIResponses):
|
||||||
messages.append(OpenAIUserMessageParam(content=user_content))
|
messages.append(OpenAIUserMessageParam(content=user_content))
|
||||||
|
|
||||||
chat_tools = await self._convert_response_tools_to_chat_tools(tools) if tools else None
|
chat_tools = await self._convert_response_tools_to_chat_tools(tools) if tools else None
|
||||||
# TODO: the code below doesn't handle streaming
|
|
||||||
chat_response = await self.inference_api.openai_chat_completion(
|
chat_response = await self.inference_api.openai_chat_completion(
|
||||||
model=model_obj.identifier,
|
model=model_obj.identifier,
|
||||||
messages=messages,
|
messages=messages,
|
||||||
tools=chat_tools,
|
tools=chat_tools,
|
||||||
stream=stream,
|
stream=stream,
|
||||||
)
|
)
|
||||||
# type cast to appease mypy
|
|
||||||
chat_response = cast(OpenAIChatCompletion, chat_response)
|
if isinstance(chat_response, AsyncIterator):
|
||||||
# dump and reload to map to our pydantic types
|
# TODO: refactor this into a separate method that handles streaming
|
||||||
chat_response = OpenAIChatCompletion.model_validate_json(chat_response.model_dump_json())
|
chat_response_id = ""
|
||||||
|
chat_response_content = []
|
||||||
|
# TODO: these chunk_ fields are hacky and only take the last chunk into account
|
||||||
|
chunk_created = 0
|
||||||
|
chunk_model = ""
|
||||||
|
chunk_finish_reason = ""
|
||||||
|
async for chunk in chat_response:
|
||||||
|
chat_response_id = chunk.id
|
||||||
|
chunk_created = chunk.created
|
||||||
|
chunk_model = chunk.model
|
||||||
|
for chunk_choice in chunk.choices:
|
||||||
|
# TODO: this only works for text content
|
||||||
|
chat_response_content.append(chunk_choice.delta.content or "")
|
||||||
|
chunk_finish_reason = chunk_choice.finish_reason
|
||||||
|
assistant_message = OpenAIAssistantMessageParam(content="".join(chat_response_content))
|
||||||
|
chat_response = OpenAIChatCompletion(
|
||||||
|
id=chat_response_id,
|
||||||
|
choices=[
|
||||||
|
OpenAIChoice(
|
||||||
|
message=assistant_message,
|
||||||
|
finish_reason=chunk_finish_reason,
|
||||||
|
index=0,
|
||||||
|
)
|
||||||
|
],
|
||||||
|
created=chunk_created,
|
||||||
|
model=chunk_model,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# dump and reload to map to our pydantic types
|
||||||
|
chat_response = OpenAIChatCompletion.model_validate_json(chat_response.model_dump_json())
|
||||||
|
|
||||||
output_messages: List[OpenAIResponseOutput] = []
|
output_messages: List[OpenAIResponseOutput] = []
|
||||||
if chat_response.choices[0].finish_reason == "tool_calls":
|
if chat_response.choices[0].finish_reason == "tool_calls":
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue