mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-27 18:50:41 +00:00
fix: ollama chat completion needs unique ids (#2344)
# What does this PR do? The chat completion ids generated by Ollama are not unique enough to use with stored chat completions as they rely on only 3 numbers of randomness to give unique values - ie `chatcmpl-373`. This causes frequent collisions in id values of chat completions in Ollama, which creates issues in our SQL storage of chat completions by id where it expects ids to actually be unique. So, this adjusts Ollama responses to use uuids as unique ids. This does mean we're replacing the ids generated natively by Ollama. If we don't wish to do this, we'll either need to relax the unique constraint on our chat completions id field in the inference storage or convince Ollama upstream to use something closer to uuid values here. Closes #2315 ## Test Plan I tested by running the openai completion / chat completion integration tests in a loop. Without this change, I regularly get unique id collisions. With this change, I do not. We sometimes see flakes from these unique id collisions in our CI tests, and this will resolve those. ``` INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" \ llama stack run llama_stack/templates/ollama/run.yaml while true; do; \ INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" \ pytest -s -v \ tests/integration/inference/test_openai_completion.py \ --stack-config=http://localhost:8321 \ --text-model="meta-llama/Llama-3.2-3B-Instruct"; \ done ``` Signed-off-by: Ben Browning <bbrownin@redhat.com>
This commit is contained in:
parent
4540c9b3e5
commit
e92f571f47
1 changed files with 20 additions and 1 deletions
|
@ -5,6 +5,7 @@
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
|
||||||
|
import uuid
|
||||||
from collections.abc import AsyncGenerator, AsyncIterator
|
from collections.abc import AsyncGenerator, AsyncIterator
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
@ -480,7 +481,25 @@ class OllamaInferenceAdapter(
|
||||||
top_p=top_p,
|
top_p=top_p,
|
||||||
user=user,
|
user=user,
|
||||||
)
|
)
|
||||||
return await self.openai_client.chat.completions.create(**params) # type: ignore
|
response = await self.openai_client.chat.completions.create(**params)
|
||||||
|
return await self._adjust_ollama_chat_completion_response_ids(response)
|
||||||
|
|
||||||
|
async def _adjust_ollama_chat_completion_response_ids(
|
||||||
|
self,
|
||||||
|
response: OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk],
|
||||||
|
) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
|
||||||
|
id = f"chatcmpl-{uuid.uuid4()}"
|
||||||
|
if isinstance(response, AsyncIterator):
|
||||||
|
|
||||||
|
async def stream_with_chunk_ids() -> AsyncIterator[OpenAIChatCompletionChunk]:
|
||||||
|
async for chunk in response:
|
||||||
|
chunk.id = id
|
||||||
|
yield chunk
|
||||||
|
|
||||||
|
return stream_with_chunk_ids()
|
||||||
|
else:
|
||||||
|
response.id = id
|
||||||
|
return response
|
||||||
|
|
||||||
async def batch_completion(
|
async def batch_completion(
|
||||||
self,
|
self,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue