feat: implement get chat completions APIs (#2200)

# What does this PR do? * Provide sqlite implementation of the APIs introduced in https://github.com/meta-llama/llama-stack/pull/2145. * Introduced a SqlStore API: llama_stack/providers/utils/sqlstore/api.py and the first Sqlite implementation * Pagination support will be added in a future PR. ## Test Plan Unit test on sql store: <img width="1005" alt="image" src="https://github.com/user-attachments/assets/9b8b7ec8-632b-4667-8127-5583426b2e29" /> Integration test: ``` INFERENCE_MODEL="llama3.2:3b-instruct-fp16" llama stack build --template ollama --image-type conda --run ``` ``` LLAMA_STACK_CONFIG=http://localhost:5001 INFERENCE_MODEL="llama3.2:3b-instruct-fp16" python -m pytest -v tests/integration/inference/test_openai_completion.py --text-model "llama3.2:3b-instruct-fp16" -k 'inference_store and openai' ```
2025-05-21 22:21:52 -07:00 · 2025-05-21 22:21:52 -07:00 · 549812f51e
commit 549812f51e
parent 633bb9c5b3
71 changed files with 1111 additions and 10 deletions
--- a/llama_stack/providers/utils/inference/inference_store.py
+++ b/llama_stack/providers/utils/inference/inference_store.py
@ -0,0 +1,123 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from llama_stack.apis.inference import (
+    ListOpenAIChatCompletionResponse,
+    OpenAIChatCompletion,
+    OpenAICompletionWithInputMessages,
+    OpenAIMessageParam,
+    Order,
+)
+from llama_stack.distribution.utils.config_dirs import RUNTIME_BASE_DIR
+
+from ..sqlstore.api import ColumnDefinition, ColumnType
+from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, sqlstore_impl
+
+
+class InferenceStore:
+    def __init__(self, sql_store_config: SqlStoreConfig):
+        if not sql_store_config:
+            sql_store_config = SqliteSqlStoreConfig(
+                db_path=(RUNTIME_BASE_DIR / "sqlstore.db").as_posix(),
+            )
+        self.sql_store_config = sql_store_config
+        self.sql_store = None
+
+    async def initialize(self):
+        """Create the necessary tables if they don't exist."""
+        self.sql_store = sqlstore_impl(self.sql_store_config)
+        await self.sql_store.create_table(
+            "chat_completions",
+            {
+                "id": ColumnDefinition(type=ColumnType.STRING, primary_key=True),
+                "created": ColumnType.INTEGER,
+                "model": ColumnType.STRING,
+                "choices": ColumnType.JSON,
+                "input_messages": ColumnType.JSON,
+            },
+        )
+
+    async def store_chat_completion(
+        self, chat_completion: OpenAIChatCompletion, input_messages: list[OpenAIMessageParam]
+    ) -> None:
+        if not self.sql_store:
+            raise ValueError("Inference store is not initialized")
+
+        data = chat_completion.model_dump()
+
+        await self.sql_store.insert(
+            "chat_completions",
+            {
+                "id": data["id"],
+                "created": data["created"],
+                "model": data["model"],
+                "choices": data["choices"],
+                "input_messages": [message.model_dump() for message in input_messages],
+            },
+        )
+
+    async def list_chat_completions(
+        self,
+        after: str | None = None,
+        limit: int | None = 50,
+        model: str | None = None,
+        order: Order | None = Order.desc,
+    ) -> ListOpenAIChatCompletionResponse:
+        """
+        List chat completions from the database.
+
+        :param after: The ID of the last chat completion to return.
+        :param limit: The maximum number of chat completions to return.
+        :param model: The model to filter by.
+        :param order: The order to sort the chat completions by.
+        """
+        if not self.sql_store:
+            raise ValueError("Inference store is not initialized")
+
+        # TODO: support after
+        if after:
+            raise NotImplementedError("After is not supported for SQLite")
+        if not order:
+            order = Order.desc
+
+        rows = await self.sql_store.fetch_all(
+            "chat_completions",
+            where={"model": model} if model else None,
+            order_by=[("created", order.value)],
+            limit=limit,
+        )
+
+        data = [
+            OpenAICompletionWithInputMessages(
+                id=row["id"],
+                created=row["created"],
+                model=row["model"],
+                choices=row["choices"],
+                input_messages=row["input_messages"],
+            )
+            for row in rows
+        ]
+        return ListOpenAIChatCompletionResponse(
+            data=data,
+            # TODO: implement has_more
+            has_more=False,
+            first_id=data[0].id if data else "",
+            last_id=data[-1].id if data else "",
+        )
+
+    async def get_chat_completion(self, completion_id: str) -> OpenAICompletionWithInputMessages:
+        if not self.sql_store:
+            raise ValueError("Inference store is not initialized")
+
+        row = await self.sql_store.fetch_one("chat_completions", where={"id": completion_id})
+        if not row:
+            raise ValueError(f"Chat completion with id {completion_id} not found") from None
+        return OpenAICompletionWithInputMessages(
+            id=row["id"],
+            created=row["created"],
+            model=row["model"],
+            choices=row["choices"],
+            input_messages=row["input_messages"],
+        )
--- a/llama_stack/providers/utils/inference/stream_utils.py
+++ b/llama_stack/providers/utils/inference/stream_utils.py
@ -0,0 +1,129 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from collections.abc import AsyncIterator
+from datetime import datetime, timezone
+from typing import Any
+
+from llama_stack.apis.inference import (
+    OpenAIAssistantMessageParam,
+    OpenAIChatCompletion,
+    OpenAIChatCompletionChunk,
+    OpenAIChatCompletionToolCall,
+    OpenAIChatCompletionToolCallFunction,
+    OpenAIChoice,
+    OpenAIChoiceLogprobs,
+    OpenAIMessageParam,
+)
+from llama_stack.providers.utils.inference.inference_store import InferenceStore
+
+
+async def stream_and_store_openai_completion(
+    provider_stream: AsyncIterator[OpenAIChatCompletionChunk],
+    model: str,
+    store: InferenceStore,
+    input_messages: list[OpenAIMessageParam],
+) -> AsyncIterator[OpenAIChatCompletionChunk]:
+    """
+    Wraps a provider's stream, yields chunks, and stores the full completion at the end.
+    """
+    id = None
+    created = None
+    choices_data: dict[int, dict[str, Any]] = {}
+
+    try:
+        async for chunk in provider_stream:
+            if id is None and chunk.id:
+                id = chunk.id
+            if created is None and chunk.created:
+                created = chunk.created
+
+            if chunk.choices:
+                for choice_delta in chunk.choices:
+                    idx = choice_delta.index
+                    if idx not in choices_data:
+                        choices_data[idx] = {
+                            "content_parts": [],
+                            "tool_calls_builder": {},
+                            "finish_reason": None,
+                            "logprobs_content_parts": [],
+                        }
+                    current_choice_data = choices_data[idx]
+
+                    if choice_delta.delta:
+                        delta = choice_delta.delta
+                        if delta.content:
+                            current_choice_data["content_parts"].append(delta.content)
+                        if delta.tool_calls:
+                            for tool_call_delta in delta.tool_calls:
+                                tc_idx = tool_call_delta.index
+                                if tc_idx not in current_choice_data["tool_calls_builder"]:
+                                    # Initialize with correct structure for _ToolCallBuilderData
+                                    current_choice_data["tool_calls_builder"][tc_idx] = {
+                                        "id": None,
+                                        "type": "function",
+                                        "function_name_parts": [],
+                                        "function_arguments_parts": [],
+                                    }
+                                builder = current_choice_data["tool_calls_builder"][tc_idx]
+                                if tool_call_delta.id:
+                                    builder["id"] = tool_call_delta.id
+                                if tool_call_delta.type:
+                                    builder["type"] = tool_call_delta.type
+                                if tool_call_delta.function:
+                                    if tool_call_delta.function.name:
+                                        builder["function_name_parts"].append(tool_call_delta.function.name)
+                                    if tool_call_delta.function.arguments:
+                                        builder["function_arguments_parts"].append(tool_call_delta.function.arguments)
+                    if choice_delta.finish_reason:
+                        current_choice_data["finish_reason"] = choice_delta.finish_reason
+                    if choice_delta.logprobs and choice_delta.logprobs.content:
+                        # Ensure that we are extending with the correct type
+                        current_choice_data["logprobs_content_parts"].extend(choice_delta.logprobs.content)
+            yield chunk
+    finally:
+        if id:
+            assembled_choices: list[OpenAIChoice] = []
+            for choice_idx, choice_data in choices_data.items():
+                content_str = "".join(choice_data["content_parts"])
+                assembled_tool_calls: list[OpenAIChatCompletionToolCall] = []
+                if choice_data["tool_calls_builder"]:
+                    for tc_build_data in choice_data["tool_calls_builder"].values():
+                        if tc_build_data["id"]:
+                            func_name = "".join(tc_build_data["function_name_parts"])
+                            func_args = "".join(tc_build_data["function_arguments_parts"])
+                            assembled_tool_calls.append(
+                                OpenAIChatCompletionToolCall(
+                                    id=tc_build_data["id"],
+                                    type=tc_build_data["type"],  # No or "function" needed, already set
+                                    function=OpenAIChatCompletionToolCallFunction(name=func_name, arguments=func_args),
+                                )
+                            )
+                message = OpenAIAssistantMessageParam(
+                    role="assistant",
+                    content=content_str if content_str else None,
+                    tool_calls=assembled_tool_calls if assembled_tool_calls else None,
+                )
+                logprobs_content = choice_data["logprobs_content_parts"]
+                final_logprobs = OpenAIChoiceLogprobs(content=logprobs_content) if logprobs_content else None
+
+                assembled_choices.append(
+                    OpenAIChoice(
+                        finish_reason=choice_data["finish_reason"],
+                        index=choice_idx,
+                        message=message,
+                        logprobs=final_logprobs,
+                    )
+                )
+
+            final_response = OpenAIChatCompletion(
+                id=id,
+                choices=assembled_choices,
+                created=created or int(datetime.now(timezone.utc).timestamp()),
+                model=model,
+                object="chat.completion",
+            )
+            await store.store_chat_completion(final_response, input_messages)