enable streaming support, use openai-python instead of httpx

2025-12-16 19:02:38 +00:00 · 2024-11-04 10:22:29 -05:00 · 2024-11-04 10:22:29 -05:00 · dbe665ed19
commit dbe665ed19
parent 2dd8c4bcb6
7 changed files with 1037 additions and 341 deletions
--- a/llama_stack/providers/adapters/inference/nvidia/_nvidia.py
+++ b/llama_stack/providers/adapters/inference/nvidia/_nvidia.py
@ -5,9 +5,8 @@
 # the root directory of this source tree.
 import warnings
-from typing import Dict, List, Optional, Union
+from typing import AsyncIterator, Dict, List, Optional, Union
 import httpx
 from llama_models.datatypes import SamplingParams
 from llama_models.llama3.api.datatypes import (
    InterleavedTextMedia,
@ -17,6 +16,7 @@ from llama_models.llama3.api.datatypes import (
    ToolPromptFormat,
 )
 from llama_models.sku_list import CoreModelId
 from openai import APIConnectionError, AsyncOpenAI
 from llama_stack.apis.inference import (
    ChatCompletionRequest,
@ -32,7 +32,12 @@ from llama_stack.apis.inference import (
 )
 from ._config import NVIDIAConfig
-from ._utils import check_health, convert_chat_completion_request, parse_completion
+from ._openai_utils import (
    convert_chat_completion_request,
    convert_openai_chat_completion_choice,
    convert_openai_chat_completion_stream,
 )
 from ._utils import check_health
 SUPPORTED_MODELS: Dict[CoreModelId, str] = {
    CoreModelId.llama3_8b_instruct: "meta/llama3-8b-instruct",
@ -71,17 +76,12 @@ class NVIDIAInferenceAdapter(Inference):
        #     )
        self._config = config
-
+        # make sure the client lives longer than any async calls
-    @property
+        self._client = AsyncOpenAI(
-    def _headers(self) -> dict:
+            base_url=f"{self._config.base_url}/v1",
-        return {
+            api_key=self._config.api_key or "NO KEY",
-            b"User-Agent": b"llama-stack: nvidia-inference-adapter",
+            timeout=self._config.timeout,
-            **(
+        )
                {b"Authorization": f"Bearer {self._config.api_key}"}
                if self._config.api_key
                else {}
            ),
        }
    async def list_models(self) -> List[ModelDef]:
        # TODO(mf): filter by available models
@ -98,7 +98,7 @@ class NVIDIAInferenceAdapter(Inference):
        response_format: Optional[ResponseFormat] = None,
        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
-    ) -> Union[CompletionResponse, CompletionResponseStreamChunk]:
+    ) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]:
        raise NotImplementedError()
    async def embeddings(
@ -121,56 +121,37 @@ class NVIDIAInferenceAdapter(Inference):
        ] = None,  # API default is ToolPromptFormat.json, we default to None to detect user input
        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
-    ) -> Union[ChatCompletionResponse, ChatCompletionResponseStreamChunk]:
+    ) -> Union[
        ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]
    ]:
        if tool_prompt_format:
            warnings.warn("tool_prompt_format is not supported by NVIDIA NIM, ignoring")
        if stream:
            raise ValueError("Streamed completions are not supported")
        await check_health(self._config)  # this raises errors
-        request = ChatCompletionRequest(
+        request = convert_chat_completion_request(
-            model=SUPPORTED_MODELS[CoreModelId(model)],
+            request=ChatCompletionRequest(
-            messages=messages,
+                model=SUPPORTED_MODELS[CoreModelId(model)],
-            sampling_params=sampling_params,
+                messages=messages,
-            tools=tools,
+                sampling_params=sampling_params,
-            tool_choice=tool_choice,
+                tools=tools,
-            tool_prompt_format=tool_prompt_format,
+                tool_choice=tool_choice,
-            stream=stream,
+                tool_prompt_format=tool_prompt_format,
-            logprobs=logprobs,
+                stream=stream,
                logprobs=logprobs,
            ),
            n=1,
        )
-        async with httpx.AsyncClient(timeout=self._config.timeout) as client:
+        try:
-            try:
+            response = await self._client.chat.completions.create(**request)
-                response = await client.post(
+        except APIConnectionError as e:
-                    f"{self._config.base_url}/v1/chat/completions",
+            raise ConnectionError(
-                    headers=self._headers,
+                f"Failed to connect to NVIDIA NIM at {self._config.base_url}: {e}"
-                    json=convert_chat_completion_request(request, n=1),
+            ) from e
                )
            except httpx.ReadTimeout as e:
                raise TimeoutError(
                    f"Request timed out. timeout set to {self._config.timeout}. Use `llama stack configure ...` to adjust it."
                ) from e
            if response.status_code == 401:
                raise PermissionError(
                    "Unauthorized. Please check your API key, reconfigure, and try again."
                )
            if response.status_code == 400:
                raise ValueError(
                    f"Bad request. Please check the request and try again. Detail: {response.text}"
                )
            if response.status_code == 404:
                raise ValueError(
                    "Model not found. Please check the model name and try again."
                )
            assert (
                response.status_code == 200
            ), f"Failed to get completion: {response.text}"
        if stream:
            return convert_openai_chat_completion_stream(response)
        else:
            # we pass n=1 to get only one completion
-            return parse_completion(response.json()["choices"][0])
+            return convert_openai_chat_completion_choice(response.choices[0])
--- a/llama_stack/providers/adapters/inference/nvidia/_openai_utils.py
+++ b/llama_stack/providers/adapters/inference/nvidia/_openai_utils.py
@ -0,0 +1,430 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import json
 import warnings
 from typing import Any, AsyncGenerator, Dict, Generator, List, Optional
 from llama_models.llama3.api.datatypes import (
    CompletionMessage,
    StopReason,
    TokenLogProbs,
    ToolCall,
 )
 from openai import AsyncStream
 from openai.types.chat import ChatCompletionChunk as OpenAIChatCompletionChunk
 from openai.types.chat.chat_completion import (
    Choice as OpenAIChoice,
    ChoiceLogprobs as OpenAIChoiceLogprobs,  # same as chat_completion_chunk ChoiceLogprobs
 )
 from openai.types.chat.chat_completion_message_tool_call import (
    ChatCompletionMessageToolCall as OpenAIChatCompletionMessageToolCall,
 )
 from llama_stack.apis.inference import (
    ChatCompletionRequest,
    ChatCompletionResponse,
    ChatCompletionResponseEvent,
    ChatCompletionResponseEventType,
    ChatCompletionResponseStreamChunk,
    Message,
    ToolCallDelta,
    ToolCallParseStatus,
 )
 def _convert_message(message: Message) -> Dict:
    """
    Convert a Message to an OpenAI API-compatible dictionary.
    """
    out_dict = message.dict()
    # Llama Stack uses role="ipython" for tool call messages, OpenAI uses "tool"
    if out_dict["role"] == "ipython":
        out_dict.update(role="tool")
    if "stop_reason" in out_dict:
        out_dict.update(stop_reason=out_dict["stop_reason"].value)
    # TODO(mf): tool_calls
    return out_dict
 def convert_chat_completion_request(
    request: ChatCompletionRequest,
    n: int = 1,
 ) -> dict:
    """
    Convert a ChatCompletionRequest to an OpenAI API-compatible dictionary.
    """
    # model -> model
    # messages -> messages
    # sampling_params  TODO(mattf): review strategy
    #  strategy=greedy -> nvext.top_k = -1, temperature = temperature
    #  strategy=top_p -> nvext.top_k = -1, top_p = top_p
    #  strategy=top_k -> nvext.top_k = top_k
    #  temperature -> temperature
    #  top_p -> top_p
    #  top_k -> nvext.top_k
    #  max_tokens -> max_tokens
    #  repetition_penalty -> nvext.repetition_penalty
    # tools -> tools
    # tool_choice ("auto", "required") -> tool_choice
    # tool_prompt_format -> TBD
    # stream -> stream
    # logprobs -> logprobs
    nvext = {}
    payload: Dict[str, Any] = dict(
        model=request.model,
        messages=[_convert_message(message) for message in request.messages],
        stream=request.stream,
        n=n,
        extra_body=dict(nvext=nvext),
        extra_headers={
            b"User-Agent": b"llama-stack: nvidia-inference-adapter",
        },
    )
    if request.tools:
        payload.update(tools=request.tools)
        if request.tool_choice:
            payload.update(
                tool_choice=request.tool_choice.value
            )  # we cannot include tool_choice w/o tools, server will complain
    if request.logprobs:
        payload.update(logprobs=True)
        payload.update(top_logprobs=request.logprobs.top_k)
    if request.sampling_params:
        nvext.update(repetition_penalty=request.sampling_params.repetition_penalty)
        if request.sampling_params.max_tokens:
            payload.update(max_tokens=request.sampling_params.max_tokens)
        if request.sampling_params.strategy == "top_p":
            nvext.update(top_k=-1)
            payload.update(top_p=request.sampling_params.top_p)
        elif request.sampling_params.strategy == "top_k":
            if (
                request.sampling_params.top_k != -1
                and request.sampling_params.top_k < 1
            ):
                warnings.warn("top_k must be -1 or >= 1")
            nvext.update(top_k=request.sampling_params.top_k)
        elif request.sampling_params.strategy == "greedy":
            nvext.update(top_k=-1)
            payload.update(temperature=request.sampling_params.temperature)
    return payload
 def _convert_openai_finish_reason(finish_reason: str) -> StopReason:
    """
    Convert an OpenAI chat completion finish_reason to a StopReason.
    finish_reason: Literal["stop", "length", "tool_calls", ...]
        - stop: model hit a natural stop point or a provided stop sequence
        - length: maximum number of tokens specified in the request was reached
        - tool_calls: model called a tool
    ->
    class StopReason(Enum):
        end_of_turn = "end_of_turn"
        end_of_message = "end_of_message"
        out_of_tokens = "out_of_tokens"
    """
    # TODO(mf): are end_of_turn and end_of_message semantics correct?
    return {
        "stop": StopReason.end_of_turn,
        "length": StopReason.out_of_tokens,
        "tool_calls": StopReason.end_of_message,
    }.get(finish_reason, StopReason.end_of_turn)
 def _convert_openai_tool_calls(
    tool_calls: List[OpenAIChatCompletionMessageToolCall],
 ) -> List[ToolCall]:
    """
    Convert an OpenAI ChatCompletionMessageToolCall list into a list of ToolCall.
    OpenAI ChatCompletionMessageToolCall:
        id: str
        function: Function
        type: Literal["function"]
    OpenAI Function:
        arguments: str
        name: str
    ->
    ToolCall:
        call_id: str
        tool_name: str
        arguments: Dict[str, ...]
    """
    if not tool_calls:
        return []  # CompletionMessage tool_calls is not optional
    return [
        ToolCall(
            call_id=call.id,
            tool_name=call.function.name,
            arguments=json.loads(call.function.arguments),
        )
        for call in tool_calls
    ]
 def _convert_openai_logprobs(
    logprobs: OpenAIChoiceLogprobs,
 ) -> Optional[List[TokenLogProbs]]:
    """
    Convert an OpenAI ChoiceLogprobs into a list of TokenLogProbs.
    OpenAI ChoiceLogprobs:
        content: Optional[List[ChatCompletionTokenLogprob]]
    OpenAI ChatCompletionTokenLogprob:
        token: str
        logprob: float
        top_logprobs: List[TopLogprob]
    OpenAI TopLogprob:
        token: str
        logprob: float
    ->
    TokenLogProbs:
        logprobs_by_token: Dict[str, float]
         - token, logprob
    """
    if not logprobs:
        return None
    return [
        TokenLogProbs(
            logprobs_by_token={
                logprobs.token: logprobs.logprob for logprobs in content.top_logprobs
            }
        )
        for content in logprobs.content
    ]
 def convert_openai_chat_completion_choice(
    choice: OpenAIChoice,
 ) -> ChatCompletionResponse:
    """
    Convert an OpenAI Choice into a ChatCompletionResponse.
    OpenAI Choice:
        message: ChatCompletionMessage
        finish_reason: str
        logprobs: Optional[ChoiceLogprobs]
    OpenAI ChatCompletionMessage:
        role: Literal["assistant"]
        content: Optional[str]
        tool_calls: Optional[List[ChatCompletionMessageToolCall]]
    ->
    ChatCompletionResponse:
        completion_message: CompletionMessage
        logprobs: Optional[List[TokenLogProbs]]
    CompletionMessage:
        role: Literal["assistant"]
        content: str | ImageMedia | List[str | ImageMedia]
        stop_reason: StopReason
        tool_calls: List[ToolCall]
    class StopReason(Enum):
        end_of_turn = "end_of_turn"
        end_of_message = "end_of_message"
        out_of_tokens = "out_of_tokens"
    """
    assert (
        hasattr(choice, "message") and choice.message
    ), "error in server response: message not found"
    assert (
        hasattr(choice, "finish_reason") and choice.finish_reason
    ), "error in server response: finish_reason not found"
    return ChatCompletionResponse(
        completion_message=CompletionMessage(
            content=choice.message.content
            or "",  # CompletionMessage content is not optional
            stop_reason=_convert_openai_finish_reason(choice.finish_reason),
            tool_calls=_convert_openai_tool_calls(choice.message.tool_calls),
        ),
        logprobs=_convert_openai_logprobs(choice.logprobs),
    )
 async def convert_openai_chat_completion_stream(
    stream: AsyncStream[OpenAIChatCompletionChunk],
 ) -> AsyncGenerator[ChatCompletionResponseStreamChunk, None]:
    """
    Convert a stream of OpenAI chat completion chunks into a stream
    of ChatCompletionResponseStreamChunk.
    OpenAI ChatCompletionChunk:
        choices: List[Choice]
    OpenAI Choice:  # different from the non-streamed Choice
        delta: ChoiceDelta
        finish_reason: Optional[Literal["stop", "length", "tool_calls", "content_filter", "function_call"]]
        logprobs: Optional[ChoiceLogprobs]
    OpenAI ChoiceDelta:
        content: Optional[str]
        role: Optional[Literal["system", "user", "assistant", "tool"]]
        tool_calls: Optional[List[ChoiceDeltaToolCall]]
    OpenAI ChoiceDeltaToolCall:
        index: int
        id: Optional[str]
        function: Optional[ChoiceDeltaToolCallFunction]
        type: Optional[Literal["function"]]
    OpenAI ChoiceDeltaToolCallFunction:
        name: Optional[str]
        arguments: Optional[str]
    ->
    ChatCompletionResponseStreamChunk:
        event: ChatCompletionResponseEvent
    ChatCompletionResponseEvent:
        event_type: ChatCompletionResponseEventType
        delta: Union[str, ToolCallDelta]
        logprobs: Optional[List[TokenLogProbs]]
        stop_reason: Optional[StopReason]
    ChatCompletionResponseEventType:
        start = "start"
        progress = "progress"
        complete = "complete"
    ToolCallDelta:
        content: Union[str, ToolCall]
        parse_status: ToolCallParseStatus
    ToolCall:
        call_id: str
        tool_name: str
        arguments: str
    ToolCallParseStatus:
        started = "started"
        in_progress = "in_progress"
        failure = "failure"
        success = "success"
    TokenLogProbs:
        logprobs_by_token: Dict[str, float]
         - token, logprob
    StopReason:
        end_of_turn = "end_of_turn"
        end_of_message = "end_of_message"
        out_of_tokens = "out_of_tokens"
    """
    # generate a stream of ChatCompletionResponseEventType: start -> progress -> progress -> ...
    def _event_type_generator() -> (
        Generator[ChatCompletionResponseEventType, None, None]
    ):
        yield ChatCompletionResponseEventType.start
        while True:
            yield ChatCompletionResponseEventType.progress
    event_type = _event_type_generator()
    # we implement NIM specific semantics, the main difference from OpenAI
    # is that tool_calls are always produced as a complete call. there is no
    # intermediate / partial tool call streamed. because of this, we can
    # simplify the logic and not concern outselves with parse_status of
    # started/in_progress/failed. we can always assume success.
    #
    # a stream of ChatCompletionResponseStreamChunk consists of
    #  0. a start event
    #  1. zero or more progress events
    #   - each progress event has a delta
    #   - each progress event may have a stop_reason
    #   - each progress event may have logprobs
    #   - each progress event may have tool_calls
    #     if a progress event has tool_calls,
    #      it is fully formed and
    #      can be emitted with a parse_status of success
    #  2. a complete event
    stop_reason = None
    async for chunk in stream:
        choice = chunk.choices[0]  # assuming only one choice per chunk
        # we assume there's only one finish_reason in the stream
        stop_reason = _convert_openai_finish_reason(choice.finish_reason) or stop_reason
        # if there's a tool call, emit an event for each tool in the list
        # if tool call and content, emit both separately
        if choice.delta.tool_calls:
            # the call may have content and a tool call. ChatCompletionResponseEvent
            # does not support both, so we emit the content first
            if choice.delta.content:
                yield ChatCompletionResponseStreamChunk(
                    event=ChatCompletionResponseEvent(
                        event_type=next(event_type),
                        delta=choice.delta.content,
                        logprobs=_convert_openai_logprobs(choice.logprobs),
                    )
                )
            # it is possible to have parallel tool calls in stream, but
            # ChatCompletionResponseEvent only supports one per stream
            if len(choice.delta.tool_calls) > 1:
                warnings.warn(
                    "multiple tool calls found in a single delta, using the first, ignoring the rest"
                )
            # NIM only produces fully formed tool calls, so we can assume success
            yield ChatCompletionResponseStreamChunk(
                event=ChatCompletionResponseEvent(
                    event_type=next(event_type),
                    delta=ToolCallDelta(
                        content=_convert_openai_tool_calls(choice.delta.tool_calls)[0],
                        parse_status=ToolCallParseStatus.success,
                    ),
                    logprobs=_convert_openai_logprobs(choice.logprobs),
                )
            )
        else:
            yield ChatCompletionResponseStreamChunk(
                event=ChatCompletionResponseEvent(
                    event_type=next(event_type),
                    delta=choice.delta.content or "",  # content is not optional
                    logprobs=_convert_openai_logprobs(choice.logprobs),
                )
            )
    yield ChatCompletionResponseStreamChunk(
        event=ChatCompletionResponseEvent(
            event_type=ChatCompletionResponseEventType.complete,
            delta="",
            stop_reason=stop_reason,
        )
    )
--- a/llama_stack/providers/adapters/inference/nvidia/_utils.py
+++ b/llama_stack/providers/adapters/inference/nvidia/_utils.py
@ -4,43 +4,13 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-import warnings
+from typing import Tuple
 from typing import Any, Dict, List, Optional, Tuple
 import httpx
 from llama_models.llama3.api.datatypes import (
    CompletionMessage,
    StopReason,
    TokenLogProbs,
    ToolCall,
 )
 from llama_stack.apis.inference import (
    ChatCompletionRequest,
    ChatCompletionResponse,
    Message,
 )
 from ._config import NVIDIAConfig
 def convert_message(message: Message) -> dict:
    """
    Convert a Message to an OpenAI API-compatible dictionary.
    """
    out_dict = message.dict()
    # Llama Stack uses role="ipython" for tool call messages, OpenAI uses "tool"
    if out_dict["role"] == "ipython":
        out_dict.update(role="tool")
    if "stop_reason" in out_dict:
        out_dict.update(stop_reason=out_dict["stop_reason"].value)
    # TODO(mf): tool_calls
    return out_dict
 async def _get_health(url: str) -> Tuple[bool, bool]:
    """
    Query {url}/v1/health/{live,ready} to check if the server is running and ready
@ -78,251 +48,3 @@ async def check_health(config: NVIDIAConfig) -> None:
            # TODO(mf): should we wait for the server to be ready?
        except httpx.ConnectError as e:
            raise ConnectionError(f"Failed to connect to NVIDIA NIM: {e}") from e
 def convert_chat_completion_request(
    request: ChatCompletionRequest,
    n: int = 1,
 ) -> dict:
    """
    Convert a ChatCompletionRequest to an OpenAI API-compatible dictionary.
    """
    # model -> model
    # messages -> messages
    # sampling_params  TODO(mattf): review strategy
    #  strategy=greedy -> nvext.top_k = -1, temperature = temperature
    #  strategy=top_p -> nvext.top_k = -1, top_p = top_p
    #  strategy=top_k -> nvext.top_k = top_k
    #  temperature -> temperature
    #  top_p -> top_p
    #  top_k -> nvext.top_k
    #  max_tokens -> max_tokens
    #  repetition_penalty -> nvext.repetition_penalty
    # tools -> tools
    # tool_choice ("auto", "required") -> tool_choice
    # tool_prompt_format -> TBD
    # stream -> stream
    # logprobs -> logprobs
    print(f"sampling_params: {request.sampling_params}")
    payload: Dict[str, Any] = dict(
        model=request.model,
        messages=[convert_message(message) for message in request.messages],
        stream=request.stream,
        nvext={},
        n=n,
    )
    nvext = payload["nvext"]
    if request.tools:
        payload.update(tools=request.tools)
        if request.tool_choice:
            payload.update(
                tool_choice=request.tool_choice.value
            )  # we cannot include tool_choice w/o tools, server will complain
    if request.logprobs:
        payload.update(logprobs=True)
        payload.update(top_logprobs=request.logprobs.top_k)
    if request.sampling_params:
        nvext.update(repetition_penalty=request.sampling_params.repetition_penalty)
        if request.sampling_params.max_tokens:
            payload.update(max_tokens=request.sampling_params.max_tokens)
        if request.sampling_params.strategy == "top_p":
            nvext.update(top_k=-1)
            payload.update(top_p=request.sampling_params.top_p)
        elif request.sampling_params.strategy == "top_k":
            if (
                request.sampling_params.top_k != -1
                and request.sampling_params.top_k < 1
            ):
                warnings.warn("top_k must be -1 or >= 1")
            nvext.update(top_k=request.sampling_params.top_k)
        elif request.sampling_params.strategy == "greedy":
            nvext.update(top_k=-1)
            payload.update(temperature=request.sampling_params.temperature)
    return payload
 def _parse_content(completion: dict) -> str:
    """
    Get the content from an OpenAI completion response.
    OpenAI completion response format -
        {
            ...
            "message": {"role": "assistant", "content": ..., ...},
            ...
        }
    """
    # content is nullable in the OpenAI response, common for tool calls
    return completion["message"]["content"] or ""
 def _parse_stop_reason(completion: dict) -> StopReason:
    """
    Get the StopReason from an OpenAI completion response.
    OpenAI completion response format -
        {
            ...
            "finish_reason": "length" or "stop" or "tool_calls",
            ...
        }
    """
    # StopReason options are end_of_turn, end_of_message, out_of_tokens
    # TODO(mf): is end_of_turn and end_of_message usage correct?
    stop_reason = StopReason.end_of_turn
    if completion["finish_reason"] == "length":
        stop_reason = StopReason.out_of_tokens
    elif completion["finish_reason"] == "stop":
        stop_reason = StopReason.end_of_message
    elif completion["finish_reason"] == "tool_calls":
        stop_reason = StopReason.end_of_turn
    return stop_reason
 def _parse_tool_calls(completion: dict) -> List[ToolCall]:
    """
    Get the tool calls from an OpenAI completion response.
    OpenAI completion response format -
        {
            ...,
            "message": {
                ...,
                "tool_calls": [
                    {
                        "id": X,
                        "type": "function",
                        "function": {
                            "name": Y,
                            "arguments": Z,
                        },
                    }*
                ],
            },
        }
    ->
        [
            ToolCall(call_id=X, tool_name=Y, arguments=Z),
            ...
        ]
    """
    tool_calls = []
    if "tool_calls" in completion["message"]:
        assert isinstance(
            completion["message"]["tool_calls"], list
        ), "error in server response: tool_calls not a list"
        for call in completion["message"]["tool_calls"]:
            assert "id" in call, "error in server response: tool call id not found"
            assert (
                "function" in call
            ), "error in server response: tool call function not found"
            assert (
                "name" in call["function"]
            ), "error in server response: tool call function name not found"
            assert (
                "arguments" in call["function"]
            ), "error in server response: tool call function arguments not found"
            tool_calls.append(
                ToolCall(
                    call_id=call["id"],
                    tool_name=call["function"]["name"],
                    arguments=call["function"]["arguments"],
                )
            )
    return tool_calls
 def _parse_logprobs(completion: dict) -> Optional[List[TokenLogProbs]]:
    """
    Extract logprobs from OpenAI as a list of TokenLogProbs.
    OpenAI completion response format -
        {
            ...
            "logprobs": {
                content: [
                    {
                        ...,
                        top_logprobs: [{token: X, logprob: Y, bytes: [...]}+]
                    }+
                ]
            },
            ...
        }
    ->
        [
            TokenLogProbs(
                logprobs_by_token={X: Y, ...}
            ),
            ...
        ]
    """
    if not (logprobs := completion.get("logprobs")):
        return None
    return [
        TokenLogProbs(
            logprobs_by_token={
                logprobs["token"]: logprobs["logprob"]
                for logprobs in content["top_logprobs"]
            }
        )
        for content in logprobs["content"]
    ]
 def parse_completion(
    completion: dict,
 ) -> ChatCompletionResponse:
    """
    Parse an OpenAI completion response into a CompletionMessage and logprobs.
    OpenAI completion response format -
        {
            "message": {
                "role": "assistant",
                "content": ...,
                "tool_calls": [
                    {
                        ...
                        "id": ...,
                        "function": {
                            "name": ...,
                            "arguments": ...,
                        },
                    }*
                ]?,
            "finish_reason": ...,
            "logprobs": {
                "content": [
                    {
                        ...,
                        "top_logprobs": [{"token": ..., "logprob": ..., ...}+]
                    }+
                ]
            }?
        }
    """
    assert "message" in completion, "error in server response: message not found"
    assert (
        "finish_reason" in completion
    ), "error in server response: finish_reason not found"
    return ChatCompletionResponse(
        completion_message=CompletionMessage(
            content=_parse_content(completion),
            stop_reason=_parse_stop_reason(completion),
            tool_calls=_parse_tool_calls(completion),
        ),
        logprobs=_parse_logprobs(completion),
    )
--- a/llama_stack/providers/registry/inference.py
+++ b/llama_stack/providers/registry/inference.py
@ -144,7 +144,9 @@ def available_providers() -> List[ProviderSpec]:
            api=Api.inference,
            adapter=AdapterSpec(
                adapter_type="nvidia",
-                pip_packages=[],  # TODO(mf): need to specify httpx if it's already a llama-stack dep?
+                pip_packages=[
                    "openai",
                ],
                module="llama_stack.providers.adapters.inference.nvidia",
                config_class="llama_stack.providers.adapters.inference.nvidia.NVIDIAConfig",
            ),
--- a/tests/nvidia/integration/test_inference.py
+++ b/tests/nvidia/integration/test_inference.py
@ -8,11 +8,15 @@ import itertools
 from typing import Generator, List, Tuple
 import pytest
 from llama_models.datatypes import SamplingParams
 from llama_stack.apis.inference import (
    ChatCompletionResponse,
    ChatCompletionResponseEventType,
    ChatCompletionResponseStreamChunk,
    CompletionMessage,
    Inference,
    # LogProbConfig,
    Message,
    StopReason,
    SystemMessage,
@ -96,6 +100,70 @@ async def test_chat_completion_messages(
    assert response.completion_message.tool_calls == []
 async def test_chat_completion_basic(
    client: Inference,
    model: str,
 ):
    """
    Test the chat completion endpoint with basic messages, with and without streaming.
    """
    client = await client
    messages = [
        UserMessage(content="How are you?"),
    ]
    response = await client.chat_completion(
        model=model,
        messages=messages,
        stream=False,
    )
    assert isinstance(response, ChatCompletionResponse)
    assert isinstance(response.completion_message.content, str)
    # we're not testing accuracy, so no assertions on the result.completion_message.content
    assert response.completion_message.role == "assistant"
    assert isinstance(response.completion_message.stop_reason, StopReason)
    assert response.completion_message.tool_calls == []
 async def test_chat_completion_stream_basic(
    client: Inference,
    model: str,
 ):
    """
    Test the chat completion endpoint with basic messages, with and without streaming.
    """
    client = await client
    messages = [
        UserMessage(content="How are you?"),
    ]
    response = await client.chat_completion(
        model=model,
        messages=messages,
        stream=True,
        sampling_params=SamplingParams(max_tokens=5),
        # logprobs=LogProbConfig(top_k=3),
    )
    chunks = [chunk async for chunk in response]
    assert all(isinstance(chunk, ChatCompletionResponseStreamChunk) for chunk in chunks)
    assert all(isinstance(chunk.event.delta, str) for chunk in chunks)
    assert chunks[0].event.event_type == ChatCompletionResponseEventType.start
    assert chunks[-1].event.event_type == ChatCompletionResponseEventType.complete
    if len(chunks) > 2:
        assert all(
            chunk.event.event_type == ChatCompletionResponseEventType.progress
            for chunk in chunks[1:-1]
        )
    # we're not testing accuracy, so no assertions on the result.completion_message.content
    assert all(
        chunk.event.stop_reason is None
        or isinstance(chunk.event.stop_reason, StopReason)
        for chunk in chunks
    )
 async def test_bad_base_url(
    model: str,
 ):
--- a/tests/nvidia/unit/test_chat_completion.py
+++ b/tests/nvidia/unit/test_chat_completion.py
@ -157,7 +157,7 @@ async def test_tools(
                                "type": "function",
                                "function": {
                                    "name": "magic",
-                                    "arguments": {"input": 3},
+                                    "arguments": '{"input": 3}',
                                },
                            },
                            {
@ -165,7 +165,7 @@ async def test_tools(
                                "type": "function",
                                "function": {
                                    "name": "magic!",
-                                    "arguments": {"input": 42},
+                                    "arguments": '{"input": 42}',
                                },
                            },
                        ],
--- a/tests/nvidia/unit/test_openai_utils.py
+++ b/tests/nvidia/unit/test_openai_utils.py
@ -0,0 +1,493 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from typing import AsyncGenerator, List
 import pytest
 from llama_models.llama3.api.datatypes import StopReason
 from llama_stack.apis.inference import (
    ChatCompletionResponse,
    ChatCompletionResponseEventType,
    ChatCompletionResponseStreamChunk,
 )
 from llama_stack.providers.adapters.inference.nvidia._openai_utils import (
    convert_openai_chat_completion_choice,
    convert_openai_chat_completion_stream,
 )
 from openai.types.chat import (
    ChatCompletionChunk as OpenAIChatCompletionChunk,
    ChatCompletionMessage,
    ChatCompletionMessageToolCall,
    ChatCompletionTokenLogprob,
 )
 from openai.types.chat.chat_completion import Choice, ChoiceLogprobs
 from openai.types.chat.chat_completion_chunk import (
    Choice as ChoiceChunk,
    ChoiceDelta,
    ChoiceDeltaToolCall,
    ChoiceDeltaToolCallFunction,
 )
 from openai.types.chat.chat_completion_token_logprob import TopLogprob
 def test_convert_openai_chat_completion_choice_basic():
    response = Choice(
        index=0,
        message=ChatCompletionMessage(
            role="assistant",
            content="Hello, world!",
        ),
        finish_reason="stop",
    )
    result = convert_openai_chat_completion_choice(response)
    assert isinstance(result, ChatCompletionResponse)
    assert result.completion_message.content == "Hello, world!"
    assert result.completion_message.stop_reason == StopReason.end_of_turn
    assert result.completion_message.tool_calls == []
    assert result.logprobs is None
 def test_convert_openai_chat_completion_choice_basic_with_tool_calls():
    response = Choice(
        index=0,
        message=ChatCompletionMessage(
            role="assistant",
            content="Hello, world!",
            tool_calls=[
                ChatCompletionMessageToolCall(
                    id="tool_call_id",
                    type="function",
                    function={
                        "name": "test_function",
                        "arguments": '{"test_args": "test_value"}',
                    },
                )
            ],
        ),
        finish_reason="tool_calls",
    )
    result = convert_openai_chat_completion_choice(response)
    assert isinstance(result, ChatCompletionResponse)
    assert result.completion_message.content == "Hello, world!"
    assert result.completion_message.stop_reason == StopReason.end_of_message
    assert len(result.completion_message.tool_calls) == 1
    assert result.completion_message.tool_calls[0].tool_name == "test_function"
    assert result.completion_message.tool_calls[0].arguments == {
        "test_args": "test_value"
    }
    assert result.logprobs is None
 def test_convert_openai_chat_completion_choice_basic_with_logprobs():
    response = Choice(
        index=0,
        message=ChatCompletionMessage(
            role="assistant",
            content="Hello world",
        ),
        finish_reason="stop",
        logprobs=ChoiceLogprobs(
            content=[
                ChatCompletionTokenLogprob(
                    token="Hello",
                    logprob=-1.0,
                    bytes=[72, 101, 108, 108, 111],
                    top_logprobs=[
                        TopLogprob(
                            token="Hello", logprob=-1.0, bytes=[72, 101, 108, 108, 111]
                        ),
                        TopLogprob(
                            token="Greetings",
                            logprob=-1.5,
                            bytes=[71, 114, 101, 101, 116, 105, 110, 103, 115],
                        ),
                    ],
                ),
                ChatCompletionTokenLogprob(
                    token="world",
                    logprob=-1.5,
                    bytes=[119, 111, 114, 108, 100],
                    top_logprobs=[
                        TopLogprob(
                            token="world", logprob=-1.5, bytes=[119, 111, 114, 108, 100]
                        ),
                        TopLogprob(
                            token="planet",
                            logprob=-2.0,
                            bytes=[112, 108, 97, 110, 101, 116],
                        ),
                    ],
                ),
            ]
        ),
    )
    result = convert_openai_chat_completion_choice(response)
    assert isinstance(result, ChatCompletionResponse)
    assert result.completion_message.content == "Hello world"
    assert result.completion_message.stop_reason == StopReason.end_of_turn
    assert result.completion_message.tool_calls == []
    assert result.logprobs is not None
    assert len(result.logprobs) == 2
    assert len(result.logprobs[0].logprobs_by_token) == 2
    assert result.logprobs[0].logprobs_by_token["Hello"] == -1.0
    assert result.logprobs[0].logprobs_by_token["Greetings"] == -1.5
    assert len(result.logprobs[1].logprobs_by_token) == 2
    assert result.logprobs[1].logprobs_by_token["world"] == -1.5
    assert result.logprobs[1].logprobs_by_token["planet"] == -2.0
 def test_convert_openai_chat_completion_choice_missing_message():
    response = Choice(
        index=0,
        message=ChatCompletionMessage(
            role="assistant",
            content="Hello, world!",
        ),
        finish_reason="stop",
    )
    response.message = None
    with pytest.raises(
        AssertionError, match="error in server response: message not found"
    ):
        convert_openai_chat_completion_choice(response)
    del response.message
    with pytest.raises(
        AssertionError, match="error in server response: message not found"
    ):
        convert_openai_chat_completion_choice(response)
 def test_convert_openai_chat_completion_choice_missing_finish_reason():
    response = Choice(
        index=0,
        message=ChatCompletionMessage(
            role="assistant",
            content="Hello, world!",
        ),
        finish_reason="stop",
    )
    response.finish_reason = None
    with pytest.raises(
        AssertionError, match="error in server response: finish_reason not found"
    ):
        convert_openai_chat_completion_choice(response)
    del response.finish_reason
    with pytest.raises(
        AssertionError, match="error in server response: finish_reason not found"
    ):
        convert_openai_chat_completion_choice(response)
 # we want to test convert_openai_chat_completion_stream
 # we need to produce a stream of OpenAIChatCompletionChunk
 # streams to produce -
 #  0. basic stream with one chunk, should produce 3 (start, progress, complete)
 #  1. stream with 3 chunks, should produce 5 events (start, progress, progress, progress, complete)
 #  2. stream with a tool call, should produce 4 events (start, progress w/ tool_call, complete)
@pytest.mark.asyncio
 async def test_convert_openai_chat_completion_stream_basic():
    chunks = [
        OpenAIChatCompletionChunk(
            id="1",
            created=1234567890,
            model="mock-model",
            object="chat.completion.chunk",
            choices=[
                ChoiceChunk(
                    index=0,
                    delta=ChoiceDelta(
                        role="assistant",
                        content="Hello, world!",
                    ),
                    finish_reason="stop",
                )
            ],
        )
    ]
    async def async_generator_from_list(items: List) -> AsyncGenerator:
        for item in items:
            yield item
    results = [
        result
        async for result in convert_openai_chat_completion_stream(
            async_generator_from_list(chunks)
        )
    ]
    assert len(results) == 2
    assert all(
        isinstance(result, ChatCompletionResponseStreamChunk) for result in results
    )
    assert results[0].event.event_type == ChatCompletionResponseEventType.start
    assert results[0].event.delta == "Hello, world!"
    assert results[1].event.event_type == ChatCompletionResponseEventType.complete
    assert results[1].event.stop_reason == StopReason.end_of_turn
@pytest.mark.asyncio
 async def test_convert_openai_chat_completion_stream_basic_empty():
    chunks = [
        OpenAIChatCompletionChunk(
            id="1",
            created=1234567890,
            model="mock-model",
            object="chat.completion.chunk",
            choices=[
                ChoiceChunk(
                    index=0,
                    delta=ChoiceDelta(
                        role="assistant",
                    ),
                    finish_reason="stop",
                )
            ],
        ),
        OpenAIChatCompletionChunk(
            id="1",
            created=1234567890,
            model="mock-model",
            object="chat.completion.chunk",
            choices=[
                ChoiceChunk(
                    index=0,
                    delta=ChoiceDelta(
                        role="assistant",
                        content="Hello, world!",
                    ),
                    finish_reason="stop",
                )
            ],
        ),
    ]
    async def async_generator_from_list(items: List) -> AsyncGenerator:
        for item in items:
            yield item
    results = [
        result
        async for result in convert_openai_chat_completion_stream(
            async_generator_from_list(chunks)
        )
    ]
    print(results)
    assert len(results) == 3
    assert all(
        isinstance(result, ChatCompletionResponseStreamChunk) for result in results
    )
    assert results[0].event.event_type == ChatCompletionResponseEventType.start
    assert results[1].event.event_type == ChatCompletionResponseEventType.progress
    assert results[1].event.delta == "Hello, world!"
    assert results[2].event.event_type == ChatCompletionResponseEventType.complete
    assert results[2].event.stop_reason == StopReason.end_of_turn
@pytest.mark.asyncio
 async def test_convert_openai_chat_completion_stream_multiple_chunks():
    chunks = [
        OpenAIChatCompletionChunk(
            id="1",
            created=1234567890,
            model="mock-model",
            object="chat.completion.chunk",
            choices=[
                ChoiceChunk(
                    index=0,
                    delta=ChoiceDelta(
                        role="assistant",
                        content="Hello, world!",
                    ),
                    # finish_reason="continue",
                )
            ],
        ),
        OpenAIChatCompletionChunk(
            id="2",
            created=1234567891,
            model="mock-model",
            object="chat.completion.chunk",
            choices=[
                ChoiceChunk(
                    index=0,
                    delta=ChoiceDelta(
                        role="assistant",
                        content="How are you?",
                    ),
                    # finish_reason="continue",
                )
            ],
        ),
        OpenAIChatCompletionChunk(
            id="3",
            created=1234567892,
            model="mock-model",
            object="chat.completion.chunk",
            choices=[
                ChoiceChunk(
                    index=0,
                    delta=ChoiceDelta(
                        role="assistant",
                        content="I'm good, thanks!",
                    ),
                    finish_reason="stop",
                )
            ],
        ),
    ]
    async def async_generator_from_list(items: List) -> AsyncGenerator:
        for item in items:
            yield item
    results = [
        result
        async for result in convert_openai_chat_completion_stream(
            async_generator_from_list(chunks)
        )
    ]
    assert len(results) == 4
    assert all(
        isinstance(result, ChatCompletionResponseStreamChunk) for result in results
    )
    assert results[0].event.event_type == ChatCompletionResponseEventType.start
    assert results[0].event.delta == "Hello, world!"
    assert not results[0].event.stop_reason
    assert results[1].event.event_type == ChatCompletionResponseEventType.progress
    assert results[1].event.delta == "How are you?"
    assert not results[1].event.stop_reason
    assert results[2].event.event_type == ChatCompletionResponseEventType.progress
    assert results[2].event.delta == "I'm good, thanks!"
    assert not results[2].event.stop_reason
    assert results[3].event.event_type == ChatCompletionResponseEventType.complete
    assert results[3].event.stop_reason == StopReason.end_of_turn
@pytest.mark.asyncio
 async def test_convert_openai_chat_completion_stream_with_tool_call_and_content():
    chunks = [
        OpenAIChatCompletionChunk(
            id="1",
            created=1234567890,
            model="mock-model",
            object="chat.completion.chunk",
            choices=[
                ChoiceChunk(
                    index=0,
                    delta=ChoiceDelta(
                        role="assistant",
                        content="Hello, world!",
                        tool_calls=[
                            ChoiceDeltaToolCall(
                                index=0,
                                id="tool_call_id",
                                type="function",
                                function=ChoiceDeltaToolCallFunction(
                                    name="test_function",
                                    arguments='{"test_args": "test_value"}',
                                ),
                            )
                        ],
                    ),
                    finish_reason="tool_calls",
                )
            ],
        )
    ]
    async def async_generator_from_list(items: List) -> AsyncGenerator:
        for item in items:
            yield item
    results = [
        result
        async for result in convert_openai_chat_completion_stream(
            async_generator_from_list(chunks)
        )
    ]
    assert len(results) == 3
    assert all(
        isinstance(result, ChatCompletionResponseStreamChunk) for result in results
    )
    assert results[0].event.event_type == ChatCompletionResponseEventType.start
    assert results[0].event.delta == "Hello, world!"
    assert not results[0].event.stop_reason
    assert results[1].event.event_type == ChatCompletionResponseEventType.progress
    assert not isinstance(results[1].event.delta, str)
    assert results[1].event.delta.content.tool_name == "test_function"
    assert results[1].event.delta.content.arguments == {"test_args": "test_value"}
    assert not results[1].event.stop_reason
    assert results[2].event.event_type == ChatCompletionResponseEventType.complete
    assert results[2].event.stop_reason == StopReason.end_of_message
@pytest.mark.asyncio
 async def test_convert_openai_chat_completion_stream_with_tool_call_and_no_content():
    chunks = [
        OpenAIChatCompletionChunk(
            id="1",
            created=1234567890,
            model="mock-model",
            object="chat.completion.chunk",
            choices=[
                ChoiceChunk(
                    index=0,
                    delta=ChoiceDelta(
                        role="assistant",
                        tool_calls=[
                            ChoiceDeltaToolCall(
                                index=0,
                                id="tool_call_id",
                                type="function",
                                function=ChoiceDeltaToolCallFunction(
                                    name="test_function",
                                    arguments='{"test_args": "test_value"}',
                                ),
                            )
                        ],
                    ),
                    finish_reason="tool_calls",
                )
            ],
        )
    ]
    async def async_generator_from_list(items: List) -> AsyncGenerator:
        for item in items:
            yield item
    results = [
        result
        async for result in convert_openai_chat_completion_stream(
            async_generator_from_list(chunks)
        )
    ]
    assert len(results) == 2
    assert all(
        isinstance(result, ChatCompletionResponseStreamChunk) for result in results
    )
    assert results[0].event.event_type == ChatCompletionResponseEventType.start
    assert not isinstance(results[0].event.delta, str)
    assert results[0].event.delta.content.tool_name == "test_function"
    assert results[0].event.delta.content.arguments == {"test_args": "test_value"}
    assert not results[0].event.stop_reason
    assert results[1].event.event_type == ChatCompletionResponseEventType.complete
    assert results[1].event.stop_reason == StopReason.end_of_message