Merge branch 'main' into use-openai-for-ollama

2025-10-10 05:24:39 +00:00 · 2025-09-15 15:31:03 -04:00 · 2025-09-15 15:31:03 -04:00 · 91fb6f42cb
commit 91fb6f42cb
parent 7b5685b1d9 01bdcce4d2
74 changed files with 8761 additions and 971 deletions
--- a/tests/integration/inference/test_openai_completion.py
+++ b/tests/integration/inference/test_openai_completion.py
@ -6,12 +6,25 @@


 import time
+import unicodedata

 import pytest

 from ..test_cases.test_case import TestCase


+def _normalize_text(text: str) -> str:
+    """
+    Normalize Unicode text by removing diacritical marks for comparison.
+
+    The test case streaming_01 expects the answer "Sol" for the question "What's the name of the Sun
+    in latin?", but the model is returning "sōl" (with a macron over the 'o'), which is the correct
+    Latin spelling. The test is failing because it's doing a simple case-insensitive string search
+    for "sol" but the actual response contains the diacritical mark.
+    """
+    return unicodedata.normalize("NFD", text).encode("ascii", "ignore").decode("ascii").lower()
+
+
 def provider_from_model(client_with_models, model_id):
    models = {m.identifier: m for m in client_with_models.models.list()}
    models.update({m.provider_resource_id: m for m in client_with_models.models.list()})
@ -42,6 +55,10 @@ def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id)
        "remote::groq",
        "remote::gemini",  # https://generativelanguage.googleapis.com/v1beta/openai/completions -> 404
        "remote::anthropic",  # at least claude-3-{5,7}-{haiku,sonnet}-* / claude-{sonnet,opus}-4-* are not supported
+        "remote::azure",  # {'error': {'code': 'OperationNotSupported', 'message': 'The completion operation
+        #  does not work with the specified model, gpt-5-mini. Please choose different model and try
+        #  again. You can learn more about which models can be used with each operation here:
+        #  https://go.microsoft.com/fwlink/?linkid=2197993.'}}"}
    ):
        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.")

@ -157,7 +174,8 @@ def test_openai_completion_non_streaming_suffix(llama_stack_client, client_with_
    assert len(response.choices) > 0
    choice = response.choices[0]
    assert len(choice.text) > 5
-    assert "france" in choice.text.lower()
+    normalized_text = _normalize_text(choice.text)
+    assert "france" in normalized_text


@pytest.mark.parametrize(
@ -248,7 +266,9 @@ def test_openai_chat_completion_non_streaming(compat_client, client_with_models,
    )
    message_content = response.choices[0].message.content.lower().strip()
    assert len(message_content) > 0
-    assert expected.lower() in message_content
+    normalized_expected = _normalize_text(expected)
+    normalized_content = _normalize_text(message_content)
+    assert normalized_expected in normalized_content


@pytest.mark.parametrize(
@ -272,10 +292,13 @@ def test_openai_chat_completion_streaming(compat_client, client_with_models, tex
    )
    streamed_content = []
    for chunk in response:
-        if chunk.choices[0].delta.content:
+        # On some providers like Azure, the choices are empty on the first chunk, so we need to check for that
+        if chunk.choices and len(chunk.choices) > 0 and chunk.choices[0].delta.content:
            streamed_content.append(chunk.choices[0].delta.content.lower().strip())
    assert len(streamed_content) > 0
-    assert expected.lower() in "".join(streamed_content)
+    normalized_expected = _normalize_text(expected)
+    normalized_content = _normalize_text("".join(streamed_content))
+    assert normalized_expected in normalized_content


@pytest.mark.parametrize(
@ -308,8 +331,12 @@ def test_openai_chat_completion_streaming_with_n(compat_client, client_with_mode
                    streamed_content.get(choice.index, "") + choice.delta.content.lower().strip()
                )
    assert len(streamed_content) == 2
+    normalized_expected = _normalize_text(expected)
    for i, content in streamed_content.items():
-        assert expected.lower() in content, f"Choice {i}: Expected {expected.lower()} in {content}"
+        normalized_content = _normalize_text(content)
+        assert normalized_expected in normalized_content, (
+            f"Choice {i}: Expected {normalized_expected} in {normalized_content}"
+        )


@pytest.mark.parametrize(
@ -339,9 +366,9 @@ def test_inference_store(compat_client, client_with_models, text_model_id, strea
        content = ""
        response_id = None
        for chunk in response:
-            if response_id is None:
+            if response_id is None and chunk.id:
                response_id = chunk.id
-            if chunk.choices[0].delta.content:
+            if chunk.choices and len(chunk.choices) > 0 and chunk.choices[0].delta.content:
                content += chunk.choices[0].delta.content
    else:
        response_id = response.id
@ -410,11 +437,12 @@ def test_inference_store_tool_calls(compat_client, client_with_models, text_mode
        content = ""
        response_id = None
        for chunk in response:
-            if response_id is None:
+            if response_id is None and chunk.id:
                response_id = chunk.id
-            if delta := chunk.choices[0].delta:
-                if delta.content:
-                    content += delta.content
+            if chunk.choices and len(chunk.choices) > 0:
+                if delta := chunk.choices[0].delta:
+                    if delta.content:
+                        content += delta.content
    else:
        response_id = response.id
        content = response.choices[0].message.content
@ -484,4 +512,5 @@ def test_openai_chat_completion_non_streaming_with_file(openai_client, client_wi
        stream=False,
    )
    message_content = response.choices[0].message.content.lower().strip()
-    assert "hello world" in message_content
+    normalized_content = _normalize_text(message_content)
+    assert "hello world" in normalized_content
--- a/tests/integration/inference/test_text_inference.py
+++ b/tests/integration/inference/test_text_inference.py
@ -32,6 +32,7 @@ def skip_if_model_doesnt_support_completion(client_with_models, model_id):
            "remote::vertexai",
            "remote::groq",
            "remote::sambanova",
+            "remote::azure",
        )
        or "openai-compat" in provider.provider_type
    ):
@ -44,7 +45,7 @@ def skip_if_model_doesnt_support_json_schema_structured_output(client_with_model
    provider_id = models[model_id].provider_id
    providers = {p.provider_id: p for p in client_with_models.providers.list()}
    provider = providers[provider_id]
-    if provider.provider_type in ("remote::sambanova",):
+    if provider.provider_type in ("remote::sambanova", "remote::azure"):
        pytest.skip(
            f"Model {model_id} hosted by {provider.provider_type} doesn't support json_schema structured output"
        )
--- a/tests/integration/recordings/responses/0fda25b9241c.json
+++ b/tests/integration/recordings/responses/0fda25b9241c.json
@ -0,0 +1,71 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "gpt-5-mini",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Which planet do humans live on?"
+        }
+      ],
+      "stream": false
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "gpt-5-mini"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "chatcmpl-CECIXqfvjuluKkZtG3q2QJoSQhBU0",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "Humans live on Earth \u2014 the third planet from the Sun. It's the only known planet that naturally supports life, with a breathable atmosphere, liquid water, and temperatures suitable for living organisms.",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": [],
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            },
+            "content_filter_results": {}
+          }
+        ],
+        "created": 1757499901,
+        "model": "gpt-5-mini-2025-08-07",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": null,
+        "usage": {
+          "completion_tokens": 112,
+          "prompt_tokens": 13,
+          "total_tokens": 125,
+          "completion_tokens_details": {
+            "accepted_prediction_tokens": 0,
+            "audio_tokens": 0,
+            "reasoning_tokens": 64,
+            "rejected_prediction_tokens": 0
+          },
+          "prompt_tokens_details": {
+            "audio_tokens": 0,
+            "cached_tokens": 0
+          }
+        },
+        "prompt_filter_results": [
+          {
+            "prompt_index": 0,
+            "content_filter_results": {}
+          }
+        ]
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/2b2ad549510d.json
+++ b/tests/integration/recordings/responses/2b2ad549510d.json
@ -0,0 +1,448 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "gpt-5-mini",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Hello, world!"
+        }
+      ],
+      "stream": true
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "gpt-5-mini"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "",
+          "choices": [],
+          "created": 0,
+          "model": "",
+          "object": "",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null,
+          "prompt_filter_results": [
+            {
+              "prompt_index": 0,
+              "content_filter_results": {}
+            }
+          ]
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499910,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
+          "choices": [
+            {
+              "delta": {
+                "content": "Hello",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499910,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
+          "choices": [
+            {
+              "delta": {
+                "content": ",",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499910,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
+          "choices": [
+            {
+              "delta": {
+                "content": " world",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499910,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
+          "choices": [
+            {
+              "delta": {
+                "content": "!",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499910,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
+          "choices": [
+            {
+              "delta": {
+                "content": " Hi",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499910,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
+          "choices": [
+            {
+              "delta": {
+                "content": " \u2014",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499910,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
+          "choices": [
+            {
+              "delta": {
+                "content": " how",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499910,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
+          "choices": [
+            {
+              "delta": {
+                "content": " can",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499910,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
+          "choices": [
+            {
+              "delta": {
+                "content": " I",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499910,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
+          "choices": [
+            {
+              "delta": {
+                "content": " help",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499910,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
+          "choices": [
+            {
+              "delta": {
+                "content": " you",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499910,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
+          "choices": [
+            {
+              "delta": {
+                "content": " today",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499910,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
+          "choices": [
+            {
+              "delta": {
+                "content": "?",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499910,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": "stop",
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499910,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      }
+    ],
+    "is_streaming": true
+  }
+}
--- a/tests/integration/recordings/responses/57b67d1b1a36.json
+++ b/tests/integration/recordings/responses/57b67d1b1a36.json
@ -0,0 +1,71 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "gpt-5-mini",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Which planet has rings around it with a name starting with letter S?"
+        }
+      ],
+      "stream": false
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "gpt-5-mini"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "chatcmpl-CECIkT5cbqFazpungtewksVePcUNa",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "Saturn. It's the planet famous for its prominent ring system made of ice and rock.",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": [],
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            },
+            "content_filter_results": {}
+          }
+        ],
+        "created": 1757499914,
+        "model": "gpt-5-mini-2025-08-07",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": null,
+        "usage": {
+          "completion_tokens": 156,
+          "prompt_tokens": 20,
+          "total_tokens": 176,
+          "completion_tokens_details": {
+            "accepted_prediction_tokens": 0,
+            "audio_tokens": 0,
+            "reasoning_tokens": 128,
+            "rejected_prediction_tokens": 0
+          },
+          "prompt_tokens_details": {
+            "audio_tokens": 0,
+            "cached_tokens": 0
+          }
+        },
+        "prompt_filter_results": [
+          {
+            "prompt_index": 0,
+            "content_filter_results": {}
+          }
+        ]
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/8752115f8d0c.json
+++ b/tests/integration/recordings/responses/8752115f8d0c.json
@ -0,0 +1,71 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "gpt-5-mini",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Hello, world!"
+        }
+      ],
+      "stream": false
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "gpt-5-mini"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "chatcmpl-CECIuyylsMNXspa83k8LrD8SQadNY",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "Hello! \ud83d\udc4b How can I help you today \u2014 answer a question, write or edit something, debug code, brainstorm ideas, or anything else?",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": [],
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            },
+            "content_filter_results": {}
+          }
+        ],
+        "created": 1757499924,
+        "model": "gpt-5-mini-2025-08-07",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": null,
+        "usage": {
+          "completion_tokens": 40,
+          "prompt_tokens": 10,
+          "total_tokens": 50,
+          "completion_tokens_details": {
+            "accepted_prediction_tokens": 0,
+            "audio_tokens": 0,
+            "reasoning_tokens": 0,
+            "rejected_prediction_tokens": 0
+          },
+          "prompt_tokens_details": {
+            "audio_tokens": 0,
+            "cached_tokens": 0
+          }
+        },
+        "prompt_filter_results": [
+          {
+            "prompt_index": 0,
+            "content_filter_results": {}
+          }
+        ]
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/94d11daee205.json
+++ b/tests/integration/recordings/responses/94d11daee205.json
--- a/tests/integration/recordings/responses/9f3d749cc1c8.json
+++ b/tests/integration/recordings/responses/9f3d749cc1c8.json
--- a/tests/integration/recordings/responses/c791119e6359.json
+++ b/tests/integration/recordings/responses/c791119e6359.json
@ -0,0 +1,98 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "gpt-5-mini",
+      "messages": [
+        {
+          "role": "user",
+          "content": "What's the weather in Tokyo? Use the get_weather function to get the weather."
+        }
+      ],
+      "stream": false,
+      "tools": [
+        {
+          "type": "function",
+          "function": {
+            "name": "get_weather",
+            "description": "Get the weather in a given city",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "city": {
+                  "type": "string",
+                  "description": "The city to get the weather for"
+                }
+              }
+            }
+          }
+        }
+      ]
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "gpt-5-mini"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "chatcmpl-CECIwq9Odd0mOJMmw7ytv8iEazH4H",
+        "choices": [
+          {
+            "finish_reason": "tool_calls",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": null,
+              "refusal": null,
+              "role": "assistant",
+              "annotations": [],
+              "audio": null,
+              "function_call": null,
+              "tool_calls": [
+                {
+                  "id": "call_yw18spRc1jjUlEyabbXBhB33",
+                  "function": {
+                    "arguments": "{\"city\":\"Tokyo\"}",
+                    "name": "get_weather"
+                  },
+                  "type": "function"
+                }
+              ]
+            },
+            "content_filter_results": {}
+          }
+        ],
+        "created": 1757499926,
+        "model": "gpt-5-mini-2025-08-07",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": null,
+        "usage": {
+          "completion_tokens": 88,
+          "prompt_tokens": 151,
+          "total_tokens": 239,
+          "completion_tokens_details": {
+            "accepted_prediction_tokens": 0,
+            "audio_tokens": 0,
+            "reasoning_tokens": 64,
+            "rejected_prediction_tokens": 0
+          },
+          "prompt_tokens_details": {
+            "audio_tokens": 0,
+            "cached_tokens": 0
+          }
+        },
+        "prompt_filter_results": [
+          {
+            "prompt_index": 0,
+            "content_filter_results": {}
+          }
+        ]
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/d3e27b7234e2.json
+++ b/tests/integration/recordings/responses/d3e27b7234e2.json
--- a/tests/integration/recordings/responses/fb785db7fafd.json
+++ b/tests/integration/recordings/responses/fb785db7fafd.json
@ -0,0 +1,310 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "gpt-5-mini",
+      "messages": [
+        {
+          "role": "user",
+          "content": "What's the weather in Tokyo? Use the get_weather function to get the weather."
+        }
+      ],
+      "stream": true,
+      "tools": [
+        {
+          "type": "function",
+          "function": {
+            "name": "get_weather",
+            "description": "Get the weather in a given city",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "city": {
+                  "type": "string",
+                  "description": "The city to get the weather for"
+                }
+              }
+            }
+          }
+        }
+      ]
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "gpt-5-mini"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "",
+          "choices": [],
+          "created": 0,
+          "model": "",
+          "object": "",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null,
+          "prompt_filter_results": [
+            {
+              "prompt_index": 0,
+              "content_filter_results": {}
+            }
+          ]
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": "call_TMbEoYn9q0ZKtoxav5LpD9Ts",
+                    "function": {
+                      "arguments": "",
+                      "name": "get_weather"
+                    },
+                    "type": "function"
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499912,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": "{\"",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499912,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": "city",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499912,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": "\":\"",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499912,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": "Tokyo",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499912,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": "\"}",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499912,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": "tool_calls",
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499912,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      }
+    ],
+    "is_streaming": true
+  }
+}
--- a/tests/integration/recordings/responses/ff3271401fb4.json
+++ b/tests/integration/recordings/responses/ff3271401fb4.json
@ -0,0 +1,556 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "gpt-5-mini",
+      "messages": [
+        {
+          "role": "user",
+          "content": "What is the name of the US captial?"
+        }
+      ],
+      "stream": true
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "gpt-5-mini"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "",
+          "choices": [],
+          "created": 0,
+          "model": "",
+          "object": "",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null,
+          "prompt_filter_results": [
+            {
+              "prompt_index": 0,
+              "content_filter_results": {}
+            }
+          ]
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": "The",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": " capital",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": " of",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": " the",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": " United",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": " States",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": " is",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": " Washington",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": ",",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": " D",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": ".C",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": ".",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": " (",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": "District",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": " of",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": " Columbia",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": ").",
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": null
+              },
+              "finish_reason": "stop",
+              "index": 0,
+              "logprobs": null,
+              "content_filter_results": {}
+            }
+          ],
+          "created": 1757499916,
+          "model": "gpt-5-mini-2025-08-07",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      }
+    ],
+    "is_streaming": true
+  }
+}
--- a/tests/integration/telemetry/test_openai_telemetry.py
+++ b/tests/integration/telemetry/test_openai_telemetry.py
@ -49,16 +49,13 @@ def setup_openai_telemetry_data(llama_stack_client, text_model_id):
        traces = llama_stack_client.telemetry.query_traces(limit=10)
        if len(traces) >= 5:  # 5 OpenAI completion traces
            break
-        time.sleep(1)
+        time.sleep(0.1)

    if len(traces) < 5:
        pytest.fail(
            f"Failed to create sufficient OpenAI completion telemetry data after 30s. Got {len(traces)} traces."
        )

-    # Wait for 5 seconds to ensure traces has completed logging
-    time.sleep(5)
-
    yield


@ -185,11 +182,13 @@ def test_openai_completion_creates_telemetry(llama_stack_client, text_model_id):
    assert len(response.choices) > 0, "Response should have at least one choice"

    # Wait for telemetry to be recorded
-    time.sleep(3)
-
-    # Check that we have more traces now
-    final_traces = llama_stack_client.telemetry.query_traces(limit=20)
-    final_count = len(final_traces)
+    start_time = time.time()
+    while time.time() - start_time < 30:
+        final_traces = llama_stack_client.telemetry.query_traces(limit=20)
+        final_count = len(final_traces)
+        if final_count > initial_count:
+            break
+        time.sleep(0.1)

    # Should have at least as many traces as before (might have more due to other activity)
    assert final_count >= initial_count, "Should have at least as many traces after OpenAI call"
--- a/tests/integration/telemetry/test_telemetry.py
+++ b/tests/integration/telemetry/test_telemetry.py
@ -42,14 +42,11 @@ def setup_telemetry_data(llama_stack_client, text_model_id):
        traces = llama_stack_client.telemetry.query_traces(limit=10)
        if len(traces) >= 4:
            break
-        time.sleep(1)
+        time.sleep(0.1)

    if len(traces) < 4:
        pytest.fail(f"Failed to create sufficient telemetry data after 30s. Got {len(traces)} traces.")

-    # Wait for 5 seconds to ensure traces has completed logging
-    time.sleep(5)
-
    yield


--- a/tests/integration/telemetry/test_telemetry_metrics.py
+++ b/tests/integration/telemetry/test_telemetry_metrics.py
@ -46,10 +46,7 @@ def setup_telemetry_metrics_data(openai_client, client_with_models, text_model_i
                break
        except Exception:
            pass
-        time.sleep(1)
-
-    # Wait additional time to ensure all metrics are processed
-    time.sleep(5)
+        time.sleep(0.1)

    # Return the token lists for use in tests
    return {"prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, "total_tokens": total_tokens}
--- a/tests/integration/tool_runtime/test_rag_tool.py
+++ b/tests/integration/tool_runtime/test_rag_tool.py
@ -183,6 +183,110 @@ def test_vector_db_insert_from_url_and_query(
    assert any("llama2" in chunk.content.lower() for chunk in response2.chunks)


+def test_rag_tool_openai_apis(client_with_empty_registry, embedding_model_id, embedding_dimension):
+    vector_db_id = "test_openai_vector_db"
+
+    client_with_empty_registry.vector_dbs.register(
+        vector_db_id=vector_db_id,
+        embedding_model=embedding_model_id,
+        embedding_dimension=embedding_dimension,
+    )
+
+    available_vector_dbs = [vector_db.identifier for vector_db in client_with_empty_registry.vector_dbs.list()]
+    actual_vector_db_id = available_vector_dbs[0]
+
+    # different document formats that should work with OpenAI APIs
+    documents = [
+        Document(
+            document_id="text-doc",
+            content="This is a plain text document about machine learning algorithms.",
+            metadata={"type": "text", "category": "AI"},
+        ),
+        Document(
+            document_id="url-doc",
+            content="https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/chat.rst",
+            mime_type="text/plain",
+            metadata={"type": "url", "source": "pytorch"},
+        ),
+        Document(
+            document_id="data-url-doc",
+            content="data:text/plain;base64,VGhpcyBpcyBhIGRhdGEgVVJMIGRvY3VtZW50IGFib3V0IGRlZXAgbGVhcm5pbmcu",  # "This is a data URL document about deep learning."
+            metadata={"type": "data_url", "encoding": "base64"},
+        ),
+    ]
+
+    client_with_empty_registry.tool_runtime.rag_tool.insert(
+        documents=documents,
+        vector_db_id=actual_vector_db_id,
+        chunk_size_in_tokens=256,
+    )
+
+    files_list = client_with_empty_registry.files.list()
+    assert len(files_list.data) >= len(documents), (
+        f"Expected at least {len(documents)} files, got {len(files_list.data)}"
+    )
+
+    vector_store_files = client_with_empty_registry.vector_io.openai_list_files_in_vector_store(
+        vector_store_id=actual_vector_db_id
+    )
+    assert len(vector_store_files.data) >= len(documents), f"Expected at least {len(documents)} files in vector store"
+
+    response = client_with_empty_registry.tool_runtime.rag_tool.query(
+        vector_db_ids=[actual_vector_db_id],
+        content="Tell me about machine learning and deep learning",
+    )
+
+    assert_valid_text_response(response)
+    content_text = " ".join([chunk.text for chunk in response.content]).lower()
+    assert "machine learning" in content_text or "deep learning" in content_text
+
+
+def test_rag_tool_exception_handling(client_with_empty_registry, embedding_model_id, embedding_dimension):
+    vector_db_id = "test_exception_handling"
+
+    client_with_empty_registry.vector_dbs.register(
+        vector_db_id=vector_db_id,
+        embedding_model=embedding_model_id,
+        embedding_dimension=embedding_dimension,
+    )
+
+    available_vector_dbs = [vector_db.identifier for vector_db in client_with_empty_registry.vector_dbs.list()]
+    actual_vector_db_id = available_vector_dbs[0]
+
+    documents = [
+        Document(
+            document_id="valid-doc",
+            content="This is a valid document that should be processed successfully.",
+            metadata={"status": "valid"},
+        ),
+        Document(
+            document_id="invalid-url-doc",
+            content="https://nonexistent-domain-12345.com/invalid.txt",
+            metadata={"status": "invalid_url"},
+        ),
+        Document(
+            document_id="another-valid-doc",
+            content="This is another valid document for testing resilience.",
+            metadata={"status": "valid"},
+        ),
+    ]
+
+    client_with_empty_registry.tool_runtime.rag_tool.insert(
+        documents=documents,
+        vector_db_id=actual_vector_db_id,
+        chunk_size_in_tokens=256,
+    )
+
+    response = client_with_empty_registry.tool_runtime.rag_tool.query(
+        vector_db_ids=[actual_vector_db_id],
+        content="valid document",
+    )
+
+    assert_valid_text_response(response)
+    content_text = " ".join([chunk.text for chunk in response.content]).lower()
+    assert "valid document" in content_text
+
+
 def test_rag_tool_insert_and_query(client_with_empty_registry, embedding_model_id, embedding_dimension):
    providers = [p for p in client_with_empty_registry.providers.list() if p.api == "vector_io"]
    assert len(providers) > 0
@ -249,3 +353,107 @@ def test_rag_tool_insert_and_query(client_with_empty_registry, embedding_model_i
                "chunk_template": "This should raise a ValueError because it is missing the proper template variables",
            },
        )
+
+
+def test_rag_tool_query_generation(client_with_empty_registry, embedding_model_id, embedding_dimension):
+    vector_db_id = "test_query_generation_db"
+
+    client_with_empty_registry.vector_dbs.register(
+        vector_db_id=vector_db_id,
+        embedding_model=embedding_model_id,
+        embedding_dimension=embedding_dimension,
+    )
+
+    available_vector_dbs = [vector_db.identifier for vector_db in client_with_empty_registry.vector_dbs.list()]
+    actual_vector_db_id = available_vector_dbs[0]
+
+    documents = [
+        Document(
+            document_id="ai-doc",
+            content="Artificial intelligence and machine learning are transforming technology.",
+            metadata={"category": "AI"},
+        ),
+        Document(
+            document_id="banana-doc",
+            content="Don't bring a banana to a knife fight.",
+            metadata={"category": "wisdom"},
+        ),
+    ]
+
+    client_with_empty_registry.tool_runtime.rag_tool.insert(
+        documents=documents,
+        vector_db_id=actual_vector_db_id,
+        chunk_size_in_tokens=256,
+    )
+
+    response = client_with_empty_registry.tool_runtime.rag_tool.query(
+        vector_db_ids=[actual_vector_db_id],
+        content="Tell me about AI",
+    )
+
+    assert_valid_text_response(response)
+    content_text = " ".join([chunk.text for chunk in response.content]).lower()
+    assert "artificial intelligence" in content_text or "machine learning" in content_text
+
+
+def test_rag_tool_pdf_data_url_handling(client_with_empty_registry, embedding_model_id, embedding_dimension):
+    vector_db_id = "test_pdf_data_url_db"
+
+    client_with_empty_registry.vector_dbs.register(
+        vector_db_id=vector_db_id,
+        embedding_model=embedding_model_id,
+        embedding_dimension=embedding_dimension,
+    )
+
+    available_vector_dbs = [vector_db.identifier for vector_db in client_with_empty_registry.vector_dbs.list()]
+    actual_vector_db_id = available_vector_dbs[0]
+
+    sample_pdf = b"%PDF-1.3\n3 0 obj\n<</Type /Page\n/Parent 1 0 R\n/Resources 2 0 R\n/Contents 4 0 R>>\nendobj\n4 0 obj\n<</Filter /FlateDecode /Length 115>>\nstream\nx\x9c\x15\xcc1\x0e\x820\x18@\xe1\x9dS\xbcM]jk$\xd5\xd5(\x83!\x86\xa1\x17\xf8\xa3\xa5`LIh+\xd7W\xc6\xf7\r\xef\xc0\xbd\xd2\xaa\xb6,\xd5\xc5\xb1o\x0c\xa6VZ\xe3znn%\xf3o\xab\xb1\xe7\xa3:Y\xdc\x8bm\xeb\xf3&1\xc8\xd7\xd3\x97\xc82\xe6\x81\x87\xe42\xcb\x87Vb(\x12<\xdd<=}Jc\x0cL\x91\xee\xda$\xb5\xc3\xbd\xd7\xe9\x0f\x8d\x97 $\nendstream\nendobj\n1 0 obj\n<</Type /Pages\n/Kids [3 0 R ]\n/Count 1\n/MediaBox [0 0 595.28 841.89]\n>>\nendobj\n5 0 obj\n<</Type /Font\n/BaseFont /Helvetica\n/Subtype /Type1\n/Encoding /WinAnsiEncoding\n>>\nendobj\n2 0 obj\n<<\n/ProcSet [/PDF /Text /ImageB /ImageC /ImageI]\n/Font <<\n/F1 5 0 R\n>>\n/XObject <<\n>>\n>>\nendobj\n6 0 obj\n<<\n/Producer (PyFPDF 1.7.2 http://pyfpdf.googlecode.com/)\n/Title (This is a sample title.)\n/Author (Llama Stack Developers)\n/CreationDate (D:20250312165548)\n>>\nendobj\n7 0 obj\n<<\n/Type /Catalog\n/Pages 1 0 R\n/OpenAction [3 0 R /FitH null]\n/PageLayout /OneColumn\n>>\nendobj\nxref\n0 8\n0000000000 65535 f \n0000000272 00000 n \n0000000455 00000 n \n0000000009 00000 n \n0000000087 00000 n \n0000000359 00000 n \n0000000559 00000 n \n0000000734 00000 n \ntrailer\n<<\n/Size 8\n/Root 7 0 R\n/Info 6 0 R\n>>\nstartxref\n837\n%%EOF\n"
+
+    import base64
+
+    pdf_base64 = base64.b64encode(sample_pdf).decode("utf-8")
+    pdf_data_url = f"data:application/pdf;base64,{pdf_base64}"
+
+    documents = [
+        Document(
+            document_id="test-pdf-data-url",
+            content=pdf_data_url,
+            metadata={"type": "pdf", "source": "data_url"},
+        ),
+    ]
+
+    client_with_empty_registry.tool_runtime.rag_tool.insert(
+        documents=documents,
+        vector_db_id=actual_vector_db_id,
+        chunk_size_in_tokens=256,
+    )
+
+    files_list = client_with_empty_registry.files.list()
+    assert len(files_list.data) >= 1, "PDF should have been uploaded to Files API"
+
+    pdf_file = None
+    for file in files_list.data:
+        if file.filename and "test-pdf-data-url" in file.filename:
+            pdf_file = file
+            break
+
+    assert pdf_file is not None, "PDF file should be found in Files API"
+    assert pdf_file.bytes == len(sample_pdf), f"File size should match original PDF ({len(sample_pdf)} bytes)"
+
+    file_content = client_with_empty_registry.files.retrieve_content(pdf_file.id)
+    assert file_content.startswith(b"%PDF-"), "Retrieved file should be a valid PDF"
+
+    vector_store_files = client_with_empty_registry.vector_io.openai_list_files_in_vector_store(
+        vector_store_id=actual_vector_db_id
+    )
+    assert len(vector_store_files.data) >= 1, "PDF should be attached to vector store"
+
+    response = client_with_empty_registry.tool_runtime.rag_tool.query(
+        vector_db_ids=[actual_vector_db_id],
+        content="sample title",
+    )
+
+    assert_valid_text_response(response)
+    content_text = " ".join([chunk.text for chunk in response.content]).lower()
+    assert "sample title" in content_text or "title" in content_text
--- a/tests/unit/distribution/test_inference_recordings.py
+++ b/tests/unit/distribution/test_inference_recordings.py
@ -6,16 +6,18 @@

 import tempfile
 from pathlib import Path
-from unittest.mock import patch
+from unittest.mock import AsyncMock, Mock, patch

 import pytest
-from openai import AsyncOpenAI
+from openai import NOT_GIVEN, AsyncOpenAI
+from openai.types.model import Model as OpenAIModel

 # Import the real Pydantic response types instead of using Mocks
 from llama_stack.apis.inference import (
    OpenAIAssistantMessageParam,
    OpenAIChatCompletion,
    OpenAIChoice,
+    OpenAICompletion,
    OpenAIEmbeddingData,
    OpenAIEmbeddingsResponse,
    OpenAIEmbeddingUsage,
@ -153,24 +155,22 @@ class TestInferenceRecording:

    async def test_recording_mode(self, temp_storage_dir, real_openai_chat_response):
        """Test that recording mode captures and stores responses."""
-
-        async def mock_create(*args, **kwargs):
-            return real_openai_chat_response
-
        temp_storage_dir = temp_storage_dir / "test_recording_mode"
-        with patch("openai.resources.chat.completions.AsyncCompletions.create", side_effect=mock_create):
-            with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)):
-                client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+        with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)):
+            client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+            client.chat.completions._post = AsyncMock(return_value=real_openai_chat_response)

-                response = await client.chat.completions.create(
-                    model="llama3.2:3b",
-                    messages=[{"role": "user", "content": "Hello, how are you?"}],
-                    temperature=0.7,
-                    max_tokens=50,
-                )
+            response = await client.chat.completions.create(
+                model="llama3.2:3b",
+                messages=[{"role": "user", "content": "Hello, how are you?"}],
+                temperature=0.7,
+                max_tokens=50,
+                user=NOT_GIVEN,
+            )

-                # Verify the response was returned correctly
-                assert response.choices[0].message.content == "Hello! I'm doing well, thank you for asking."
+            # Verify the response was returned correctly
+            assert response.choices[0].message.content == "Hello! I'm doing well, thank you for asking."
+            client.chat.completions._post.assert_called_once()

        # Verify recording was stored
        storage = ResponseStorage(temp_storage_dir)
@ -178,40 +178,74 @@ class TestInferenceRecording:

    async def test_replay_mode(self, temp_storage_dir, real_openai_chat_response):
        """Test that replay mode returns stored responses without making real calls."""
-
-        async def mock_create(*args, **kwargs):
-            return real_openai_chat_response
-
        temp_storage_dir = temp_storage_dir / "test_replay_mode"
        # First, record a response
-        with patch("openai.resources.chat.completions.AsyncCompletions.create", side_effect=mock_create):
-            with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)):
-                client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+        with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)):
+            client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+            client.chat.completions._post = AsyncMock(return_value=real_openai_chat_response)

-                response = await client.chat.completions.create(
-                    model="llama3.2:3b",
-                    messages=[{"role": "user", "content": "Hello, how are you?"}],
-                    temperature=0.7,
-                    max_tokens=50,
-                )
+            response = await client.chat.completions.create(
+                model="llama3.2:3b",
+                messages=[{"role": "user", "content": "Hello, how are you?"}],
+                temperature=0.7,
+                max_tokens=50,
+                user=NOT_GIVEN,
+            )
+            client.chat.completions._post.assert_called_once()

        # Now test replay mode - should not call the original method
-        with patch("openai.resources.chat.completions.AsyncCompletions.create") as mock_create_patch:
-            with inference_recording(mode=InferenceMode.REPLAY, storage_dir=str(temp_storage_dir)):
-                client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+        with inference_recording(mode=InferenceMode.REPLAY, storage_dir=str(temp_storage_dir)):
+            client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+            client.chat.completions._post = AsyncMock(return_value=real_openai_chat_response)

-                response = await client.chat.completions.create(
-                    model="llama3.2:3b",
-                    messages=[{"role": "user", "content": "Hello, how are you?"}],
-                    temperature=0.7,
-                    max_tokens=50,
-                )
+            response = await client.chat.completions.create(
+                model="llama3.2:3b",
+                messages=[{"role": "user", "content": "Hello, how are you?"}],
+                temperature=0.7,
+                max_tokens=50,
+            )

-                # Verify we got the recorded response
-                assert response.choices[0].message.content == "Hello! I'm doing well, thank you for asking."
+            # Verify we got the recorded response
+            assert response.choices[0].message.content == "Hello! I'm doing well, thank you for asking."

-                # Verify the original method was NOT called
-                mock_create_patch.assert_not_called()
+            # Verify the original method was NOT called
+            client.chat.completions._post.assert_not_called()
+
+    async def test_replay_mode_models(self, temp_storage_dir):
+        """Test that replay mode returns stored responses without making real model listing calls."""
+
+        async def _async_iterator(models):
+            for model in models:
+                yield model
+
+        models = [
+            OpenAIModel(id="foo", created=1, object="model", owned_by="test"),
+            OpenAIModel(id="bar", created=2, object="model", owned_by="test"),
+        ]
+
+        expected_ids = {m.id for m in models}
+
+        temp_storage_dir = temp_storage_dir / "test_replay_mode_models"
+
+        # baseline - mock works without recording
+        client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+        client.models._get_api_list = Mock(return_value=_async_iterator(models))
+        assert {m.id async for m in client.models.list()} == expected_ids
+        client.models._get_api_list.assert_called_once()
+
+        # record the call
+        with inference_recording(mode=InferenceMode.RECORD, storage_dir=temp_storage_dir):
+            client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+            client.models._get_api_list = Mock(return_value=_async_iterator(models))
+            assert {m.id async for m in client.models.list()} == expected_ids
+            client.models._get_api_list.assert_called_once()
+
+        # replay the call
+        with inference_recording(mode=InferenceMode.REPLAY, storage_dir=temp_storage_dir):
+            client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+            client.models._get_api_list = Mock(return_value=_async_iterator(models))
+            assert {m.id async for m in client.models.list()} == expected_ids
+            client.models._get_api_list.assert_not_called()

    async def test_replay_missing_recording(self, temp_storage_dir):
        """Test that replay mode fails when no recording is found."""
@ -228,36 +262,110 @@ class TestInferenceRecording:
    async def test_embeddings_recording(self, temp_storage_dir, real_embeddings_response):
        """Test recording and replay of embeddings calls."""

-        async def mock_create(*args, **kwargs):
-            return real_embeddings_response
+        # baseline - mock works without recording
+        client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+        client.embeddings._post = AsyncMock(return_value=real_embeddings_response)
+        response = await client.embeddings.create(
+            model=real_embeddings_response.model,
+            input=["Hello world", "Test embedding"],
+            encoding_format=NOT_GIVEN,
+        )
+        assert len(response.data) == 2
+        assert response.data[0].embedding == [0.1, 0.2, 0.3]
+        client.embeddings._post.assert_called_once()

        temp_storage_dir = temp_storage_dir / "test_embeddings_recording"
        # Record
-        with patch("openai.resources.embeddings.AsyncEmbeddings.create", side_effect=mock_create):
-            with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)):
-                client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+        with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)):
+            client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+            client.embeddings._post = AsyncMock(return_value=real_embeddings_response)

-                response = await client.embeddings.create(
-                    model="nomic-embed-text", input=["Hello world", "Test embedding"]
-                )
+            response = await client.embeddings.create(
+                model=real_embeddings_response.model,
+                input=["Hello world", "Test embedding"],
+                encoding_format=NOT_GIVEN,
+                dimensions=NOT_GIVEN,
+                user=NOT_GIVEN,
+            )

-                assert len(response.data) == 2
+            assert len(response.data) == 2

        # Replay
-        with patch("openai.resources.embeddings.AsyncEmbeddings.create") as mock_create_patch:
-            with inference_recording(mode=InferenceMode.REPLAY, storage_dir=str(temp_storage_dir)):
-                client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+        with inference_recording(mode=InferenceMode.REPLAY, storage_dir=str(temp_storage_dir)):
+            client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+            client.embeddings._post = AsyncMock(return_value=real_embeddings_response)

-                response = await client.embeddings.create(
-                    model="nomic-embed-text", input=["Hello world", "Test embedding"]
-                )
+            response = await client.embeddings.create(
+                model=real_embeddings_response.model,
+                input=["Hello world", "Test embedding"],
+            )

-                # Verify we got the recorded response
-                assert len(response.data) == 2
-                assert response.data[0].embedding == [0.1, 0.2, 0.3]
+            # Verify we got the recorded response
+            assert len(response.data) == 2
+            assert response.data[0].embedding == [0.1, 0.2, 0.3]

-                # Verify original method was not called
-                mock_create_patch.assert_not_called()
+            # Verify original method was not called
+            client.embeddings._post.assert_not_called()
+
+    async def test_completions_recording(self, temp_storage_dir):
+        real_completions_response = OpenAICompletion(
+            id="test_completion",
+            object="text_completion",
+            created=1234567890,
+            model="llama3.2:3b",
+            choices=[
+                {
+                    "text": "Hello! I'm doing well, thank you for asking.",
+                    "index": 0,
+                    "logprobs": None,
+                    "finish_reason": "stop",
+                }
+            ],
+        )
+
+        temp_storage_dir = temp_storage_dir / "test_completions_recording"
+
+        # baseline - mock works without recording
+        client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+        client.completions._post = AsyncMock(return_value=real_completions_response)
+        response = await client.completions.create(
+            model=real_completions_response.model,
+            prompt="Hello, how are you?",
+            temperature=0.7,
+            max_tokens=50,
+            user=NOT_GIVEN,
+        )
+        assert response.choices[0].text == real_completions_response.choices[0].text
+        client.completions._post.assert_called_once()
+
+        # Record
+        with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)):
+            client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+            client.completions._post = AsyncMock(return_value=real_completions_response)
+
+            response = await client.completions.create(
+                model=real_completions_response.model,
+                prompt="Hello, how are you?",
+                temperature=0.7,
+                max_tokens=50,
+                user=NOT_GIVEN,
+            )
+
+            assert response.choices[0].text == real_completions_response.choices[0].text
+            client.completions._post.assert_called_once()
+
+        # Replay
+        with inference_recording(mode=InferenceMode.REPLAY, storage_dir=str(temp_storage_dir)):
+            client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
+            client.completions._post = AsyncMock(return_value=real_completions_response)
+            response = await client.completions.create(
+                model=real_completions_response.model,
+                prompt="Hello, how are you?",
+                temperature=0.7,
+                max_tokens=50,
+            )
+            assert response.choices[0].text == real_completions_response.choices[0].text
+            client.completions._post.assert_not_called()

    async def test_live_mode(self, real_openai_chat_response):
        """Test that live mode passes through to original methods."""
--- a/tests/unit/providers/inference/test_remote_vllm.py
+++ b/tests/unit/providers/inference/test_remote_vllm.py
@ -6,19 +6,15 @@

 import asyncio
 import json
-import logging  # allow-direct-logging
-import threading
 import time
-from http.server import BaseHTTPRequestHandler, HTTPServer
-from typing import Any
-from unittest.mock import AsyncMock, MagicMock, patch
+from unittest.mock import AsyncMock, MagicMock, PropertyMock, patch

 import pytest
 from openai.types.chat.chat_completion_chunk import (
    ChatCompletionChunk as OpenAIChatCompletionChunk,
 )
 from openai.types.chat.chat_completion_chunk import (
-    Choice as OpenAIChoice,
+    Choice as OpenAIChoiceChunk,
 )
 from openai.types.chat.chat_completion_chunk import (
    ChoiceDelta as OpenAIChoiceDelta,
@ -35,6 +31,9 @@ from llama_stack.apis.inference import (
    ChatCompletionRequest,
    ChatCompletionResponseEventType,
    CompletionMessage,
+    OpenAIAssistantMessageParam,
+    OpenAIChatCompletion,
+    OpenAIChoice,
    SystemMessage,
    ToolChoice,
    ToolConfig,
@ -61,41 +60,6 @@ from llama_stack.providers.remote.inference.vllm.vllm import (
 # -v -s --tb=short --disable-warnings


-class MockInferenceAdapterWithSleep:
-    def __init__(self, sleep_time: int, response: dict[str, Any]):
-        self.httpd = None
-
-        class DelayedRequestHandler(BaseHTTPRequestHandler):
-            # ruff: noqa: N802
-            def do_POST(self):
-                time.sleep(sleep_time)
-                response_body = json.dumps(response).encode("utf-8")
-                self.send_response(code=200)
-                self.send_header("Content-Type", "application/json")
-                self.send_header("Content-Length", len(response_body))
-                self.end_headers()
-                self.wfile.write(response_body)
-
-        self.request_handler = DelayedRequestHandler
-
-    def __enter__(self):
-        httpd = HTTPServer(("", 0), self.request_handler)
-        self.httpd = httpd
-        host, port = httpd.server_address
-        httpd_thread = threading.Thread(target=httpd.serve_forever)
-        httpd_thread.daemon = True  # stop server if this thread terminates
-        httpd_thread.start()
-
-        config = VLLMInferenceAdapterConfig(url=f"http://{host}:{port}")
-        inference_adapter = VLLMInferenceAdapter(config)
-        return inference_adapter
-
-    def __exit__(self, _exc_type, _exc_value, _traceback):
-        if self.httpd:
-            self.httpd.shutdown()
-            self.httpd.server_close()
-
-
@pytest.fixture(scope="module")
 def mock_openai_models_list():
    with patch("openai.resources.models.AsyncModels.list", new_callable=AsyncMock) as mock_list:
@ -150,10 +114,12 @@ async def test_tool_call_response(vllm_inference_adapter):
    """Verify that tool call arguments from a CompletionMessage are correctly converted
    into the expected JSON format."""

-    # Patch the call to vllm so we can inspect the arguments sent were correct
-    with patch.object(
-        vllm_inference_adapter.client.chat.completions, "create", new_callable=AsyncMock
-    ) as mock_nonstream_completion:
+    # Patch the client property to avoid instantiating a real AsyncOpenAI client
+    with patch.object(VLLMInferenceAdapter, "client", new_callable=PropertyMock) as mock_create_client:
+        mock_client = MagicMock()
+        mock_client.chat.completions.create = AsyncMock()
+        mock_create_client.return_value = mock_client
+
        messages = [
            SystemMessage(content="You are a helpful assistant"),
            UserMessage(content="How many?"),
@ -179,7 +145,7 @@ async def test_tool_call_response(vllm_inference_adapter):
            tool_config=ToolConfig(tool_choice=ToolChoice.auto),
        )

-        assert mock_nonstream_completion.call_args.kwargs["messages"][2]["tool_calls"] == [
+        assert mock_client.chat.completions.create.call_args.kwargs["messages"][2]["tool_calls"] == [
            {
                "id": "foo",
                "type": "function",
@ -199,7 +165,7 @@ async def test_tool_call_delta_empty_tool_call_buf():

    async def mock_stream():
        delta = OpenAIChoiceDelta(content="", tool_calls=None)
-        choices = [OpenAIChoice(delta=delta, finish_reason="stop", index=0)]
+        choices = [OpenAIChoiceChunk(delta=delta, finish_reason="stop", index=0)]
        mock_chunk = OpenAIChatCompletionChunk(
            id="chunk-1",
            created=1,
@ -225,7 +191,7 @@ async def test_tool_call_delta_streaming_arguments_dict():
            model="foo",
            object="chat.completion.chunk",
            choices=[
-                OpenAIChoice(
+                OpenAIChoiceChunk(
                    delta=OpenAIChoiceDelta(
                        content="",
                        tool_calls=[
@ -250,7 +216,7 @@ async def test_tool_call_delta_streaming_arguments_dict():
            model="foo",
            object="chat.completion.chunk",
            choices=[
-                OpenAIChoice(
+                OpenAIChoiceChunk(
                    delta=OpenAIChoiceDelta(
                        content="",
                        tool_calls=[
@ -275,7 +241,9 @@ async def test_tool_call_delta_streaming_arguments_dict():
            model="foo",
            object="chat.completion.chunk",
            choices=[
-                OpenAIChoice(delta=OpenAIChoiceDelta(content="", tool_calls=None), finish_reason="tool_calls", index=0)
+                OpenAIChoiceChunk(
+                    delta=OpenAIChoiceDelta(content="", tool_calls=None), finish_reason="tool_calls", index=0
+                )
            ],
        )
        for chunk in [mock_chunk_1, mock_chunk_2, mock_chunk_3]:
@ -299,7 +267,7 @@ async def test_multiple_tool_calls():
            model="foo",
            object="chat.completion.chunk",
            choices=[
-                OpenAIChoice(
+                OpenAIChoiceChunk(
                    delta=OpenAIChoiceDelta(
                        content="",
                        tool_calls=[
@ -324,7 +292,7 @@ async def test_multiple_tool_calls():
            model="foo",
            object="chat.completion.chunk",
            choices=[
-                OpenAIChoice(
+                OpenAIChoiceChunk(
                    delta=OpenAIChoiceDelta(
                        content="",
                        tool_calls=[
@ -349,7 +317,9 @@ async def test_multiple_tool_calls():
            model="foo",
            object="chat.completion.chunk",
            choices=[
-                OpenAIChoice(delta=OpenAIChoiceDelta(content="", tool_calls=None), finish_reason="tool_calls", index=0)
+                OpenAIChoiceChunk(
+                    delta=OpenAIChoiceDelta(content="", tool_calls=None), finish_reason="tool_calls", index=0
+                )
            ],
        )
        for chunk in [mock_chunk_1, mock_chunk_2, mock_chunk_3]:
@ -393,59 +363,6 @@ async def test_process_vllm_chat_completion_stream_response_no_choices():
    assert chunks[0].event.event_type.value == "start"


-@pytest.mark.allow_network
-def test_chat_completion_doesnt_block_event_loop(caplog):
-    loop = asyncio.new_event_loop()
-    loop.set_debug(True)
-    caplog.set_level(logging.WARNING)
-
-    # Log when event loop is blocked for more than 200ms
-    loop.slow_callback_duration = 0.5
-    # Sleep for 500ms in our delayed http response
-    sleep_time = 0.5
-
-    mock_model = Model(identifier="mock-model", provider_resource_id="mock-model", provider_id="vllm-inference")
-    mock_response = {
-        "id": "chatcmpl-abc123",
-        "object": "chat.completion",
-        "created": 1,
-        "modle": "mock-model",
-        "choices": [
-            {
-                "message": {"content": ""},
-                "logprobs": None,
-                "finish_reason": "stop",
-                "index": 0,
-            }
-        ],
-    }
-
-    async def do_chat_completion():
-        await inference_adapter.chat_completion(
-            "mock-model",
-            [],
-            stream=False,
-            tools=None,
-            tool_config=ToolConfig(tool_choice=ToolChoice.auto),
-        )
-
-    with MockInferenceAdapterWithSleep(sleep_time, mock_response) as inference_adapter:
-        inference_adapter.model_store = AsyncMock()
-        inference_adapter.model_store.get_model.return_value = mock_model
-        loop.run_until_complete(inference_adapter.initialize())
-
-        # Clear the logs so far and run the actual chat completion we care about
-        caplog.clear()
-        loop.run_until_complete(do_chat_completion())
-
-    # Ensure we don't have any asyncio warnings in the captured log
-    # records from our chat completion call. A message gets logged
-    # here any time we exceed the slow_callback_duration configured
-    # above.
-    asyncio_warnings = [record.message for record in caplog.records if record.name == "asyncio"]
-    assert not asyncio_warnings
-
-
 async def test_get_params_empty_tools(vllm_inference_adapter):
    request = ChatCompletionRequest(
        tools=[],
@ -641,9 +558,7 @@ async def test_health_status_success(vllm_inference_adapter):
    This test verifies that the health method returns a HealthResponse with status OK, only
    when the connection to the vLLM server is successful.
    """
-    # Set vllm_inference_adapter.client to None to ensure _create_client is called
-    vllm_inference_adapter.client = None
-    with patch.object(vllm_inference_adapter, "_create_client") as mock_create_client:
+    with patch.object(VLLMInferenceAdapter, "client", new_callable=PropertyMock) as mock_create_client:
        # Create mock client and models
        mock_client = MagicMock()
        mock_models = MagicMock()
@ -674,8 +589,7 @@ async def test_health_status_failure(vllm_inference_adapter):
    This test verifies that the health method returns a HealthResponse with status ERROR
    and an appropriate error message when the connection to the vLLM server fails.
    """
-    vllm_inference_adapter.client = None
-    with patch.object(vllm_inference_adapter, "_create_client") as mock_create_client:
+    with patch.object(VLLMInferenceAdapter, "client", new_callable=PropertyMock) as mock_create_client:
        # Create mock client and models
        mock_client = MagicMock()
        mock_models = MagicMock()
@ -697,3 +611,48 @@ async def test_health_status_failure(vllm_inference_adapter):
        assert "Health check failed: Connection failed" in health_response["message"]

        mock_models.list.assert_called_once()
+
+
+async def test_openai_chat_completion_is_async(vllm_inference_adapter):
+    """
+    Verify that openai_chat_completion is async and doesn't block the event loop.
+
+    To do this we mock the underlying inference with a sleep, start multiple
+    inference calls in parallel, and ensure the total time taken is less
+    than the sum of the individual sleep times.
+    """
+    sleep_time = 0.5
+
+    async def mock_create(*args, **kwargs):
+        await asyncio.sleep(sleep_time)
+        return OpenAIChatCompletion(
+            id="chatcmpl-abc123",
+            created=1,
+            model="mock-model",
+            choices=[
+                OpenAIChoice(
+                    message=OpenAIAssistantMessageParam(
+                        content="nothing interesting",
+                    ),
+                    finish_reason="stop",
+                    index=0,
+                )
+            ],
+        )
+
+    async def do_inference():
+        await vllm_inference_adapter.openai_chat_completion(
+            "mock-model", messages=["one fish", "two fish"], stream=False
+        )
+
+    with patch.object(VLLMInferenceAdapter, "client", new_callable=PropertyMock) as mock_create_client:
+        mock_client = MagicMock()
+        mock_client.chat.completions.create = AsyncMock(side_effect=mock_create)
+        mock_create_client.return_value = mock_client
+
+        start_time = time.time()
+        await asyncio.gather(do_inference(), do_inference(), do_inference(), do_inference())
+        total_time = time.time() - start_time
+
+        assert mock_create_client.call_count == 4  # no cheating
+        assert total_time < (sleep_time * 2), f"Total time taken: {total_time}s exceeded expected max"
--- a/tests/unit/providers/test_bedrock.py
+++ b/tests/unit/providers/test_bedrock.py
@ -0,0 +1,53 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.providers.remote.inference.bedrock.bedrock import (
+    _get_region_prefix,
+    _to_inference_profile_id,
+)
+
+
+def test_region_prefixes():
+    assert _get_region_prefix("us-east-1") == "us."
+    assert _get_region_prefix("eu-west-1") == "eu."
+    assert _get_region_prefix("ap-south-1") == "ap."
+    assert _get_region_prefix("ca-central-1") == "us."
+
+    # Test case insensitive
+    assert _get_region_prefix("US-EAST-1") == "us."
+    assert _get_region_prefix("EU-WEST-1") == "eu."
+    assert _get_region_prefix("Ap-South-1") == "ap."
+
+    # Test None region
+    assert _get_region_prefix(None) == "us."
+
+
+def test_model_id_conversion():
+    # Basic conversion
+    assert (
+        _to_inference_profile_id("meta.llama3-1-70b-instruct-v1:0", "us-east-1") == "us.meta.llama3-1-70b-instruct-v1:0"
+    )
+
+    # Already has prefix
+    assert (
+        _to_inference_profile_id("us.meta.llama3-1-70b-instruct-v1:0", "us-east-1")
+        == "us.meta.llama3-1-70b-instruct-v1:0"
+    )
+
+    # ARN should be returned unchanged
+    arn = "arn:aws:bedrock:us-east-1:123456789012:inference-profile/us.meta.llama3-1-70b-instruct-v1:0"
+    assert _to_inference_profile_id(arn, "us-east-1") == arn
+
+    # ARN should be returned unchanged even without region
+    assert _to_inference_profile_id(arn) == arn
+
+    # Optional region parameter defaults to us-east-1
+    assert _to_inference_profile_id("meta.llama3-1-70b-instruct-v1:0") == "us.meta.llama3-1-70b-instruct-v1:0"
+
+    # Different regions work with optional parameter
+    assert (
+        _to_inference_profile_id("meta.llama3-1-70b-instruct-v1:0", "eu-west-1") == "eu.meta.llama3-1-70b-instruct-v1:0"
+    )
--- a/tests/unit/providers/utils/memory/test_vector_store.py
+++ b/tests/unit/providers/utils/memory/test_vector_store.py
@ -178,3 +178,41 @@ def test_content_from_data_and_mime_type_both_encodings_fail():
        # Should raise an exception instead of returning empty string
        with pytest.raises(UnicodeDecodeError):
            content_from_data_and_mime_type(data, mime_type)
+
+
+async def test_memory_tool_error_handling():
+    """Test that memory tool handles various failures gracefully without crashing."""
+    from llama_stack.providers.inline.tool_runtime.rag.config import RagToolRuntimeConfig
+    from llama_stack.providers.inline.tool_runtime.rag.memory import MemoryToolRuntimeImpl
+
+    config = RagToolRuntimeConfig()
+    memory_tool = MemoryToolRuntimeImpl(
+        config=config,
+        vector_io_api=AsyncMock(),
+        inference_api=AsyncMock(),
+        files_api=AsyncMock(),
+    )
+
+    docs = [
+        RAGDocument(document_id="good_doc", content="Good content", metadata={}),
+        RAGDocument(document_id="bad_url_doc", content=URL(uri="https://bad.url"), metadata={}),
+        RAGDocument(document_id="another_good_doc", content="Another good content", metadata={}),
+    ]
+
+    mock_file1 = MagicMock()
+    mock_file1.id = "file_good1"
+    mock_file2 = MagicMock()
+    mock_file2.id = "file_good2"
+    memory_tool.files_api.openai_upload_file.side_effect = [mock_file1, mock_file2]
+
+    with patch("httpx.AsyncClient") as mock_client:
+        mock_instance = AsyncMock()
+        mock_instance.get.side_effect = Exception("Bad URL")
+        mock_client.return_value.__aenter__.return_value = mock_instance
+
+        # won't raise exception despite one document failing
+        await memory_tool.insert(docs, "vector_store_123")
+
+    # processed 2 documents successfully, skipped 1
+    assert memory_tool.files_api.openai_upload_file.call_count == 2
+    assert memory_tool.vector_io_api.openai_attach_file_to_vector_store.call_count == 2
--- a/tests/unit/providers/vector_io/test_vector_utils.py
+++ b/tests/unit/providers/vector_io/test_vector_utils.py
@ -26,9 +26,9 @@ def test_generate_chunk_id():

    chunk_ids = sorted([chunk.chunk_id for chunk in chunks])
    assert chunk_ids == [
-        "177a1368-f6a8-0c50-6e92-18677f2c3de3",
-        "bc744db3-1b25-0a9c-cdff-b6ba3df73c36",
-        "f68df25d-d9aa-ab4d-5684-64a233add20d",
+        "31d1f9a3-c8d2-66e7-3c37-af2acd329778",
+        "d07dade7-29c0-cda7-df29-0249a1dcbc3e",
+        "d14f75a1-5855-7f72-2c78-d9fc4275a346",
    ]


@ -36,14 +36,14 @@ def test_generate_chunk_id_with_window():
    chunk = Chunk(content="test", metadata={"document_id": "doc-1"})
    chunk_id1 = generate_chunk_id("doc-1", chunk, chunk_window="0-1")
    chunk_id2 = generate_chunk_id("doc-1", chunk, chunk_window="1-2")
-    assert chunk_id1 == "149018fe-d0eb-0f8d-5f7f-726bdd2aeedb"
-    assert chunk_id2 == "4562c1ee-9971-1f3b-51a6-7d05e5211154"
+    assert chunk_id1 == "8630321a-d9cb-2bb6-cd28-ebf68dafd866"
+    assert chunk_id2 == "13a1c09a-cbda-b61a-2d1a-7baa90888685"


 def test_chunk_id():
    # Test with existing chunk ID
    chunk_with_id = Chunk(content="test", metadata={"document_id": "existing-id"})
-    assert chunk_with_id.chunk_id == "84ededcc-b80b-a83e-1a20-ca6515a11350"
+    assert chunk_with_id.chunk_id == "11704f92-42b6-61df-bf85-6473e7708fbd"

    # Test with document ID in metadata
    chunk_with_doc_id = Chunk(content="test", metadata={"document_id": "doc-1"})
--- a/tests/unit/utils/inference/test_inference_store.py
+++ b/tests/unit/utils/inference/test_inference_store.py
@ -65,6 +65,9 @@ async def test_inference_store_pagination_basic():
            input_messages = [OpenAIUserMessageParam(role="user", content=f"Test message for {completion_id}")]
            await store.store_chat_completion(completion, input_messages)

+        # Wait for all queued writes to complete
+        await store.flush()
+
        # Test 1: First page with limit=2, descending order (default)
        result = await store.list_chat_completions(limit=2, order=Order.desc)
        assert len(result.data) == 2
@ -108,6 +111,9 @@ async def test_inference_store_pagination_ascending():
            input_messages = [OpenAIUserMessageParam(role="user", content=f"Test message for {completion_id}")]
            await store.store_chat_completion(completion, input_messages)

+        # Wait for all queued writes to complete
+        await store.flush()
+
        # Test ascending order pagination
        result = await store.list_chat_completions(limit=1, order=Order.asc)
        assert len(result.data) == 1
@ -143,6 +149,9 @@ async def test_inference_store_pagination_with_model_filter():
            input_messages = [OpenAIUserMessageParam(role="user", content=f"Test message for {completion_id}")]
            await store.store_chat_completion(completion, input_messages)

+        # Wait for all queued writes to complete
+        await store.flush()
+
        # Test pagination with model filter
        result = await store.list_chat_completions(limit=1, model="model-a", order=Order.desc)
        assert len(result.data) == 1
@ -190,6 +199,9 @@ async def test_inference_store_pagination_no_limit():
            input_messages = [OpenAIUserMessageParam(role="user", content=f"Test message for {completion_id}")]
            await store.store_chat_completion(completion, input_messages)

+        # Wait for all queued writes to complete
+        await store.flush()
+
        # Test without limit
        result = await store.list_chat_completions(order=Order.desc)
        assert len(result.data) == 2