feat(tests): implement test isolation for inference recordings

Uses test_id in request hashes and test-scoped subdirectories to prevent cross-test contamination. Model list endpoints exclude test_id to enable merging recordings from different servers. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-14 08:52:37 +00:00 · 2025-10-03 20:05:24 -07:00 · 2025-10-03 20:05:24 -07:00 · 7008a70a6e
commit 7008a70a6e
parent f176196fba
303 changed files with 40627 additions and 105140 deletions
--- a/llama_stack/testing/inference_recorder.py
+++ b/llama_stack/testing/inference_recorder.py
@ -22,10 +22,18 @@ from llama_stack.log import get_logger
 logger = get_logger(__name__, category="testing")

 # Global state for the recording system
+# Note: Using module globals instead of ContextVars because the session-scoped
+# client initialization happens in one async context, but tests run in different
+# contexts, and we need the mode/storage to persist across all contexts.
 _current_mode: str | None = None
 _current_storage: ResponseStorage | None = None
 _original_methods: dict[str, Any] = {}

+# Test context uses ContextVar since it changes per-test and needs async isolation
+from contextvars import ContextVar
+
+_test_context: ContextVar[str | None] = ContextVar("_test_context", default=None)
+
 from openai.types.completion_choice import CompletionChoice

 # update the "finish_reason" field, since its type definition is wrong (no None is accepted)
@ -33,22 +41,38 @@ CompletionChoice.model_fields["finish_reason"].annotation = Literal["stop", "len
 CompletionChoice.model_rebuild()

 REPO_ROOT = Path(__file__).parent.parent.parent
-DEFAULT_STORAGE_DIR = REPO_ROOT / "tests/integration/recordings"
+DEFAULT_STORAGE_DIR = REPO_ROOT / "tests/integration/common"


 class InferenceMode(StrEnum):
    LIVE = "live"
    RECORD = "record"
    REPLAY = "replay"
+    RECORD_IF_MISSING = "record-if-missing"


 def normalize_request(method: str, url: str, headers: dict[str, Any], body: dict[str, Any]) -> str:
-    """Create a normalized hash of the request for consistent matching."""
+    """Create a normalized hash of the request for consistent matching.
+
+    Includes test_id from context to ensure test isolation - identical requests
+    from different tests will have different hashes.
+
+    Exception: Model list endpoints (/v1/models, /api/tags) exclude test_id since
+    they are infrastructure/shared and need to work across session setup and tests.
+    """
    # Extract just the endpoint path
    from urllib.parse import urlparse

    parsed = urlparse(url)
-    normalized = {"method": method.upper(), "endpoint": parsed.path, "body": body}
+    normalized = {
+        "method": method.upper(),
+        "endpoint": parsed.path,
+        "body": body,
+    }
+
+    # Include test_id for isolation, except for shared infrastructure endpoints
+    if parsed.path not in ("/api/tags", "/v1/models"):
+        normalized["test_id"] = _test_context.get()

    # Create hash - sort_keys=True ensures deterministic ordering
    normalized_json = json.dumps(normalized, sort_keys=True)
@ -67,7 +91,11 @@ def setup_inference_recording():
    Currently, this is only supported for OpenAI and Ollama clients. These should cover the vast majority of use cases.

    Two environment variables are supported:
-    - LLAMA_STACK_TEST_INFERENCE_MODE: The mode to run in. Must be 'live', 'record', or 'replay'. Default is 'replay'.
+    - LLAMA_STACK_TEST_INFERENCE_MODE: The mode to run in. Must be 'live', 'record', 'replay', or 'record-if-missing'. Default is 'replay'.
+      - 'live': Make all requests live without recording
+      - 'record': Record all requests (overwrites existing recordings)
+      - 'replay': Use only recorded responses (fails if recording not found)
+      - 'record-if-missing': Use recorded responses when available, record new ones when not found
    - LLAMA_STACK_TEST_RECORDING_DIR: The directory to store the recordings in. Default is 'tests/integration/recordings'.

    The recordings are stored as JSON files.
@ -154,21 +182,43 @@ def _deserialize_response(data: dict[str, Any]) -> Any:
 class ResponseStorage:
    """Handles SQLite index + JSON file storage/retrieval for inference recordings."""

-    def __init__(self, test_dir: Path):
-        self.test_dir = test_dir
-        self.responses_dir = self.test_dir / "responses"
+    def __init__(self, base_dir: Path):
+        self.base_dir = base_dir
+        # Don't create responses_dir here - determine it per-test at runtime

-        self._ensure_directories()
+    def _get_test_dir(self) -> Path:
+        """Get the recordings directory in the test file's parent directory.
+
+        For test at "tests/integration/inference/test_foo.py::test_bar",
+        returns "tests/integration/inference/recordings/".
+        """
+        test_id = _test_context.get()
+        if test_id:
+            # Extract the directory path from the test nodeid
+            # e.g., "tests/integration/inference/test_basic.py::test_foo[params]"
+            # -> get "tests/integration/inference"
+            test_file = test_id.split("::")[0]  # Remove test function part
+            test_dir = Path(test_file).parent  # Get parent directory
+
+            # Put recordings in a "recordings" subdirectory of the test's parent dir
+            # e.g., "tests/integration/inference" -> "tests/integration/inference/recordings"
+            return test_dir / "recordings"
+        else:
+            # Fallback for non-test contexts
+            return self.base_dir / "recordings"

    def _ensure_directories(self):
-        self.test_dir.mkdir(parents=True, exist_ok=True)
-        self.responses_dir.mkdir(exist_ok=True)
+        """Ensure test-specific directories exist."""
+        test_dir = self._get_test_dir()
+        test_dir.mkdir(parents=True, exist_ok=True)
+        return test_dir

    def store_recording(self, request_hash: str, request: dict[str, Any], response: dict[str, Any]):
        """Store a request/response pair."""
-        # Generate unique response filename
-        short_hash = request_hash[:12]
-        response_file = f"{short_hash}.json"
+        responses_dir = self._ensure_directories()
+
+        # Use FULL hash (not truncated)
+        response_file = f"{request_hash}.json"

        # Serialize response body if needed
        serialized_response = dict(response)
@ -182,35 +232,71 @@ class ResponseStorage:
                # Handle single response
                serialized_response["body"] = _serialize_response(serialized_response["body"], request_hash)

-        # If this is an Ollama /api/tags recording, include models digest in filename to distinguish variants
+        # For model-list endpoints, include digest in filename to distinguish different model sets
        endpoint = request.get("endpoint")
        if endpoint in ("/api/tags", "/v1/models"):
            digest = _model_identifiers_digest(endpoint, response)
-            response_file = f"models-{short_hash}-{digest}.json"
+            response_file = f"models-{request_hash}-{digest}.json"

-        response_path = self.responses_dir / response_file
+        response_path = responses_dir / response_file

-        # Save response to JSON file
+        # Save response to JSON file with metadata
        with open(response_path, "w") as f:
-            json.dump({"request": request, "response": serialized_response}, f, indent=2)
+            json.dump(
+                {
+                    "test_id": _test_context.get(),  # Include for debugging
+                    "request": request,
+                    "response": serialized_response,
+                },
+                f,
+                indent=2,
+            )
            f.write("\n")
            f.flush()

    def find_recording(self, request_hash: str) -> dict[str, Any] | None:
-        """Find a recorded response by request hash."""
-        response_file = f"{request_hash[:12]}.json"
-        response_path = self.responses_dir / response_file
+        """Find a recorded response by request hash.

-        if not response_path.exists():
-            return None
+        Uses fallback: first checks test-specific dir, then falls back to base recordings dir.
+        This handles cases where recordings happen during session setup (no test context) but
+        are requested during tests (with test context).
+        """
+        response_file = f"{request_hash}.json"

+        # Try test-specific directory first
+        test_dir = self._get_test_dir()
+        response_path = test_dir / response_file
+
+        if response_path.exists():
            return _recording_from_file(response_path)

-    def _model_list_responses(self, short_hash: str) -> list[dict[str, Any]]:
+        # Fallback to base recordings directory (for session-level recordings)
+        fallback_dir = self.base_dir / "recordings"
+        fallback_path = fallback_dir / response_file
+
+        if fallback_path.exists():
+            return _recording_from_file(fallback_path)
+
+        return None
+
+    def _model_list_responses(self, request_hash: str) -> list[dict[str, Any]]:
+        """Find all model-list recordings with the given hash (different digests)."""
        results: list[dict[str, Any]] = []
-        for path in self.responses_dir.glob(f"models-{short_hash}-*.json"):
+
+        # Check test-specific directory first
+        test_dir = self._get_test_dir()
+        if test_dir.exists():
+            for path in test_dir.glob(f"models-{request_hash}-*.json"):
                data = _recording_from_file(path)
                results.append(data)
+
+        # Also check fallback directory
+        fallback_dir = self.base_dir / "recordings"
+        if fallback_dir.exists():
+            for path in fallback_dir.glob(f"models-{request_hash}-*.json"):
+                data = _recording_from_file(path)
+                results.append(data)
+
        return results


@ -231,6 +317,8 @@ def _recording_from_file(response_path) -> dict[str, Any]:


 def _model_identifiers_digest(endpoint: str, response: dict[str, Any]) -> str:
+    """Generate a digest from model identifiers for distinguishing different model sets."""
+
    def _extract_model_identifiers():
        """Extract a stable set of identifiers for model-list endpoints.

@ -253,7 +341,14 @@ def _model_identifiers_digest(endpoint: str, response: dict[str, Any]) -> str:


 def _combine_model_list_responses(endpoint: str, records: list[dict[str, Any]]) -> dict[str, Any] | None:
-    """Return a single, unioned recording for supported model-list endpoints."""
+    """Return a single, unioned recording for supported model-list endpoints.
+
+    Merges multiple recordings with different model sets (from different servers) into
+    a single response containing all models.
+    """
+    if not records:
+        return None
+
    seen: dict[str, dict[str, Any]] = {}
    for rec in records:
        body = rec["response"]["body"]
@ -282,7 +377,10 @@ def _combine_model_list_responses(endpoint: str, records: list[dict[str, Any]])
 async def _patched_inference_method(original_method, self, client_type, endpoint, *args, **kwargs):
    global _current_mode, _current_storage

-    if _current_mode == InferenceMode.LIVE or _current_storage is None:
+    mode = _current_mode
+    storage = _current_storage
+
+    if mode == InferenceMode.LIVE or storage is None:
        if endpoint == "/v1/models":
            return original_method(self, *args, **kwargs)
        else:
@ -313,13 +411,16 @@ async def _patched_inference_method(original_method, self, client_type, endpoint

    request_hash = normalize_request(method, url, headers, body)

-    if _current_mode == InferenceMode.REPLAY:
-        # Special handling for model-list endpoints: return union of all responses
+    # Try to find existing recording for REPLAY or RECORD_IF_MISSING modes
+    recording = None
+    if mode == InferenceMode.REPLAY or mode == InferenceMode.RECORD_IF_MISSING:
+        # Special handling for model-list endpoints: merge all recordings with this hash
        if endpoint in ("/api/tags", "/v1/models"):
-            records = _current_storage._model_list_responses(request_hash[:12])
+            records = storage._model_list_responses(request_hash)
            recording = _combine_model_list_responses(endpoint, records)
        else:
-            recording = _current_storage.find_recording(request_hash)
+            recording = storage.find_recording(request_hash)
+
        if recording:
            response_body = recording["response"]["body"]

@ -332,7 +433,8 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
                return replay_stream()
            else:
                return response_body
-        else:
+        elif mode == InferenceMode.REPLAY:
+            # REPLAY mode requires recording to exist
            raise RuntimeError(
                f"No recorded response found for request hash: {request_hash}\n"
                f"Request: {method} {url} {body}\n"
@ -340,7 +442,7 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
                f"To record this response, run with LLAMA_STACK_TEST_INFERENCE_MODE=record"
            )

-    elif _current_mode == InferenceMode.RECORD:
+    if mode == InferenceMode.RECORD or (mode == InferenceMode.RECORD_IF_MISSING and not recording):
        if endpoint == "/v1/models":
            response = original_method(self, *args, **kwargs)
        else:
@ -371,7 +473,7 @@ async def _patched_inference_method(original_method, self, client_type, endpoint

            # Store the recording immediately
            response_data = {"body": chunks, "is_streaming": True}
-            _current_storage.store_recording(request_hash, request_data, response_data)
+            storage.store_recording(request_hash, request_data, response_data)

            # Return a generator that replays the stored chunks
            async def replay_recorded_stream():
@ -381,11 +483,11 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
            return replay_recorded_stream()
        else:
            response_data = {"body": response, "is_streaming": False}
-            _current_storage.store_recording(request_hash, request_data, response_data)
+            storage.store_recording(request_hash, request_data, response_data)
            return response

    else:
-        raise AssertionError(f"Invalid mode: {_current_mode}")
+        raise AssertionError(f"Invalid mode: {mode}")


 def patch_inference_clients():
@ -526,9 +628,9 @@ def inference_recording(mode: str, storage_dir: str | Path | None = None) -> Gen
    try:
        _current_mode = mode

-        if mode in ["record", "replay"]:
+        if mode in ["record", "replay", "record-if-missing"]:
            if storage_dir is None:
-                raise ValueError("storage_dir is required for record and replay modes")
+                raise ValueError("storage_dir is required for record, replay, and record-if-missing modes")
            _current_storage = ResponseStorage(Path(storage_dir))
            patch_inference_clients()

@ -536,7 +638,7 @@ def inference_recording(mode: str, storage_dir: str | Path | None = None) -> Gen

    finally:
        # Restore previous state
-        if mode in ["record", "replay"]:
+        if mode in ["record", "replay", "record-if-missing"]:
            unpatch_inference_clients()

        _current_mode = prev_mode
--- a/tests/integration/agents/recordings/00f8a71ccb939737ed72a289eede62998c6882727519858bbd5954307d10a673.json
+++ b/tests/integration/agents/recordings/00f8a71ccb939737ed72a289eede62998c6882727519858bbd5954307d10a673.json
--- a/tests/integration/agents/recordings/095b37e65a5a78904f225bdefc904d3e20145a4dab1be0cf07d17a416d85e58d.json
+++ b/tests/integration/agents/recordings/095b37e65a5a78904f225bdefc904d3e20145a4dab1be0cf07d17a416d85e58d.json
--- a/tests/integration/agents/recordings/13fac3724cd626a119153f60fa56bb54955fe0b10f5c4102b78e2d428b5ffc7a.json
+++ b/tests/integration/agents/recordings/13fac3724cd626a119153f60fa56bb54955fe0b10f5c4102b78e2d428b5ffc7a.json
--- a/tests/integration/agents/recordings/1a0d3109cf92111ed4cb061a857dee0b99fa1e0b27934de1e6c5d29c03026626.json
+++ b/tests/integration/agents/recordings/1a0d3109cf92111ed4cb061a857dee0b99fa1e0b27934de1e6c5d29c03026626.json
@ -0,0 +1,58 @@
+{
+  "test_id": "tests/integration/agents/test_agents.py::test_tool_choice_none[ollama/llama3.2:3b-instruct-fp16]",
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama-guard3:1b",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Task: Check if there is unsafe content in 'User' messages in conversations according our safety policy with the below categories.\n\n<BEGIN UNSAFE CONTENT CATEGORIES>\nS1: Violent Crimes.\nS2: Non-Violent Crimes.\nS3: Sex Crimes.\nS4: Child Exploitation.\nS5: Defamation.\nS6: Specialized Advice.\nS7: Privacy.\nS8: Intellectual Property.\nS9: Indiscriminate Weapons.\nS10: Hate.\nS11: Self-Harm.\nS12: Sexual Content.\nS13: Elections.\nS14: Code Interpreter Abuse.\n<END UNSAFE CONTENT CATEGORIES>\n\n<BEGIN CONVERSATION>\n\nUser: What is the boiling point of the liquid polyjuice in celsius?\n\n<END CONVERSATION>\n\nProvide your safety assessment for ONLY THE LAST User message in the above conversation:\n - First line must read 'safe' or 'unsafe'.\n - If unsafe, a second line must include a comma-separated list of violated categories."
+        }
+      ],
+      "stream": false,
+      "temperature": 0.0
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama-guard3:1b"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "rec-1a0d3109cf92",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "safe",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 0,
+        "model": "llama-guard3:1b",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 2,
+          "prompt_tokens": 398,
+          "total_tokens": 400,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/1d82e9439ae3.json
+++ b/tests/integration/recordings/responses/1d82e9439ae3.json
--- a/tests/integration/agents/recordings/1fd05fad28c2771c56ac8462a742f6004a37724e9c44b32bfc7fef5ac44d744c.json
+++ b/tests/integration/agents/recordings/1fd05fad28c2771c56ac8462a742f6004a37724e9c44b32bfc7fef5ac44d744c.json
--- a/tests/integration/agents/recordings/28cc2057662b6d13ab1becc3f92e5afbdaa19c2192b588a5795a5fa4c09fb588.json
+++ b/tests/integration/agents/recordings/28cc2057662b6d13ab1becc3f92e5afbdaa19c2192b588a5795a5fa4c09fb588.json
--- a/tests/integration/agents/recordings/292308724331c7172aaf91fe1373f2fbc9d626af08314bd7f5ba69d038ea7c1b.json
+++ b/tests/integration/agents/recordings/292308724331c7172aaf91fe1373f2fbc9d626af08314bd7f5ba69d038ea7c1b.json
--- a/tests/integration/agents/recordings/36e22908b34c0835037ba7b52477c5db69585e66f4fde18eaa8bfd4bb4e3d783.json
+++ b/tests/integration/agents/recordings/36e22908b34c0835037ba7b52477c5db69585e66f4fde18eaa8bfd4bb4e3d783.json
--- a/tests/integration/recordings/responses/3a1a67912f65.json
+++ b/tests/integration/recordings/responses/3a1a67912f65.json
--- a/tests/integration/agents/recordings/3ef67b5ce7dbb36bf47c16b906e32b5d5f18f7e46b6425690d680b6536fffcf0.json
+++ b/tests/integration/agents/recordings/3ef67b5ce7dbb36bf47c16b906e32b5d5f18f7e46b6425690d680b6536fffcf0.json
--- a/tests/integration/agents/recordings/45175e711385e24f62516c3982eaf8fb6bbab4f37691fadc08812ce223dfc628.json
+++ b/tests/integration/agents/recordings/45175e711385e24f62516c3982eaf8fb6bbab4f37691fadc08812ce223dfc628.json
--- a/tests/integration/agents/recordings/45f7bb25cd8184f361ab72c0ca3f9391c966d802d60df60d5b5b7c8354d0da6d.json
+++ b/tests/integration/agents/recordings/45f7bb25cd8184f361ab72c0ca3f9391c966d802d60df60d5b5b7c8354d0da6d.json
--- a/tests/integration/agents/recordings/55c7250c01acd7f27b62fa966eae24af54de6d7e0e393918c58ac7215c60a29f.json
+++ b/tests/integration/agents/recordings/55c7250c01acd7f27b62fa966eae24af54de6d7e0e393918c58ac7215c60a29f.json
--- a/tests/integration/recordings/responses/5cc7605f9950.json
+++ b/tests/integration/recordings/responses/5cc7605f9950.json
--- a/tests/integration/agents/recordings/610695da59ffcce0a03a4d4b1ffbf61cf698f39fd41628173fb864e14b3a71c5.json
+++ b/tests/integration/agents/recordings/610695da59ffcce0a03a4d4b1ffbf61cf698f39fd41628173fb864e14b3a71c5.json
--- a/tests/integration/agents/recordings/6446f5838fb216dcc72e56d44ad62350d66e4f850bc15abb328f1278d1204723.json
+++ b/tests/integration/agents/recordings/6446f5838fb216dcc72e56d44ad62350d66e4f850bc15abb328f1278d1204723.json
--- a/tests/integration/agents/recordings/697a25dd7f0ff515f567c883ad72ae9dca423726834aec8b38420dccb735c050.json
+++ b/tests/integration/agents/recordings/697a25dd7f0ff515f567c883ad72ae9dca423726834aec8b38420dccb735c050.json
--- a/tests/integration/agents/recordings/6da760645fe224ace4ab628e4f647259897598e28037fe5f7c09f6677edd08e9.json
+++ b/tests/integration/agents/recordings/6da760645fe224ace4ab628e4f647259897598e28037fe5f7c09f6677edd08e9.json
--- a/tests/integration/agents/recordings/7094319e038424fbec54338c397b487c7128fc28534351deb4662fba31043fa4.json
+++ b/tests/integration/agents/recordings/7094319e038424fbec54338c397b487c7128fc28534351deb4662fba31043fa4.json
@ -0,0 +1,58 @@
+{
+  "test_id": "tests/integration/agents/test_agents.py::test_tool_choice_required[ollama/llama3.2:3b-instruct-fp16]",
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama-guard3:1b",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Task: Check if there is unsafe content in 'User' messages in conversations according our safety policy with the below categories.\n\n<BEGIN UNSAFE CONTENT CATEGORIES>\nS1: Violent Crimes.\nS2: Non-Violent Crimes.\nS3: Sex Crimes.\nS4: Child Exploitation.\nS5: Defamation.\nS6: Specialized Advice.\nS7: Privacy.\nS8: Intellectual Property.\nS9: Indiscriminate Weapons.\nS10: Hate.\nS11: Self-Harm.\nS12: Sexual Content.\nS13: Elections.\nS14: Code Interpreter Abuse.\n<END UNSAFE CONTENT CATEGORIES>\n\n<BEGIN CONVERSATION>\n\nUser: What is the boiling point of the liquid polyjuice in celsius?\n\n<END CONVERSATION>\n\nProvide your safety assessment for ONLY THE LAST User message in the above conversation:\n - First line must read 'safe' or 'unsafe'.\n - If unsafe, a second line must include a comma-separated list of violated categories."
+        }
+      ],
+      "stream": false,
+      "temperature": 0.0
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama-guard3:1b"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "rec-7094319e0384",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "safe",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 0,
+        "model": "llama-guard3:1b",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 2,
+          "prompt_tokens": 398,
+          "total_tokens": 400,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/agents/recordings/74c26f63592ceedb76eb9623fce41773965dd66b569506b5622b1a797c45f8e4.json
+++ b/tests/integration/agents/recordings/74c26f63592ceedb76eb9623fce41773965dd66b569506b5622b1a797c45f8e4.json
--- a/tests/integration/agents/recordings/7e0d8c4abe407611ceaa99beea2e9973e2a938cab3db3e1be017bbe8be10edc6.json
+++ b/tests/integration/agents/recordings/7e0d8c4abe407611ceaa99beea2e9973e2a938cab3db3e1be017bbe8be10edc6.json
--- a/tests/integration/agents/recordings/868820c8d798c0d16063d1750a65ae2632ecf543ee440e7d87ea16f8e83461a5.json
+++ b/tests/integration/agents/recordings/868820c8d798c0d16063d1750a65ae2632ecf543ee440e7d87ea16f8e83461a5.json
--- a/tests/integration/agents/recordings/868db6e8a427d63dc71b749257aa40343a2876e37cc0f695fee5f1ae6e1d6ba7.json
+++ b/tests/integration/agents/recordings/868db6e8a427d63dc71b749257aa40343a2876e37cc0f695fee5f1ae6e1d6ba7.json
--- a/tests/integration/agents/recordings/86e2b939aabb9dfe7ec712a6b20a5809d6fb56f8c9f92d93030f57cba51a1fe2.json
+++ b/tests/integration/agents/recordings/86e2b939aabb9dfe7ec712a6b20a5809d6fb56f8c9f92d93030f57cba51a1fe2.json
--- a/tests/integration/agents/recordings/8ed0947593196c2b2f68035e248c137813e8db50d0d46395ef9ba98636fa5819.json
+++ b/tests/integration/agents/recordings/8ed0947593196c2b2f68035e248c137813e8db50d0d46395ef9ba98636fa5819.json
--- a/tests/integration/agents/recordings/901956b3a51b792f2506d603489af51636b480db9cc520614ee4886418776237.json
+++ b/tests/integration/agents/recordings/901956b3a51b792f2506d603489af51636b480db9cc520614ee4886418776237.json
@ -0,0 +1,58 @@
+{
+  "test_id": "tests/integration/agents/test_agents.py::test_tool_choice_get_boiling_point[ollama/llama3.2:3b-instruct-fp16]",
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama-guard3:1b",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Task: Check if there is unsafe content in 'Assistant' messages in conversations according our safety policy with the below categories.\n\n<BEGIN UNSAFE CONTENT CATEGORIES>\nS1: Violent Crimes.\nS2: Non-Violent Crimes.\nS3: Sex Crimes.\nS4: Child Exploitation.\nS5: Defamation.\nS6: Specialized Advice.\nS7: Privacy.\nS8: Intellectual Property.\nS9: Indiscriminate Weapons.\nS10: Hate.\nS11: Self-Harm.\nS12: Sexual Content.\nS13: Elections.\nS14: Code Interpreter Abuse.\n<END UNSAFE CONTENT CATEGORIES>\n\n<BEGIN CONVERSATION>\n\nUser: What is the boiling point of the liquid polyjuice in celsius?\n\nAssistant: \n\n<END CONVERSATION>\n\nProvide your safety assessment for ONLY THE LAST Assistant message in the above conversation:\n - First line must read 'safe' or 'unsafe'.\n - If unsafe, a second line must include a comma-separated list of violated categories."
+        }
+      ],
+      "stream": false,
+      "temperature": 0.0
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama-guard3:1b"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "rec-901956b3a51b",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "safe",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 0,
+        "model": "llama-guard3:1b",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 2,
+          "prompt_tokens": 401,
+          "total_tokens": 403,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/agents/recordings/9354fb7f3064fe08a633f7e86444f54a53aaa490958df34f881f114c355d2dc2.json
+++ b/tests/integration/agents/recordings/9354fb7f3064fe08a633f7e86444f54a53aaa490958df34f881f114c355d2dc2.json
@ -0,0 +1,415 @@
+{
+  "test_id": "tests/integration/agents/test_agents.py::test_custom_tool[ollama/llama3.2:3b-instruct-fp16]",
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "system",
+          "content": "You are a helpful assistant"
+        },
+        {
+          "role": "user",
+          "content": "What is the boiling point of the liquid polyjuice in celsius?"
+        },
+        {
+          "role": "assistant",
+          "content": "",
+          "tool_calls": [
+            {
+              "id": "call_4p1xbexa",
+              "type": "function",
+              "function": {
+                "name": "get_boiling_point",
+                "arguments": "{\"celcius\":true,\"liquid_name\":\"polyjuice\"}"
+              }
+            }
+          ]
+        },
+        {
+          "role": "tool",
+          "tool_call_id": "call_4p1xbexa",
+          "content": "-100"
+        }
+      ],
+      "max_tokens": 512,
+      "stream": true,
+      "temperature": 0.0001,
+      "tool_choice": "auto",
+      "tools": [
+        {
+          "type": "function",
+          "function": {
+            "name": "get_boiling_point",
+            "description": "Returns the boiling point of a liquid in Celcius or Fahrenheit.",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "liquid_name": {
+                  "type": "string",
+                  "description": "The name of the liquid"
+                },
+                "celcius": {
+                  "type": "boolean",
+                  "description": "Whether to return the boiling point in Celcius"
+                }
+              },
+              "required": [
+                "liquid_name"
+              ]
+            }
+          }
+        }
+      ],
+      "top_p": 0.9
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-9354fb7f3064",
+          "choices": [
+            {
+              "delta": {
+                "content": "The",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-9354fb7f3064",
+          "choices": [
+            {
+              "delta": {
+                "content": " boiling",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-9354fb7f3064",
+          "choices": [
+            {
+              "delta": {
+                "content": " point",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-9354fb7f3064",
+          "choices": [
+            {
+              "delta": {
+                "content": " of",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-9354fb7f3064",
+          "choices": [
+            {
+              "delta": {
+                "content": " Poly",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-9354fb7f3064",
+          "choices": [
+            {
+              "delta": {
+                "content": "ju",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-9354fb7f3064",
+          "choices": [
+            {
+              "delta": {
+                "content": "ice",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-9354fb7f3064",
+          "choices": [
+            {
+              "delta": {
+                "content": " is",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-9354fb7f3064",
+          "choices": [
+            {
+              "delta": {
+                "content": " -",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-9354fb7f3064",
+          "choices": [
+            {
+              "delta": {
+                "content": "100",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-9354fb7f3064",
+          "choices": [
+            {
+              "delta": {
+                "content": "\u00b0C",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-9354fb7f3064",
+          "choices": [
+            {
+              "delta": {
+                "content": ".",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-9354fb7f3064",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": "stop",
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      }
+    ],
+    "is_streaming": true
+  }
+}
--- a/tests/integration/agents/recordings/958f9b74e98bcf41e4988db8ad15494b8fe0ff707261108305353e4ad980195f.json
+++ b/tests/integration/agents/recordings/958f9b74e98bcf41e4988db8ad15494b8fe0ff707261108305353e4ad980195f.json
--- a/tests/integration/agents/recordings/961052707b2d39c56d3ac2ee7c71d67fffc2c1398c7d64d8e18a7b28135c42c3.json
+++ b/tests/integration/agents/recordings/961052707b2d39c56d3ac2ee7c71d67fffc2c1398c7d64d8e18a7b28135c42c3.json
--- a/tests/integration/agents/recordings/96623a251d6e51ee6ba21c53ca111d4aa54882a124d783a8096fd88adf481065.json
+++ b/tests/integration/agents/recordings/96623a251d6e51ee6ba21c53ca111d4aa54882a124d783a8096fd88adf481065.json
--- a/tests/integration/agents/recordings/a702e4bf918e94acd0d76ed753c120a4704dde82acf5ae198982fd62bd103279.json
+++ b/tests/integration/agents/recordings/a702e4bf918e94acd0d76ed753c120a4704dde82acf5ae198982fd62bd103279.json
@ -0,0 +1,58 @@
+{
+  "test_id": "tests/integration/agents/test_agents.py::test_tool_choice_get_boiling_point[ollama/llama3.2:3b-instruct-fp16]",
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama-guard3:1b",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Task: Check if there is unsafe content in 'User' messages in conversations according our safety policy with the below categories.\n\n<BEGIN UNSAFE CONTENT CATEGORIES>\nS1: Violent Crimes.\nS2: Non-Violent Crimes.\nS3: Sex Crimes.\nS4: Child Exploitation.\nS5: Defamation.\nS6: Specialized Advice.\nS7: Privacy.\nS8: Intellectual Property.\nS9: Indiscriminate Weapons.\nS10: Hate.\nS11: Self-Harm.\nS12: Sexual Content.\nS13: Elections.\nS14: Code Interpreter Abuse.\n<END UNSAFE CONTENT CATEGORIES>\n\n<BEGIN CONVERSATION>\n\nUser: What is the boiling point of the liquid polyjuice in celsius?\n\n<END CONVERSATION>\n\nProvide your safety assessment for ONLY THE LAST User message in the above conversation:\n - First line must read 'safe' or 'unsafe'.\n - If unsafe, a second line must include a comma-separated list of violated categories."
+        }
+      ],
+      "stream": false,
+      "temperature": 0.0
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama-guard3:1b"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "rec-a702e4bf918e",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "safe",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 0,
+        "model": "llama-guard3:1b",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 2,
+          "prompt_tokens": 398,
+          "total_tokens": 400,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/accd741d4b74.json
+++ b/tests/integration/recordings/responses/accd741d4b74.json
--- a/tests/integration/agents/recordings/ad3f6a2b4031bcd38026c3c50617851f102c12946164a563584e6316bd1b6228.json
+++ b/tests/integration/agents/recordings/ad3f6a2b4031bcd38026c3c50617851f102c12946164a563584e6316bd1b6228.json
--- a/tests/integration/agents/recordings/ae230a6062d097d51a5a54cb41852f7f33f6b2c7a400039007c0822cac28d434.json
+++ b/tests/integration/agents/recordings/ae230a6062d097d51a5a54cb41852f7f33f6b2c7a400039007c0822cac28d434.json
--- a/tests/integration/agents/recordings/b62d824408e5d5f1e278fb6ed654b18a8c50eeed5cfdd125ec110752595649bc.json
+++ b/tests/integration/agents/recordings/b62d824408e5d5f1e278fb6ed654b18a8c50eeed5cfdd125ec110752595649bc.json
--- a/tests/integration/agents/recordings/bcb50763cac6527944653e77df1f2cc9e9481ffae9cff5693b2f669270c9c0a7.json
+++ b/tests/integration/agents/recordings/bcb50763cac6527944653e77df1f2cc9e9481ffae9cff5693b2f669270c9c0a7.json
--- a/tests/integration/agents/recordings/c97c102959ea8c64a43e4c752bf5e979c7a298fdbeedec153954ce817da7e3e7.json
+++ b/tests/integration/agents/recordings/c97c102959ea8c64a43e4c752bf5e979c7a298fdbeedec153954ce817da7e3e7.json
--- a/tests/integration/agents/recordings/ca95f47c2896a7ce1536a3cf1a78170e073e49e478658a4b098b3581a703e843.json
+++ b/tests/integration/agents/recordings/ca95f47c2896a7ce1536a3cf1a78170e073e49e478658a4b098b3581a703e843.json
--- a/tests/integration/agents/recordings/d35fc2ef48595f5d0afe8fb617c8df864a155017e8d4c5d2e2b2c51e9cfaed5e.json
+++ b/tests/integration/agents/recordings/d35fc2ef48595f5d0afe8fb617c8df864a155017e8d4c5d2e2b2c51e9cfaed5e.json
--- a/tests/integration/agents/recordings/da6ec32ed8fb5bf4d957058ddd391a9958c4cfd0ab301bbd7b466a889cfc631b.json
+++ b/tests/integration/agents/recordings/da6ec32ed8fb5bf4d957058ddd391a9958c4cfd0ab301bbd7b466a889cfc631b.json
--- a/tests/integration/agents/recordings/db5c89b87eba0d129ad9ed17306d4016aeeaf2bbeeaa5643d9620f5ea484430e.json
+++ b/tests/integration/agents/recordings/db5c89b87eba0d129ad9ed17306d4016aeeaf2bbeeaa5643d9620f5ea484430e.json
--- a/tests/integration/agents/recordings/df1ff58ea16f0cfb14c80acfee395b186a1feed8262ef40538f0a215954e4398.json
+++ b/tests/integration/agents/recordings/df1ff58ea16f0cfb14c80acfee395b186a1feed8262ef40538f0a215954e4398.json
--- a/tests/integration/agents/recordings/df8355d4d3267371f2a65159d80cfb4e34591e6222a5bc3a079c94a068bf8fd7.json
+++ b/tests/integration/agents/recordings/df8355d4d3267371f2a65159d80cfb4e34591e6222a5bc3a079c94a068bf8fd7.json
--- a/tests/integration/agents/recordings/ed76dd5fdf892c9cc959b2d301a256f81c43a906a0a56684ca97e848f8d6a94c.json
+++ b/tests/integration/agents/recordings/ed76dd5fdf892c9cc959b2d301a256f81c43a906a0a56684ca97e848f8d6a94c.json
--- a/tests/integration/agents/recordings/f85c3c14185386eecd4939eeb6b3a3cee734d69beb7cd6d13a3d3c2c64eca734.json
+++ b/tests/integration/agents/recordings/f85c3c14185386eecd4939eeb6b3a3cee734d69beb7cd6d13a3d3c2c64eca734.json
--- a/tests/integration/common/recordings/02c93bb3c314427bae2b7a7a6f054792b9f22d2cb4522eab802810be8672d3dc.json
+++ b/tests/integration/common/recordings/02c93bb3c314427bae2b7a7a6f054792b9f22d2cb4522eab802810be8672d3dc.json
@ -1,4 +1,5 @@
 {
+  "test_id": null,
  "request": {
    "method": "POST",
    "url": "http://localhost:11434/api/ps",
--- a/tests/integration/common/recordings/1098240ef53bbd378adf8dafbd5838b16eef7d6a7d6e75d24e3c120e25e73750.json
+++ b/tests/integration/common/recordings/1098240ef53bbd378adf8dafbd5838b16eef7d6a7d6e75d24e3c120e25e73750.json
@ -0,0 +1,57 @@
+{
+  "test_id": null,
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Test trace openai 1"
+        }
+      ],
+      "stream": false
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "rec-1098240ef53b",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "I'm happy to help you test the OpenAI API, but I need to clarify that I'm a large language model, not a direct interface to the OpenAI API. However, I can guide you through a simple testing process.\n\nOpenAI provides an API key that allows access to their models, including the powerful \"text-001\" model you mentioned. Here's how you can test it:\n\n**Step 1: Get your API Key**\n\nIf you haven't already, sign up for an OpenAI account at [openai.com](http://openai.com) and obtain a free API key.\n\n**Step 2: Access the API endpoint**\n\nOpenAI provides a few different endpoints for their models. For testing purposes, we'll use the \"API endpoint\" which is `https://api.openai.com/v1/models/text-001`. You can access this URL directly in your web browser or use a tool like `curl` to send an HTTP request.\n\n**Step 3: Test with a simple prompt**\n\nTo test the model, you can use a simple prompt, such as:\n\n`\"Write a short paragraph about AI.\"`\n\nYou can modify this prompt to suit your testing needs. You can also try using more complex prompts or even generating text based on user input.\n\nHere's an example of how you could send a request to the OpenAI endpoint using `curl`:\n```bash\ncurl -X POST \\\n  https://api.openai.com/v1/models/text-001 \\\n  -H 'Authorization: Bearer YOUR_API_KEY' \\\n  -H 'Content-Type: application/json' \\\n  -d '{\"prompt\": \"Write a short paragraph about AI.\"}'\n```\nReplace `YOUR_API_KEY` with your actual OpenAI API key.\n\n**Step 4: Verify the response**\n\nAfter sending the request, you should receive a JSON response from OpenAI. The response will contain the generated text based on your prompt.\n\nIf everything goes well, you should see some text in response to your prompt. This might look something like this:\n```json\n{\n  \"result\": [\n    {\n      \"text\": \"Artificial intelligence has been able to learn and adapt quickly, allowing it to take over various tasks automatically.\"\n    }\n  ]\n}\n```\nThis is just a simple example to demonstrate the process of testing the OpenAI API. Keep in mind that this is just a brief illustration and the actual experience may vary depending on your specific use case.\n\nDo you want me to try using python - gpt-3 via Hugging Face to run some tests ? I can give it a shot if you're not up to the task yourself!",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 0,
+        "model": "llama3.2:3b-instruct-fp16",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 539,
+          "prompt_tokens": 31,
+          "total_tokens": 570,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/common/recordings/395c30078677826058a0cbe136dfd07c816854cfb7015ee4ece0e414d16e7e52.json
+++ b/tests/integration/common/recordings/395c30078677826058a0cbe136dfd07c816854cfb7015ee4ece0e414d16e7e52.json
@ -0,0 +1,57 @@
+{
+  "test_id": null,
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Test trace openai 2"
+        }
+      ],
+      "stream": false
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "rec-395c30078677",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "I'd be happy to help you test the OpenAI 2.0 model!\n\nPlease note that I need to clarify a few things:\n\n1. **I can only simulate the conversation**: I won't have direct access to the actual OpenAI 2.0 model.\n2. **The simulation might not be perfect**: The testing process will be based on my understanding of the model's capabilities and limitations.\n\nThat being said, here's how you can test the OpenAI 2.0 model with me:\n\n**Testing Scenarios**\n\nChoose one or more of the following scenarios to test the OpenAI 2.0 model:\n\n1. **Basic Conversation**: Test basic conversations, such as asking questions and receiving answers.\n2. **Dialogue Flow**: Test the model's ability to maintain a conversation flow by asking follow-up questions.\n3. **Creative Writing**: Test the model's creativity by asking it to generate paragraphs or short stories.\n4. **Fact Retrieval**: Test the model's knowledge retrieval capabilities by asking it factual questions.\n\n**Input Format**\n\nPlease format your input using plain text. You can use:\n\n* A single question or statement\n* Multiple statements separated by commas\n* A prompt with a specific format (e.g., \"Write a short story about [topic]\")\n\n**Example Input**\n```\nAsk me: What is the capital of France?\n```\n\nGo ahead and choose your testing scenario, and I'll simulate the conversation!",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 0,
+        "model": "llama3.2:3b-instruct-fp16",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 293,
+          "prompt_tokens": 31,
+          "total_tokens": 324,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/common/recordings/41c28019c2c89e5962ae3043dc7015ee45aa5ee235645768a230a5fa5cd45ad9.json
+++ b/tests/integration/common/recordings/41c28019c2c89e5962ae3043dc7015ee45aa5ee235645768a230a5fa5cd45ad9.json
--- a/tests/integration/common/recordings/4df315784095a200b2d275c6f8dda8be845e250000208127d20cf8c4f0bc666c.json
+++ b/tests/integration/common/recordings/4df315784095a200b2d275c6f8dda8be845e250000208127d20cf8c4f0bc666c.json
--- a/tests/integration/common/recordings/5b03940f8f14616ba20bf3b695138b785ffc26aed814ef01db492f4a5674d6c5.json
+++ b/tests/integration/common/recordings/5b03940f8f14616ba20bf3b695138b785ffc26aed814ef01db492f4a5674d6c5.json
--- a/tests/integration/common/recordings/64a2277c90f0f42576f60c1030e3a020403d34a95f56931b792d5939f4cebc57.json
+++ b/tests/integration/common/recordings/64a2277c90f0f42576f60c1030e3a020403d34a95f56931b792d5939f4cebc57.json
@ -1,4 +1,5 @@
 {
+  "test_id": null,
  "request": {
    "method": "POST",
    "url": "http://0.0.0.0:11434/v1/v1/models",
--- a/tests/integration/common/recordings/b2c646582d0a4d9d8986789261c0d630d5b604ee6291cf8aa3d44ab761f2c676.json
+++ b/tests/integration/common/recordings/b2c646582d0a4d9d8986789261c0d630d5b604ee6291cf8aa3d44ab761f2c676.json
--- a/tests/integration/common/recordings/bfc8818f4ad237ba6c9649d47eaff8946e334ea6a2bcb564d74f4f14dbc3497b.json
+++ b/tests/integration/common/recordings/bfc8818f4ad237ba6c9649d47eaff8946e334ea6a2bcb564d74f4f14dbc3497b.json
@ -0,0 +1,56 @@
+{
+  "test_id": null,
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Test trace 1"
+        }
+      ]
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "rec-bfc8818f4ad2",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "It seems like you'd like to test something, but I'm not sure what. Could you please provide more context or clarify what you're trying to test? I'll do my best to assist you!",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 0,
+        "model": "llama3.2:3b-instruct-fp16",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 42,
+          "prompt_tokens": 29,
+          "total_tokens": 71,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/common/recordings/c34cccb2af2fb9f02f7136b0dd350e75e7d2a77d222ef26a9bc419e10fa33c56.json
+++ b/tests/integration/common/recordings/c34cccb2af2fb9f02f7136b0dd350e75e7d2a77d222ef26a9bc419e10fa33c56.json
@ -0,0 +1,57 @@
+{
+  "test_id": null,
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Test trace openai 0"
+        }
+      ],
+      "stream": false
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "rec-c34cccb2af2f",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "I'm not capable of testing or interacting with the OpenAI API directly. However, I can provide some general information about the OpenAI model called \"Trace\" and how it can be used.\n\nThe Trace was a transformer-based language model developed by OpenAI in 2022. It was designed to generate text based on a given prompt, but it never gained widespread use.\n\nUnfortunately, the model is no longer available for public testing because OpenAI removed it from their model hub after some issues were raised about its quality and limitations.\n\nThat being said, there are various other models that you might be interested in using as an alternative to the Trace. Here's a list of popular models:\n\n1. **Text-Transformer**: This model is designed for text classification and generation tasks.\n2, **DALL-E 2**: A text-to-image model capable of generating images based on user-provided input prompts.\n3:**Diffusers**: An AI model that can generate raw pixel data in the form of an image.\n\nTo test these models or others available through OpenAI models hub you may need to complete a sign-up process.",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 0,
+        "model": "llama3.2:3b-instruct-fp16",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 225,
+          "prompt_tokens": 31,
+          "total_tokens": 256,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/common/recordings/c4f314b202711805808eb75f1947cb6cca0bf8dbffb0dfabb814f9da0083b3c3.json
+++ b/tests/integration/common/recordings/c4f314b202711805808eb75f1947cb6cca0bf8dbffb0dfabb814f9da0083b3c3.json
--- a/tests/integration/common/recordings/models-64a2277c90f0f42576f60c1030e3a020403d34a95f56931b792d5939f4cebc57-52e8575f.json
+++ b/tests/integration/common/recordings/models-64a2277c90f0f42576f60c1030e3a020403d34a95f56931b792d5939f4cebc57-52e8575f.json
@ -1,4 +1,5 @@
 {
+  "test_id": null,
  "request": {
    "method": "POST",
    "url": "https://api.openai.com/v1/v1/models",
--- a/tests/integration/common/recordings/models-64a2277c90f0f42576f60c1030e3a020403d34a95f56931b792d5939f4cebc57-e660ee4a.json
+++ b/tests/integration/common/recordings/models-64a2277c90f0f42576f60c1030e3a020403d34a95f56931b792d5939f4cebc57-e660ee4a.json
@ -1,4 +1,5 @@
 {
+  "test_id": null,
  "request": {
    "method": "POST",
    "url": "http://0.0.0.0:11434/v1/v1/models",
@ -9,47 +10,11 @@
  },
  "response": {
    "body": [
-      {
-        "__type__": "openai.types.model.Model",
-        "__data__": {
-          "id": "qwen3:8b",
-          "created": 1758707188,
-          "object": "model",
-          "owned_by": "library"
-        }
-      },
-      {
-        "__type__": "openai.types.model.Model",
-        "__data__": {
-          "id": "nomic-embed-text:137m-v1.5-fp16",
-          "created": 1758640855,
-          "object": "model",
-          "owned_by": "library"
-        }
-      },
      {
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "nomic-embed-text:latest",
-          "created": 1756727155,
-          "object": "model",
-          "owned_by": "library"
-        }
-      },
-      {
-        "__type__": "openai.types.model.Model",
-        "__data__": {
-          "id": "llama3.2-vision:11b",
-          "created": 1756722893,
-          "object": "model",
-          "owned_by": "library"
-        }
-      },
-      {
-        "__type__": "openai.types.model.Model",
-        "__data__": {
-          "id": "llama-guard3:1b",
-          "created": 1756671473,
+          "created": 1756922046,
          "object": "model",
          "owned_by": "library"
        }
@ -58,7 +23,43 @@
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "all-minilm:l6-v2",
-          "created": 1756655274,
+          "created": 1756919946,
+          "object": "model",
+          "owned_by": "library"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "llama3.2-vision:11b",
+          "created": 1753926302,
+          "object": "model",
+          "owned_by": "library"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "llama3.2-vision:latest",
+          "created": 1753845527,
+          "object": "model",
+          "owned_by": "library"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "llama-guard3:1b",
+          "created": 1753479584,
+          "object": "model",
+          "owned_by": "library"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "llama3.2:1b",
+          "created": 1752814944,
          "object": "model",
          "owned_by": "library"
        }
@ -67,16 +68,7 @@
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "all-minilm:latest",
-          "created": 1747317111,
-          "object": "model",
-          "owned_by": "library"
-        }
-      },
-      {
-        "__type__": "openai.types.model.Model",
-        "__data__": {
-          "id": "llama3.2:3b-instruct-fp16",
-          "created": 1744974677,
+          "created": 1748994610,
          "object": "model",
          "owned_by": "library"
        }
@ -85,7 +77,16 @@
        "__type__": "openai.types.model.Model",
        "__data__": {
          "id": "llama3.2:3b",
-          "created": 1743536220,
+          "created": 1746123323,
+          "object": "model",
+          "owned_by": "library"
+        }
+      },
+      {
+        "__type__": "openai.types.model.Model",
+        "__data__": {
+          "id": "llama3.2:3b-instruct-fp16",
+          "created": 1746052428,
          "object": "model",
          "owned_by": "library"
        }
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@ -36,6 +36,24 @@ def pytest_sessionstart(session):
        os.environ["LLAMA_STACK_TEST_INFERENCE_MODE"] = "replay"


+@pytest.fixture(autouse=True)
+def _track_test_context(request):
+    """Automatically track current test context for isolated recordings.
+
+    This fixture runs for every test and stores the test's nodeid in a contextvar
+    that the recording system can access to determine which subdirectory to use.
+    """
+    from llama_stack.testing.inference_recorder import _test_context
+
+    # Store the test nodeid (e.g., "tests/integration/responses/test_basic.py::test_foo[params]")
+    token = _test_context.set(request.node.nodeid)
+
+    yield
+
+    # Cleanup
+    _test_context.reset(token)
+
+
 def pytest_runtest_teardown(item):
    # Check if the test actually ran and passed or failed, but was not skipped or an expected failure (xfail)
    outcome = getattr(item, "execution_outcome", None)
@ -137,8 +155,8 @@ def pytest_addoption(parser):

    parser.addoption(
        "--inference-mode",
-        help="Inference mode: { record, replay, live } (default: replay)",
-        choices=["record", "replay", "live"],
+        help="Inference mode: { record, replay, live, record-if-missing } (default: replay)",
+        choices=["record", "replay", "live", "record-if-missing"],
        default="replay",
    )
    parser.addoption(
--- a/tests/integration/eval/recordings/171c4dcb3dc848196f5d7fd87efd4626e70673c405ae1cd72b8dd0617104263e.json
+++ b/tests/integration/eval/recordings/171c4dcb3dc848196f5d7fd87efd4626e70673c405ae1cd72b8dd0617104263e.json
--- a/tests/integration/eval/recordings/1b2720589d2a4273b5eb2c06b50ab45674040195c15013c9ea43bc6331e1a831.json
+++ b/tests/integration/eval/recordings/1b2720589d2a4273b5eb2c06b50ab45674040195c15013c9ea43bc6331e1a831.json
@ -0,0 +1,57 @@
+{
+  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_benchmark[txt=ollama/llama3.2:3b-instruct-fp16-basic::subset_of]",
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "user",
+          "content": "What is the currency of Japan?"
+        }
+      ],
+      "max_tokens": 0
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "rec-1b2720589d2a",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "The official currency of Japan is the yen (JPY). It is denoted by the ISO code \"JPY\" and subdivided into 100 sen, although sen are no longer used in everyday transactions. The Japanese government currently plans to phase out cash payments for certain types of transactions and aims to move fully away from paper money by year-end 2031.",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 0,
+        "model": "llama3.2:3b-instruct-fp16",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 74,
+          "prompt_tokens": 32,
+          "total_tokens": 106,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/eval/recordings/3e5ea35cb3dc92835d230456b6e2fc61593f964148d6c05df5c4a387a5389e6b.json
+++ b/tests/integration/eval/recordings/3e5ea35cb3dc92835d230456b6e2fc61593f964148d6c05df5c4a387a5389e6b.json
@ -0,0 +1,57 @@
+{
+  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_benchmark[txt=ollama/llama3.2:3b-instruct-fp16-basic::subset_of]",
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "user",
+          "content": "What is the smallest country in the world?"
+        }
+      ],
+      "max_tokens": 0
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "rec-3e5ea35cb3dc",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "The smallest country in the world is officially Vatican City, with an area of approximately 0.44 km\u00b2 (0.17 sq mi). It is located within Rome, Italy, and is home to the Pope and the central government of the Catholic Church.\n\nVatican City has a population of around 800 people, making it the smallest internationally recognized sovereign state in the world, both by area and population. Despite its small size, Vatican City has its own government, currency, postal system, and even its own police force.\n\nInterestingly, Vatican City is also the headquarters of the Catholic Church and is home to numerous iconic landmarks, including St. Peter's Basilica, the Sistine Chapel, and the Vatican Museums.",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 0,
+        "model": "llama3.2:3b-instruct-fp16",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 146,
+          "prompt_tokens": 34,
+          "total_tokens": 180,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/eval/recordings/6de6d1ebc3128dfaba1efe654ca1453f12cd31ce2e294d20868c0c498b7d9136.json
+++ b/tests/integration/eval/recordings/6de6d1ebc3128dfaba1efe654ca1453f12cd31ce2e294d20868c0c498b7d9136.json
@ -0,0 +1,57 @@
+{
+  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_benchmark[txt=ollama/llama3.2:3b-instruct-fp16-basic::subset_of]",
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "user",
+          "content": "What is the capital of France?"
+        }
+      ],
+      "max_tokens": 0
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "rec-6de6d1ebc312",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "The capital of France is Paris.",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 0,
+        "model": "llama3.2:3b-instruct-fp16",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 8,
+          "prompt_tokens": 32,
+          "total_tokens": 40,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/eval/recordings/9ebe1e04fc3a8d41f88992428a7c99669c7e19b3d551090eb6bec83b33de2a18.json
+++ b/tests/integration/eval/recordings/9ebe1e04fc3a8d41f88992428a7c99669c7e19b3d551090eb6bec83b33de2a18.json
--- a/tests/integration/eval/recordings/aa20023c358a0dc718355082cc244a231426700a772b8dc64abf05d8b126a736.json
+++ b/tests/integration/eval/recordings/aa20023c358a0dc718355082cc244a231426700a772b8dc64abf05d8b126a736.json
--- a/tests/integration/eval/recordings/b52a054b314c8b42634c4a9ef76280591f73cf26c00b7308dde7d19a1ced016c.json
+++ b/tests/integration/eval/recordings/b52a054b314c8b42634c4a9ef76280591f73cf26c00b7308dde7d19a1ced016c.json
--- a/tests/integration/eval/recordings/c07b01fe99467efcfa99f6ac9c60acc212cf2ac3bdd4192aabb5f98359236572.json
+++ b/tests/integration/eval/recordings/c07b01fe99467efcfa99f6ac9c60acc212cf2ac3bdd4192aabb5f98359236572.json
--- a/tests/integration/files/recordings/108e7edbe4a967fdb8befc0df3d3c10c336afca41aad421e653e4b02eab2a632.json
+++ b/tests/integration/files/recordings/108e7edbe4a967fdb8befc0df3d3c10c336afca41aad421e653e4b02eab2a632.json
@ -0,0 +1,59 @@
+{
+  "test_id": "tests/integration/files/test_files.py::test_files_authentication_shared_attributes",
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:11434/api/ps",
+    "headers": {},
+    "body": {},
+    "endpoint": "/api/ps",
+    "model": ""
+  },
+  "response": {
+    "body": {
+      "__type__": "ollama._types.ProcessResponse",
+      "__data__": {
+        "models": [
+          {
+            "model": "all-minilm:l6-v2",
+            "name": "all-minilm:l6-v2",
+            "digest": "1b226e2802dbb772b5fc32a58f103ca1804ef7501331012de126ab22f67475ef",
+            "expires_at": "2025-10-03T21:21:14.509335-07:00",
+            "size": 585846784,
+            "size_vram": 585846784,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "bert",
+              "families": [
+                "bert"
+              ],
+              "parameter_size": "23M",
+              "quantization_level": "F16"
+            },
+            "context_length": 256
+          },
+          {
+            "model": "llama3.2:3b-instruct-fp16",
+            "name": "llama3.2:3b-instruct-fp16",
+            "digest": "195a8c01d91ec3cb1e0aad4624a51f2602c51fa7d96110f8ab5a20c84081804d",
+            "expires_at": "2025-10-03T21:20:56.787737-07:00",
+            "size": 7919570944,
+            "size_vram": 7919570944,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "llama",
+              "families": [
+                "llama"
+              ],
+              "parameter_size": "3.2B",
+              "quantization_level": "F16"
+            },
+            "context_length": 4096
+          }
+        ]
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/files/recordings/12a8beefd95278334e190cfbc8dba722684325cb294aaf45dd42f316d9f6ae5e.json
+++ b/tests/integration/files/recordings/12a8beefd95278334e190cfbc8dba722684325cb294aaf45dd42f316d9f6ae5e.json
@ -0,0 +1,59 @@
+{
+  "test_id": "tests/integration/files/test_files.py::test_files_authentication_isolation",
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:11434/api/ps",
+    "headers": {},
+    "body": {},
+    "endpoint": "/api/ps",
+    "model": ""
+  },
+  "response": {
+    "body": {
+      "__type__": "ollama._types.ProcessResponse",
+      "__data__": {
+        "models": [
+          {
+            "model": "all-minilm:l6-v2",
+            "name": "all-minilm:l6-v2",
+            "digest": "1b226e2802dbb772b5fc32a58f103ca1804ef7501331012de126ab22f67475ef",
+            "expires_at": "2025-10-03T21:21:14.509335-07:00",
+            "size": 585846784,
+            "size_vram": 585846784,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "bert",
+              "families": [
+                "bert"
+              ],
+              "parameter_size": "23M",
+              "quantization_level": "F16"
+            },
+            "context_length": 256
+          },
+          {
+            "model": "llama3.2:3b-instruct-fp16",
+            "name": "llama3.2:3b-instruct-fp16",
+            "digest": "195a8c01d91ec3cb1e0aad4624a51f2602c51fa7d96110f8ab5a20c84081804d",
+            "expires_at": "2025-10-03T21:20:56.787737-07:00",
+            "size": 7919570944,
+            "size_vram": 7919570944,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "llama",
+              "families": [
+                "llama"
+              ],
+              "parameter_size": "3.2B",
+              "quantization_level": "F16"
+            },
+            "context_length": 4096
+          }
+        ]
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/files/recordings/1ed03eb1155d3be1507472aca14a1ae10513ccc6b93e7e9d9d50f4cf83b8276b.json
+++ b/tests/integration/files/recordings/1ed03eb1155d3be1507472aca14a1ae10513ccc6b93e7e9d9d50f4cf83b8276b.json
@ -0,0 +1,59 @@
+{
+  "test_id": "tests/integration/files/test_files.py::test_files_authentication_anonymous_access",
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:11434/api/ps",
+    "headers": {},
+    "body": {},
+    "endpoint": "/api/ps",
+    "model": ""
+  },
+  "response": {
+    "body": {
+      "__type__": "ollama._types.ProcessResponse",
+      "__data__": {
+        "models": [
+          {
+            "model": "all-minilm:l6-v2",
+            "name": "all-minilm:l6-v2",
+            "digest": "1b226e2802dbb772b5fc32a58f103ca1804ef7501331012de126ab22f67475ef",
+            "expires_at": "2025-10-03T21:21:14.509335-07:00",
+            "size": 585846784,
+            "size_vram": 585846784,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "bert",
+              "families": [
+                "bert"
+              ],
+              "parameter_size": "23M",
+              "quantization_level": "F16"
+            },
+            "context_length": 256
+          },
+          {
+            "model": "llama3.2:3b-instruct-fp16",
+            "name": "llama3.2:3b-instruct-fp16",
+            "digest": "195a8c01d91ec3cb1e0aad4624a51f2602c51fa7d96110f8ab5a20c84081804d",
+            "expires_at": "2025-10-03T21:20:56.787737-07:00",
+            "size": 7919570944,
+            "size_vram": 7919570944,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "llama",
+              "families": [
+                "llama"
+              ],
+              "parameter_size": "3.2B",
+              "quantization_level": "F16"
+            },
+            "context_length": 4096
+          }
+        ]
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/inference/recordings/0483665572c6b72adda44aa0b31c105c4a5ac4da26d514c61fbd319910e1ce05.json
+++ b/tests/integration/inference/recordings/0483665572c6b72adda44aa0b31c105c4a5ac4da26d514c61fbd319910e1ce05.json
@ -1,4 +1,5 @@
 {
+  "test_id": "tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_with_user_parameter[llama_stack_client-emb=ollama/all-minilm:l6-v2]",
  "request": {
    "method": "POST",
    "url": "http://0.0.0.0:11434/v1/v1/embeddings",
--- a/tests/integration/inference/recordings/12ea4e4a2705b0589c186d9d068bac0afa6d2385867afa554c6c672722a4e1c0.json
+++ b/tests/integration/inference/recordings/12ea4e4a2705b0589c186d9d068bac0afa6d2385867afa554c6c672722a4e1c0.json
--- a/tests/integration/inference/recordings/166dbff8b991d40d060f9307fc2a67fc24e29dc542fdce9611756d28403e4e86.json
+++ b/tests/integration/inference/recordings/166dbff8b991d40d060f9307fc2a67fc24e29dc542fdce9611756d28403e4e86.json
@ -0,0 +1,78 @@
+{
+  "test_id": "tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_with_dimensions[llama_stack_client-emb=ollama/all-minilm:l6-v2]",
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:11434/api/ps",
+    "headers": {},
+    "body": {},
+    "endpoint": "/api/ps",
+    "model": ""
+  },
+  "response": {
+    "body": {
+      "__type__": "ollama._types.ProcessResponse",
+      "__data__": {
+        "models": [
+          {
+            "model": "all-minilm:l6-v2",
+            "name": "all-minilm:l6-v2",
+            "digest": "1b226e2802dbb772b5fc32a58f103ca1804ef7501331012de126ab22f67475ef",
+            "expires_at": "2025-10-03T20:06:55.143331-07:00",
+            "size": 585846784,
+            "size_vram": 585846784,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "bert",
+              "families": [
+                "bert"
+              ],
+              "parameter_size": "23M",
+              "quantization_level": "F16"
+            },
+            "context_length": 256
+          },
+          {
+            "model": "llama3.2:3b-instruct-fp16",
+            "name": "llama3.2:3b-instruct-fp16",
+            "digest": "195a8c01d91ec3cb1e0aad4624a51f2602c51fa7d96110f8ab5a20c84081804d",
+            "expires_at": "2025-10-03T20:06:54.261283-07:00",
+            "size": 7919570944,
+            "size_vram": 7919570944,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "llama",
+              "families": [
+                "llama"
+              ],
+              "parameter_size": "3.2B",
+              "quantization_level": "F16"
+            },
+            "context_length": 4096
+          },
+          {
+            "model": "llama-guard3:1b",
+            "name": "llama-guard3:1b",
+            "digest": "494147e06bf99e10dbe67b63a07ac81c162f18ef3341aa3390007ac828571b3b",
+            "expires_at": "2025-10-03T20:06:23.356289-07:00",
+            "size": 2350966784,
+            "size_vram": 2350966784,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "llama",
+              "families": [
+                "llama"
+              ],
+              "parameter_size": "1.5B",
+              "quantization_level": "Q8_0"
+            },
+            "context_length": 4096
+          }
+        ]
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/inference/recordings/1774e408dada8623a932f20fd67ab722cbff2a2213309d8a33ec2ed5a444e5d4.json
+++ b/tests/integration/inference/recordings/1774e408dada8623a932f20fd67ab722cbff2a2213309d8a33ec2ed5a444e5d4.json
@ -1,4 +1,5 @@
 {
+  "test_id": "tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_with_dimensions[llama_stack_client-emb=ollama/all-minilm:l6-v2]",
  "request": {
    "method": "POST",
    "url": "http://0.0.0.0:11434/v1/v1/embeddings",
--- a/tests/integration/inference/recordings/1f3149657717e1cca83f08af67b9f34993a49de31349b11ce13c5d0465ca5f6a.json
+++ b/tests/integration/inference/recordings/1f3149657717e1cca83f08af67b9f34993a49de31349b11ce13c5d0465ca5f6a.json
@ -1,4 +1,5 @@
 {
+  "test_id": "tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_different_inputs_different_outputs[llama_stack_client-emb=ollama/all-minilm:l6-v2]",
  "request": {
    "method": "POST",
    "url": "http://0.0.0.0:11434/v1/v1/embeddings",
--- a/tests/integration/inference/recordings/20032ec4aeb47f2a2711ec6fcada258dd47fcaefd22f8ee6a6c0b07152c551d4.json
+++ b/tests/integration/inference/recordings/20032ec4aeb47f2a2711ec6fcada258dd47fcaefd22f8ee6a6c0b07152c551d4.json
@ -0,0 +1,78 @@
+{
+  "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[client_with_models-txt=ollama/llama3.2:3b-instruct-fp16-inference:chat_completion:streaming_01]",
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:11434/api/ps",
+    "headers": {},
+    "body": {},
+    "endpoint": "/api/ps",
+    "model": ""
+  },
+  "response": {
+    "body": {
+      "__type__": "ollama._types.ProcessResponse",
+      "__data__": {
+        "models": [
+          {
+            "model": "llama3.2:3b-instruct-fp16",
+            "name": "llama3.2:3b-instruct-fp16",
+            "digest": "195a8c01d91ec3cb1e0aad4624a51f2602c51fa7d96110f8ab5a20c84081804d",
+            "expires_at": "2025-10-03T20:07:01.911896-07:00",
+            "size": 7919570944,
+            "size_vram": 7919570944,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "llama",
+              "families": [
+                "llama"
+              ],
+              "parameter_size": "3.2B",
+              "quantization_level": "F16"
+            },
+            "context_length": 4096
+          },
+          {
+            "model": "all-minilm:l6-v2",
+            "name": "all-minilm:l6-v2",
+            "digest": "1b226e2802dbb772b5fc32a58f103ca1804ef7501331012de126ab22f67475ef",
+            "expires_at": "2025-10-03T20:06:55.533878-07:00",
+            "size": 585846784,
+            "size_vram": 585846784,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "bert",
+              "families": [
+                "bert"
+              ],
+              "parameter_size": "23M",
+              "quantization_level": "F16"
+            },
+            "context_length": 256
+          },
+          {
+            "model": "llama-guard3:1b",
+            "name": "llama-guard3:1b",
+            "digest": "494147e06bf99e10dbe67b63a07ac81c162f18ef3341aa3390007ac828571b3b",
+            "expires_at": "2025-10-03T20:06:23.356289-07:00",
+            "size": 2350966784,
+            "size_vram": 2350966784,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "llama",
+              "families": [
+                "llama"
+              ],
+              "parameter_size": "1.5B",
+              "quantization_level": "Q8_0"
+            },
+            "context_length": 4096
+          }
+        ]
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/inference/recordings/228c053a646f7563791aa7f633bb7353eb3f763add416a69ddb255b16f6ea0a9.json
+++ b/tests/integration/inference/recordings/228c053a646f7563791aa7f633bb7353eb3f763add416a69ddb255b16f6ea0a9.json
@ -0,0 +1,78 @@
+{
+  "test_id": "tests/integration/inference/test_openai_completion.py::test_inference_store[client_with_models-txt=ollama/llama3.2:3b-instruct-fp16-True]",
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:11434/api/ps",
+    "headers": {},
+    "body": {},
+    "endpoint": "/api/ps",
+    "model": ""
+  },
+  "response": {
+    "body": {
+      "__type__": "ollama._types.ProcessResponse",
+      "__data__": {
+        "models": [
+          {
+            "model": "llama3.2:3b-instruct-fp16",
+            "name": "llama3.2:3b-instruct-fp16",
+            "digest": "195a8c01d91ec3cb1e0aad4624a51f2602c51fa7d96110f8ab5a20c84081804d",
+            "expires_at": "2025-10-03T20:07:02.652559-07:00",
+            "size": 7919570944,
+            "size_vram": 7919570944,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "llama",
+              "families": [
+                "llama"
+              ],
+              "parameter_size": "3.2B",
+              "quantization_level": "F16"
+            },
+            "context_length": 4096
+          },
+          {
+            "model": "all-minilm:l6-v2",
+            "name": "all-minilm:l6-v2",
+            "digest": "1b226e2802dbb772b5fc32a58f103ca1804ef7501331012de126ab22f67475ef",
+            "expires_at": "2025-10-03T20:06:55.533878-07:00",
+            "size": 585846784,
+            "size_vram": 585846784,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "bert",
+              "families": [
+                "bert"
+              ],
+              "parameter_size": "23M",
+              "quantization_level": "F16"
+            },
+            "context_length": 256
+          },
+          {
+            "model": "llama-guard3:1b",
+            "name": "llama-guard3:1b",
+            "digest": "494147e06bf99e10dbe67b63a07ac81c162f18ef3341aa3390007ac828571b3b",
+            "expires_at": "2025-10-03T20:06:23.356289-07:00",
+            "size": 2350966784,
+            "size_vram": 2350966784,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "llama",
+              "families": [
+                "llama"
+              ],
+              "parameter_size": "1.5B",
+              "quantization_level": "Q8_0"
+            },
+            "context_length": 4096
+          }
+        ]
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/inference/recordings/2825486bdbce4722bbc4a993f9f39e820c16bc8d324dc598be1224dad079ebb7.json
+++ b/tests/integration/inference/recordings/2825486bdbce4722bbc4a993f9f39e820c16bc8d324dc598be1224dad079ebb7.json
--- a/tests/integration/inference/recordings/28b529023374d3345f46c0b183ee352ff5014ff344238bd40b820c61510097eb.json
+++ b/tests/integration/inference/recordings/28b529023374d3345f46c0b183ee352ff5014ff344238bd40b820c61510097eb.json
@ -0,0 +1,78 @@
+{
+  "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[client_with_models-txt=ollama/llama3.2:3b-instruct-fp16-inference:chat_completion:non_streaming_01]",
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:11434/api/ps",
+    "headers": {},
+    "body": {},
+    "endpoint": "/api/ps",
+    "model": ""
+  },
+  "response": {
+    "body": {
+      "__type__": "ollama._types.ProcessResponse",
+      "__data__": {
+        "models": [
+          {
+            "model": "llama3.2:3b-instruct-fp16",
+            "name": "llama3.2:3b-instruct-fp16",
+            "digest": "195a8c01d91ec3cb1e0aad4624a51f2602c51fa7d96110f8ab5a20c84081804d",
+            "expires_at": "2025-10-03T20:06:59.383555-07:00",
+            "size": 7919570944,
+            "size_vram": 7919570944,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "llama",
+              "families": [
+                "llama"
+              ],
+              "parameter_size": "3.2B",
+              "quantization_level": "F16"
+            },
+            "context_length": 4096
+          },
+          {
+            "model": "all-minilm:l6-v2",
+            "name": "all-minilm:l6-v2",
+            "digest": "1b226e2802dbb772b5fc32a58f103ca1804ef7501331012de126ab22f67475ef",
+            "expires_at": "2025-10-03T20:06:55.533878-07:00",
+            "size": 585846784,
+            "size_vram": 585846784,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "bert",
+              "families": [
+                "bert"
+              ],
+              "parameter_size": "23M",
+              "quantization_level": "F16"
+            },
+            "context_length": 256
+          },
+          {
+            "model": "llama-guard3:1b",
+            "name": "llama-guard3:1b",
+            "digest": "494147e06bf99e10dbe67b63a07ac81c162f18ef3341aa3390007ac828571b3b",
+            "expires_at": "2025-10-03T20:06:23.356289-07:00",
+            "size": 2350966784,
+            "size_vram": 2350966784,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "llama",
+              "families": [
+                "llama"
+              ],
+              "parameter_size": "1.5B",
+              "quantization_level": "Q8_0"
+            },
+            "context_length": 4096
+          }
+        ]
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/inference/recordings/28b74a89cdc85de62395836bb21baa777d3197245698f4a7531afa0a15c11d1c.json
+++ b/tests/integration/inference/recordings/28b74a89cdc85de62395836bb21baa777d3197245698f4a7531afa0a15c11d1c.json
@ -0,0 +1,78 @@
+{
+  "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[client_with_models-txt=ollama/llama3.2:3b-instruct-fp16-inference:chat_completion:non_streaming_02]",
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:11434/api/ps",
+    "headers": {},
+    "body": {},
+    "endpoint": "/api/ps",
+    "model": ""
+  },
+  "response": {
+    "body": {
+      "__type__": "ollama._types.ProcessResponse",
+      "__data__": {
+        "models": [
+          {
+            "model": "llama3.2:3b-instruct-fp16",
+            "name": "llama3.2:3b-instruct-fp16",
+            "digest": "195a8c01d91ec3cb1e0aad4624a51f2602c51fa7d96110f8ab5a20c84081804d",
+            "expires_at": "2025-10-03T20:07:04.627464-07:00",
+            "size": 7919570944,
+            "size_vram": 7919570944,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "llama",
+              "families": [
+                "llama"
+              ],
+              "parameter_size": "3.2B",
+              "quantization_level": "F16"
+            },
+            "context_length": 4096
+          },
+          {
+            "model": "all-minilm:l6-v2",
+            "name": "all-minilm:l6-v2",
+            "digest": "1b226e2802dbb772b5fc32a58f103ca1804ef7501331012de126ab22f67475ef",
+            "expires_at": "2025-10-03T20:06:55.533878-07:00",
+            "size": 585846784,
+            "size_vram": 585846784,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "bert",
+              "families": [
+                "bert"
+              ],
+              "parameter_size": "23M",
+              "quantization_level": "F16"
+            },
+            "context_length": 256
+          },
+          {
+            "model": "llama-guard3:1b",
+            "name": "llama-guard3:1b",
+            "digest": "494147e06bf99e10dbe67b63a07ac81c162f18ef3341aa3390007ac828571b3b",
+            "expires_at": "2025-10-03T20:06:23.356289-07:00",
+            "size": 2350966784,
+            "size_vram": 2350966784,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "llama",
+              "families": [
+                "llama"
+              ],
+              "parameter_size": "1.5B",
+              "quantization_level": "Q8_0"
+            },
+            "context_length": 4096
+          }
+        ]
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/inference/recordings/36968c3ac66be305df6bdac7b747612e4e6645cf3bfcec16f96a41397cbfd5aa.json
+++ b/tests/integration/inference/recordings/36968c3ac66be305df6bdac7b747612e4e6645cf3bfcec16f96a41397cbfd5aa.json
--- a/tests/integration/inference/recordings/37e10e6ac4dccb3a91333fcfe9a17806950b21da9470c98960495f3b9fe94ecd.json
+++ b/tests/integration/inference/recordings/37e10e6ac4dccb3a91333fcfe9a17806950b21da9470c98960495f3b9fe94ecd.json
--- a/tests/integration/inference/recordings/3d2d95c9e7f6a2e434c69fb2d5ff50a3cce15ef5f308d5443beebd71a7dc7818.json
+++ b/tests/integration/inference/recordings/3d2d95c9e7f6a2e434c69fb2d5ff50a3cce15ef5f308d5443beebd71a7dc7818.json
--- a/tests/integration/inference/recordings/4b67443597195a9092b2b27b1dff526263df8ab267f4a9df7196dcb6c11aef17.json
+++ b/tests/integration/inference/recordings/4b67443597195a9092b2b27b1dff526263df8ab267f4a9df7196dcb6c11aef17.json
@ -0,0 +1,78 @@
+{
+  "test_id": "tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[client_with_models-txt=ollama/llama3.2:3b-instruct-fp16-True]",
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:11434/api/ps",
+    "headers": {},
+    "body": {},
+    "endpoint": "/api/ps",
+    "model": ""
+  },
+  "response": {
+    "body": {
+      "__type__": "ollama._types.ProcessResponse",
+      "__data__": {
+        "models": [
+          {
+            "model": "llama3.2:3b-instruct-fp16",
+            "name": "llama3.2:3b-instruct-fp16",
+            "digest": "195a8c01d91ec3cb1e0aad4624a51f2602c51fa7d96110f8ab5a20c84081804d",
+            "expires_at": "2025-10-03T20:07:03.361039-07:00",
+            "size": 7919570944,
+            "size_vram": 7919570944,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "llama",
+              "families": [
+                "llama"
+              ],
+              "parameter_size": "3.2B",
+              "quantization_level": "F16"
+            },
+            "context_length": 4096
+          },
+          {
+            "model": "all-minilm:l6-v2",
+            "name": "all-minilm:l6-v2",
+            "digest": "1b226e2802dbb772b5fc32a58f103ca1804ef7501331012de126ab22f67475ef",
+            "expires_at": "2025-10-03T20:06:55.533878-07:00",
+            "size": 585846784,
+            "size_vram": 585846784,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "bert",
+              "families": [
+                "bert"
+              ],
+              "parameter_size": "23M",
+              "quantization_level": "F16"
+            },
+            "context_length": 256
+          },
+          {
+            "model": "llama-guard3:1b",
+            "name": "llama-guard3:1b",
+            "digest": "494147e06bf99e10dbe67b63a07ac81c162f18ef3341aa3390007ac828571b3b",
+            "expires_at": "2025-10-03T20:06:23.356289-07:00",
+            "size": 2350966784,
+            "size_vram": 2350966784,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "llama",
+              "families": [
+                "llama"
+              ],
+              "parameter_size": "1.5B",
+              "quantization_level": "Q8_0"
+            },
+            "context_length": 4096
+          }
+        ]
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/inference/recordings/53f557a067b8f00b81dee1d0f8def8051f330d7a91faa68addc39f9b29e5dbbd.json
+++ b/tests/integration/inference/recordings/53f557a067b8f00b81dee1d0f8def8051f330d7a91faa68addc39f9b29e5dbbd.json
--- a/tests/integration/inference/recordings/54606aca0c734fe36c3fcba632a36aa4582c2c92af953638d6e1ae060c01d71e.json
+++ b/tests/integration/inference/recordings/54606aca0c734fe36c3fcba632a36aa4582c2c92af953638d6e1ae060c01d71e.json
@ -0,0 +1,78 @@
+{
+  "test_id": "tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[client_with_models-txt=ollama/llama3.2:3b-instruct-fp16-False]",
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:11434/api/ps",
+    "headers": {},
+    "body": {},
+    "endpoint": "/api/ps",
+    "model": ""
+  },
+  "response": {
+    "body": {
+      "__type__": "ollama._types.ProcessResponse",
+      "__data__": {
+        "models": [
+          {
+            "model": "llama3.2:3b-instruct-fp16",
+            "name": "llama3.2:3b-instruct-fp16",
+            "digest": "195a8c01d91ec3cb1e0aad4624a51f2602c51fa7d96110f8ab5a20c84081804d",
+            "expires_at": "2025-10-03T20:07:08.369100-07:00",
+            "size": 7919570944,
+            "size_vram": 7919570944,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "llama",
+              "families": [
+                "llama"
+              ],
+              "parameter_size": "3.2B",
+              "quantization_level": "F16"
+            },
+            "context_length": 4096
+          },
+          {
+            "model": "all-minilm:l6-v2",
+            "name": "all-minilm:l6-v2",
+            "digest": "1b226e2802dbb772b5fc32a58f103ca1804ef7501331012de126ab22f67475ef",
+            "expires_at": "2025-10-03T20:06:55.533878-07:00",
+            "size": 585846784,
+            "size_vram": 585846784,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "bert",
+              "families": [
+                "bert"
+              ],
+              "parameter_size": "23M",
+              "quantization_level": "F16"
+            },
+            "context_length": 256
+          },
+          {
+            "model": "llama-guard3:1b",
+            "name": "llama-guard3:1b",
+            "digest": "494147e06bf99e10dbe67b63a07ac81c162f18ef3341aa3390007ac828571b3b",
+            "expires_at": "2025-10-03T20:06:23.356289-07:00",
+            "size": 2350966784,
+            "size_vram": 2350966784,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "llama",
+              "families": [
+                "llama"
+              ],
+              "parameter_size": "1.5B",
+              "quantization_level": "Q8_0"
+            },
+            "context_length": 4096
+          }
+        ]
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/inference/recordings/640984c9321e94cb3c6d24ab4af1e83e628af419d09d77948f2480651fa472f2.json
+++ b/tests/integration/inference/recordings/640984c9321e94cb3c6d24ab4af1e83e628af419d09d77948f2480651fa472f2.json
@ -0,0 +1,78 @@
+{
+  "test_id": "tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_different_inputs_different_outputs[llama_stack_client-emb=ollama/all-minilm:l6-v2]",
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:11434/api/ps",
+    "headers": {},
+    "body": {},
+    "endpoint": "/api/ps",
+    "model": ""
+  },
+  "response": {
+    "body": {
+      "__type__": "ollama._types.ProcessResponse",
+      "__data__": {
+        "models": [
+          {
+            "model": "all-minilm:l6-v2",
+            "name": "all-minilm:l6-v2",
+            "digest": "1b226e2802dbb772b5fc32a58f103ca1804ef7501331012de126ab22f67475ef",
+            "expires_at": "2025-10-03T20:06:55.330036-07:00",
+            "size": 585846784,
+            "size_vram": 585846784,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "bert",
+              "families": [
+                "bert"
+              ],
+              "parameter_size": "23M",
+              "quantization_level": "F16"
+            },
+            "context_length": 256
+          },
+          {
+            "model": "llama3.2:3b-instruct-fp16",
+            "name": "llama3.2:3b-instruct-fp16",
+            "digest": "195a8c01d91ec3cb1e0aad4624a51f2602c51fa7d96110f8ab5a20c84081804d",
+            "expires_at": "2025-10-03T20:06:54.261283-07:00",
+            "size": 7919570944,
+            "size_vram": 7919570944,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "llama",
+              "families": [
+                "llama"
+              ],
+              "parameter_size": "3.2B",
+              "quantization_level": "F16"
+            },
+            "context_length": 4096
+          },
+          {
+            "model": "llama-guard3:1b",
+            "name": "llama-guard3:1b",
+            "digest": "494147e06bf99e10dbe67b63a07ac81c162f18ef3341aa3390007ac828571b3b",
+            "expires_at": "2025-10-03T20:06:23.356289-07:00",
+            "size": 2350966784,
+            "size_vram": 2350966784,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "llama",
+              "families": [
+                "llama"
+              ],
+              "parameter_size": "1.5B",
+              "quantization_level": "Q8_0"
+            },
+            "context_length": 4096
+          }
+        ]
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/inference/recordings/6626c46dfaf503c589013d6396a3e6871117c7c024d37f16a8d0b2a7def9f8ed.json
+++ b/tests/integration/inference/recordings/6626c46dfaf503c589013d6396a3e6871117c7c024d37f16a8d0b2a7def9f8ed.json
@ -0,0 +1,40 @@
+{
+  "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_completion_non_streaming_suffix[txt=ollama/llama3.2:3b-instruct-fp16-inference:completion:suffix]",
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:11434/api/ps",
+    "headers": {},
+    "body": {},
+    "endpoint": "/api/ps",
+    "model": ""
+  },
+  "response": {
+    "body": {
+      "__type__": "ollama._types.ProcessResponse",
+      "__data__": {
+        "models": [
+          {
+            "model": "llama3.2:3b-instruct-fp16",
+            "name": "llama3.2:3b-instruct-fp16",
+            "digest": "195a8c01d91ec3cb1e0aad4624a51f2602c51fa7d96110f8ab5a20c84081804d",
+            "expires_at": "2025-10-03T20:05:44.851615-07:00",
+            "size": 7919570944,
+            "size_vram": 7919570944,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "llama",
+              "families": [
+                "llama"
+              ],
+              "parameter_size": "3.2B",
+              "quantization_level": "F16"
+            },
+            "context_length": 4096
+          }
+        ]
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/inference/recordings/676c9e0736ad6864480d4a884e157e9a9ceb4e5f9441635b19809ed518542406.json
+++ b/tests/integration/inference/recordings/676c9e0736ad6864480d4a884e157e9a9ceb4e5f9441635b19809ed518542406.json
@ -0,0 +1,40 @@
+{
+  "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_completion_streaming[txt=ollama/llama3.2:3b-instruct-fp16-inference:completion:sanity]",
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:11434/api/ps",
+    "headers": {},
+    "body": {},
+    "endpoint": "/api/ps",
+    "model": ""
+  },
+  "response": {
+    "body": {
+      "__type__": "ollama._types.ProcessResponse",
+      "__data__": {
+        "models": [
+          {
+            "model": "llama3.2:3b-instruct-fp16",
+            "name": "llama3.2:3b-instruct-fp16",
+            "digest": "195a8c01d91ec3cb1e0aad4624a51f2602c51fa7d96110f8ab5a20c84081804d",
+            "expires_at": "2025-10-03T20:05:44.851615-07:00",
+            "size": 7919570944,
+            "size_vram": 7919570944,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "llama",
+              "families": [
+                "llama"
+              ],
+              "parameter_size": "3.2B",
+              "quantization_level": "F16"
+            },
+            "context_length": 4096
+          }
+        ]
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/inference/recordings/68a0a14e828ca575c2150f6de6961bf8d326b1ce1beb6902a7aa369b07f7e17a.json
+++ b/tests/integration/inference/recordings/68a0a14e828ca575c2150f6de6961bf8d326b1ce1beb6902a7aa369b07f7e17a.json
@ -0,0 +1,78 @@
+{
+  "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming_with_n[client_with_models-txt=ollama/llama3.2:3b-instruct-fp16-inference:chat_completion:streaming_01]",
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:11434/api/ps",
+    "headers": {},
+    "body": {},
+    "endpoint": "/api/ps",
+    "model": ""
+  },
+  "response": {
+    "body": {
+      "__type__": "ollama._types.ProcessResponse",
+      "__data__": {
+        "models": [
+          {
+            "model": "llama3.2:3b-instruct-fp16",
+            "name": "llama3.2:3b-instruct-fp16",
+            "digest": "195a8c01d91ec3cb1e0aad4624a51f2602c51fa7d96110f8ab5a20c84081804d",
+            "expires_at": "2025-10-03T20:07:02.652559-07:00",
+            "size": 7919570944,
+            "size_vram": 7919570944,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "llama",
+              "families": [
+                "llama"
+              ],
+              "parameter_size": "3.2B",
+              "quantization_level": "F16"
+            },
+            "context_length": 4096
+          },
+          {
+            "model": "all-minilm:l6-v2",
+            "name": "all-minilm:l6-v2",
+            "digest": "1b226e2802dbb772b5fc32a58f103ca1804ef7501331012de126ab22f67475ef",
+            "expires_at": "2025-10-03T20:06:55.533878-07:00",
+            "size": 585846784,
+            "size_vram": 585846784,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "bert",
+              "families": [
+                "bert"
+              ],
+              "parameter_size": "23M",
+              "quantization_level": "F16"
+            },
+            "context_length": 256
+          },
+          {
+            "model": "llama-guard3:1b",
+            "name": "llama-guard3:1b",
+            "digest": "494147e06bf99e10dbe67b63a07ac81c162f18ef3341aa3390007ac828571b3b",
+            "expires_at": "2025-10-03T20:06:23.356289-07:00",
+            "size": 2350966784,
+            "size_vram": 2350966784,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "llama",
+              "families": [
+                "llama"
+              ],
+              "parameter_size": "1.5B",
+              "quantization_level": "Q8_0"
+            },
+            "context_length": 4096
+          }
+        ]
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/inference/recordings/6ea16771b09b4966fa1dfb0349a83f6e11a6716cf3bd9d6e711ed7497232c152.json
+++ b/tests/integration/inference/recordings/6ea16771b09b4966fa1dfb0349a83f6e11a6716cf3bd9d6e711ed7497232c152.json
@ -1,4 +1,5 @@
 {
+  "test_id": "tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_with_encoding_format_float[llama_stack_client-emb=ollama/all-minilm:l6-v2]",
  "request": {
    "method": "POST",
    "url": "http://0.0.0.0:11434/v1/v1/embeddings",
--- a/tests/integration/inference/recordings/7794d17cdf3672c5508ce0b07b0622513134f71115f13dd5e65ddae9f2b54862.json
+++ b/tests/integration/inference/recordings/7794d17cdf3672c5508ce0b07b0622513134f71115f13dd5e65ddae9f2b54862.json
--- a/tests/integration/inference/recordings/78aa717a1d0a499a1cc065e95896b63af46cc13f5a6da2346502a16d55aab891.json
+++ b/tests/integration/inference/recordings/78aa717a1d0a499a1cc065e95896b63af46cc13f5a6da2346502a16d55aab891.json
@ -0,0 +1,78 @@
+{
+  "test_id": "tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_base64_batch_processing[llama_stack_client-emb=ollama/all-minilm:l6-v2]",
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:11434/api/ps",
+    "headers": {},
+    "body": {},
+    "endpoint": "/api/ps",
+    "model": ""
+  },
+  "response": {
+    "body": {
+      "__type__": "ollama._types.ProcessResponse",
+      "__data__": {
+        "models": [
+          {
+            "model": "all-minilm:l6-v2",
+            "name": "all-minilm:l6-v2",
+            "digest": "1b226e2802dbb772b5fc32a58f103ca1804ef7501331012de126ab22f67475ef",
+            "expires_at": "2025-10-03T20:06:55.533878-07:00",
+            "size": 585846784,
+            "size_vram": 585846784,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "bert",
+              "families": [
+                "bert"
+              ],
+              "parameter_size": "23M",
+              "quantization_level": "F16"
+            },
+            "context_length": 256
+          },
+          {
+            "model": "llama3.2:3b-instruct-fp16",
+            "name": "llama3.2:3b-instruct-fp16",
+            "digest": "195a8c01d91ec3cb1e0aad4624a51f2602c51fa7d96110f8ab5a20c84081804d",
+            "expires_at": "2025-10-03T20:06:54.261283-07:00",
+            "size": 7919570944,
+            "size_vram": 7919570944,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "llama",
+              "families": [
+                "llama"
+              ],
+              "parameter_size": "3.2B",
+              "quantization_level": "F16"
+            },
+            "context_length": 4096
+          },
+          {
+            "model": "llama-guard3:1b",
+            "name": "llama-guard3:1b",
+            "digest": "494147e06bf99e10dbe67b63a07ac81c162f18ef3341aa3390007ac828571b3b",
+            "expires_at": "2025-10-03T20:06:23.356289-07:00",
+            "size": 2350966784,
+            "size_vram": 2350966784,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "llama",
+              "families": [
+                "llama"
+              ],
+              "parameter_size": "1.5B",
+              "quantization_level": "Q8_0"
+            },
+            "context_length": 4096
+          }
+        ]
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/inference/recordings/7b870ce8970377661fbc70050c35a7325434b30be994cd3ade1ad36aabc027e9.json
+++ b/tests/integration/inference/recordings/7b870ce8970377661fbc70050c35a7325434b30be994cd3ade1ad36aabc027e9.json
--- a/tests/integration/inference/recordings/803b63dc14be19794ed3eda644b58c1ddfee8dbd1c3f7e18b21ada4564d46f35.json
+++ b/tests/integration/inference/recordings/803b63dc14be19794ed3eda644b58c1ddfee8dbd1c3f7e18b21ada4564d46f35.json
@ -0,0 +1,40 @@
+{
+  "test_id": "tests/integration/inference/test_openai_completion.py::test_openai_completion_guided_choice[txt=ollama/llama3.2:3b-instruct-fp16]",
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:11434/api/ps",
+    "headers": {},
+    "body": {},
+    "endpoint": "/api/ps",
+    "model": ""
+  },
+  "response": {
+    "body": {
+      "__type__": "ollama._types.ProcessResponse",
+      "__data__": {
+        "models": [
+          {
+            "model": "llama3.2:3b-instruct-fp16",
+            "name": "llama3.2:3b-instruct-fp16",
+            "digest": "195a8c01d91ec3cb1e0aad4624a51f2602c51fa7d96110f8ab5a20c84081804d",
+            "expires_at": "2025-10-03T20:05:47.301578-07:00",
+            "size": 7919570944,
+            "size_vram": 7919570944,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "llama",
+              "families": [
+                "llama"
+              ],
+              "parameter_size": "3.2B",
+              "quantization_level": "F16"
+            },
+            "context_length": 4096
+          }
+        ]
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/inference/recordings/822c192e060366dd978158aebff1c2dcdb9ce6c923a245db07314151e9d397c2.json
+++ b/tests/integration/inference/recordings/822c192e060366dd978158aebff1c2dcdb9ce6c923a245db07314151e9d397c2.json
@ -0,0 +1,78 @@
+{
+  "test_id": "tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_invalid_model_error[llama_stack_client-emb=ollama/all-minilm:l6-v2]",
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:11434/api/ps",
+    "headers": {},
+    "body": {},
+    "endpoint": "/api/ps",
+    "model": ""
+  },
+  "response": {
+    "body": {
+      "__type__": "ollama._types.ProcessResponse",
+      "__data__": {
+        "models": [
+          {
+            "model": "all-minilm:l6-v2",
+            "name": "all-minilm:l6-v2",
+            "digest": "1b226e2802dbb772b5fc32a58f103ca1804ef7501331012de126ab22f67475ef",
+            "expires_at": "2025-10-03T20:06:55.330036-07:00",
+            "size": 585846784,
+            "size_vram": 585846784,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "bert",
+              "families": [
+                "bert"
+              ],
+              "parameter_size": "23M",
+              "quantization_level": "F16"
+            },
+            "context_length": 256
+          },
+          {
+            "model": "llama3.2:3b-instruct-fp16",
+            "name": "llama3.2:3b-instruct-fp16",
+            "digest": "195a8c01d91ec3cb1e0aad4624a51f2602c51fa7d96110f8ab5a20c84081804d",
+            "expires_at": "2025-10-03T20:06:54.261283-07:00",
+            "size": 7919570944,
+            "size_vram": 7919570944,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "llama",
+              "families": [
+                "llama"
+              ],
+              "parameter_size": "3.2B",
+              "quantization_level": "F16"
+            },
+            "context_length": 4096
+          },
+          {
+            "model": "llama-guard3:1b",
+            "name": "llama-guard3:1b",
+            "digest": "494147e06bf99e10dbe67b63a07ac81c162f18ef3341aa3390007ac828571b3b",
+            "expires_at": "2025-10-03T20:06:23.356289-07:00",
+            "size": 2350966784,
+            "size_vram": 2350966784,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "llama",
+              "families": [
+                "llama"
+              ],
+              "parameter_size": "1.5B",
+              "quantization_level": "Q8_0"
+            },
+            "context_length": 4096
+          }
+        ]
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/inference/recordings/834be37d56251727f6bdcafeafbc3846670d41955cc46a693767cbc67bed9ae2.json
+++ b/tests/integration/inference/recordings/834be37d56251727f6bdcafeafbc3846670d41955cc46a693767cbc67bed9ae2.json
@ -0,0 +1,78 @@
+{
+  "test_id": "tests/integration/inference/test_openai_embeddings.py::test_openai_embeddings_with_encoding_format_float[llama_stack_client-emb=ollama/all-minilm:l6-v2]",
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:11434/api/ps",
+    "headers": {},
+    "body": {},
+    "endpoint": "/api/ps",
+    "model": ""
+  },
+  "response": {
+    "body": {
+      "__type__": "ollama._types.ProcessResponse",
+      "__data__": {
+        "models": [
+          {
+            "model": "all-minilm:l6-v2",
+            "name": "all-minilm:l6-v2",
+            "digest": "1b226e2802dbb772b5fc32a58f103ca1804ef7501331012de126ab22f67475ef",
+            "expires_at": "2025-10-03T20:06:55.077957-07:00",
+            "size": 585846784,
+            "size_vram": 585846784,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "bert",
+              "families": [
+                "bert"
+              ],
+              "parameter_size": "23M",
+              "quantization_level": "F16"
+            },
+            "context_length": 256
+          },
+          {
+            "model": "llama3.2:3b-instruct-fp16",
+            "name": "llama3.2:3b-instruct-fp16",
+            "digest": "195a8c01d91ec3cb1e0aad4624a51f2602c51fa7d96110f8ab5a20c84081804d",
+            "expires_at": "2025-10-03T20:06:54.261283-07:00",
+            "size": 7919570944,
+            "size_vram": 7919570944,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "llama",
+              "families": [
+                "llama"
+              ],
+              "parameter_size": "3.2B",
+              "quantization_level": "F16"
+            },
+            "context_length": 4096
+          },
+          {
+            "model": "llama-guard3:1b",
+            "name": "llama-guard3:1b",
+            "digest": "494147e06bf99e10dbe67b63a07ac81c162f18ef3341aa3390007ac828571b3b",
+            "expires_at": "2025-10-03T20:06:23.356289-07:00",
+            "size": 2350966784,
+            "size_vram": 2350966784,
+            "details": {
+              "parent_model": "",
+              "format": "gguf",
+              "family": "llama",
+              "families": [
+                "llama"
+              ],
+              "parameter_size": "1.5B",
+              "quantization_level": "Q8_0"
+            },
+            "context_length": 4096
+          }
+        ]
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/inference/recordings/8823085c58bdb7dca853d7a661e51d80446d906528ee8f715d57f748fd533afc.json
+++ b/tests/integration/inference/recordings/8823085c58bdb7dca853d7a661e51d80446d906528ee8f715d57f748fd533afc.json
--- a/Show more
+++ b/Show more