From 03d23db910a97fd5387d25652c3a2a31170b58d4 Mon Sep 17 00:00:00 2001
From: Derek Higgins <derekh@redhat.com>
Date: Thu, 6 Nov 2025 15:59:55 +0000
Subject: [PATCH 1/2] ci: vllm ci job update (#4088)

Add missing recording for vllm in library mode
Add Docker env (missed during rebase)

Signed-off-by: Derek Higgins <derekh@redhat.com>
---
 scripts/integration-tests.sh                  |   4 +
 ...62a706ebe85e2a5fe637ddad558cbaafe92d8.json | 103 ++++++++++++++++++
 2 files changed, 107 insertions(+)
 create mode 100644 tests/integration/inference/recordings/99bf0054f11a9c58c13a44f9cf962a706ebe85e2a5fe637ddad558cbaafe92d8.json
diff --git a/scripts/integration-tests.sh b/scripts/integration-tests.sh
index 2d088f3df..e21f73f99 100755
--- a/scripts/integration-tests.sh
+++ b/scripts/integration-tests.sh
@@ -353,6 +353,10 @@ if [[ "$STACK_CONFIG" == *"docker:"* && "$COLLECT_ONLY" == false ]]; then
     [ -n "${OLLAMA_URL:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OLLAMA_URL=$OLLAMA_URL"
     [ -n "${SAFETY_MODEL:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e SAFETY_MODEL=$SAFETY_MODEL"
 
+    if [[ "$TEST_SETUP" == "vllm" ]]; then
+        DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e VLLM_URL=http://localhost:8000/v1"
+    fi
+
     # Determine the actual image name (may have localhost/ prefix)
     IMAGE_NAME=$(docker images --format "{{.Repository}}:{{.Tag}}" | grep "distribution-$DISTRO:dev$" | head -1)
     if [[ -z "$IMAGE_NAME" ]]; then
diff --git a/tests/integration/inference/recordings/99bf0054f11a9c58c13a44f9cf962a706ebe85e2a5fe637ddad558cbaafe92d8.json b/tests/integration/inference/recordings/99bf0054f11a9c58c13a44f9cf962a706ebe85e2a5fe637ddad558cbaafe92d8.json
new file mode 100644
index 000000000..250e91c68
--- /dev/null
+++ b/tests/integration/inference/recordings/99bf0054f11a9c58c13a44f9cf962a706ebe85e2a5fe637ddad558cbaafe92d8.json
@@ -0,0 +1,103 @@
+{
+  "test_id": "tests/integration/inference/test_tools_with_schemas.py::TestMCPToolsInChatCompletion::test_mcp_tools_in_inference[txt=vllm/Qwen/Qwen3-0.6B]",
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:8000/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "Qwen/Qwen3-0.6B",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Calculate 5 + 3"
+        }
+      ],
+      "max_tokens": 4096,
+      "tools": [
+        {
+          "type": "function",
+          "function": {
+            "name": "calculate",
+            "description": "",
+            "parameters": {
+              "properties": {
+                "x": {
+                  "title": "X",
+                  "type": "number"
+                },
+                "y": {
+                  "title": "Y",
+                  "type": "number"
+                },
+                "operation": {
+                  "title": "Operation",
+                  "type": "string"
+                }
+              },
+              "required": [
+                "x",
+                "y",
+                "operation"
+              ],
+              "title": "calculateArguments",
+              "type": "object"
+            }
+          }
+        }
+      ]
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "Qwen/Qwen3-0.6B"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "rec-99bf0054f11a",
+        "choices": [
+          {
+            "finish_reason": "tool_calls",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "<think>\nOkay, the user wants to calculate 5 plus 3. Let me check the tools provided. The only function available is 'calculate', which requires x, y, and operation. The parameters are numbers and an operation. The user input is straightforward: 5 + 3. So I need to call the 'calculate' function with x=5, y=3, and operation='+'. That should give the correct result. I don't see any other parameters needed here. Just make sure the JSON is correctly formatted with the required fields.\n</think>\n\n",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": [
+                {
+                  "id": "chatcmpl-tool-6d1a92899a8246bb8fae5682dc08590c",
+                  "function": {
+                    "arguments": "{\"x\": 5, \"y\": 3, \"operation\": \"+\"}",
+                    "name": "calculate"
+                  },
+                  "type": "function"
+                }
+              ],
+              "reasoning_content": null
+            },
+            "stop_reason": null
+          }
+        ],
+        "created": 0,
+        "model": "Qwen/Qwen3-0.6B",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": null,
+        "usage": {
+          "completion_tokens": 144,
+          "prompt_tokens": 193,
+          "total_tokens": 337,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        },
+        "prompt_logprobs": null,
+        "kv_transfer_params": null
+      }
+    },
+    "is_streaming": false
+  },
+  "id_normalization_mapping": {}
+}

From dc9497a3b245769e3caf615e34d28b014ed7e0f0 Mon Sep 17 00:00:00 2001
From: Derek Higgins <derekh@redhat.com>
Date: Thu, 6 Nov 2025 16:53:02 +0000
Subject: [PATCH 2/2] ci: Temperarily disable Telemetry during tests (#4090)

Closes: #4089

Signed-off-by: Derek Higgins <derekh@redhat.com>
---
 scripts/integration-tests.sh                    | 6 ++++--
 tests/integration/telemetry/test_completions.py | 5 +++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/scripts/integration-tests.sh b/scripts/integration-tests.sh
index e21f73f99..0951feb14 100755
--- a/scripts/integration-tests.sh
+++ b/scripts/integration-tests.sh
@@ -231,7 +231,8 @@ if [[ "$STACK_CONFIG" == *"server:"* && "$COLLECT_ONLY" == false ]]; then
         # Use a fixed port for the OTEL collector so the server can connect to it
         COLLECTOR_PORT=4317
         export LLAMA_STACK_TEST_COLLECTOR_PORT="${COLLECTOR_PORT}"
-        export OTEL_EXPORTER_OTLP_ENDPOINT="http://127.0.0.1:${COLLECTOR_PORT}"
+        # Disabled: https://github.com/llamastack/llama-stack/issues/4089
+        #export OTEL_EXPORTER_OTLP_ENDPOINT="http://127.0.0.1:${COLLECTOR_PORT}"
         export OTEL_EXPORTER_OTLP_PROTOCOL="http/protobuf"
         export OTEL_BSP_SCHEDULE_DELAY="200"
         export OTEL_BSP_EXPORT_TIMEOUT="2000"
@@ -337,7 +338,8 @@ if [[ "$STACK_CONFIG" == *"docker:"* && "$COLLECT_ONLY" == false ]]; then
     DOCKER_ENV_VARS=""
     DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e LLAMA_STACK_TEST_INFERENCE_MODE=$INFERENCE_MODE"
     DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e LLAMA_STACK_TEST_STACK_CONFIG_TYPE=server"
-    DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:${COLLECTOR_PORT}"
+    # Disabled: https://github.com/llamastack/llama-stack/issues/4089
+    #DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:${COLLECTOR_PORT}"
     DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OTEL_METRIC_EXPORT_INTERVAL=200"
     DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OTEL_BSP_SCHEDULE_DELAY=200"
     DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OTEL_BSP_EXPORT_TIMEOUT=2000"
diff --git a/tests/integration/telemetry/test_completions.py b/tests/integration/telemetry/test_completions.py
index 2b8835f6c..af073d8bc 100644
--- a/tests/integration/telemetry/test_completions.py
+++ b/tests/integration/telemetry/test_completions.py
@@ -12,9 +12,13 @@ before and after each test, ensuring test isolation.
 
 import json
 
+import pytest
+
 
 def test_streaming_chunk_count(mock_otlp_collector, llama_stack_client, text_model_id):
     """Verify streaming adds chunk_count and __type__=async_generator."""
+
+    pytest.skip("Disabled: See https://github.com/llamastack/llama-stack/issues/4089")
     stream = llama_stack_client.chat.completions.create(
         model=text_model_id,
         messages=[{"role": "user", "content": "Test trace openai 1"}],
@@ -50,6 +54,7 @@ def test_streaming_chunk_count(mock_otlp_collector, llama_stack_client, text_mod
 def test_telemetry_format_completeness(mock_otlp_collector, llama_stack_client, text_model_id):
     """Comprehensive validation of telemetry data format including spans and metrics."""
 
+    pytest.skip("Disabled: See https://github.com/llamastack/llama-stack/issues/4089")
     response = llama_stack_client.chat.completions.create(
         model=text_model_id,
         messages=[{"role": "user", "content": "Test trace openai with temperature 0.7"}],