Merge 1f29aaa2e1 into 61582f327c

2025-08-15 06:00:48 +00:00 · 2025-08-14 13:57:16 -04:00 · 2025-08-14 13:57:16 -04:00 · 6a5dadc395
commit 6a5dadc395
parent 61582f327c 1f29aaa2e1
4 changed files with 134 additions and 43 deletions
--- a/.github/actions/run-and-record-tests/action.yml
+++ b/.github/actions/run-and-record-tests/action.yml
@ -52,9 +52,9 @@ runs:
          git add tests/integration/recordings/

          if [ "${{ inputs.run-vision-tests }}" == "true" ]; then
-            git commit -m "Recordings update from CI (vision)"
+            git commit -m "Recordings update from CI (vision) (${{ inputs.provider }})"
          else
-            git commit -m "Recordings update from CI"
+            git commit -m "Recordings update from CI (${{ inputs.provider }})"
          fi

          git fetch origin ${{ github.event.pull_request.head.ref }}
@ -70,7 +70,8 @@ runs:
      if: ${{ always() }}
      shell: bash
      run: |
-        sudo docker logs ollama > ollama-${{ inputs.inference-mode }}.log || true
+        sudo docker logs ollama > ollama-${{ inputs.inference-mode }}.log 2>&1 || true
+        sudo docker logs vllm > vllm-${{ inputs.inference-mode }}.log 2>&1 || true

    - name: Upload logs
      if: ${{ always() }}
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -20,7 +20,6 @@ on:
  schedule:
    # If changing the cron schedule, update the provider in the test-matrix job
    - cron: '0 0 * * *'  # (test latest client) Daily at 12 AM UTC
-    - cron: '1 0 * * 0'  # (test vllm) Weekly on Sunday at 1 AM UTC
  workflow_dispatch:
    inputs:
      test-all-client-versions:
@ -38,28 +37,7 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  discover-tests:
-    runs-on: ubuntu-latest
-    outputs:
-      test-types: ${{ steps.generate-test-types.outputs.test-types }}
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Generate test types
-        id: generate-test-types
-        run: |
-          # Get test directories dynamically, excluding non-test directories
-          # NOTE: we are excluding post_training since the tests take too long
-          TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d |
-            sed 's|tests/integration/||' |
-            grep -Ev "^(__pycache__|fixtures|test_cases|recordings|non_ci|post_training)$" |
-            sort | jq -R -s -c 'split("\n")[:-1]')
-          echo "test-types=$TEST_TYPES" >> $GITHUB_OUTPUT
-
  run-replay-mode-tests:
-    needs: discover-tests
    runs-on: ubuntu-latest
    name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, vision={4})', matrix.client-type, matrix.provider, matrix.python-version, matrix.client-version, matrix.run-vision-tests) }}

@ -68,11 +46,14 @@ jobs:
      matrix:
        client-type: [library, server]
        # Use vllm on weekly schedule, otherwise use test-provider input (defaults to ollama)
-        provider: ${{ (github.event.schedule == '1 0 * * 0') && fromJSON('["vllm"]') || fromJSON(format('["{0}"]', github.event.inputs.test-provider || 'ollama')) }}
+        provider: [ollama, vllm]
        # Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12
        python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
        client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
        run-vision-tests: [true, false]
+        exclude:
+          - provider: vllm
+            run-vision-tests: true

    steps:
      - name: Checkout repository
@ -87,10 +68,27 @@ jobs:
          run-vision-tests: ${{ matrix.run-vision-tests }}
          inference-mode: 'replay'

+      - name: Generate test types
+        id: generate-test-types
+        run: |
+          # Only run inference tests for vllm as these are more likely to exercise the vllm provider
+          # TODO: Add agent tests for vllm
+          if [ ${{ matrix.provider }} == "vllm" ]; then
+            echo "test-types=[\"inference\"]" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+          # Get test directories dynamically, excluding non-test directories
+          # NOTE: we are excluding post_training since the tests take too long
+          TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d |
+            sed 's|tests/integration/||' |
+            grep -Ev "^(__pycache__|fixtures|test_cases|recordings|non_ci|post_training)$" |
+            sort | jq -R -s -c 'split("\n")[:-1]')
+          echo "test-types=$TEST_TYPES" >> $GITHUB_OUTPUT
+
      - name: Run tests
        uses: ./.github/actions/run-and-record-tests
        with:
-          test-types: ${{ needs.discover-tests.outputs.test-types }}
+          test-types: ${{ steps.generate-test-types.outputs.test-types }}
          stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }}
          provider: ${{ matrix.provider }}
          inference-mode: 'replay'
--- a/.github/workflows/record-integration-tests.yml
+++ b/.github/workflows/record-integration-tests.yml
@ -15,12 +15,6 @@ on:
      - '.github/actions/setup-ollama/action.yml'
      - '.github/actions/setup-test-environment/action.yml'
      - '.github/actions/run-and-record-tests/action.yml'
-  workflow_dispatch:
-    inputs:
-      test-provider:
-        description: 'Test against a specific provider'
-        type: string
-        default: 'ollama'

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
@ -42,12 +36,6 @@ jobs:
      - name: Generate test types
        id: generate-test-types
        run: |
-          # Get test directories dynamically, excluding non-test directories
-          TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d -printf "%f\n" |
-            grep -Ev "^(__pycache__|fixtures|test_cases|recordings|post_training)$" |
-            sort | jq -R -s -c 'split("\n")[:-1]')
-          echo "test-types=$TEST_TYPES" >> $GITHUB_OUTPUT
-
          labels=$(gh pr view ${{ github.event.pull_request.number }} --json labels --jq '.labels[].name')
          echo "labels=$labels"

@ -82,6 +70,10 @@ jobs:
      fail-fast: false
      matrix:
        mode: ${{ fromJSON(needs.discover-tests.outputs.matrix-modes) }}
+        provider: [ollama, vllm]
+        exclude:
+          - mode: vision
+            provider: vllm

    steps:
      - name: Checkout repository
@ -90,20 +82,33 @@ jobs:
          ref: ${{ github.event.pull_request.head.ref }}
          fetch-depth: 0

+      - name: Generate test types
+        id: generate-test-types
+        run: |
+          if [ ${{ matrix.provider }} == "vllm" ]; then
+            echo "test-types=[\"inference\"]" >> $GITHUB_OUTPUT
+          else
+            # Get test directories dynamically, excluding non-test directories
+            TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d -printf "%f\n" |
+            grep -Ev "^(__pycache__|fixtures|test_cases|recordings|non_ci|post_training)$" |
+            sort | jq -R -s -c 'split("\n")[:-1]')
+            echo "test-types=$TEST_TYPES" >> $GITHUB_OUTPUT
+          fi
+
      - name: Setup test environment
        uses: ./.github/actions/setup-test-environment
        with:
          python-version: "3.12"  # Use single Python version for recording
          client-version: "latest"
-          provider: ${{ inputs.test-provider || 'ollama' }}
+          provider: ${{ matrix.provider }}
          run-vision-tests: ${{ matrix.mode == 'vision' && 'true' || 'false' }}
          inference-mode: 'record'

      - name: Run and record tests
        uses: ./.github/actions/run-and-record-tests
        with:
-          test-types: ${{ needs.discover-tests.outputs.test-types }}
+          test-types: ${{ steps.generate-test-types.outputs.test-types }}
          stack-config: 'server:ci-tests'  # recording must be done with server since more tests are run
-          provider: ${{ inputs.test-provider || 'ollama' }}
+          provider: ${{ matrix.provider }}
          inference-mode: 'record'
          run-vision-tests: ${{ matrix.mode == 'vision' && 'true' || 'false' }}
--- a/llama_stack/testing/inference_recorder.py
+++ b/llama_stack/testing/inference_recorder.py
@ -10,12 +10,15 @@ import hashlib
 import json
 import os
 import sqlite3
+import uuid
 from collections.abc import Generator
 from contextlib import contextmanager
 from enum import StrEnum
 from pathlib import Path
 from typing import Any, Literal, cast

+from openai.types.chat import ChatCompletion, ChatCompletionChunk
+
 from llama_stack.log import get_logger

 logger = get_logger(__name__, category="testing")
@ -105,13 +108,29 @@ def _deserialize_response(data: dict[str, Any]) -> Any:
        try:
            # Import the original class and reconstruct the object
            module_path, class_name = data["__type__"].rsplit(".", 1)
+
+            # Handle generic types (e.g. AsyncPage[Model]) by removing the generic part
+            if "[" in class_name and "]" in class_name:
+                class_name = class_name.split("[")[0]
+
            module = __import__(module_path, fromlist=[class_name])
            cls = getattr(module, class_name)

            if not hasattr(cls, "model_validate"):
                raise ValueError(f"Pydantic class {cls} does not support model_validate?")

-            return cls.model_validate(data["__data__"])
+            # Special handling for AsyncPage - convert nested model dicts to proper model objects
+            validate_data = data["__data__"]
+            if class_name == "AsyncPage" and isinstance(validate_data, dict) and "data" in validate_data:
+                # Convert model dictionaries to objects with attributes so they work with .id access
+                from types import SimpleNamespace
+
+                validate_data = dict(validate_data)
+                validate_data["data"] = [
+                    SimpleNamespace(**item) if isinstance(item, dict) else item for item in validate_data["data"]
+                ]
+
+            return cls.model_validate(validate_data)
        except (ImportError, AttributeError, TypeError, ValueError) as e:
            logger.warning(f"Failed to deserialize object of type {data['__type__']}: {e}")
            return data["__data__"]
@ -248,6 +267,20 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
        recording = _current_storage.find_recording(request_hash)
        if recording:
            response_body = recording["response"]["body"]
+            if (
+                isinstance(response_body, list)
+                and len(response_body) > 0
+                and isinstance(response_body[0], ChatCompletionChunk)
+            ):
+                # We can't replay chatcompletions with the same id and we store them in a sqlite database with a unique constraint on the id.
+                # So we generate a new id and replace the old one.
+                newid = uuid.uuid4().hex
+                response_body[0].id = "chatcmpl-" + newid
+            elif isinstance(response_body, ChatCompletion):
+                # We can't replay chatcompletions with the same id and we store them in a sqlite database with a unique constraint on the id.
+                # So we generate a new id and replace the old one.
+                newid = uuid.uuid4().hex
+                response_body.id = "chatcmpl-" + newid

            if recording["response"].get("is_streaming", False):

@ -315,9 +348,11 @@ def patch_inference_clients():
    from openai.resources.chat.completions import AsyncCompletions as AsyncChatCompletions
    from openai.resources.completions import AsyncCompletions
    from openai.resources.embeddings import AsyncEmbeddings
+    from openai.resources.models import AsyncModels

    # Store original methods for both OpenAI and Ollama clients
    _original_methods = {
+        "model_list": AsyncModels.list,
        "chat_completions_create": AsyncChatCompletions.create,
        "completions_create": AsyncCompletions.create,
        "embeddings_create": AsyncEmbeddings.create,
@ -330,6 +365,55 @@ def patch_inference_clients():
    }

    # Create patched methods for OpenAI client
+    def patched_model_list(self, *args, **kwargs):
+        # The original models.list() returns an AsyncPaginator that can be used with async for
+        # We need to create a wrapper that preserves this behavior
+        class PatchedAsyncPaginator:
+            def __init__(self, original_method, instance, client_type, endpoint, args, kwargs):
+                self.original_method = original_method
+                self.instance = instance
+                self.client_type = client_type
+                self.endpoint = endpoint
+                self.args = args
+                self.kwargs = kwargs
+                self._result = None
+
+            def __await__(self):
+                # Make it awaitable like the original AsyncPaginator
+                async def _await():
+                    self._result = await _patched_inference_method(
+                        self.original_method, self.instance, self.client_type, self.endpoint, *self.args, **self.kwargs
+                    )
+                    return self._result
+
+                return _await().__await__()
+
+            def __aiter__(self):
+                # Make it async iterable like the original AsyncPaginator
+                return self
+
+            async def __anext__(self):
+                # Get the result if we haven't already
+                if self._result is None:
+                    self._result = await _patched_inference_method(
+                        self.original_method, self.instance, self.client_type, self.endpoint, *self.args, **self.kwargs
+                    )
+
+                # Initialize iteration on first call
+                if not hasattr(self, "_iter_index"):
+                    # Extract the data list from the result
+                    self._data_list = self._result.data
+                    self._iter_index = 0
+
+                # Return next item from the list
+                if self._iter_index >= len(self._data_list):
+                    raise StopAsyncIteration
+                item = self._data_list[self._iter_index]
+                self._iter_index += 1
+                return item
+
+        return PatchedAsyncPaginator(_original_methods["model_list"], self, "openai", "/v1/models", args, kwargs)
+
    async def patched_chat_completions_create(self, *args, **kwargs):
        return await _patched_inference_method(
            _original_methods["chat_completions_create"], self, "openai", "/v1/chat/completions", *args, **kwargs
@ -346,6 +430,7 @@ def patch_inference_clients():
        )

    # Apply OpenAI patches
+    AsyncModels.list = patched_model_list
    AsyncChatCompletions.create = patched_chat_completions_create
    AsyncCompletions.create = patched_completions_create
    AsyncEmbeddings.create = patched_embeddings_create
@ -402,8 +487,10 @@ def unpatch_inference_clients():
    from openai.resources.chat.completions import AsyncCompletions as AsyncChatCompletions
    from openai.resources.completions import AsyncCompletions
    from openai.resources.embeddings import AsyncEmbeddings
+    from openai.resources.models import AsyncModels

    # Restore OpenAI client methods
+    AsyncModels.list = _original_methods["model_list"]
    AsyncChatCompletions.create = _original_methods["chat_completions_create"]
    AsyncCompletions.create = _original_methods["completions_create"]
    AsyncEmbeddings.create = _original_methods["embeddings_create"]