fake mode

# What does this PR do? ## Test Plan
2025-12-23 00:32:26 +00:00 · 2025-08-04 11:46:24 -07:00 · 2025-08-04 11:46:24 -07:00 · 12e46b7a4a
commit 12e46b7a4a
parent 05cfa213b6
3 changed files with 484 additions and 8 deletions
--- a/llama_stack/testing/fake_responses.py
+++ b/llama_stack/testing/fake_responses.py
@ -0,0 +1,256 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""
+Fake response generation for testing inference providers without making real API calls.
+"""
+
+import asyncio
+import os
+import time
+from collections.abc import AsyncGenerator
+from typing import Any
+from openai.types.chat import ChatCompletion
+
+from pydantic import BaseModel
+
+
+class FakeConfig(BaseModel):
+    response_length: int = 100
+    latency_ms: int = 50
+
+
+def parse_fake_config() -> FakeConfig:
+    """Parse fake mode configuration from environment variable."""
+    mode_str = os.environ.get("LLAMA_STACK_TEST_INFERENCE_MODE", "live").lower()
+    config = {}
+
+    if ":" in mode_str:
+        parts = mode_str.split(":")
+        for part in parts[1:]:
+            if "=" in part:
+                key, value = part.split("=", 1)
+                config[key] = int(value)
+    return FakeConfig(**config)
+
+
+def generate_fake_content(word_count: int) -> str:
+    """Generate fake response content with specified word count."""
+    words = [
+        "This",
+        "is",
+        "a",
+        "synthetic",
+        "response",
+        "generated",
+        "for",
+        "testing",
+        "purposes",
+        "only",
+        "The",
+        "content",
+        "simulates",
+        "realistic",
+        "language",
+        "model",
+        "output",
+        "patterns",
+        "and",
+        "structures",
+        "It",
+        "includes",
+        "various",
+        "sentence",
+        "types",
+        "and",
+        "maintains",
+        "coherent",
+        "flow",
+        "throughout",
+        "These",
+        "responses",
+        "help",
+        "test",
+        "system",
+        "performance",
+        "without",
+        "requiring",
+        "real",
+        "model",
+        "calls",
+    ]
+
+    return " ".join(words[i % len(words)] for i in range(word_count)) + "."
+
+
+def generate_fake_chat_completion(body: dict[str, Any], config: FakeConfig) -> Any:
+    """Generate fake OpenAI chat completion response."""
+    model = body.get("model", "gpt-3.5-turbo")
+    messages = body.get("messages", [])
+
+    # Calculate fake token counts based on input
+    prompt_tokens = 0
+    for msg in messages:
+        content = msg.get("content", "")
+        if isinstance(content, str):
+            prompt_tokens += len(content.split())
+        elif isinstance(content, list):
+            # Handle content arrays (images, etc.)
+            for item in content:
+                if isinstance(item, dict) and item.get("type") == "text":
+                    prompt_tokens += len(item.get("text", "").split())
+
+    response_length = config.response_length
+    fake_content = generate_fake_content(response_length)
+    completion_tokens = len(fake_content.split())
+
+    response_data = {
+        "id": f"chatcmpl-fake-{int(time.time() * 1000)}",
+        "object": "chat.completion",
+        "created": int(time.time()),
+        "model": model,
+        "choices": [
+            {
+                "index": 0,
+                "message": {
+                    "role": "assistant",
+                    "content": fake_content,
+                    "function_call": None,
+                    "tool_calls": None,
+                },
+                "finish_reason": "stop",
+                "logprobs": None,
+            }
+        ],
+        "usage": {
+            "prompt_tokens": prompt_tokens,
+            "completion_tokens": completion_tokens,
+            "total_tokens": prompt_tokens + completion_tokens,
+        },
+        "system_fingerprint": None,
+    }
+    time.sleep(config.latency_ms / 1000.0)
+
+    return ChatCompletion.model_validate(response_data)
+
+
+def generate_fake_completion(body: dict[str, Any], config: FakeConfig) -> dict[str, Any]:
+    """Generate fake OpenAI completion response."""
+    raise NotImplementedError("Fake completions not implemented yet")
+
+
+def generate_fake_embeddings(body: dict[str, Any], config: FakeConfig) -> dict[str, Any]:
+    """Generate fake OpenAI embeddings response."""
+    raise NotImplementedError("Fake embeddings not implemented yet")
+
+
+def generate_fake_models_list(config: FakeConfig) -> dict[str, Any]:
+    """Generate fake OpenAI models list response."""
+    raise NotImplementedError("Fake models list not implemented yet")
+
+
+async def generate_fake_stream(
+    response_data: Any, endpoint: str, config: FakeConfig
+) -> AsyncGenerator[dict[str, Any], None]:
+    """Convert fake response to streaming chunks."""
+    latency_seconds = config.latency_ms / 1000.0
+
+    if endpoint == "/v1/chat/completions":
+        if hasattr(response_data, "choices"):
+            content = response_data.choices[0].message.content
+            chunk_id = response_data.id
+            model = response_data.model
+        else:
+            content = response_data["choices"][0]["message"]["content"]
+            chunk_id = response_data["id"]
+            model = response_data["model"]
+
+        words = content.split()
+
+        yield {
+            "id": chunk_id,
+            "object": "chat.completion.chunk",
+            "created": int(time.time()),
+            "model": model,
+            "choices": [
+                {
+                    "index": 0,
+                    "delta": {
+                        "role": "assistant",
+                        "content": "",
+                        "function_call": None,
+                        "tool_calls": None,
+                    },
+                    "finish_reason": None,
+                    "logprobs": None,
+                }
+            ],
+            "system_fingerprint": None,
+        }
+
+        await asyncio.sleep(latency_seconds)
+
+        for i, word in enumerate(words):
+            chunk_content = word + (" " if i < len(words) - 1 else "")
+
+            yield {
+                "id": chunk_id,
+                "object": "chat.completion.chunk",
+                "created": int(time.time()),
+                "model": model,
+                "choices": [
+                    {
+                        "index": 0,
+                        "delta": {
+                            "content": chunk_content,
+                            "function_call": None,
+                            "tool_calls": None,
+                        },
+                        "finish_reason": None,
+                        "logprobs": None,
+                    }
+                ],
+                "system_fingerprint": None,
+            }
+
+            await asyncio.sleep(latency_seconds)
+
+        yield {
+            "id": chunk_id,
+            "object": "chat.completion.chunk",
+            "created": int(time.time()),
+            "model": model,
+            "choices": [
+                {
+                    "index": 0,
+                    "delta": {
+                        "content": None,
+                        "function_call": None,
+                        "tool_calls": None,
+                    },
+                    "finish_reason": "stop",
+                    "logprobs": None,
+                }
+            ],
+            "system_fingerprint": None,
+        }
+
+    elif endpoint == "/v1/completions":
+        raise NotImplementedError("Fake streaming completions not implemented yet")
+
+
+def generate_fake_response(endpoint: str, body: dict[str, Any], config: FakeConfig) -> Any:
+    """Generate fake responses based on endpoint and request."""
+    if endpoint == "/v1/chat/completions":
+        return generate_fake_chat_completion(body, config)
+    elif endpoint == "/v1/completions":
+        return generate_fake_completion(body, config)
+    elif endpoint == "/v1/embeddings":
+        return generate_fake_embeddings(body, config)
+    elif endpoint == "/v1/models":
+        return generate_fake_models_list(config)
+    else:
+        raise ValueError(f"Unsupported endpoint for fake mode: {endpoint}")