mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-04 12:07:34 +00:00
Merge branch 'main' into ragtool-migration
This commit is contained in:
commit
0b27182037
15 changed files with 193 additions and 178 deletions
|
@ -34,13 +34,12 @@ This data enables data-driven architectural decisions and performance optimizati
|
|||
|
||||
**1. Deploy base k8s infrastructure:**
|
||||
```bash
|
||||
cd ../k8s
|
||||
cd ../../docs/source/distributions/k8s
|
||||
./apply.sh
|
||||
```
|
||||
|
||||
**2. Deploy benchmark components:**
|
||||
```bash
|
||||
cd ../k8s-benchmark
|
||||
./apply.sh
|
||||
```
|
||||
|
||||
|
@ -56,7 +55,6 @@ kubectl get pods
|
|||
|
||||
**Benchmark Llama Stack (default):**
|
||||
```bash
|
||||
cd docs/source/distributions/k8s-benchmark/
|
||||
./run-benchmark.sh
|
||||
```
|
||||
|
|
@ -14,7 +14,7 @@ import os
|
|||
import random
|
||||
import statistics
|
||||
import time
|
||||
from typing import Tuple
|
||||
|
||||
import aiohttp
|
||||
|
||||
|
||||
|
@ -55,50 +55,50 @@ class BenchmarkStats:
|
|||
|
||||
total_time = self.end_time - self.start_time
|
||||
success_rate = (self.success_count / self.total_requests) * 100
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"BENCHMARK RESULTS")
|
||||
|
||||
print(f"\nResponse Time Statistics:")
|
||||
|
||||
print(f"\n{'=' * 60}")
|
||||
print("BENCHMARK RESULTS")
|
||||
|
||||
print("\nResponse Time Statistics:")
|
||||
print(f" Mean: {statistics.mean(self.response_times):.3f}s")
|
||||
print(f" Median: {statistics.median(self.response_times):.3f}s")
|
||||
print(f" Min: {min(self.response_times):.3f}s")
|
||||
print(f" Max: {max(self.response_times):.3f}s")
|
||||
|
||||
|
||||
if len(self.response_times) > 1:
|
||||
print(f" Std Dev: {statistics.stdev(self.response_times):.3f}s")
|
||||
|
||||
|
||||
percentiles = [50, 90, 95, 99]
|
||||
sorted_times = sorted(self.response_times)
|
||||
print(f"\nPercentiles:")
|
||||
print("\nPercentiles:")
|
||||
for p in percentiles:
|
||||
idx = int(len(sorted_times) * p / 100) - 1
|
||||
idx = max(0, min(idx, len(sorted_times) - 1))
|
||||
print(f" P{p}: {sorted_times[idx]:.3f}s")
|
||||
|
||||
|
||||
if self.ttft_times:
|
||||
print(f"\nTime to First Token (TTFT) Statistics:")
|
||||
print("\nTime to First Token (TTFT) Statistics:")
|
||||
print(f" Mean: {statistics.mean(self.ttft_times):.3f}s")
|
||||
print(f" Median: {statistics.median(self.ttft_times):.3f}s")
|
||||
print(f" Min: {min(self.ttft_times):.3f}s")
|
||||
print(f" Max: {max(self.ttft_times):.3f}s")
|
||||
|
||||
|
||||
if len(self.ttft_times) > 1:
|
||||
print(f" Std Dev: {statistics.stdev(self.ttft_times):.3f}s")
|
||||
|
||||
|
||||
sorted_ttft = sorted(self.ttft_times)
|
||||
print(f"\nTTFT Percentiles:")
|
||||
print("\nTTFT Percentiles:")
|
||||
for p in percentiles:
|
||||
idx = int(len(sorted_ttft) * p / 100) - 1
|
||||
idx = max(0, min(idx, len(sorted_ttft) - 1))
|
||||
print(f" P{p}: {sorted_ttft[idx]:.3f}s")
|
||||
|
||||
|
||||
if self.chunks_received:
|
||||
print(f"\nStreaming Statistics:")
|
||||
print("\nStreaming Statistics:")
|
||||
print(f" Mean chunks per response: {statistics.mean(self.chunks_received):.1f}")
|
||||
print(f" Total chunks received: {sum(self.chunks_received)}")
|
||||
|
||||
print(f"{'='*60}")
|
||||
|
||||
print(f"{'=' * 60}")
|
||||
print(f"Total time: {total_time:.2f}s")
|
||||
print(f"Concurrent users: {self.concurrent_users}")
|
||||
print(f"Total requests: {self.total_requests}")
|
||||
|
@ -106,16 +106,16 @@ class BenchmarkStats:
|
|||
print(f"Failed requests: {len(self.errors)}")
|
||||
print(f"Success rate: {success_rate:.1f}%")
|
||||
print(f"Requests per second: {self.success_count / total_time:.2f}")
|
||||
|
||||
|
||||
if self.errors:
|
||||
print(f"\nErrors (showing first 5):")
|
||||
print("\nErrors (showing first 5):")
|
||||
for error in self.errors[:5]:
|
||||
print(f" {error}")
|
||||
|
||||
|
||||
class LlamaStackBenchmark:
|
||||
def __init__(self, base_url: str, model_id: str):
|
||||
self.base_url = base_url.rstrip('/')
|
||||
self.base_url = base_url.rstrip("/")
|
||||
self.model_id = model_id
|
||||
self.headers = {"Content-Type": "application/json"}
|
||||
self.test_messages = [
|
||||
|
@ -126,74 +126,67 @@ class LlamaStackBenchmark:
|
|||
[
|
||||
{"role": "user", "content": "What is machine learning?"},
|
||||
{"role": "assistant", "content": "Machine learning is a subset of AI..."},
|
||||
{"role": "user", "content": "Can you give me a practical example?"}
|
||||
]
|
||||
{"role": "user", "content": "Can you give me a practical example?"},
|
||||
],
|
||||
]
|
||||
|
||||
|
||||
async def make_async_streaming_request(self) -> Tuple[float, int, float | None, str | None]:
|
||||
async def make_async_streaming_request(self) -> tuple[float, int, float | None, str | None]:
|
||||
"""Make a single async streaming chat completion request."""
|
||||
messages = random.choice(self.test_messages)
|
||||
payload = {
|
||||
"model": self.model_id,
|
||||
"messages": messages,
|
||||
"stream": True,
|
||||
"max_tokens": 100
|
||||
}
|
||||
|
||||
payload = {"model": self.model_id, "messages": messages, "stream": True, "max_tokens": 100}
|
||||
|
||||
start_time = time.time()
|
||||
chunks_received = 0
|
||||
ttft = None
|
||||
error = None
|
||||
|
||||
|
||||
session = aiohttp.ClientSession()
|
||||
|
||||
|
||||
try:
|
||||
async with session.post(
|
||||
f"{self.base_url}/chat/completions",
|
||||
headers=self.headers,
|
||||
json=payload,
|
||||
timeout=aiohttp.ClientTimeout(total=30)
|
||||
timeout=aiohttp.ClientTimeout(total=30),
|
||||
) as response:
|
||||
if response.status == 200:
|
||||
async for line in response.content:
|
||||
if line:
|
||||
line_str = line.decode('utf-8').strip()
|
||||
if line_str.startswith('data: '):
|
||||
line_str = line.decode("utf-8").strip()
|
||||
if line_str.startswith("data: "):
|
||||
chunks_received += 1
|
||||
if ttft is None:
|
||||
ttft = time.time() - start_time
|
||||
if line_str == 'data: [DONE]':
|
||||
if line_str == "data: [DONE]":
|
||||
break
|
||||
|
||||
|
||||
if chunks_received == 0:
|
||||
error = "No streaming chunks received"
|
||||
else:
|
||||
text = await response.text()
|
||||
error = f"HTTP {response.status}: {text[:100]}"
|
||||
|
||||
|
||||
except Exception as e:
|
||||
error = f"Request error: {str(e)}"
|
||||
finally:
|
||||
await session.close()
|
||||
|
||||
|
||||
response_time = time.time() - start_time
|
||||
return response_time, chunks_received, ttft, error
|
||||
|
||||
|
||||
async def run_benchmark(self, duration: int, concurrent_users: int) -> BenchmarkStats:
|
||||
"""Run benchmark using async requests for specified duration."""
|
||||
stats = BenchmarkStats()
|
||||
stats.concurrent_users = concurrent_users
|
||||
stats.start_time = time.time()
|
||||
|
||||
|
||||
print(f"Starting benchmark: {duration}s duration, {concurrent_users} concurrent users")
|
||||
print(f"Target URL: {self.base_url}/chat/completions")
|
||||
print(f"Model: {self.model_id}")
|
||||
|
||||
|
||||
connector = aiohttp.TCPConnector(limit=concurrent_users)
|
||||
async with aiohttp.ClientSession(connector=connector) as session:
|
||||
|
||||
async with aiohttp.ClientSession(connector=connector):
|
||||
|
||||
async def worker(worker_id: int):
|
||||
"""Worker that sends requests sequentially until canceled."""
|
||||
request_count = 0
|
||||
|
@ -202,12 +195,12 @@ class LlamaStackBenchmark:
|
|||
response_time, chunks, ttft, error = await self.make_async_streaming_request()
|
||||
await stats.add_result(response_time, chunks, ttft, error)
|
||||
request_count += 1
|
||||
|
||||
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
except Exception as e:
|
||||
await stats.add_result(0, 0, None, f"Worker {worker_id} error: {str(e)}")
|
||||
|
||||
|
||||
# Progress reporting task
|
||||
async def progress_reporter():
|
||||
last_report_time = time.time()
|
||||
|
@ -216,48 +209,52 @@ class LlamaStackBenchmark:
|
|||
await asyncio.sleep(1) # Report every second
|
||||
if time.time() >= last_report_time + 10: # Report every 10 seconds
|
||||
elapsed = time.time() - stats.start_time
|
||||
print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s, RPS: {stats.total_requests / elapsed:.1f}")
|
||||
print(
|
||||
f"Completed: {stats.total_requests} requests in {elapsed:.1f}s, RPS: {stats.total_requests / elapsed:.1f}"
|
||||
)
|
||||
last_report_time = time.time()
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
|
||||
|
||||
# Spawn concurrent workers
|
||||
tasks = [asyncio.create_task(worker(i)) for i in range(concurrent_users)]
|
||||
progress_task = asyncio.create_task(progress_reporter())
|
||||
tasks.append(progress_task)
|
||||
|
||||
|
||||
# Wait for duration then cancel all tasks
|
||||
await asyncio.sleep(duration)
|
||||
|
||||
|
||||
for task in tasks:
|
||||
task.cancel()
|
||||
|
||||
|
||||
# Wait for all tasks to complete
|
||||
await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
|
||||
stats.end_time = time.time()
|
||||
return stats
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Llama Stack Benchmark Tool")
|
||||
parser.add_argument("--base-url", default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"),
|
||||
help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)")
|
||||
parser.add_argument("--model", default=os.getenv("INFERENCE_MODEL", "test-model"),
|
||||
help="Model ID to use for requests")
|
||||
parser.add_argument("--duration", type=int, default=60,
|
||||
help="Duration in seconds to run benchmark (default: 60)")
|
||||
parser.add_argument("--concurrent", type=int, default=10,
|
||||
help="Number of concurrent users (default: 10)")
|
||||
|
||||
parser.add_argument(
|
||||
"--base-url",
|
||||
default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"),
|
||||
help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model", default=os.getenv("INFERENCE_MODEL", "test-model"), help="Model ID to use for requests"
|
||||
)
|
||||
parser.add_argument("--duration", type=int, default=60, help="Duration in seconds to run benchmark (default: 60)")
|
||||
parser.add_argument("--concurrent", type=int, default=10, help="Number of concurrent users (default: 10)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
benchmark = LlamaStackBenchmark(args.base_url, args.model)
|
||||
|
||||
|
||||
try:
|
||||
stats = asyncio.run(benchmark.run_benchmark(args.duration, args.concurrent))
|
||||
stats.print_summary()
|
||||
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\nBenchmark interrupted by user")
|
||||
except Exception as e:
|
|
@ -11,180 +11,192 @@ OpenAI-compatible mock server that returns:
|
|||
- Valid OpenAI-formatted chat completion responses with dynamic content
|
||||
"""
|
||||
|
||||
from flask import Flask, request, jsonify, Response
|
||||
import time
|
||||
import random
|
||||
import uuid
|
||||
import json
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
import uuid
|
||||
|
||||
from flask import Flask, Response, jsonify, request
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
|
||||
# Models from environment variables
|
||||
def get_models():
|
||||
models_str = os.getenv("MOCK_MODELS", "meta-llama/Llama-3.2-3B-Instruct")
|
||||
model_ids = [m.strip() for m in models_str.split(",") if m.strip()]
|
||||
|
||||
|
||||
return {
|
||||
"object": "list",
|
||||
"data": [
|
||||
{
|
||||
"id": model_id,
|
||||
"object": "model",
|
||||
"created": 1234567890,
|
||||
"owned_by": "vllm"
|
||||
}
|
||||
for model_id in model_ids
|
||||
]
|
||||
{"id": model_id, "object": "model", "created": 1234567890, "owned_by": "vllm"} for model_id in model_ids
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def generate_random_text(length=50):
|
||||
"""Generate random but coherent text for responses."""
|
||||
words = [
|
||||
"Hello", "there", "I'm", "an", "AI", "assistant", "ready", "to", "help", "you",
|
||||
"with", "your", "questions", "and", "tasks", "today", "Let", "me","know", "what",
|
||||
"you'd", "like", "to", "discuss", "or", "explore", "together", "I", "can", "assist",
|
||||
"with", "various", "topics", "including", "coding", "writing", "analysis", "and", "more"
|
||||
"Hello",
|
||||
"there",
|
||||
"I'm",
|
||||
"an",
|
||||
"AI",
|
||||
"assistant",
|
||||
"ready",
|
||||
"to",
|
||||
"help",
|
||||
"you",
|
||||
"with",
|
||||
"your",
|
||||
"questions",
|
||||
"and",
|
||||
"tasks",
|
||||
"today",
|
||||
"Let",
|
||||
"me",
|
||||
"know",
|
||||
"what",
|
||||
"you'd",
|
||||
"like",
|
||||
"to",
|
||||
"discuss",
|
||||
"or",
|
||||
"explore",
|
||||
"together",
|
||||
"I",
|
||||
"can",
|
||||
"assist",
|
||||
"with",
|
||||
"various",
|
||||
"topics",
|
||||
"including",
|
||||
"coding",
|
||||
"writing",
|
||||
"analysis",
|
||||
"and",
|
||||
"more",
|
||||
]
|
||||
return " ".join(random.choices(words, k=length))
|
||||
|
||||
@app.route('/v1/models', methods=['GET'])
|
||||
|
||||
@app.route("/v1/models", methods=["GET"])
|
||||
def list_models():
|
||||
models = get_models()
|
||||
print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}")
|
||||
return jsonify(models)
|
||||
|
||||
@app.route('/v1/chat/completions', methods=['POST'])
|
||||
|
||||
@app.route("/v1/chat/completions", methods=["POST"])
|
||||
def chat_completions():
|
||||
"""Return OpenAI-formatted chat completion responses."""
|
||||
data = request.get_json()
|
||||
default_model = get_models()['data'][0]['id']
|
||||
model = data.get('model', default_model)
|
||||
messages = data.get('messages', [])
|
||||
stream = data.get('stream', False)
|
||||
|
||||
default_model = get_models()["data"][0]["id"]
|
||||
model = data.get("model", default_model)
|
||||
messages = data.get("messages", [])
|
||||
stream = data.get("stream", False)
|
||||
|
||||
print(f"[MOCK] Chat completion request - model: {model}, stream: {stream}")
|
||||
|
||||
|
||||
if stream:
|
||||
return handle_streaming_completion(model, messages)
|
||||
else:
|
||||
return handle_non_streaming_completion(model, messages)
|
||||
|
||||
|
||||
def handle_non_streaming_completion(model, messages):
|
||||
response_text = generate_random_text(random.randint(20, 80))
|
||||
|
||||
|
||||
# Calculate realistic token counts
|
||||
prompt_tokens = sum(len(str(msg.get('content', '')).split()) for msg in messages)
|
||||
prompt_tokens = sum(len(str(msg.get("content", "")).split()) for msg in messages)
|
||||
completion_tokens = len(response_text.split())
|
||||
|
||||
|
||||
response = {
|
||||
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
||||
"object": "chat.completion",
|
||||
"created": int(time.time()),
|
||||
"model": model,
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": response_text
|
||||
},
|
||||
"finish_reason": "stop"
|
||||
}
|
||||
],
|
||||
"choices": [{"index": 0, "message": {"role": "assistant", "content": response_text}, "finish_reason": "stop"}],
|
||||
"usage": {
|
||||
"prompt_tokens": prompt_tokens,
|
||||
"completion_tokens": completion_tokens,
|
||||
"total_tokens": prompt_tokens + completion_tokens
|
||||
}
|
||||
"total_tokens": prompt_tokens + completion_tokens,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
return jsonify(response)
|
||||
|
||||
|
||||
def handle_streaming_completion(model, messages):
|
||||
def generate_stream():
|
||||
# Generate response text
|
||||
full_response = generate_random_text(random.randint(30, 100))
|
||||
words = full_response.split()
|
||||
|
||||
|
||||
# Send initial chunk
|
||||
initial_chunk = {
|
||||
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": int(time.time()),
|
||||
"model": model,
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"delta": {"role": "assistant", "content": ""}
|
||||
}
|
||||
]
|
||||
"choices": [{"index": 0, "delta": {"role": "assistant", "content": ""}}],
|
||||
}
|
||||
yield f"data: {json.dumps(initial_chunk)}\n\n"
|
||||
|
||||
|
||||
# Send word by word
|
||||
for i, word in enumerate(words):
|
||||
chunk = {
|
||||
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
||||
"object": "chat.completion.chunk",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": int(time.time()),
|
||||
"model": model,
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"delta": {"content": f"{word} " if i < len(words) - 1 else word}
|
||||
}
|
||||
]
|
||||
"choices": [{"index": 0, "delta": {"content": f"{word} " if i < len(words) - 1 else word}}],
|
||||
}
|
||||
yield f"data: {json.dumps(chunk)}\n\n"
|
||||
# Configurable delay to simulate realistic streaming
|
||||
stream_delay = float(os.getenv("STREAM_DELAY_SECONDS", "0.005"))
|
||||
time.sleep(stream_delay)
|
||||
|
||||
|
||||
# Send final chunk
|
||||
final_chunk = {
|
||||
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
||||
"object": "chat.completion.chunk",
|
||||
"created": int(time.time()),
|
||||
"model": model,
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"delta": {"content": ""},
|
||||
"finish_reason": "stop"
|
||||
}
|
||||
]
|
||||
"choices": [{"index": 0, "delta": {"content": ""}, "finish_reason": "stop"}],
|
||||
}
|
||||
yield f"data: {json.dumps(final_chunk)}\n\n"
|
||||
yield "data: [DONE]\n\n"
|
||||
|
||||
|
||||
return Response(
|
||||
generate_stream(),
|
||||
mimetype='text/event-stream',
|
||||
mimetype="text/event-stream",
|
||||
headers={
|
||||
'Cache-Control': 'no-cache',
|
||||
'Connection': 'keep-alive',
|
||||
'Access-Control-Allow-Origin': '*',
|
||||
}
|
||||
"Cache-Control": "no-cache",
|
||||
"Connection": "keep-alive",
|
||||
"Access-Control-Allow-Origin": "*",
|
||||
},
|
||||
)
|
||||
|
||||
@app.route('/health', methods=['GET'])
|
||||
|
||||
@app.route("/health", methods=["GET"])
|
||||
def health():
|
||||
return jsonify({"status": "healthy", "type": "openai-mock"})
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='OpenAI-compatible mock server')
|
||||
parser.add_argument('--port', type=int, default=8081,
|
||||
help='Port to run the server on (default: 8081)')
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="OpenAI-compatible mock server")
|
||||
parser.add_argument("--port", type=int, default=8081, help="Port to run the server on (default: 8081)")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
port = args.port
|
||||
|
||||
|
||||
models = get_models()
|
||||
print("Starting OpenAI-compatible mock server...")
|
||||
print(f"- /models endpoint with: {[m['id'] for m in models['data']]}")
|
||||
print("- OpenAI-formatted chat/completion responses with dynamic content")
|
||||
print("- Streaming support with valid SSE format")
|
||||
print(f"- Listening on: http://0.0.0.0:{port}")
|
||||
app.run(host='0.0.0.0', port=port, debug=False)
|
||||
app.run(host="0.0.0.0", port=port, debug=False)
|
|
@ -35,5 +35,5 @@ testing/record-replay
|
|||
|
||||
### Benchmarking
|
||||
|
||||
```{include} ../../../docs/source/distributions/k8s-benchmark/README.md
|
||||
```{include} ../../../benchmarking/k8s-benchmark/README.md
|
||||
```
|
||||
|
|
20
llama_stack/ui/package-lock.json
generated
20
llama_stack/ui/package-lock.json
generated
|
@ -3578,6 +3578,13 @@
|
|||
"tailwindcss": "4.1.6"
|
||||
}
|
||||
},
|
||||
"node_modules/@tailwindcss/node/node_modules/tailwindcss": {
|
||||
"version": "4.1.6",
|
||||
"resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.6.tgz",
|
||||
"integrity": "sha512-j0cGLTreM6u4OWzBeLBpycK0WIh8w7kSwcUsQZoGLHZ7xDTdM69lN64AgoIEEwFi0tnhs4wSykUa5YWxAzgFYg==",
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@tailwindcss/oxide": {
|
||||
"version": "4.1.6",
|
||||
"resolved": "https://registry.npmjs.org/@tailwindcss/oxide/-/oxide-4.1.6.tgz",
|
||||
|
@ -3838,6 +3845,13 @@
|
|||
"tailwindcss": "4.1.6"
|
||||
}
|
||||
},
|
||||
"node_modules/@tailwindcss/postcss/node_modules/tailwindcss": {
|
||||
"version": "4.1.6",
|
||||
"resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.6.tgz",
|
||||
"integrity": "sha512-j0cGLTreM6u4OWzBeLBpycK0WIh8w7kSwcUsQZoGLHZ7xDTdM69lN64AgoIEEwFi0tnhs4wSykUa5YWxAzgFYg==",
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@testing-library/dom": {
|
||||
"version": "10.4.1",
|
||||
"resolved": "https://registry.npmjs.org/@testing-library/dom/-/dom-10.4.1.tgz",
|
||||
|
@ -13843,9 +13857,9 @@
|
|||
}
|
||||
},
|
||||
"node_modules/tailwindcss": {
|
||||
"version": "4.1.6",
|
||||
"resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.6.tgz",
|
||||
"integrity": "sha512-j0cGLTreM6u4OWzBeLBpycK0WIh8w7kSwcUsQZoGLHZ7xDTdM69lN64AgoIEEwFi0tnhs4wSykUa5YWxAzgFYg==",
|
||||
"version": "4.1.13",
|
||||
"resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.13.tgz",
|
||||
"integrity": "sha512-i+zidfmTqtwquj4hMEwdjshYYgMbOrPzb9a0M3ZgNa0JMoZeFC6bxZvO8yr8ozS6ix2SDz0+mvryPeBs2TFE+w==",
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
|
|
|
@ -49,16 +49,13 @@ def setup_openai_telemetry_data(llama_stack_client, text_model_id):
|
|||
traces = llama_stack_client.telemetry.query_traces(limit=10)
|
||||
if len(traces) >= 5: # 5 OpenAI completion traces
|
||||
break
|
||||
time.sleep(1)
|
||||
time.sleep(0.1)
|
||||
|
||||
if len(traces) < 5:
|
||||
pytest.fail(
|
||||
f"Failed to create sufficient OpenAI completion telemetry data after 30s. Got {len(traces)} traces."
|
||||
)
|
||||
|
||||
# Wait for 5 seconds to ensure traces has completed logging
|
||||
time.sleep(5)
|
||||
|
||||
yield
|
||||
|
||||
|
||||
|
@ -185,11 +182,13 @@ def test_openai_completion_creates_telemetry(llama_stack_client, text_model_id):
|
|||
assert len(response.choices) > 0, "Response should have at least one choice"
|
||||
|
||||
# Wait for telemetry to be recorded
|
||||
time.sleep(3)
|
||||
|
||||
# Check that we have more traces now
|
||||
final_traces = llama_stack_client.telemetry.query_traces(limit=20)
|
||||
final_count = len(final_traces)
|
||||
start_time = time.time()
|
||||
while time.time() - start_time < 30:
|
||||
final_traces = llama_stack_client.telemetry.query_traces(limit=20)
|
||||
final_count = len(final_traces)
|
||||
if final_count > initial_count:
|
||||
break
|
||||
time.sleep(0.1)
|
||||
|
||||
# Should have at least as many traces as before (might have more due to other activity)
|
||||
assert final_count >= initial_count, "Should have at least as many traces after OpenAI call"
|
||||
|
|
|
@ -42,14 +42,11 @@ def setup_telemetry_data(llama_stack_client, text_model_id):
|
|||
traces = llama_stack_client.telemetry.query_traces(limit=10)
|
||||
if len(traces) >= 4:
|
||||
break
|
||||
time.sleep(1)
|
||||
time.sleep(0.1)
|
||||
|
||||
if len(traces) < 4:
|
||||
pytest.fail(f"Failed to create sufficient telemetry data after 30s. Got {len(traces)} traces.")
|
||||
|
||||
# Wait for 5 seconds to ensure traces has completed logging
|
||||
time.sleep(5)
|
||||
|
||||
yield
|
||||
|
||||
|
||||
|
|
|
@ -46,10 +46,7 @@ def setup_telemetry_metrics_data(openai_client, client_with_models, text_model_i
|
|||
break
|
||||
except Exception:
|
||||
pass
|
||||
time.sleep(1)
|
||||
|
||||
# Wait additional time to ensure all metrics are processed
|
||||
time.sleep(5)
|
||||
time.sleep(0.1)
|
||||
|
||||
# Return the token lists for use in tests
|
||||
return {"prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, "total_tokens": total_tokens}
|
||||
|
|
19
uv.lock
generated
19
uv.lock
generated
|
@ -2023,7 +2023,7 @@ wheels = [
|
|||
|
||||
[[package]]
|
||||
name = "locust"
|
||||
version = "2.39.1"
|
||||
version = "2.40.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "configargparse" },
|
||||
|
@ -2035,6 +2035,7 @@ dependencies = [
|
|||
{ name = "locust-cloud" },
|
||||
{ name = "msgpack" },
|
||||
{ name = "psutil" },
|
||||
{ name = "pytest" },
|
||||
{ name = "python-engineio" },
|
||||
{ name = "python-socketio", extra = ["client"] },
|
||||
{ name = "pywin32", marker = "sys_platform == 'win32'" },
|
||||
|
@ -2043,9 +2044,9 @@ dependencies = [
|
|||
{ name = "setuptools" },
|
||||
{ name = "werkzeug" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/95/c8/10aa5445c404eed389b56877e6714c1787190cc09dd70059ce3765979ec5/locust-2.39.1.tar.gz", hash = "sha256:6bdd19e27edf9a1c84391d6cf6e9a737dfb832be7dfbf39053191ae31b9cc498", size = 1409902, upload-time = "2025-08-29T17:41:01.544Z" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/01/22/82f40176473a98c9479bed667d3ad21bb859d2cb67f6880a6b0b6a725e45/locust-2.40.1.tar.gz", hash = "sha256:5bde76c1cf7e412071670f926f34844e119210c93f07a4cf9fc4cb93c60a578a", size = 1411606, upload-time = "2025-09-05T15:57:35.76Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/ec/b3/b2f4b2ca88b1e72eba7be2b2982533b887f8b709d222db78eb9602aa5121/locust-2.39.1-py3-none-any.whl", hash = "sha256:fd5148f2f1a4ed34aee968abc4393674e69d1b5e1b54db50a397f6eb09ce0b04", size = 1428155, upload-time = "2025-08-29T17:41:00.245Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/3b/e6/9c6335ab16becf4f8ad3da6083ab78793c56ec1ca496d6f7c74660c21c3f/locust-2.40.1-py3-none-any.whl", hash = "sha256:ef0517f9bb5ed0afa7035014faaf944802917e07da8649461aaaf5e5f3ba8a65", size = 1430154, upload-time = "2025-09-05T15:57:33.233Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -2619,7 +2620,7 @@ wheels = [
|
|||
|
||||
[[package]]
|
||||
name = "openai"
|
||||
version = "1.102.0"
|
||||
version = "1.107.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "anyio" },
|
||||
|
@ -2631,9 +2632,9 @@ dependencies = [
|
|||
{ name = "tqdm" },
|
||||
{ name = "typing-extensions" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/07/55/da5598ed5c6bdd9939633854049cddc5cbac0da938dfcfcb3c6b119c16c0/openai-1.102.0.tar.gz", hash = "sha256:2e0153bcd64a6523071e90211cbfca1f2bbc5ceedd0993ba932a5869f93b7fc9", size = 519027, upload-time = "2025-08-26T20:50:29.397Z" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/88/67/d6498de300f83ff57a79cb7aa96ef3bef8d6f070c3ded0f1b5b45442a6bc/openai-1.107.0.tar.gz", hash = "sha256:43e04927584e57d0e9e640ee0077c78baf8150098be96ebd5c512539b6c4e9a4", size = 566056, upload-time = "2025-09-08T19:25:47.604Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/bd/0d/c9e7016d82c53c5b5e23e2bad36daebb8921ed44f69c0a985c6529a35106/openai-1.102.0-py3-none-any.whl", hash = "sha256:d751a7e95e222b5325306362ad02a7aa96e1fab3ed05b5888ce1c7ca63451345", size = 812015, upload-time = "2025-08-26T20:50:27.219Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/91/ed/e8a4fd20390f2858b95227c288df8fe0c835f7c77625f7583609161684ba/openai-1.107.0-py3-none-any.whl", hash = "sha256:3dcfa3cbb116bd6924b27913b8da28c4a787379ff60049588547a1013e6d6438", size = 950968, upload-time = "2025-09-08T19:25:45.552Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -3540,7 +3541,7 @@ wheels = [
|
|||
|
||||
[[package]]
|
||||
name = "pytest"
|
||||
version = "8.4.1"
|
||||
version = "8.4.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "colorama", marker = "sys_platform == 'win32'" },
|
||||
|
@ -3549,9 +3550,9 @@ dependencies = [
|
|||
{ name = "pluggy" },
|
||||
{ name = "pygments" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/08/ba/45911d754e8eba3d5a841a5ce61a65a685ff1798421ac054f85aa8747dfb/pytest-8.4.1.tar.gz", hash = "sha256:7c67fd69174877359ed9371ec3af8a3d2b04741818c51e5e99cc1742251fa93c", size = 1517714, upload-time = "2025-06-18T05:48:06.109Z" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/a3/5c/00a0e072241553e1a7496d638deababa67c5058571567b92a7eaa258397c/pytest-8.4.2.tar.gz", hash = "sha256:86c0d0b93306b961d58d62a4db4879f27fe25513d4b969df351abdddb3c30e01", size = 1519618, upload-time = "2025-09-04T14:34:22.711Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/29/16/c8a903f4c4dffe7a12843191437d7cd8e32751d5de349d45d3fe69544e87/pytest-8.4.1-py3-none-any.whl", hash = "sha256:539c70ba6fcead8e78eebbf1115e8b589e7565830d7d006a8723f19ac8a0afb7", size = 365474, upload-time = "2025-06-18T05:48:03.955Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a8/a4/20da314d277121d6534b3a980b29035dcd51e6744bd79075a6ce8fa4eb8d/pytest-8.4.2-py3-none-any.whl", hash = "sha256:872f880de3fc3a5bdc88a11b39c9710c3497a547cfa9320bc3c5e62fbf272e79", size = 365750, upload-time = "2025-09-04T14:34:20.226Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue