Merge branch 'main' into use-openai-for-ollama

This commit is contained in:
Matthew Farrellee 2025-09-15 15:31:03 -04:00
commit 91fb6f42cb
74 changed files with 8761 additions and 971 deletions

View file

@ -13,11 +13,8 @@ on:
branches: [ main ] branches: [ main ]
types: [opened, synchronize, reopened] types: [opened, synchronize, reopened]
paths: paths:
- 'llama_stack/**' - 'docs/_static/llama-stack-spec.yaml'
- '!llama_stack/ui/**' - 'docs/_static/llama-stack-spec.html'
- 'tests/**'
- 'uv.lock'
- 'pyproject.toml'
- '.github/workflows/conformance.yml' # This workflow itself - '.github/workflows/conformance.yml' # This workflow itself
concurrency: concurrency:
@ -43,10 +40,27 @@ jobs:
ref: ${{ github.event.pull_request.base.ref }} ref: ${{ github.event.pull_request.base.ref }}
path: 'base' path: 'base'
# Cache oasdiff to avoid checksum failures and speed up builds
- name: Cache oasdiff
id: cache-oasdiff
uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809
with:
path: ~/oasdiff
key: oasdiff-${{ runner.os }}
# Install oasdiff: https://github.com/oasdiff/oasdiff, a tool for detecting breaking changes in OpenAPI specs. # Install oasdiff: https://github.com/oasdiff/oasdiff, a tool for detecting breaking changes in OpenAPI specs.
- name: Install oasdiff - name: Install oasdiff
if: steps.cache-oasdiff.outputs.cache-hit != 'true'
run: | run: |
curl -fsSL https://raw.githubusercontent.com/oasdiff/oasdiff/main/install.sh | sh curl -fsSL https://raw.githubusercontent.com/oasdiff/oasdiff/main/install.sh | sh
cp /usr/local/bin/oasdiff ~/oasdiff
# Setup cached oasdiff
- name: Setup cached oasdiff
if: steps.cache-oasdiff.outputs.cache-hit == 'true'
run: |
sudo cp ~/oasdiff /usr/local/bin/oasdiff
sudo chmod +x /usr/local/bin/oasdiff
# Run oasdiff to detect breaking changes in the API specification # Run oasdiff to detect breaking changes in the API specification
# This step will fail if incompatible changes are detected, preventing breaking changes from being merged # This step will fail if incompatible changes are detected, preventing breaking changes from being merged

View file

@ -47,11 +47,21 @@ jobs:
run: npm ci run: npm ci
working-directory: llama_stack/ui working-directory: llama_stack/ui
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1 - name: Run pre-commit
id: precommit
uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
continue-on-error: true
env: env:
SKIP: no-commit-to-branch SKIP: no-commit-to-branch
RUFF_OUTPUT_FORMAT: github RUFF_OUTPUT_FORMAT: github
- name: Check pre-commit results
if: steps.precommit.outcome == 'failure'
run: |
echo "::error::Pre-commit hooks failed. Please run 'pre-commit run --all-files' locally and commit the fixes."
echo "::warning::Some pre-commit hooks failed. Check the output above for details."
exit 1
- name: Debug - name: Debug
run: | run: |
echo "github.ref: ${{ github.ref }}" echo "github.ref: ${{ github.ref }}"
@ -79,17 +89,23 @@ jobs:
echo "No changes to commit" echo "No changes to commit"
fi fi
- name: Verify if there are any diff files after pre-commit - name: Verify no uncommitted changes
if: github.actor != 'dependabot[bot]' if: github.actor != 'dependabot[bot]'
run: | run: |
git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1) if ! git diff --exit-code; then
echo "::error::There are uncommitted changes after pre-commit. Please run 'pre-commit run --all-files' locally and commit the fixes."
echo "::warning::Files with changes:"
git diff --name-status
exit 1
fi
- name: Verify if there are any new files after pre-commit - name: Verify if there are any new files after pre-commit
if: github.actor != 'dependabot[bot]' if: github.actor != 'dependabot[bot]'
run: | run: |
unstaged_files=$(git ls-files --others --exclude-standard) unstaged_files=$(git ls-files --others --exclude-standard)
if [ -n "$unstaged_files" ]; then if [ -n "$unstaged_files" ]; then
echo "There are uncommitted new files, run pre-commit locally and commit again" echo "::error::There are new untracked files after pre-commit. Please run 'pre-commit run --all-files' locally and commit the fixes."
echo "::warning::New files:"
echo "$unstaged_files" echo "$unstaged_files"
exit 1 exit 1
fi fi

View file

@ -34,13 +34,12 @@ This data enables data-driven architectural decisions and performance optimizati
**1. Deploy base k8s infrastructure:** **1. Deploy base k8s infrastructure:**
```bash ```bash
cd ../k8s cd ../../docs/source/distributions/k8s
./apply.sh ./apply.sh
``` ```
**2. Deploy benchmark components:** **2. Deploy benchmark components:**
```bash ```bash
cd ../k8s-benchmark
./apply.sh ./apply.sh
``` ```
@ -56,7 +55,6 @@ kubectl get pods
**Benchmark Llama Stack (default):** **Benchmark Llama Stack (default):**
```bash ```bash
cd docs/source/distributions/k8s-benchmark/
./run-benchmark.sh ./run-benchmark.sh
``` ```

View file

@ -14,7 +14,7 @@ import os
import random import random
import statistics import statistics
import time import time
from typing import Tuple
import aiohttp import aiohttp
@ -57,17 +57,9 @@ class BenchmarkStats:
success_rate = (self.success_count / self.total_requests) * 100 success_rate = (self.success_count / self.total_requests) * 100
print(f"\n{'=' * 60}") print(f"\n{'=' * 60}")
print(f"BENCHMARK RESULTS") print("BENCHMARK RESULTS")
print(f"{'='*60}")
print(f"Total time: {total_time:.2f}s")
print(f"Concurrent users: {self.concurrent_users}")
print(f"Total requests: {self.total_requests}")
print(f"Successful requests: {self.success_count}")
print(f"Failed requests: {len(self.errors)}")
print(f"Success rate: {success_rate:.1f}%")
print(f"Requests per second: {self.success_count / total_time:.2f}")
print(f"\nResponse Time Statistics:") print("\nResponse Time Statistics:")
print(f" Mean: {statistics.mean(self.response_times):.3f}s") print(f" Mean: {statistics.mean(self.response_times):.3f}s")
print(f" Median: {statistics.median(self.response_times):.3f}s") print(f" Median: {statistics.median(self.response_times):.3f}s")
print(f" Min: {min(self.response_times):.3f}s") print(f" Min: {min(self.response_times):.3f}s")
@ -78,14 +70,14 @@ class BenchmarkStats:
percentiles = [50, 90, 95, 99] percentiles = [50, 90, 95, 99]
sorted_times = sorted(self.response_times) sorted_times = sorted(self.response_times)
print(f"\nPercentiles:") print("\nPercentiles:")
for p in percentiles: for p in percentiles:
idx = int(len(sorted_times) * p / 100) - 1 idx = int(len(sorted_times) * p / 100) - 1
idx = max(0, min(idx, len(sorted_times) - 1)) idx = max(0, min(idx, len(sorted_times) - 1))
print(f" P{p}: {sorted_times[idx]:.3f}s") print(f" P{p}: {sorted_times[idx]:.3f}s")
if self.ttft_times: if self.ttft_times:
print(f"\nTime to First Token (TTFT) Statistics:") print("\nTime to First Token (TTFT) Statistics:")
print(f" Mean: {statistics.mean(self.ttft_times):.3f}s") print(f" Mean: {statistics.mean(self.ttft_times):.3f}s")
print(f" Median: {statistics.median(self.ttft_times):.3f}s") print(f" Median: {statistics.median(self.ttft_times):.3f}s")
print(f" Min: {min(self.ttft_times):.3f}s") print(f" Min: {min(self.ttft_times):.3f}s")
@ -95,26 +87,35 @@ class BenchmarkStats:
print(f" Std Dev: {statistics.stdev(self.ttft_times):.3f}s") print(f" Std Dev: {statistics.stdev(self.ttft_times):.3f}s")
sorted_ttft = sorted(self.ttft_times) sorted_ttft = sorted(self.ttft_times)
print(f"\nTTFT Percentiles:") print("\nTTFT Percentiles:")
for p in percentiles: for p in percentiles:
idx = int(len(sorted_ttft) * p / 100) - 1 idx = int(len(sorted_ttft) * p / 100) - 1
idx = max(0, min(idx, len(sorted_ttft) - 1)) idx = max(0, min(idx, len(sorted_ttft) - 1))
print(f" P{p}: {sorted_ttft[idx]:.3f}s") print(f" P{p}: {sorted_ttft[idx]:.3f}s")
if self.chunks_received: if self.chunks_received:
print(f"\nStreaming Statistics:") print("\nStreaming Statistics:")
print(f" Mean chunks per response: {statistics.mean(self.chunks_received):.1f}") print(f" Mean chunks per response: {statistics.mean(self.chunks_received):.1f}")
print(f" Total chunks received: {sum(self.chunks_received)}") print(f" Total chunks received: {sum(self.chunks_received)}")
print(f"{'=' * 60}")
print(f"Total time: {total_time:.2f}s")
print(f"Concurrent users: {self.concurrent_users}")
print(f"Total requests: {self.total_requests}")
print(f"Successful requests: {self.success_count}")
print(f"Failed requests: {len(self.errors)}")
print(f"Success rate: {success_rate:.1f}%")
print(f"Requests per second: {self.success_count / total_time:.2f}")
if self.errors: if self.errors:
print(f"\nErrors (showing first 5):") print("\nErrors (showing first 5):")
for error in self.errors[:5]: for error in self.errors[:5]:
print(f" {error}") print(f" {error}")
class LlamaStackBenchmark: class LlamaStackBenchmark:
def __init__(self, base_url: str, model_id: str): def __init__(self, base_url: str, model_id: str):
self.base_url = base_url.rstrip('/') self.base_url = base_url.rstrip("/")
self.model_id = model_id self.model_id = model_id
self.headers = {"Content-Type": "application/json"} self.headers = {"Content-Type": "application/json"}
self.test_messages = [ self.test_messages = [
@ -125,20 +126,14 @@ class LlamaStackBenchmark:
[ [
{"role": "user", "content": "What is machine learning?"}, {"role": "user", "content": "What is machine learning?"},
{"role": "assistant", "content": "Machine learning is a subset of AI..."}, {"role": "assistant", "content": "Machine learning is a subset of AI..."},
{"role": "user", "content": "Can you give me a practical example?"} {"role": "user", "content": "Can you give me a practical example?"},
] ],
] ]
async def make_async_streaming_request(self) -> tuple[float, int, float | None, str | None]:
async def make_async_streaming_request(self) -> Tuple[float, int, float | None, str | None]:
"""Make a single async streaming chat completion request.""" """Make a single async streaming chat completion request."""
messages = random.choice(self.test_messages) messages = random.choice(self.test_messages)
payload = { payload = {"model": self.model_id, "messages": messages, "stream": True, "max_tokens": 100}
"model": self.model_id,
"messages": messages,
"stream": True,
"max_tokens": 100
}
start_time = time.time() start_time = time.time()
chunks_received = 0 chunks_received = 0
@ -152,17 +147,17 @@ class LlamaStackBenchmark:
f"{self.base_url}/chat/completions", f"{self.base_url}/chat/completions",
headers=self.headers, headers=self.headers,
json=payload, json=payload,
timeout=aiohttp.ClientTimeout(total=30) timeout=aiohttp.ClientTimeout(total=30),
) as response: ) as response:
if response.status == 200: if response.status == 200:
async for line in response.content: async for line in response.content:
if line: if line:
line_str = line.decode('utf-8').strip() line_str = line.decode("utf-8").strip()
if line_str.startswith('data: '): if line_str.startswith("data: "):
chunks_received += 1 chunks_received += 1
if ttft is None: if ttft is None:
ttft = time.time() - start_time ttft = time.time() - start_time
if line_str == 'data: [DONE]': if line_str == "data: [DONE]":
break break
if chunks_received == 0: if chunks_received == 0:
@ -179,7 +174,6 @@ class LlamaStackBenchmark:
response_time = time.time() - start_time response_time = time.time() - start_time
return response_time, chunks_received, ttft, error return response_time, chunks_received, ttft, error
async def run_benchmark(self, duration: int, concurrent_users: int) -> BenchmarkStats: async def run_benchmark(self, duration: int, concurrent_users: int) -> BenchmarkStats:
"""Run benchmark using async requests for specified duration.""" """Run benchmark using async requests for specified duration."""
stats = BenchmarkStats() stats = BenchmarkStats()
@ -191,7 +185,7 @@ class LlamaStackBenchmark:
print(f"Model: {self.model_id}") print(f"Model: {self.model_id}")
connector = aiohttp.TCPConnector(limit=concurrent_users) connector = aiohttp.TCPConnector(limit=concurrent_users)
async with aiohttp.ClientSession(connector=connector) as session: async with aiohttp.ClientSession(connector=connector):
async def worker(worker_id: int): async def worker(worker_id: int):
"""Worker that sends requests sequentially until canceled.""" """Worker that sends requests sequentially until canceled."""
@ -215,7 +209,9 @@ class LlamaStackBenchmark:
await asyncio.sleep(1) # Report every second await asyncio.sleep(1) # Report every second
if time.time() >= last_report_time + 10: # Report every 10 seconds if time.time() >= last_report_time + 10: # Report every 10 seconds
elapsed = time.time() - stats.start_time elapsed = time.time() - stats.start_time
print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s") print(
f"Completed: {stats.total_requests} requests in {elapsed:.1f}s, RPS: {stats.total_requests / elapsed:.1f}"
)
last_report_time = time.time() last_report_time = time.time()
except asyncio.CancelledError: except asyncio.CancelledError:
break break
@ -240,14 +236,16 @@ class LlamaStackBenchmark:
def main(): def main():
parser = argparse.ArgumentParser(description="Llama Stack Benchmark Tool") parser = argparse.ArgumentParser(description="Llama Stack Benchmark Tool")
parser.add_argument("--base-url", default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"), parser.add_argument(
help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)") "--base-url",
parser.add_argument("--model", default=os.getenv("INFERENCE_MODEL", "test-model"), default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"),
help="Model ID to use for requests") help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)",
parser.add_argument("--duration", type=int, default=60, )
help="Duration in seconds to run benchmark (default: 60)") parser.add_argument(
parser.add_argument("--concurrent", type=int, default=10, "--model", default=os.getenv("INFERENCE_MODEL", "test-model"), help="Model ID to use for requests"
help="Number of concurrent users (default: 10)") )
parser.add_argument("--duration", type=int, default=60, help="Duration in seconds to run benchmark (default: 60)")
parser.add_argument("--concurrent", type=int, default=10, help="Number of concurrent users (default: 10)")
args = parser.parse_args() args = parser.parse_args()

View file

@ -11,16 +11,18 @@ OpenAI-compatible mock server that returns:
- Valid OpenAI-formatted chat completion responses with dynamic content - Valid OpenAI-formatted chat completion responses with dynamic content
""" """
from flask import Flask, request, jsonify, Response
import time
import random
import uuid
import json
import argparse import argparse
import json
import os import os
import random
import time
import uuid
from flask import Flask, Response, jsonify, request
app = Flask(__name__) app = Flask(__name__)
# Models from environment variables # Models from environment variables
def get_models(): def get_models():
models_str = os.getenv("MOCK_MODELS", "meta-llama/Llama-3.2-3B-Instruct") models_str = os.getenv("MOCK_MODELS", "meta-llama/Llama-3.2-3B-Instruct")
@ -29,40 +31,72 @@ def get_models():
return { return {
"object": "list", "object": "list",
"data": [ "data": [
{ {"id": model_id, "object": "model", "created": 1234567890, "owned_by": "vllm"} for model_id in model_ids
"id": model_id, ],
"object": "model",
"created": 1234567890,
"owned_by": "vllm"
}
for model_id in model_ids
]
} }
def generate_random_text(length=50): def generate_random_text(length=50):
"""Generate random but coherent text for responses.""" """Generate random but coherent text for responses."""
words = [ words = [
"Hello", "there", "I'm", "an", "AI", "assistant", "ready", "to", "help", "you", "Hello",
"with", "your", "questions", "and", "tasks", "today", "Let", "me","know", "what", "there",
"you'd", "like", "to", "discuss", "or", "explore", "together", "I", "can", "assist", "I'm",
"with", "various", "topics", "including", "coding", "writing", "analysis", "and", "more" "an",
"AI",
"assistant",
"ready",
"to",
"help",
"you",
"with",
"your",
"questions",
"and",
"tasks",
"today",
"Let",
"me",
"know",
"what",
"you'd",
"like",
"to",
"discuss",
"or",
"explore",
"together",
"I",
"can",
"assist",
"with",
"various",
"topics",
"including",
"coding",
"writing",
"analysis",
"and",
"more",
] ]
return " ".join(random.choices(words, k=length)) return " ".join(random.choices(words, k=length))
@app.route('/v1/models', methods=['GET'])
@app.route("/v1/models", methods=["GET"])
def list_models(): def list_models():
models = get_models() models = get_models()
print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}") print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}")
return jsonify(models) return jsonify(models)
@app.route('/v1/chat/completions', methods=['POST'])
@app.route("/v1/chat/completions", methods=["POST"])
def chat_completions(): def chat_completions():
"""Return OpenAI-formatted chat completion responses.""" """Return OpenAI-formatted chat completion responses."""
data = request.get_json() data = request.get_json()
default_model = get_models()['data'][0]['id'] default_model = get_models()["data"][0]["id"]
model = data.get('model', default_model) model = data.get("model", default_model)
messages = data.get('messages', []) messages = data.get("messages", [])
stream = data.get('stream', False) stream = data.get("stream", False)
print(f"[MOCK] Chat completion request - model: {model}, stream: {stream}") print(f"[MOCK] Chat completion request - model: {model}, stream: {stream}")
@ -71,11 +105,12 @@ def chat_completions():
else: else:
return handle_non_streaming_completion(model, messages) return handle_non_streaming_completion(model, messages)
def handle_non_streaming_completion(model, messages): def handle_non_streaming_completion(model, messages):
response_text = generate_random_text(random.randint(20, 80)) response_text = generate_random_text(random.randint(20, 80))
# Calculate realistic token counts # Calculate realistic token counts
prompt_tokens = sum(len(str(msg.get('content', '')).split()) for msg in messages) prompt_tokens = sum(len(str(msg.get("content", "")).split()) for msg in messages)
completion_tokens = len(response_text.split()) completion_tokens = len(response_text.split())
response = { response = {
@ -83,25 +118,17 @@ def handle_non_streaming_completion(model, messages):
"object": "chat.completion", "object": "chat.completion",
"created": int(time.time()), "created": int(time.time()),
"model": model, "model": model,
"choices": [ "choices": [{"index": 0, "message": {"role": "assistant", "content": response_text}, "finish_reason": "stop"}],
{
"index": 0,
"message": {
"role": "assistant",
"content": response_text
},
"finish_reason": "stop"
}
],
"usage": { "usage": {
"prompt_tokens": prompt_tokens, "prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens, "completion_tokens": completion_tokens,
"total_tokens": prompt_tokens + completion_tokens "total_tokens": prompt_tokens + completion_tokens,
} },
} }
return jsonify(response) return jsonify(response)
def handle_streaming_completion(model, messages): def handle_streaming_completion(model, messages):
def generate_stream(): def generate_stream():
# Generate response text # Generate response text
@ -114,12 +141,7 @@ def handle_streaming_completion(model, messages):
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"created": int(time.time()), "created": int(time.time()),
"model": model, "model": model,
"choices": [ "choices": [{"index": 0, "delta": {"role": "assistant", "content": ""}}],
{
"index": 0,
"delta": {"role": "assistant", "content": ""}
}
]
} }
yield f"data: {json.dumps(initial_chunk)}\n\n" yield f"data: {json.dumps(initial_chunk)}\n\n"
@ -130,12 +152,7 @@ def handle_streaming_completion(model, messages):
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"created": int(time.time()), "created": int(time.time()),
"model": model, "model": model,
"choices": [ "choices": [{"index": 0, "delta": {"content": f"{word} " if i < len(words) - 1 else word}}],
{
"index": 0,
"delta": {"content": f"{word} " if i < len(words) - 1 else word}
}
]
} }
yield f"data: {json.dumps(chunk)}\n\n" yield f"data: {json.dumps(chunk)}\n\n"
# Configurable delay to simulate realistic streaming # Configurable delay to simulate realistic streaming
@ -148,35 +165,30 @@ def handle_streaming_completion(model, messages):
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"created": int(time.time()), "created": int(time.time()),
"model": model, "model": model,
"choices": [ "choices": [{"index": 0, "delta": {"content": ""}, "finish_reason": "stop"}],
{
"index": 0,
"delta": {"content": ""},
"finish_reason": "stop"
}
]
} }
yield f"data: {json.dumps(final_chunk)}\n\n" yield f"data: {json.dumps(final_chunk)}\n\n"
yield "data: [DONE]\n\n" yield "data: [DONE]\n\n"
return Response( return Response(
generate_stream(), generate_stream(),
mimetype='text/event-stream', mimetype="text/event-stream",
headers={ headers={
'Cache-Control': 'no-cache', "Cache-Control": "no-cache",
'Connection': 'keep-alive', "Connection": "keep-alive",
'Access-Control-Allow-Origin': '*', "Access-Control-Allow-Origin": "*",
} },
) )
@app.route('/health', methods=['GET'])
@app.route("/health", methods=["GET"])
def health(): def health():
return jsonify({"status": "healthy", "type": "openai-mock"}) return jsonify({"status": "healthy", "type": "openai-mock"})
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='OpenAI-compatible mock server') if __name__ == "__main__":
parser.add_argument('--port', type=int, default=8081, parser = argparse.ArgumentParser(description="OpenAI-compatible mock server")
help='Port to run the server on (default: 8081)') parser.add_argument("--port", type=int, default=8081, help="Port to run the server on (default: 8081)")
args = parser.parse_args() args = parser.parse_args()
port = args.port port = args.port
@ -187,4 +199,4 @@ if __name__ == '__main__':
print("- OpenAI-formatted chat/completion responses with dynamic content") print("- OpenAI-formatted chat/completion responses with dynamic content")
print("- Streaming support with valid SSE format") print("- Streaming support with valid SSE format")
print(f"- Listening on: http://0.0.0.0:{port}") print(f"- Listening on: http://0.0.0.0:{port}")
app.run(host='0.0.0.0', port=port, debug=False) app.run(host="0.0.0.0", port=port, debug=False)

View file

@ -2,6 +2,7 @@ version: '2'
image_name: kubernetes-benchmark-demo image_name: kubernetes-benchmark-demo
apis: apis:
- agents - agents
- files
- inference - inference
- files - files
- safety - safety
@ -20,6 +21,14 @@ providers:
- provider_id: sentence-transformers - provider_id: sentence-transformers
provider_type: inline::sentence-transformers provider_type: inline::sentence-transformers
config: {} config: {}
files:
- provider_id: meta-reference-files
provider_type: inline::localfs
config:
storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
metadata_store:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
vector_io: vector_io:
- provider_id: ${env.ENABLE_CHROMADB:+chromadb} - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
provider_type: remote::chromadb provider_type: remote::chromadb

View file

@ -1,5 +1,106 @@
@import url("theme.css"); @import url("theme.css");
/* Horizontal Navigation Bar */
.horizontal-nav {
background-color: #ffffff;
border-bottom: 1px solid #e5e5e5;
padding: 0;
position: fixed;
top: 0;
left: 0;
right: 0;
z-index: 1050;
height: 50px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
[data-theme="dark"] .horizontal-nav {
background-color: #1a1a1a;
border-bottom: 1px solid #333;
}
.horizontal-nav .nav-container {
max-width: 1200px;
margin: 0 auto;
display: flex;
align-items: center;
justify-content: space-between;
padding: 0 20px;
height: 100%;
}
.horizontal-nav .nav-brand {
font-size: 18px;
font-weight: 600;
color: #333;
text-decoration: none;
}
[data-theme="dark"] .horizontal-nav .nav-brand {
color: #fff;
}
.horizontal-nav .nav-links {
display: flex;
align-items: center;
gap: 30px;
list-style: none;
margin: 0;
padding: 0;
}
.horizontal-nav .nav-links a {
color: #666;
text-decoration: none;
font-size: 14px;
font-weight: 500;
padding: 8px 12px;
border-radius: 6px;
transition: all 0.2s ease;
}
.horizontal-nav .nav-links a:hover,
.horizontal-nav .nav-links a.active {
color: #333;
background-color: #f5f5f5;
}
.horizontal-nav .nav-links a.active {
font-weight: 600;
}
[data-theme="dark"] .horizontal-nav .nav-links a {
color: #ccc;
}
[data-theme="dark"] .horizontal-nav .nav-links a:hover,
[data-theme="dark"] .horizontal-nav .nav-links a.active {
color: #fff;
background-color: #333;
}
.horizontal-nav .nav-links .github-link {
display: flex;
align-items: center;
gap: 6px;
}
.horizontal-nav .nav-links .github-icon {
width: 16px;
height: 16px;
fill: currentColor;
}
/* Adjust main content to account for fixed nav */
.wy-nav-side {
top: 50px;
height: calc(100vh - 50px);
}
.wy-nav-content-wrap {
margin-top: 50px;
}
.wy-nav-content { .wy-nav-content {
max-width: 90%; max-width: 90%;
} }

44
docs/_static/js/horizontal_nav.js vendored Normal file
View file

@ -0,0 +1,44 @@
// Horizontal Navigation Bar for Llama Stack Documentation
document.addEventListener('DOMContentLoaded', function() {
// Create the horizontal navigation HTML
const navHTML = `
<nav class="horizontal-nav">
<div class="nav-container">
<a href="/" class="nav-brand">Llama Stack</a>
<ul class="nav-links">
<li><a href="/">Docs</a></li>
<li><a href="/references/api_reference/">API Reference</a></li>
<li><a href="https://github.com/meta-llama/llama-stack" target="_blank" class="github-link">
<svg class="github-icon" viewBox="0 0 16 16" aria-hidden="true">
<path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0016 8c0-4.42-3.58-8-8-8z"/>
</svg>
GitHub
</a></li>
</ul>
</div>
</nav>
`;
// Insert the navigation at the beginning of the body
document.body.insertAdjacentHTML('afterbegin', navHTML);
// Update navigation links based on current page
updateActiveNav();
});
function updateActiveNav() {
const currentPath = window.location.pathname;
const navLinks = document.querySelectorAll('.horizontal-nav .nav-links a');
navLinks.forEach(link => {
// Remove any existing active classes
link.classList.remove('active');
// Add active class based on current path
if (currentPath === '/' && link.getAttribute('href') === '/') {
link.classList.add('active');
} else if (currentPath.includes('/references/api_reference/') && link.getAttribute('href').includes('api_reference')) {
link.classList.add('active');
}
});
}

View file

@ -0,0 +1,701 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "1ztegmwm4sp",
"metadata": {},
"source": [
"## LlamaStack + LangChain Integration Tutorial\n",
"\n",
"This notebook demonstrates how to integrate **LlamaStack** with **LangChain** to build a complete RAG (Retrieval-Augmented Generation) system.\n",
"\n",
"### Overview\n",
"\n",
"- **LlamaStack**: Provides the infrastructure for running LLMs and Open AI Compatible Vector Stores\n",
"- **LangChain**: Provides the framework for chaining operations and prompt templates\n",
"- **Integration**: Uses LlamaStack's OpenAI-compatible API with LangChain\n",
"\n",
"### What You'll See\n",
"\n",
"1. Setting up LlamaStack server with Fireworks AI provider\n",
"2. Creating and Querying Vector Stores\n",
"3. Building RAG chains with LangChain + LLAMAStack\n",
"4. Querying the chain for relevant information\n",
"\n",
"### Prerequisites\n",
"\n",
"- Fireworks API key\n",
"\n",
"---\n",
"\n",
"### 1. Installation and Setup"
]
},
{
"cell_type": "markdown",
"id": "2ktr5ls2cas",
"metadata": {},
"source": [
"#### Install Required Dependencies\n",
"\n",
"First, we install all the necessary packages for LangChain and FastAPI integration."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "5b6a6a17-b931-4bea-8273-0d6e5563637a",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: uv in /Users/swapna942/miniconda3/lib/python3.12/site-packages (0.7.20)\n",
"\u001b[2mUsing Python 3.12.11 environment at: /Users/swapna942/miniconda3\u001b[0m\n",
"\u001b[2mAudited \u001b[1m7 packages\u001b[0m \u001b[2min 42ms\u001b[0m\u001b[0m\n"
]
}
],
"source": [
"!pip install uv\n",
"!uv pip install fastapi uvicorn \"langchain>=0.2\" langchain-openai \\\n",
" langchain-community langchain-text-splitters \\\n",
" faiss-cpu"
]
},
{
"cell_type": "markdown",
"id": "wmt9jvqzh7n",
"metadata": {},
"source": [
"### 2. LlamaStack Server Setup\n",
"\n",
"#### Build and Start LlamaStack Server\n",
"\n",
"This section sets up the LlamaStack server with:\n",
"- **Fireworks AI** as the inference provider\n",
"- **Sentence Transformers** for embeddings\n",
"\n",
"The server runs on `localhost:8321` and provides OpenAI-compatible endpoints."
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "dd2dacf3-ec8b-4cc7-8ff4-b5b6ea4a6e9e",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"import os\n",
"import subprocess\n",
"import time\n",
"\n",
"# Remove UV_SYSTEM_PYTHON to ensure uv creates a proper virtual environment\n",
"# instead of trying to use system Python globally, which could cause permission issues\n",
"# and package conflicts with the system's Python installation\n",
"if \"UV_SYSTEM_PYTHON\" in os.environ:\n",
" del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
"\n",
"def run_llama_stack_server_background():\n",
" \"\"\"Build and run LlamaStack server in one step using --run flag\"\"\"\n",
" log_file = open(\"llama_stack_server.log\", \"w\")\n",
" process = subprocess.Popen(\n",
" \"uv run --with llama-stack llama stack build --distro starter --image-type venv --run\",\n",
" shell=True,\n",
" stdout=log_file,\n",
" stderr=log_file,\n",
" text=True,\n",
" )\n",
"\n",
" print(f\"Building and starting Llama Stack server with PID: {process.pid}\")\n",
" return process\n",
"\n",
"\n",
"def wait_for_server_to_start():\n",
" import requests\n",
" from requests.exceptions import ConnectionError\n",
"\n",
" url = \"http://0.0.0.0:8321/v1/health\"\n",
" max_retries = 30\n",
" retry_interval = 1\n",
"\n",
" print(\"Waiting for server to start\", end=\"\")\n",
" for _ in range(max_retries):\n",
" try:\n",
" response = requests.get(url)\n",
" if response.status_code == 200:\n",
" print(\"\\nServer is ready!\")\n",
" return True\n",
" except ConnectionError:\n",
" print(\".\", end=\"\", flush=True)\n",
" time.sleep(retry_interval)\n",
"\n",
" print(\"\\nServer failed to start after\", max_retries * retry_interval, \"seconds\")\n",
" return False\n",
"\n",
"\n",
"def kill_llama_stack_server():\n",
" # Kill any existing llama stack server processes using pkill command\n",
" os.system(\"pkill -f llama_stack.core.server.server\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "28bd8dbd-4576-4e76-813f-21ab94db44a2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Building and starting Llama Stack server with PID: 19747\n",
"Waiting for server to start....\n",
"Server is ready!\n"
]
}
],
"source": [
"server_process = run_llama_stack_server_background()\n",
"assert wait_for_server_to_start()"
]
},
{
"cell_type": "markdown",
"id": "gr9cdcg4r7n",
"metadata": {},
"source": [
"#### Install LlamaStack Client\n",
"\n",
"Install the client library to interact with the LlamaStack server."
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "487d2dbc-d071-400e-b4f0-dcee58f8dc95",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2mUsing Python 3.12.11 environment at: /Users/swapna942/miniconda3\u001b[0m\n",
"\u001b[2mAudited \u001b[1m1 package\u001b[0m \u001b[2min 27ms\u001b[0m\u001b[0m\n"
]
}
],
"source": [
"!uv pip install llama_stack_client"
]
},
{
"cell_type": "markdown",
"id": "0j5hag7l9x89",
"metadata": {},
"source": [
"### 3. Initialize LlamaStack Client\n",
"\n",
"Create a client connection to the LlamaStack server with API keys for different providers:\n",
"\n",
"- **Fireworks API Key**: For Fireworks models\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "ab4eff97-4565-4c73-b1b3-0020a4c7e2a5",
"metadata": {},
"outputs": [],
"source": [
"from llama_stack_client import LlamaStackClient\n",
"\n",
"client = LlamaStackClient(\n",
" base_url=\"http://0.0.0.0:8321\",\n",
" provider_data={\"fireworks_api_key\": \"***\"},\n",
")"
]
},
{
"cell_type": "markdown",
"id": "vwhexjy1e8o",
"metadata": {},
"source": [
"#### Explore Available Models and Safety Features\n",
"\n",
"Check what models and safety shields are available through your LlamaStack instance."
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "880443ef-ac3c-48b1-a80a-7dab5b25ac61",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:httpx:HTTP Request: GET http://0.0.0.0:8321/v1/models \"HTTP/1.1 200 OK\"\n",
"INFO:httpx:HTTP Request: GET http://0.0.0.0:8321/v1/shields \"HTTP/1.1 200 OK\"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Available Fireworks models:\n",
"- fireworks/accounts/fireworks/models/llama-v3p1-8b-instruct\n",
"- fireworks/accounts/fireworks/models/llama-v3p1-70b-instruct\n",
"- fireworks/accounts/fireworks/models/llama-v3p1-405b-instruct\n",
"- fireworks/accounts/fireworks/models/llama-v3p2-3b-instruct\n",
"- fireworks/accounts/fireworks/models/llama-v3p2-11b-vision-instruct\n",
"- fireworks/accounts/fireworks/models/llama-v3p2-90b-vision-instruct\n",
"- fireworks/accounts/fireworks/models/llama-v3p3-70b-instruct\n",
"- fireworks/accounts/fireworks/models/llama4-scout-instruct-basic\n",
"- fireworks/accounts/fireworks/models/llama4-maverick-instruct-basic\n",
"- fireworks/nomic-ai/nomic-embed-text-v1.5\n",
"- fireworks/accounts/fireworks/models/llama-guard-3-8b\n",
"- fireworks/accounts/fireworks/models/llama-guard-3-11b-vision\n",
"----\n",
"Available shields (safety models):\n",
"code-scanner\n",
"llama-guard\n",
"nemo-guardrail\n",
"----\n"
]
}
],
"source": [
"print(\"Available Fireworks models:\")\n",
"for m in client.models.list():\n",
" if m.identifier.startswith(\"fireworks/\"):\n",
" print(f\"- {m.identifier}\")\n",
"\n",
"print(\"----\")\n",
"print(\"Available shields (safety models):\")\n",
"for s in client.shields.list():\n",
" print(s.identifier)\n",
"print(\"----\")"
]
},
{
"cell_type": "markdown",
"id": "gojp7at31ht",
"metadata": {},
"source": [
"### 4. Vector Store Setup\n",
"\n",
"#### Create a Vector Store with File Upload\n",
"\n",
"Create a vector store using the OpenAI-compatible vector stores API:\n",
"\n",
"- **Vector Store**: OpenAI-compatible vector store for document storage\n",
"- **File Upload**: Automatic chunking and embedding of uploaded files \n",
"- **Embedding Model**: Sentence Transformers model for text embeddings\n",
"- **Dimensions**: 384-dimensional embeddings"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "be2c2899-ea53-4e5f-b6b8-ed425f5d6572",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/files \"HTTP/1.1 200 OK\"\n",
"INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/files \"HTTP/1.1 200 OK\"\n",
"INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/files \"HTTP/1.1 200 OK\"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File(id='file-54652c95c56c4c34918a97d7ff8a4320', bytes=41, created_at=1757442621, expires_at=1788978621, filename='shipping_policy.txt', object='file', purpose='assistants')\n",
"File(id='file-fb1227c1d1854da1bd774d21e5b7e41c', bytes=48, created_at=1757442621, expires_at=1788978621, filename='returns_policy.txt', object='file', purpose='assistants')\n",
"File(id='file-673f874852fe42798675a13d06a256e2', bytes=45, created_at=1757442621, expires_at=1788978621, filename='support.txt', object='file', purpose='assistants')\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/vector_stores \"HTTP/1.1 200 OK\"\n"
]
}
],
"source": [
"from io import BytesIO\n",
"\n",
"docs = [\n",
" (\"Acme ships globally in 3-5 business days.\", {\"title\": \"Shipping Policy\"}),\n",
" (\"Returns are accepted within 30 days of purchase.\", {\"title\": \"Returns Policy\"}),\n",
" (\"Support is available 24/7 via chat and email.\", {\"title\": \"Support\"}),\n",
"]\n",
"\n",
"file_ids = []\n",
"for content, metadata in docs:\n",
" with BytesIO(content.encode()) as file_buffer:\n",
" file_buffer.name = f\"{metadata['title'].replace(' ', '_').lower()}.txt\"\n",
" create_file_response = client.files.create(file=file_buffer, purpose=\"assistants\")\n",
" print(create_file_response)\n",
" file_ids.append(create_file_response.id)\n",
"\n",
"# Create vector store with files\n",
"vector_store = client.vector_stores.create(\n",
" name=\"acme_docs\",\n",
" file_ids=file_ids,\n",
" embedding_model=\"sentence-transformers/all-MiniLM-L6-v2\",\n",
" embedding_dimension=384,\n",
" provider_id=\"faiss\"\n",
")"
]
},
{
"cell_type": "markdown",
"id": "9061tmi1zpq",
"metadata": {},
"source": [
"#### Test Vector Store Search\n",
"\n",
"Query the vector store. This performs semantic search to find relevant documents based on the query."
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "ba9d1901-bd5e-4216-b3e6-19dc74551cc6",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/vector_stores/vs_708c060b-45da-423e-8354-68529b4fd1a6/search \"HTTP/1.1 200 OK\"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Acme ships globally in 3-5 business days.\n",
"Returns are accepted within 30 days of purchase.\n"
]
}
],
"source": [
"search_response = client.vector_stores.search(\n",
" vector_store_id=vector_store.id,\n",
" query=\"How long does shipping take?\",\n",
" max_num_results=2\n",
")\n",
"for result in search_response.data:\n",
" content = result.content[0].text\n",
" print(content)"
]
},
{
"cell_type": "markdown",
"id": "usne6mbspms",
"metadata": {},
"source": [
"### 5. LangChain Integration\n",
"\n",
"#### Configure LangChain with LlamaStack\n",
"\n",
"Set up LangChain to use LlamaStack's OpenAI-compatible API:\n",
"\n",
"- **Base URL**: Points to LlamaStack's OpenAI endpoint\n",
"- **Headers**: Include Fireworks API key for model access\n",
"- **Model**: Use Meta Llama v3p1 8b instruct model for inference"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "c378bd10-09c2-417c-bdfc-1e0a2dd19084",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"from langchain_openai import ChatOpenAI\n",
"\n",
"# Point LangChain to Llamastack Server\n",
"llm = ChatOpenAI(\n",
" base_url=\"http://0.0.0.0:8321/v1/openai/v1\",\n",
" api_key=\"dummy\",\n",
" model=\"fireworks/accounts/fireworks/models/llama-v3p1-8b-instruct\",\n",
" default_headers={\"X-LlamaStack-Provider-Data\": '{\"fireworks_api_key\": \"***\"}'},\n",
")"
]
},
{
"cell_type": "markdown",
"id": "5a4ddpcuk3l",
"metadata": {},
"source": [
"#### Test LLM Connection\n",
"\n",
"Verify that LangChain can successfully communicate with the LlamaStack server."
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "f88ffb5a-657b-4916-9375-c6ddc156c25e",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
]
},
{
"data": {
"text/plain": [
"AIMessage(content=\"A llama's gentle eyes shine bright,\\nIn the Andes, it roams through morning light.\", additional_kwargs={'refusal': None}, response_metadata={'token_usage': None, 'model_name': 'fireworks/accounts/fireworks/models/llama-v3p1-8b-instruct', 'system_fingerprint': None, 'id': 'chatcmpl-602b5967-82a3-476b-9cd2-7d3b29b76ee8', 'service_tier': None, 'finish_reason': 'stop', 'logprobs': None}, id='run--0933c465-ff4d-4a7b-b7fb-fd97dd8244f3-0')"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Test llm with simple message\n",
"messages = [\n",
" {\"role\": \"system\", \"content\": \"You are a friendly assistant.\"},\n",
" {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"},\n",
"]\n",
"llm.invoke(messages)"
]
},
{
"cell_type": "markdown",
"id": "0xh0jg6a0l4a",
"metadata": {},
"source": [
"### 6. Building the RAG Chain\n",
"\n",
"#### Create a Complete RAG Pipeline\n",
"\n",
"Build a LangChain pipeline that combines:\n",
"\n",
"1. **Vector Search**: Query LlamaStack's Open AI compatible Vector Store\n",
"2. **Context Assembly**: Format retrieved documents\n",
"3. **Prompt Template**: Structure the input for the LLM\n",
"4. **LLM Generation**: Generate answers using context\n",
"5. **Output Parsing**: Extract the final response\n",
"\n",
"**Chain Flow**: `Query → Vector Search → Context + Question → LLM → Response`"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "9684427d-dcc7-4544-9af5-8b110d014c42",
"metadata": {},
"outputs": [],
"source": [
"# LangChain for prompt template and chaining + LLAMA Stack Client Vector DB and LLM chat completion\n",
"from langchain_core.output_parsers import StrOutputParser\n",
"from langchain_core.prompts import ChatPromptTemplate\n",
"from langchain_core.runnables import RunnableLambda, RunnablePassthrough\n",
"\n",
"\n",
"def join_docs(docs):\n",
" return \"\\n\\n\".join([f\"[{d.filename}] {d.content[0].text}\" for d in docs.data])\n",
"\n",
"PROMPT = ChatPromptTemplate.from_messages(\n",
" [\n",
" (\"system\", \"You are a helpful assistant. Use the following context to answer.\"),\n",
" (\"user\", \"Question: {question}\\n\\nContext:\\n{context}\"),\n",
" ]\n",
")\n",
"\n",
"vector_step = RunnableLambda(\n",
" lambda x: client.vector_stores.search(\n",
" vector_store_id=vector_store.id,\n",
" query=x,\n",
" max_num_results=2\n",
" )\n",
" )\n",
"\n",
"chain = (\n",
" {\"context\": vector_step | RunnableLambda(join_docs), \"question\": RunnablePassthrough()}\n",
" | PROMPT\n",
" | llm\n",
" | StrOutputParser()\n",
")"
]
},
{
"cell_type": "markdown",
"id": "0onu6rhphlra",
"metadata": {},
"source": [
"### 7. Testing the RAG System\n",
"\n",
"#### Example 1: Shipping Query"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "03322188-9509-446a-a4a8-ce3bb83ec87c",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/vector_stores/vs_708c060b-45da-423e-8354-68529b4fd1a6/search \"HTTP/1.1 200 OK\"\n",
"INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"❓ How long does shipping take?\n",
"💡 Acme ships globally in 3-5 business days. This means that shipping typically takes between 3 to 5 working days from the date of dispatch or order fulfillment.\n"
]
}
],
"source": [
"query = \"How long does shipping take?\"\n",
"response = chain.invoke(query)\n",
"print(\"❓\", query)\n",
"print(\"💡\", response)"
]
},
{
"cell_type": "markdown",
"id": "b7krhqj88ku",
"metadata": {},
"source": [
"#### Example 2: Returns Policy Query"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "61995550-bb0b-46a8-a5d0-023207475d60",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/vector_stores/vs_708c060b-45da-423e-8354-68529b4fd1a6/search \"HTTP/1.1 200 OK\"\n",
"INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"❓ Can I return a product after 40 days?\n",
"💡 Based on the provided context, you cannot return a product after 40 days. The return window is limited to 30 days from the date of purchase.\n"
]
}
],
"source": [
"query = \"Can I return a product after 40 days?\"\n",
"response = chain.invoke(query)\n",
"print(\"❓\", query)\n",
"print(\"💡\", response)"
]
},
{
"cell_type": "markdown",
"id": "h4w24fadvjs",
"metadata": {},
"source": [
"---\n",
"We have successfully built a RAG system that combines:\n",
"\n",
"- **LlamaStack** for infrastructure (LLM serving + Vector Store)\n",
"- **LangChain** for orchestration (prompts + chains)\n",
"- **Fireworks** for high-quality language models\n",
"\n",
"### Key Benefits\n",
"\n",
"1. **Unified Infrastructure**: Single server for LLMs and Vector Store\n",
"2. **OpenAI Compatibility**: Easy integration with existing LangChain code\n",
"3. **Multi-Provider Support**: Switch between different LLM providers\n",
"4. **Production Ready**: Built-in safety shields and monitoring\n",
"\n",
"### Next Steps\n",
"\n",
"- Add more sophisticated document processing\n",
"- Implement conversation memory\n",
"- Add safety filtering and monitoring\n",
"- Scale to larger document collections\n",
"- Integrate with web frameworks like FastAPI or Streamlit\n",
"\n",
"---\n",
"\n",
"##### 🔧 Cleanup\n",
"\n",
"Don't forget to stop the LlamaStack server when you're done:\n",
"\n",
"```python\n",
"kill_llama_stack_server()\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "15647c46-22ce-4698-af3f-8161329d8e3a",
"metadata": {},
"outputs": [],
"source": [
"kill_llama_stack_server()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View file

@ -93,10 +93,31 @@ chunks_response = client.vector_io.query(
### Using the RAG Tool ### Using the RAG Tool
> **⚠️ DEPRECATION NOTICE**: The RAG Tool is being deprecated in favor of directly using the OpenAI-compatible Search
> API. We recommend migrating to the OpenAI APIs for better compatibility and future support.
A better way to ingest documents is to use the RAG Tool. This tool allows you to ingest documents from URLs, files, etc. A better way to ingest documents is to use the RAG Tool. This tool allows you to ingest documents from URLs, files, etc.
and automatically chunks them into smaller pieces. More examples for how to format a RAGDocument can be found in the and automatically chunks them into smaller pieces. More examples for how to format a RAGDocument can be found in the
[appendix](#more-ragdocument-examples). [appendix](#more-ragdocument-examples).
#### OpenAI API Integration & Migration
The RAG tool has been updated to use OpenAI-compatible APIs. This provides several benefits:
- **Files API Integration**: Documents are now uploaded using OpenAI's file upload endpoints
- **Vector Stores API**: Vector storage operations use OpenAI's vector store format with configurable chunking strategies
- **Error Resilience:** When processing multiple documents, individual failures are logged but don't crash the operation. Failed documents are skipped while successful ones continue processing.
**Migration Path:**
We recommend migrating to the OpenAI-compatible Search API for:
1. **Better OpenAI Ecosystem Integration**: Direct compatibility with OpenAI tools and workflows including the Responses API
2**Future-Proof**: Continued support and feature development
3**Full OpenAI Compatibility**: Vector Stores, Files, and Search APIs are fully compatible with OpenAI's Responses API
The OpenAI APIs are used under the hood, so you can continue to use your existing RAG Tool code with minimal changes.
However, we recommend updating your code to use the new OpenAI-compatible APIs for better long-term support. If any
documents fail to process, they will be logged in the response but will not cause the entire operation to fail.
```python ```python
from llama_stack_client import RAGDocument from llama_stack_client import RAGDocument

View file

@ -131,6 +131,7 @@ html_static_path = ["../_static"]
def setup(app): def setup(app):
app.add_css_file("css/my_theme.css") app.add_css_file("css/my_theme.css")
app.add_js_file("js/detect_theme.js") app.add_js_file("js/detect_theme.js")
app.add_js_file("js/horizontal_nav.js")
app.add_js_file("js/keyboard_shortcuts.js") app.add_js_file("js/keyboard_shortcuts.js")
def dockerhub_role(name, rawtext, text, lineno, inliner, options={}, content=[]): def dockerhub_role(name, rawtext, text, lineno, inliner, options={}, content=[]):

View file

@ -35,5 +35,5 @@ testing/record-replay
### Benchmarking ### Benchmarking
```{include} ../../../docs/source/distributions/k8s-benchmark/README.md ```{include} ../../../benchmarking/k8s-benchmark/README.md
``` ```

View file

@ -18,6 +18,7 @@ This section contains documentation for all available providers for the **infere
inline_meta-reference inline_meta-reference
inline_sentence-transformers inline_sentence-transformers
remote_anthropic remote_anthropic
remote_azure
remote_bedrock remote_bedrock
remote_cerebras remote_cerebras
remote_databricks remote_databricks

View file

@ -0,0 +1,29 @@
# remote::azure
## Description
Azure OpenAI inference provider for accessing GPT models and other Azure services.
Provider documentation
https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview
## Configuration
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `api_key` | `<class 'pydantic.types.SecretStr'>` | No | | Azure API key for Azure |
| `api_base` | `<class 'pydantic.networks.HttpUrl'>` | No | | Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com) |
| `api_version` | `str \| None` | No | | Azure API version for Azure (e.g., 2024-12-01-preview) |
| `api_type` | `str \| None` | No | azure | Azure API type for Azure (e.g., azure) |
## Sample Configuration
```yaml
api_key: ${env.AZURE_API_KEY:=}
api_base: ${env.AZURE_API_BASE:=}
api_version: ${env.AZURE_API_VERSION:=}
api_type: ${env.AZURE_API_TYPE:=}
```

View file

@ -48,15 +48,12 @@ def setup_verify_download_parser(parser: argparse.ArgumentParser) -> None:
parser.set_defaults(func=partial(run_verify_cmd, parser=parser)) parser.set_defaults(func=partial(run_verify_cmd, parser=parser))
def calculate_md5(filepath: Path, chunk_size: int = 8192) -> str: def calculate_sha256(filepath: Path, chunk_size: int = 8192) -> str:
# NOTE: MD5 is used here only for download integrity verification, sha256_hash = hashlib.sha256()
# not for security purposes
# TODO: switch to SHA256
md5_hash = hashlib.md5(usedforsecurity=False)
with open(filepath, "rb") as f: with open(filepath, "rb") as f:
for chunk in iter(lambda: f.read(chunk_size), b""): for chunk in iter(lambda: f.read(chunk_size), b""):
md5_hash.update(chunk) sha256_hash.update(chunk)
return md5_hash.hexdigest() return sha256_hash.hexdigest()
def load_checksums(checklist_path: Path) -> dict[str, str]: def load_checksums(checklist_path: Path) -> dict[str, str]:
@ -64,10 +61,10 @@ def load_checksums(checklist_path: Path) -> dict[str, str]:
with open(checklist_path) as f: with open(checklist_path) as f:
for line in f: for line in f:
if line.strip(): if line.strip():
md5sum, filepath = line.strip().split(" ", 1) sha256sum, filepath = line.strip().split(" ", 1)
# Remove leading './' if present # Remove leading './' if present
filepath = filepath.lstrip("./") filepath = filepath.lstrip("./")
checksums[filepath] = md5sum checksums[filepath] = sha256sum
return checksums return checksums
@ -88,7 +85,7 @@ def verify_files(model_dir: Path, checksums: dict[str, str], console: Console) -
matches = False matches = False
if exists: if exists:
actual_hash = calculate_md5(full_path) actual_hash = calculate_sha256(full_path)
matches = actual_hash == expected_hash matches = actual_hash == expected_hash
results.append( results.append(

View file

@ -431,6 +431,12 @@ class ServerConfig(BaseModel):
) )
class InferenceStoreConfig(BaseModel):
sql_store_config: SqlStoreConfig
max_write_queue_size: int = Field(default=10000, description="Max queued writes for inference store")
num_writers: int = Field(default=4, description="Number of concurrent background writers")
class StackRunConfig(BaseModel): class StackRunConfig(BaseModel):
version: int = LLAMA_STACK_RUN_CONFIG_VERSION version: int = LLAMA_STACK_RUN_CONFIG_VERSION
@ -464,11 +470,12 @@ Configuration for the persistence store used by the distribution registry. If no
a default SQLite store will be used.""", a default SQLite store will be used.""",
) )
inference_store: SqlStoreConfig | None = Field( inference_store: InferenceStoreConfig | SqlStoreConfig | None = Field(
default=None, default=None,
description=""" description="""
Configuration for the persistence store used by the inference API. If not specified, Configuration for the persistence store used by the inference API. Can be either a
a default SQLite store will be used.""", InferenceStoreConfig (with queue tuning parameters) or a SqlStoreConfig (deprecated).
If not specified, a default SQLite store will be used.""",
) )
# registry of "resources" in the distribution # registry of "resources" in the distribution

View file

@ -78,7 +78,10 @@ async def get_auto_router_impl(
# TODO: move pass configs to routers instead # TODO: move pass configs to routers instead
if api == Api.inference and run_config.inference_store: if api == Api.inference and run_config.inference_store:
inference_store = InferenceStore(run_config.inference_store, policy) inference_store = InferenceStore(
config=run_config.inference_store,
policy=policy,
)
await inference_store.initialize() await inference_store.initialize()
api_to_dep_impl["store"] = inference_store api_to_dep_impl["store"] = inference_store

View file

@ -63,7 +63,7 @@ from llama_stack.models.llama.llama3.chat_format import ChatFormat
from llama_stack.models.llama.llama3.tokenizer import Tokenizer from llama_stack.models.llama.llama3.tokenizer import Tokenizer
from llama_stack.providers.datatypes import HealthResponse, HealthStatus, RoutingTable from llama_stack.providers.datatypes import HealthResponse, HealthStatus, RoutingTable
from llama_stack.providers.utils.inference.inference_store import InferenceStore from llama_stack.providers.utils.inference.inference_store import InferenceStore
from llama_stack.providers.utils.telemetry.tracing import get_current_span from llama_stack.providers.utils.telemetry.tracing import enqueue_event, get_current_span
logger = get_logger(name=__name__, category="core::routers") logger = get_logger(name=__name__, category="core::routers")
@ -90,6 +90,11 @@ class InferenceRouter(Inference):
async def shutdown(self) -> None: async def shutdown(self) -> None:
logger.debug("InferenceRouter.shutdown") logger.debug("InferenceRouter.shutdown")
if self.store:
try:
await self.store.shutdown()
except Exception as e:
logger.warning(f"Error during InferenceStore shutdown: {e}")
async def register_model( async def register_model(
self, self,
@ -160,7 +165,7 @@ class InferenceRouter(Inference):
metrics = self._construct_metrics(prompt_tokens, completion_tokens, total_tokens, model) metrics = self._construct_metrics(prompt_tokens, completion_tokens, total_tokens, model)
if self.telemetry: if self.telemetry:
for metric in metrics: for metric in metrics:
await self.telemetry.log_event(metric) enqueue_event(metric)
return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in metrics] return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in metrics]
async def _count_tokens( async def _count_tokens(
@ -431,7 +436,7 @@ class InferenceRouter(Inference):
model=model_obj, model=model_obj,
) )
for metric in metrics: for metric in metrics:
await self.telemetry.log_event(metric) enqueue_event(metric)
# these metrics will show up in the client response. # these metrics will show up in the client response.
response.metrics = ( response.metrics = (
@ -537,7 +542,7 @@ class InferenceRouter(Inference):
model=model_obj, model=model_obj,
) )
for metric in metrics: for metric in metrics:
await self.telemetry.log_event(metric) enqueue_event(metric)
# these metrics will show up in the client response. # these metrics will show up in the client response.
response.metrics = ( response.metrics = (
metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics
@ -664,7 +669,7 @@ class InferenceRouter(Inference):
"completion_tokens", "completion_tokens",
"total_tokens", "total_tokens",
]: # Only log completion and total tokens ]: # Only log completion and total tokens
await self.telemetry.log_event(metric) enqueue_event(metric)
# Return metrics in response # Return metrics in response
async_metrics = [ async_metrics = [
@ -710,7 +715,7 @@ class InferenceRouter(Inference):
) )
for metric in completion_metrics: for metric in completion_metrics:
if metric.metric in ["completion_tokens", "total_tokens"]: # Only log completion and total tokens if metric.metric in ["completion_tokens", "total_tokens"]: # Only log completion and total tokens
await self.telemetry.log_event(metric) enqueue_event(metric)
# Return metrics in response # Return metrics in response
return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics] return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics]
@ -806,7 +811,7 @@ class InferenceRouter(Inference):
model=model, model=model,
) )
for metric in metrics: for metric in metrics:
await self.telemetry.log_event(metric) enqueue_event(metric)
yield chunk yield chunk
finally: finally:

View file

@ -17,6 +17,7 @@ distribution_spec:
- provider_type: remote::vertexai - provider_type: remote::vertexai
- provider_type: remote::groq - provider_type: remote::groq
- provider_type: remote::sambanova - provider_type: remote::sambanova
- provider_type: remote::azure
- provider_type: inline::sentence-transformers - provider_type: inline::sentence-transformers
vector_io: vector_io:
- provider_type: inline::faiss - provider_type: inline::faiss

View file

@ -81,6 +81,13 @@ providers:
config: config:
url: https://api.sambanova.ai/v1 url: https://api.sambanova.ai/v1
api_key: ${env.SAMBANOVA_API_KEY:=} api_key: ${env.SAMBANOVA_API_KEY:=}
- provider_id: ${env.AZURE_API_KEY:+azure}
provider_type: remote::azure
config:
api_key: ${env.AZURE_API_KEY:=}
api_base: ${env.AZURE_API_BASE:=}
api_version: ${env.AZURE_API_VERSION:=}
api_type: ${env.AZURE_API_TYPE:=}
- provider_id: sentence-transformers - provider_id: sentence-transformers
provider_type: inline::sentence-transformers provider_type: inline::sentence-transformers
vector_io: vector_io:

View file

@ -18,6 +18,7 @@ distribution_spec:
- provider_type: remote::vertexai - provider_type: remote::vertexai
- provider_type: remote::groq - provider_type: remote::groq
- provider_type: remote::sambanova - provider_type: remote::sambanova
- provider_type: remote::azure
- provider_type: inline::sentence-transformers - provider_type: inline::sentence-transformers
vector_io: vector_io:
- provider_type: inline::faiss - provider_type: inline::faiss

View file

@ -81,6 +81,13 @@ providers:
config: config:
url: https://api.sambanova.ai/v1 url: https://api.sambanova.ai/v1
api_key: ${env.SAMBANOVA_API_KEY:=} api_key: ${env.SAMBANOVA_API_KEY:=}
- provider_id: ${env.AZURE_API_KEY:+azure}
provider_type: remote::azure
config:
api_key: ${env.AZURE_API_KEY:=}
api_base: ${env.AZURE_API_BASE:=}
api_version: ${env.AZURE_API_VERSION:=}
api_type: ${env.AZURE_API_TYPE:=}
- provider_id: sentence-transformers - provider_id: sentence-transformers
provider_type: inline::sentence-transformers provider_type: inline::sentence-transformers
vector_io: vector_io:

View file

@ -18,6 +18,7 @@ distribution_spec:
- provider_type: remote::vertexai - provider_type: remote::vertexai
- provider_type: remote::groq - provider_type: remote::groq
- provider_type: remote::sambanova - provider_type: remote::sambanova
- provider_type: remote::azure
- provider_type: inline::sentence-transformers - provider_type: inline::sentence-transformers
vector_io: vector_io:
- provider_type: inline::faiss - provider_type: inline::faiss

View file

@ -81,6 +81,13 @@ providers:
config: config:
url: https://api.sambanova.ai/v1 url: https://api.sambanova.ai/v1
api_key: ${env.SAMBANOVA_API_KEY:=} api_key: ${env.SAMBANOVA_API_KEY:=}
- provider_id: ${env.AZURE_API_KEY:+azure}
provider_type: remote::azure
config:
api_key: ${env.AZURE_API_KEY:=}
api_base: ${env.AZURE_API_BASE:=}
api_version: ${env.AZURE_API_VERSION:=}
api_type: ${env.AZURE_API_TYPE:=}
- provider_id: sentence-transformers - provider_id: sentence-transformers
provider_type: inline::sentence-transformers provider_type: inline::sentence-transformers
vector_io: vector_io:

View file

@ -59,6 +59,7 @@ ENABLED_INFERENCE_PROVIDERS = [
"cerebras", "cerebras",
"nvidia", "nvidia",
"bedrock", "bedrock",
"azure",
] ]
INFERENCE_PROVIDER_IDS = { INFERENCE_PROVIDER_IDS = {
@ -68,6 +69,7 @@ INFERENCE_PROVIDER_IDS = {
"cerebras": "${env.CEREBRAS_API_KEY:+cerebras}", "cerebras": "${env.CEREBRAS_API_KEY:+cerebras}",
"nvidia": "${env.NVIDIA_API_KEY:+nvidia}", "nvidia": "${env.NVIDIA_API_KEY:+nvidia}",
"vertexai": "${env.VERTEX_AI_PROJECT:+vertexai}", "vertexai": "${env.VERTEX_AI_PROJECT:+vertexai}",
"azure": "${env.AZURE_API_KEY:+azure}",
} }
@ -277,5 +279,21 @@ def get_distribution_template(name: str = "starter") -> DistributionTemplate:
"http://localhost:11434", "http://localhost:11434",
"Ollama URL", "Ollama URL",
), ),
"AZURE_API_KEY": (
"",
"Azure API Key",
),
"AZURE_API_BASE": (
"",
"Azure API Base",
),
"AZURE_API_VERSION": (
"",
"Azure API Version",
),
"AZURE_API_TYPE": (
"azure",
"Azure API Type",
),
}, },
) )

View file

@ -8,7 +8,7 @@
from jinja2 import Template from jinja2 import Template
from llama_stack.apis.common.content_types import InterleavedContent from llama_stack.apis.common.content_types import InterleavedContent
from llama_stack.apis.inference import UserMessage from llama_stack.apis.inference import OpenAIUserMessageParam
from llama_stack.apis.tools.rag_tool import ( from llama_stack.apis.tools.rag_tool import (
DefaultRAGQueryGeneratorConfig, DefaultRAGQueryGeneratorConfig,
LLMRAGQueryGeneratorConfig, LLMRAGQueryGeneratorConfig,
@ -61,16 +61,16 @@ async def llm_rag_query_generator(
messages = [interleaved_content_as_str(content)] messages = [interleaved_content_as_str(content)]
template = Template(config.template) template = Template(config.template)
content = template.render({"messages": messages}) rendered_content: str = template.render({"messages": messages})
model = config.model model = config.model
message = UserMessage(content=content) message = OpenAIUserMessageParam(content=rendered_content)
response = await inference_api.chat_completion( response = await inference_api.openai_chat_completion(
model_id=model, model=model,
messages=[message], messages=[message],
stream=False, stream=False,
) )
query = response.completion_message.content query = response.choices[0].message.content
return query return query

View file

@ -45,10 +45,7 @@ from llama_stack.apis.vector_io import (
from llama_stack.log import get_logger from llama_stack.log import get_logger
from llama_stack.providers.datatypes import ToolGroupsProtocolPrivate from llama_stack.providers.datatypes import ToolGroupsProtocolPrivate
from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str
from llama_stack.providers.utils.memory.vector_store import ( from llama_stack.providers.utils.memory.vector_store import parse_data_url
content_from_doc,
parse_data_url,
)
from .config import RagToolRuntimeConfig from .config import RagToolRuntimeConfig
from .context_retriever import generate_rag_query from .context_retriever import generate_rag_query
@ -60,6 +57,47 @@ def make_random_string(length: int = 8):
return "".join(secrets.choice(string.ascii_letters + string.digits) for _ in range(length)) return "".join(secrets.choice(string.ascii_letters + string.digits) for _ in range(length))
async def raw_data_from_doc(doc: RAGDocument) -> tuple[bytes, str]:
"""Get raw binary data and mime type from a RAGDocument for file upload."""
if isinstance(doc.content, URL):
if doc.content.uri.startswith("data:"):
parts = parse_data_url(doc.content.uri)
mime_type = parts["mimetype"]
data = parts["data"]
if parts["is_base64"]:
file_data = base64.b64decode(data)
else:
file_data = data.encode("utf-8")
return file_data, mime_type
else:
async with httpx.AsyncClient() as client:
r = await client.get(doc.content.uri)
r.raise_for_status()
mime_type = r.headers.get("content-type", "application/octet-stream")
return r.content, mime_type
else:
if isinstance(doc.content, str):
content_str = doc.content
else:
content_str = interleaved_content_as_str(doc.content)
if content_str.startswith("data:"):
parts = parse_data_url(content_str)
mime_type = parts["mimetype"]
data = parts["data"]
if parts["is_base64"]:
file_data = base64.b64decode(data)
else:
file_data = data.encode("utf-8")
return file_data, mime_type
else:
return content_str.encode("utf-8"), "text/plain"
class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRuntime): class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRuntime):
def __init__( def __init__(
self, self,
@ -95,20 +133,12 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
return return
for doc in documents: for doc in documents:
if isinstance(doc.content, URL): try:
if doc.content.uri.startswith("data:"): try:
parts = parse_data_url(doc.content.uri) file_data, mime_type = await raw_data_from_doc(doc)
file_data = base64.b64decode(parts["data"]) if parts["is_base64"] else parts["data"].encode() except Exception as e:
mime_type = parts["mimetype"] log.error(f"Failed to extract content from document {doc.document_id}: {e}")
else: continue
async with httpx.AsyncClient() as client:
response = await client.get(doc.content.uri)
file_data = response.content
mime_type = doc.mime_type or response.headers.get("content-type", "application/octet-stream")
else:
content_str = await content_from_doc(doc)
file_data = content_str.encode("utf-8")
mime_type = doc.mime_type or "text/plain"
file_extension = mimetypes.guess_extension(mime_type) or ".txt" file_extension = mimetypes.guess_extension(mime_type) or ".txt"
filename = doc.metadata.get("filename", f"{doc.document_id}{file_extension}") filename = doc.metadata.get("filename", f"{doc.document_id}{file_extension}")
@ -118,9 +148,13 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
upload_file = UploadFile(file=file_obj, filename=filename) upload_file = UploadFile(file=file_obj, filename=filename)
try:
created_file = await self.files_api.openai_upload_file( created_file = await self.files_api.openai_upload_file(
file=upload_file, purpose=OpenAIFilePurpose.ASSISTANTS file=upload_file, purpose=OpenAIFilePurpose.ASSISTANTS
) )
except Exception as e:
log.error(f"Failed to upload file for document {doc.document_id}: {e}")
continue
chunking_strategy = VectorStoreChunkingStrategyStatic( chunking_strategy = VectorStoreChunkingStrategyStatic(
static=VectorStoreChunkingStrategyStaticConfig( static=VectorStoreChunkingStrategyStaticConfig(
@ -129,12 +163,22 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
) )
) )
try:
await self.vector_io_api.openai_attach_file_to_vector_store( await self.vector_io_api.openai_attach_file_to_vector_store(
vector_store_id=vector_db_id, vector_store_id=vector_db_id,
file_id=created_file.id, file_id=created_file.id,
attributes=doc.metadata, attributes=doc.metadata,
chunking_strategy=chunking_strategy, chunking_strategy=chunking_strategy,
) )
except Exception as e:
log.error(
f"Failed to attach file {created_file.id} to vector store {vector_db_id} for document {doc.document_id}: {e}"
)
continue
except Exception as e:
log.error(f"Unexpected error processing document {doc.document_id}: {e}")
continue
async def query( async def query(
self, self,
@ -274,7 +318,6 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
if query_config: if query_config:
query_config = TypeAdapter(RAGQueryConfig).validate_python(query_config) query_config = TypeAdapter(RAGQueryConfig).validate_python(query_config)
else: else:
# handle someone passing an empty dict
query_config = RAGQueryConfig() query_config = RAGQueryConfig()
query = kwargs["query"] query = kwargs["query"]
@ -285,6 +328,6 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
) )
return ToolInvocationResult( return ToolInvocationResult(
content=result.content, content=result.content or [],
metadata=result.metadata, metadata=result.metadata,
) )

View file

@ -13,7 +13,7 @@ def available_providers() -> list[ProviderSpec]:
InlineProviderSpec( InlineProviderSpec(
api=Api.batches, api=Api.batches,
provider_type="inline::reference", provider_type="inline::reference",
pip_packages=["openai"], pip_packages=[],
module="llama_stack.providers.inline.batches.reference", module="llama_stack.providers.inline.batches.reference",
config_class="llama_stack.providers.inline.batches.reference.config.ReferenceBatchesImplConfig", config_class="llama_stack.providers.inline.batches.reference.config.ReferenceBatchesImplConfig",
api_dependencies=[ api_dependencies=[

View file

@ -75,7 +75,7 @@ def available_providers() -> list[ProviderSpec]:
api=Api.inference, api=Api.inference,
adapter=AdapterSpec( adapter=AdapterSpec(
adapter_type="vllm", adapter_type="vllm",
pip_packages=["openai"], pip_packages=[],
module="llama_stack.providers.remote.inference.vllm", module="llama_stack.providers.remote.inference.vllm",
config_class="llama_stack.providers.remote.inference.vllm.VLLMInferenceAdapterConfig", config_class="llama_stack.providers.remote.inference.vllm.VLLMInferenceAdapterConfig",
description="Remote vLLM inference provider for connecting to vLLM servers.", description="Remote vLLM inference provider for connecting to vLLM servers.",
@ -151,9 +151,7 @@ def available_providers() -> list[ProviderSpec]:
api=Api.inference, api=Api.inference,
adapter=AdapterSpec( adapter=AdapterSpec(
adapter_type="databricks", adapter_type="databricks",
pip_packages=[ pip_packages=[],
"openai",
],
module="llama_stack.providers.remote.inference.databricks", module="llama_stack.providers.remote.inference.databricks",
config_class="llama_stack.providers.remote.inference.databricks.DatabricksImplConfig", config_class="llama_stack.providers.remote.inference.databricks.DatabricksImplConfig",
description="Databricks inference provider for running models on Databricks' unified analytics platform.", description="Databricks inference provider for running models on Databricks' unified analytics platform.",
@ -163,9 +161,7 @@ def available_providers() -> list[ProviderSpec]:
api=Api.inference, api=Api.inference,
adapter=AdapterSpec( adapter=AdapterSpec(
adapter_type="nvidia", adapter_type="nvidia",
pip_packages=[ pip_packages=[],
"openai",
],
module="llama_stack.providers.remote.inference.nvidia", module="llama_stack.providers.remote.inference.nvidia",
config_class="llama_stack.providers.remote.inference.nvidia.NVIDIAConfig", config_class="llama_stack.providers.remote.inference.nvidia.NVIDIAConfig",
description="NVIDIA inference provider for accessing NVIDIA NIM models and AI services.", description="NVIDIA inference provider for accessing NVIDIA NIM models and AI services.",
@ -175,7 +171,7 @@ def available_providers() -> list[ProviderSpec]:
api=Api.inference, api=Api.inference,
adapter=AdapterSpec( adapter=AdapterSpec(
adapter_type="runpod", adapter_type="runpod",
pip_packages=["openai"], pip_packages=[],
module="llama_stack.providers.remote.inference.runpod", module="llama_stack.providers.remote.inference.runpod",
config_class="llama_stack.providers.remote.inference.runpod.RunpodImplConfig", config_class="llama_stack.providers.remote.inference.runpod.RunpodImplConfig",
description="RunPod inference provider for running models on RunPod's cloud GPU platform.", description="RunPod inference provider for running models on RunPod's cloud GPU platform.",
@ -207,7 +203,7 @@ def available_providers() -> list[ProviderSpec]:
api=Api.inference, api=Api.inference,
adapter=AdapterSpec( adapter=AdapterSpec(
adapter_type="gemini", adapter_type="gemini",
pip_packages=["litellm", "openai"], pip_packages=["litellm"],
module="llama_stack.providers.remote.inference.gemini", module="llama_stack.providers.remote.inference.gemini",
config_class="llama_stack.providers.remote.inference.gemini.GeminiConfig", config_class="llama_stack.providers.remote.inference.gemini.GeminiConfig",
provider_data_validator="llama_stack.providers.remote.inference.gemini.config.GeminiProviderDataValidator", provider_data_validator="llama_stack.providers.remote.inference.gemini.config.GeminiProviderDataValidator",
@ -218,7 +214,7 @@ def available_providers() -> list[ProviderSpec]:
api=Api.inference, api=Api.inference,
adapter=AdapterSpec( adapter=AdapterSpec(
adapter_type="vertexai", adapter_type="vertexai",
pip_packages=["litellm", "google-cloud-aiplatform", "openai"], pip_packages=["litellm", "google-cloud-aiplatform"],
module="llama_stack.providers.remote.inference.vertexai", module="llama_stack.providers.remote.inference.vertexai",
config_class="llama_stack.providers.remote.inference.vertexai.VertexAIConfig", config_class="llama_stack.providers.remote.inference.vertexai.VertexAIConfig",
provider_data_validator="llama_stack.providers.remote.inference.vertexai.config.VertexAIProviderDataValidator", provider_data_validator="llama_stack.providers.remote.inference.vertexai.config.VertexAIProviderDataValidator",
@ -248,7 +244,7 @@ Available Models:
api=Api.inference, api=Api.inference,
adapter=AdapterSpec( adapter=AdapterSpec(
adapter_type="groq", adapter_type="groq",
pip_packages=["litellm", "openai"], pip_packages=["litellm"],
module="llama_stack.providers.remote.inference.groq", module="llama_stack.providers.remote.inference.groq",
config_class="llama_stack.providers.remote.inference.groq.GroqConfig", config_class="llama_stack.providers.remote.inference.groq.GroqConfig",
provider_data_validator="llama_stack.providers.remote.inference.groq.config.GroqProviderDataValidator", provider_data_validator="llama_stack.providers.remote.inference.groq.config.GroqProviderDataValidator",
@ -270,7 +266,7 @@ Available Models:
api=Api.inference, api=Api.inference,
adapter=AdapterSpec( adapter=AdapterSpec(
adapter_type="sambanova", adapter_type="sambanova",
pip_packages=["litellm", "openai"], pip_packages=["litellm"],
module="llama_stack.providers.remote.inference.sambanova", module="llama_stack.providers.remote.inference.sambanova",
config_class="llama_stack.providers.remote.inference.sambanova.SambaNovaImplConfig", config_class="llama_stack.providers.remote.inference.sambanova.SambaNovaImplConfig",
provider_data_validator="llama_stack.providers.remote.inference.sambanova.config.SambaNovaProviderDataValidator", provider_data_validator="llama_stack.providers.remote.inference.sambanova.config.SambaNovaProviderDataValidator",
@ -299,4 +295,19 @@ Available Models:
description="IBM WatsonX inference provider for accessing AI models on IBM's WatsonX platform.", description="IBM WatsonX inference provider for accessing AI models on IBM's WatsonX platform.",
), ),
), ),
remote_provider_spec(
api=Api.inference,
adapter=AdapterSpec(
adapter_type="azure",
pip_packages=["litellm"],
module="llama_stack.providers.remote.inference.azure",
config_class="llama_stack.providers.remote.inference.azure.AzureConfig",
provider_data_validator="llama_stack.providers.remote.inference.azure.config.AzureProviderDataValidator",
description="""
Azure OpenAI inference provider for accessing GPT models and other Azure services.
Provider documentation
https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview
""",
),
),
] ]

View file

@ -38,7 +38,7 @@ def available_providers() -> list[ProviderSpec]:
InlineProviderSpec( InlineProviderSpec(
api=Api.scoring, api=Api.scoring,
provider_type="inline::braintrust", provider_type="inline::braintrust",
pip_packages=["autoevals", "openai"], pip_packages=["autoevals"],
module="llama_stack.providers.inline.scoring.braintrust", module="llama_stack.providers.inline.scoring.braintrust",
config_class="llama_stack.providers.inline.scoring.braintrust.BraintrustScoringConfig", config_class="llama_stack.providers.inline.scoring.braintrust.BraintrustScoringConfig",
api_dependencies=[ api_dependencies=[

View file

@ -0,0 +1,15 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from .config import AzureConfig
async def get_adapter_impl(config: AzureConfig, _deps):
from .azure import AzureInferenceAdapter
impl = AzureInferenceAdapter(config)
await impl.initialize()
return impl

View file

@ -0,0 +1,64 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Any
from urllib.parse import urljoin
from llama_stack.apis.inference import ChatCompletionRequest
from llama_stack.providers.utils.inference.litellm_openai_mixin import (
LiteLLMOpenAIMixin,
)
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from .config import AzureConfig
from .models import MODEL_ENTRIES
class AzureInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
def __init__(self, config: AzureConfig) -> None:
LiteLLMOpenAIMixin.__init__(
self,
MODEL_ENTRIES,
litellm_provider_name="azure",
api_key_from_config=config.api_key.get_secret_value(),
provider_data_api_key_field="azure_api_key",
openai_compat_api_base=str(config.api_base),
)
self.config = config
# Delegate the client data handling get_api_key method to LiteLLMOpenAIMixin
get_api_key = LiteLLMOpenAIMixin.get_api_key
def get_base_url(self) -> str:
"""
Get the Azure API base URL.
Returns the Azure API base URL from the configuration.
"""
return urljoin(str(self.config.api_base), "/openai/v1")
async def _get_params(self, request: ChatCompletionRequest) -> dict[str, Any]:
# Get base parameters from parent
params = await super()._get_params(request)
# Add Azure specific parameters
provider_data = self.get_request_provider_data()
if provider_data:
if getattr(provider_data, "azure_api_key", None):
params["api_key"] = provider_data.azure_api_key
if getattr(provider_data, "azure_api_base", None):
params["api_base"] = provider_data.azure_api_base
if getattr(provider_data, "azure_api_version", None):
params["api_version"] = provider_data.azure_api_version
if getattr(provider_data, "azure_api_type", None):
params["api_type"] = provider_data.azure_api_type
else:
params["api_key"] = self.config.api_key.get_secret_value()
params["api_base"] = str(self.config.api_base)
params["api_version"] = self.config.api_version
params["api_type"] = self.config.api_type
return params

View file

@ -0,0 +1,63 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import os
from typing import Any
from pydantic import BaseModel, Field, HttpUrl, SecretStr
from llama_stack.schema_utils import json_schema_type
class AzureProviderDataValidator(BaseModel):
azure_api_key: SecretStr = Field(
description="Azure API key for Azure",
)
azure_api_base: HttpUrl = Field(
description="Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com)",
)
azure_api_version: str | None = Field(
default=None,
description="Azure API version for Azure (e.g., 2024-06-01)",
)
azure_api_type: str | None = Field(
default="azure",
description="Azure API type for Azure (e.g., azure)",
)
@json_schema_type
class AzureConfig(BaseModel):
api_key: SecretStr = Field(
description="Azure API key for Azure",
)
api_base: HttpUrl = Field(
description="Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com)",
)
api_version: str | None = Field(
default_factory=lambda: os.getenv("AZURE_API_VERSION"),
description="Azure API version for Azure (e.g., 2024-12-01-preview)",
)
api_type: str | None = Field(
default_factory=lambda: os.getenv("AZURE_API_TYPE", "azure"),
description="Azure API type for Azure (e.g., azure)",
)
@classmethod
def sample_run_config(
cls,
api_key: str = "${env.AZURE_API_KEY:=}",
api_base: str = "${env.AZURE_API_BASE:=}",
api_version: str = "${env.AZURE_API_VERSION:=}",
api_type: str = "${env.AZURE_API_TYPE:=}",
**kwargs,
) -> dict[str, Any]:
return {
"api_key": api_key,
"api_base": api_base,
"api_version": api_version,
"api_type": api_type,
}

View file

@ -0,0 +1,28 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from llama_stack.providers.utils.inference.model_registry import (
ProviderModelEntry,
)
# https://learn.microsoft.com/en-us/azure/ai-foundry/openai/concepts/models?tabs=global-standard%2Cstandard-chat-completions
LLM_MODEL_IDS = [
"gpt-5",
"gpt-5-mini",
"gpt-5-nano",
"gpt-5-chat",
"o1",
"o1-mini",
"o3-mini",
"o4-mini",
"gpt-4.1",
"gpt-4.1-mini",
"gpt-4.1-nano",
]
SAFETY_MODELS_ENTRIES = list[ProviderModelEntry]()
MODEL_ENTRIES = [ProviderModelEntry(provider_model_id=m) for m in LLM_MODEL_IDS] + SAFETY_MODELS_ENTRIES

View file

@ -53,6 +53,43 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
from .models import MODEL_ENTRIES from .models import MODEL_ENTRIES
REGION_PREFIX_MAP = {
"us": "us.",
"eu": "eu.",
"ap": "ap.",
}
def _get_region_prefix(region: str | None) -> str:
# AWS requires region prefixes for inference profiles
if region is None:
return "us." # default to US when we don't know
# Handle case insensitive region matching
region_lower = region.lower()
for prefix in REGION_PREFIX_MAP:
if region_lower.startswith(f"{prefix}-"):
return REGION_PREFIX_MAP[prefix]
# Fallback to US for anything we don't recognize
return "us."
def _to_inference_profile_id(model_id: str, region: str = None) -> str:
# Return ARNs unchanged
if model_id.startswith("arn:"):
return model_id
# Return inference profile IDs that already have regional prefixes
if any(model_id.startswith(p) for p in REGION_PREFIX_MAP.values()):
return model_id
# Default to US East when no region is provided
if region is None:
region = "us-east-1"
return _get_region_prefix(region) + model_id
class BedrockInferenceAdapter( class BedrockInferenceAdapter(
ModelRegistryHelper, ModelRegistryHelper,
@ -166,8 +203,13 @@ class BedrockInferenceAdapter(
options["repetition_penalty"] = sampling_params.repetition_penalty options["repetition_penalty"] = sampling_params.repetition_penalty
prompt = await chat_completion_request_to_prompt(request, self.get_llama_model(request.model)) prompt = await chat_completion_request_to_prompt(request, self.get_llama_model(request.model))
# Convert foundation model ID to inference profile ID
region_name = self.client.meta.region_name
inference_profile_id = _to_inference_profile_id(bedrock_model, region_name)
return { return {
"modelId": bedrock_model, "modelId": inference_profile_id,
"body": json.dumps( "body": json.dumps(
{ {
"prompt": prompt, "prompt": prompt,
@ -185,6 +227,11 @@ class BedrockInferenceAdapter(
task_type: EmbeddingTaskType | None = None, task_type: EmbeddingTaskType | None = None,
) -> EmbeddingsResponse: ) -> EmbeddingsResponse:
model = await self.model_store.get_model(model_id) model = await self.model_store.get_model(model_id)
# Convert foundation model ID to inference profile ID
region_name = self.client.meta.region_name
inference_profile_id = _to_inference_profile_id(model.provider_resource_id, region_name)
embeddings = [] embeddings = []
for content in contents: for content in contents:
assert not content_has_media(content), "Bedrock does not support media for embeddings" assert not content_has_media(content), "Bedrock does not support media for embeddings"
@ -193,7 +240,7 @@ class BedrockInferenceAdapter(
body = json.dumps(input_body) body = json.dumps(input_body)
response = self.client.invoke_model( response = self.client.invoke_model(
body=body, body=body,
modelId=model.provider_resource_id, modelId=inference_profile_id,
accept="application/json", accept="application/json",
contentType="application/json", contentType="application/json",
) )

View file

@ -4,7 +4,7 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
import json import json
from collections.abc import AsyncGenerator, AsyncIterator from collections.abc import AsyncGenerator
from typing import Any from typing import Any
import httpx import httpx
@ -38,13 +38,6 @@ from llama_stack.apis.inference import (
LogProbConfig, LogProbConfig,
Message, Message,
ModelStore, ModelStore,
OpenAIChatCompletion,
OpenAICompletion,
OpenAIEmbeddingData,
OpenAIEmbeddingsResponse,
OpenAIEmbeddingUsage,
OpenAIMessageParam,
OpenAIResponseFormatParam,
ResponseFormat, ResponseFormat,
SamplingParams, SamplingParams,
TextTruncation, TextTruncation,
@ -71,11 +64,11 @@ from llama_stack.providers.utils.inference.openai_compat import (
convert_message_to_openai_dict, convert_message_to_openai_dict,
convert_tool_call, convert_tool_call,
get_sampling_options, get_sampling_options,
prepare_openai_completion_params,
process_chat_completion_stream_response, process_chat_completion_stream_response,
process_completion_response, process_completion_response,
process_completion_stream_response, process_completion_stream_response,
) )
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from llama_stack.providers.utils.inference.prompt_adapter import ( from llama_stack.providers.utils.inference.prompt_adapter import (
completion_request_to_prompt, completion_request_to_prompt,
content_has_media, content_has_media,
@ -288,7 +281,7 @@ async def _process_vllm_chat_completion_stream_response(
yield c yield c
class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): class VLLMInferenceAdapter(OpenAIMixin, Inference, ModelsProtocolPrivate):
# automatically set by the resolver when instantiating the provider # automatically set by the resolver when instantiating the provider
__provider_id__: str __provider_id__: str
model_store: ModelStore | None = None model_store: ModelStore | None = None
@ -296,7 +289,6 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
def __init__(self, config: VLLMInferenceAdapterConfig) -> None: def __init__(self, config: VLLMInferenceAdapterConfig) -> None:
self.register_helper = ModelRegistryHelper(build_hf_repo_model_entries()) self.register_helper = ModelRegistryHelper(build_hf_repo_model_entries())
self.config = config self.config = config
self.client = None
async def initialize(self) -> None: async def initialize(self) -> None:
if not self.config.url: if not self.config.url:
@ -308,8 +300,6 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
return self.config.refresh_models return self.config.refresh_models
async def list_models(self) -> list[Model] | None: async def list_models(self) -> list[Model] | None:
self._lazy_initialize_client()
assert self.client is not None # mypy
models = [] models = []
async for m in self.client.models.list(): async for m in self.client.models.list():
model_type = ModelType.llm # unclear how to determine embedding vs. llm models model_type = ModelType.llm # unclear how to determine embedding vs. llm models
@ -340,8 +330,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
HealthResponse: A dictionary containing the health status. HealthResponse: A dictionary containing the health status.
""" """
try: try:
client = self._create_client() if self.client is None else self.client _ = [m async for m in self.client.models.list()] # Ensure the client is initialized
_ = [m async for m in client.models.list()] # Ensure the client is initialized
return HealthResponse(status=HealthStatus.OK) return HealthResponse(status=HealthStatus.OK)
except Exception as e: except Exception as e:
return HealthResponse(status=HealthStatus.ERROR, message=f"Health check failed: {str(e)}") return HealthResponse(status=HealthStatus.ERROR, message=f"Health check failed: {str(e)}")
@ -351,19 +340,14 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
raise ValueError("Model store not set") raise ValueError("Model store not set")
return await self.model_store.get_model(model_id) return await self.model_store.get_model(model_id)
def _lazy_initialize_client(self): def get_api_key(self):
if self.client is not None: return self.config.api_token
return
log.info(f"Initializing vLLM client with base_url={self.config.url}") def get_base_url(self):
self.client = self._create_client() return self.config.url
def _create_client(self): def get_extra_client_params(self):
return AsyncOpenAI( return {"http_client": httpx.AsyncClient(verify=self.config.tls_verify)}
base_url=self.config.url,
api_key=self.config.api_token,
http_client=httpx.AsyncClient(verify=self.config.tls_verify),
)
async def completion( async def completion(
self, self,
@ -374,7 +358,6 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
stream: bool | None = False, stream: bool | None = False,
logprobs: LogProbConfig | None = None, logprobs: LogProbConfig | None = None,
) -> CompletionResponse | AsyncGenerator[CompletionResponseStreamChunk, None]: ) -> CompletionResponse | AsyncGenerator[CompletionResponseStreamChunk, None]:
self._lazy_initialize_client()
if sampling_params is None: if sampling_params is None:
sampling_params = SamplingParams() sampling_params = SamplingParams()
model = await self._get_model(model_id) model = await self._get_model(model_id)
@ -406,7 +389,6 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
logprobs: LogProbConfig | None = None, logprobs: LogProbConfig | None = None,
tool_config: ToolConfig | None = None, tool_config: ToolConfig | None = None,
) -> ChatCompletionResponse | AsyncGenerator[ChatCompletionResponseStreamChunk, None]: ) -> ChatCompletionResponse | AsyncGenerator[ChatCompletionResponseStreamChunk, None]:
self._lazy_initialize_client()
if sampling_params is None: if sampling_params is None:
sampling_params = SamplingParams() sampling_params = SamplingParams()
model = await self._get_model(model_id) model = await self._get_model(model_id)
@ -479,16 +461,12 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
yield chunk yield chunk
async def register_model(self, model: Model) -> Model: async def register_model(self, model: Model) -> Model:
# register_model is called during Llama Stack initialization, hence we cannot init self.client if not initialized yet.
# self.client should only be created after the initialization is complete to avoid asyncio cross-context errors.
# Changing this may lead to unpredictable behavior.
client = self._create_client() if self.client is None else self.client
try: try:
model = await self.register_helper.register_model(model) model = await self.register_helper.register_model(model)
except ValueError: except ValueError:
pass # Ignore statically unknown model, will check live listing pass # Ignore statically unknown model, will check live listing
try: try:
res = await client.models.list() res = await self.client.models.list()
except APIConnectionError as e: except APIConnectionError as e:
raise ValueError( raise ValueError(
f"Failed to connect to vLLM at {self.config.url}. Please check if vLLM is running and accessible at that URL." f"Failed to connect to vLLM at {self.config.url}. Please check if vLLM is running and accessible at that URL."
@ -543,8 +521,6 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
output_dimension: int | None = None, output_dimension: int | None = None,
task_type: EmbeddingTaskType | None = None, task_type: EmbeddingTaskType | None = None,
) -> EmbeddingsResponse: ) -> EmbeddingsResponse:
self._lazy_initialize_client()
assert self.client is not None
model = await self._get_model(model_id) model = await self._get_model(model_id)
kwargs = {} kwargs = {}
@ -560,154 +536,3 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
embeddings = [data.embedding for data in response.data] embeddings = [data.embedding for data in response.data]
return EmbeddingsResponse(embeddings=embeddings) return EmbeddingsResponse(embeddings=embeddings)
async def openai_embeddings(
self,
model: str,
input: str | list[str],
encoding_format: str | None = "float",
dimensions: int | None = None,
user: str | None = None,
) -> OpenAIEmbeddingsResponse:
self._lazy_initialize_client()
assert self.client is not None
model_obj = await self._get_model(model)
assert model_obj.model_type == ModelType.embedding
# Convert input to list if it's a string
input_list = [input] if isinstance(input, str) else input
# Call vLLM embeddings endpoint with encoding_format
response = await self.client.embeddings.create(
model=model_obj.provider_resource_id,
input=input_list,
dimensions=dimensions,
encoding_format=encoding_format,
)
# Convert response to OpenAI format
data = [
OpenAIEmbeddingData(
embedding=embedding_data.embedding,
index=i,
)
for i, embedding_data in enumerate(response.data)
]
# Not returning actual token usage since vLLM doesn't provide it
usage = OpenAIEmbeddingUsage(prompt_tokens=-1, total_tokens=-1)
return OpenAIEmbeddingsResponse(
data=data,
model=model_obj.provider_resource_id,
usage=usage,
)
async def openai_completion(
self,
model: str,
prompt: str | list[str] | list[int] | list[list[int]],
best_of: int | None = None,
echo: bool | None = None,
frequency_penalty: float | None = None,
logit_bias: dict[str, float] | None = None,
logprobs: bool | None = None,
max_tokens: int | None = None,
n: int | None = None,
presence_penalty: float | None = None,
seed: int | None = None,
stop: str | list[str] | None = None,
stream: bool | None = None,
stream_options: dict[str, Any] | None = None,
temperature: float | None = None,
top_p: float | None = None,
user: str | None = None,
guided_choice: list[str] | None = None,
prompt_logprobs: int | None = None,
suffix: str | None = None,
) -> OpenAICompletion:
self._lazy_initialize_client()
model_obj = await self._get_model(model)
extra_body: dict[str, Any] = {}
if prompt_logprobs is not None and prompt_logprobs >= 0:
extra_body["prompt_logprobs"] = prompt_logprobs
if guided_choice:
extra_body["guided_choice"] = guided_choice
params = await prepare_openai_completion_params(
model=model_obj.provider_resource_id,
prompt=prompt,
best_of=best_of,
echo=echo,
frequency_penalty=frequency_penalty,
logit_bias=logit_bias,
logprobs=logprobs,
max_tokens=max_tokens,
n=n,
presence_penalty=presence_penalty,
seed=seed,
stop=stop,
stream=stream,
stream_options=stream_options,
temperature=temperature,
top_p=top_p,
user=user,
extra_body=extra_body,
)
return await self.client.completions.create(**params) # type: ignore
async def openai_chat_completion(
self,
model: str,
messages: list[OpenAIMessageParam],
frequency_penalty: float | None = None,
function_call: str | dict[str, Any] | None = None,
functions: list[dict[str, Any]] | None = None,
logit_bias: dict[str, float] | None = None,
logprobs: bool | None = None,
max_completion_tokens: int | None = None,
max_tokens: int | None = None,
n: int | None = None,
parallel_tool_calls: bool | None = None,
presence_penalty: float | None = None,
response_format: OpenAIResponseFormatParam | None = None,
seed: int | None = None,
stop: str | list[str] | None = None,
stream: bool | None = None,
stream_options: dict[str, Any] | None = None,
temperature: float | None = None,
tool_choice: str | dict[str, Any] | None = None,
tools: list[dict[str, Any]] | None = None,
top_logprobs: int | None = None,
top_p: float | None = None,
user: str | None = None,
) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
self._lazy_initialize_client()
model_obj = await self._get_model(model)
params = await prepare_openai_completion_params(
model=model_obj.provider_resource_id,
messages=messages,
frequency_penalty=frequency_penalty,
function_call=function_call,
functions=functions,
logit_bias=logit_bias,
logprobs=logprobs,
max_completion_tokens=max_completion_tokens,
max_tokens=max_tokens,
n=n,
parallel_tool_calls=parallel_tool_calls,
presence_penalty=presence_penalty,
response_format=response_format,
seed=seed,
stop=stop,
stream=stream,
stream_options=stream_options,
temperature=temperature,
tool_choice=tool_choice,
tools=tools,
top_logprobs=top_logprobs,
top_p=top_p,
user=user,
)
return await self.client.chat.completions.create(**params) # type: ignore

View file

@ -3,6 +3,11 @@
# #
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
import asyncio
from typing import Any
from sqlalchemy.exc import IntegrityError
from llama_stack.apis.inference import ( from llama_stack.apis.inference import (
ListOpenAIChatCompletionResponse, ListOpenAIChatCompletionResponse,
OpenAIChatCompletion, OpenAIChatCompletion,
@ -10,24 +15,43 @@ from llama_stack.apis.inference import (
OpenAIMessageParam, OpenAIMessageParam,
Order, Order,
) )
from llama_stack.core.datatypes import AccessRule from llama_stack.core.datatypes import AccessRule, InferenceStoreConfig
from llama_stack.core.utils.config_dirs import RUNTIME_BASE_DIR from llama_stack.log import get_logger
from ..sqlstore.api import ColumnDefinition, ColumnType from ..sqlstore.api import ColumnDefinition, ColumnType
from ..sqlstore.authorized_sqlstore import AuthorizedSqlStore from ..sqlstore.authorized_sqlstore import AuthorizedSqlStore
from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, sqlstore_impl from ..sqlstore.sqlstore import SqlStoreConfig, SqlStoreType, sqlstore_impl
logger = get_logger(name=__name__, category="inference_store")
class InferenceStore: class InferenceStore:
def __init__(self, sql_store_config: SqlStoreConfig, policy: list[AccessRule]): def __init__(
if not sql_store_config: self,
sql_store_config = SqliteSqlStoreConfig( config: InferenceStoreConfig | SqlStoreConfig,
db_path=(RUNTIME_BASE_DIR / "sqlstore.db").as_posix(), policy: list[AccessRule],
):
# Handle backward compatibility
if not isinstance(config, InferenceStoreConfig):
# Legacy: SqlStoreConfig passed directly as config
config = InferenceStoreConfig(
sql_store_config=config,
) )
self.sql_store_config = sql_store_config
self.config = config
self.sql_store_config = config.sql_store_config
self.sql_store = None self.sql_store = None
self.policy = policy self.policy = policy
# Disable write queue for SQLite to avoid concurrency issues
self.enable_write_queue = self.sql_store_config.type != SqlStoreType.sqlite
# Async write queue and worker control
self._queue: asyncio.Queue[tuple[OpenAIChatCompletion, list[OpenAIMessageParam]]] | None = None
self._worker_tasks: list[asyncio.Task[Any]] = []
self._max_write_queue_size: int = config.max_write_queue_size
self._num_writers: int = max(1, config.num_writers)
async def initialize(self): async def initialize(self):
"""Create the necessary tables if they don't exist.""" """Create the necessary tables if they don't exist."""
self.sql_store = AuthorizedSqlStore(sqlstore_impl(self.sql_store_config)) self.sql_store = AuthorizedSqlStore(sqlstore_impl(self.sql_store_config))
@ -42,23 +66,109 @@ class InferenceStore:
}, },
) )
if self.enable_write_queue:
self._queue = asyncio.Queue(maxsize=self._max_write_queue_size)
for _ in range(self._num_writers):
self._worker_tasks.append(asyncio.create_task(self._worker_loop()))
else:
logger.info("Write queue disabled for SQLite to avoid concurrency issues")
async def shutdown(self) -> None:
if not self._worker_tasks:
return
if self._queue is not None:
await self._queue.join()
for t in self._worker_tasks:
if not t.done():
t.cancel()
for t in self._worker_tasks:
try:
await t
except asyncio.CancelledError:
pass
self._worker_tasks.clear()
async def flush(self) -> None:
"""Wait for all queued writes to complete. Useful for testing."""
if self.enable_write_queue and self._queue is not None:
await self._queue.join()
async def store_chat_completion( async def store_chat_completion(
self, chat_completion: OpenAIChatCompletion, input_messages: list[OpenAIMessageParam] self, chat_completion: OpenAIChatCompletion, input_messages: list[OpenAIMessageParam]
) -> None: ) -> None:
if not self.sql_store: if self.enable_write_queue:
if self._queue is None:
raise ValueError("Inference store is not initialized")
try:
self._queue.put_nowait((chat_completion, input_messages))
except asyncio.QueueFull:
logger.warning(
f"Write queue full; adding chat completion id={getattr(chat_completion, 'id', '<unknown>')}"
)
await self._queue.put((chat_completion, input_messages))
else:
await self._write_chat_completion(chat_completion, input_messages)
async def _worker_loop(self) -> None:
assert self._queue is not None
while True:
try:
item = await self._queue.get()
except asyncio.CancelledError:
break
chat_completion, input_messages = item
try:
await self._write_chat_completion(chat_completion, input_messages)
except Exception as e: # noqa: BLE001
logger.error(f"Error writing chat completion: {e}")
finally:
self._queue.task_done()
async def _write_chat_completion(
self, chat_completion: OpenAIChatCompletion, input_messages: list[OpenAIMessageParam]
) -> None:
if self.sql_store is None:
raise ValueError("Inference store is not initialized") raise ValueError("Inference store is not initialized")
data = chat_completion.model_dump() data = chat_completion.model_dump()
record_data = {
await self.sql_store.insert(
table="chat_completions",
data={
"id": data["id"], "id": data["id"],
"created": data["created"], "created": data["created"],
"model": data["model"], "model": data["model"],
"choices": data["choices"], "choices": data["choices"],
"input_messages": [message.model_dump() for message in input_messages], "input_messages": [message.model_dump() for message in input_messages],
}, }
try:
await self.sql_store.insert(
table="chat_completions",
data=record_data,
)
except IntegrityError as e:
# Duplicate chat completion IDs can be generated during tests especially if they are replaying
# recorded responses across different tests. No need to warn or error under those circumstances.
# In the wild, this is not likely to happen at all (no evidence) so we aren't really hiding any problem.
# Check if it's a unique constraint violation
error_message = str(e.orig) if e.orig else str(e)
if self._is_unique_constraint_error(error_message):
# Update the existing record instead
await self.sql_store.update(table="chat_completions", data=record_data, where={"id": data["id"]})
else:
# Re-raise if it's not a unique constraint error
raise
def _is_unique_constraint_error(self, error_message: str) -> bool:
"""Check if the error is specifically a unique constraint violation."""
error_lower = error_message.lower()
return any(
indicator in error_lower
for indicator in [
"unique constraint failed", # SQLite
"duplicate key", # PostgreSQL
"unique violation", # PostgreSQL alternative
"duplicate entry", # MySQL
]
) )
async def list_chat_completions( async def list_chat_completions(

View file

@ -67,6 +67,17 @@ class OpenAIMixin(ABC):
""" """
pass pass
def get_extra_client_params(self) -> dict[str, Any]:
"""
Get any extra parameters to pass to the AsyncOpenAI client.
Child classes can override this method to provide additional parameters
such as timeout settings, proxies, etc.
:return: A dictionary of extra parameters
"""
return {}
@property @property
def client(self) -> AsyncOpenAI: def client(self) -> AsyncOpenAI:
""" """
@ -78,6 +89,7 @@ class OpenAIMixin(ABC):
return AsyncOpenAI( return AsyncOpenAI(
api_key=self.get_api_key(), api_key=self.get_api_key(),
base_url=self.get_base_url(), base_url=self.get_base_url(),
**self.get_extra_client_params(),
) )
async def _get_provider_model_id(self, model: str) -> str: async def _get_provider_model_id(self, model: str) -> str:
@ -124,10 +136,15 @@ class OpenAIMixin(ABC):
""" """
Direct OpenAI completion API call. Direct OpenAI completion API call.
""" """
if guided_choice is not None: # Handle parameters that are not supported by OpenAI API, but may be by the provider
logger.warning("guided_choice is not supported by the OpenAI API. Ignoring.") # prompt_logprobs is supported by vLLM
if prompt_logprobs is not None: # guided_choice is supported by vLLM
logger.warning("prompt_logprobs is not supported by the OpenAI API. Ignoring.") # TODO: test coverage
extra_body: dict[str, Any] = {}
if prompt_logprobs is not None and prompt_logprobs >= 0:
extra_body["prompt_logprobs"] = prompt_logprobs
if guided_choice:
extra_body["guided_choice"] = guided_choice
# TODO: fix openai_completion to return type compatible with OpenAI's API response # TODO: fix openai_completion to return type compatible with OpenAI's API response
return await self.client.completions.create( # type: ignore[no-any-return] return await self.client.completions.create( # type: ignore[no-any-return]
@ -150,7 +167,8 @@ class OpenAIMixin(ABC):
top_p=top_p, top_p=top_p,
user=user, user=user,
suffix=suffix, suffix=suffix,
) ),
extra_body=extra_body,
) )
async def openai_chat_completion( async def openai_chat_completion(

View file

@ -172,6 +172,20 @@ class AuthorizedSqlStore:
return results.data[0] if results.data else None return results.data[0] if results.data else None
async def update(self, table: str, data: Mapping[str, Any], where: Mapping[str, Any]) -> None:
"""Update rows with automatic access control attribute capture."""
enhanced_data = dict(data)
current_user = get_authenticated_user()
if current_user:
enhanced_data["owner_principal"] = current_user.principal
enhanced_data["access_attributes"] = current_user.attributes
else:
enhanced_data["owner_principal"] = None
enhanced_data["access_attributes"] = None
await self.sql_store.update(table, enhanced_data, where)
async def delete(self, table: str, where: Mapping[str, Any]) -> None: async def delete(self, table: str, where: Mapping[str, Any]) -> None:
"""Delete rows with automatic access control filtering.""" """Delete rows with automatic access control filtering."""
await self.sql_store.delete(table, where) await self.sql_store.delete(table, where)

View file

@ -18,6 +18,7 @@ from functools import wraps
from typing import Any from typing import Any
from llama_stack.apis.telemetry import ( from llama_stack.apis.telemetry import (
Event,
LogSeverity, LogSeverity,
Span, Span,
SpanEndPayload, SpanEndPayload,
@ -98,7 +99,7 @@ class BackgroundLogger:
def __init__(self, api: Telemetry, capacity: int = 100000): def __init__(self, api: Telemetry, capacity: int = 100000):
self.api = api self.api = api
self.log_queue: queue.Queue[Any] = queue.Queue(maxsize=capacity) self.log_queue: queue.Queue[Any] = queue.Queue(maxsize=capacity)
self.worker_thread = threading.Thread(target=self._process_logs, daemon=True) self.worker_thread = threading.Thread(target=self._worker, daemon=True)
self.worker_thread.start() self.worker_thread.start()
self._last_queue_full_log_time: float = 0.0 self._last_queue_full_log_time: float = 0.0
self._dropped_since_last_notice: int = 0 self._dropped_since_last_notice: int = 0
@ -118,12 +119,16 @@ class BackgroundLogger:
self._last_queue_full_log_time = current_time self._last_queue_full_log_time = current_time
self._dropped_since_last_notice = 0 self._dropped_since_last_notice = 0
def _process_logs(self): def _worker(self):
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(self._process_logs())
async def _process_logs(self):
while True: while True:
try: try:
event = self.log_queue.get() event = self.log_queue.get()
# figure out how to use a thread's native loop await self.api.log_event(event)
asyncio.run(self.api.log_event(event))
except Exception: except Exception:
import traceback import traceback
@ -136,6 +141,19 @@ class BackgroundLogger:
self.log_queue.join() self.log_queue.join()
def enqueue_event(event: Event) -> None:
"""Enqueue a telemetry event to the background logger if available.
This provides a non-blocking path for routers and other hot paths to
submit telemetry without awaiting the Telemetry API, reducing contention
with the main event loop.
"""
global BACKGROUND_LOGGER
if BACKGROUND_LOGGER is None:
raise RuntimeError("Telemetry API not initialized")
BACKGROUND_LOGGER.log_event(event)
class TraceContext: class TraceContext:
spans: list[Span] = [] spans: list[Span] = []
@ -256,11 +274,7 @@ class TelemetryHandler(logging.Handler):
if record.module in ("asyncio", "selector_events"): if record.module in ("asyncio", "selector_events"):
return return
global CURRENT_TRACE_CONTEXT, BACKGROUND_LOGGER global CURRENT_TRACE_CONTEXT
if BACKGROUND_LOGGER is None:
raise RuntimeError("Telemetry API not initialized")
context = CURRENT_TRACE_CONTEXT.get() context = CURRENT_TRACE_CONTEXT.get()
if context is None: if context is None:
return return
@ -269,7 +283,7 @@ class TelemetryHandler(logging.Handler):
if span is None: if span is None:
return return
BACKGROUND_LOGGER.log_event( enqueue_event(
UnstructuredLogEvent( UnstructuredLogEvent(
trace_id=span.trace_id, trace_id=span.trace_id,
span_id=span.span_id, span_id=span.span_id,

View file

@ -12,14 +12,12 @@ import uuid
def generate_chunk_id(document_id: str, chunk_text: str, chunk_window: str | None = None) -> str: def generate_chunk_id(document_id: str, chunk_text: str, chunk_window: str | None = None) -> str:
""" """
Generate a unique chunk ID using a hash of the document ID and chunk text. Generate a unique chunk ID using a hash of the document ID and chunk text.
Then use the first 32 characters of the hash to create a UUID.
Note: MD5 is used only to calculate an identifier, not for security purposes.
Adding usedforsecurity=False for compatibility with FIPS environments.
""" """
hash_input = f"{document_id}:{chunk_text}".encode() hash_input = f"{document_id}:{chunk_text}".encode()
if chunk_window: if chunk_window:
hash_input += f":{chunk_window}".encode() hash_input += f":{chunk_window}".encode()
return str(uuid.UUID(hashlib.md5(hash_input, usedforsecurity=False).hexdigest())) return str(uuid.UUID(hashlib.sha256(hash_input).hexdigest()[:32]))
def proper_case(s: str) -> str: def proper_case(s: str) -> str:

View file

@ -15,6 +15,8 @@ from enum import StrEnum
from pathlib import Path from pathlib import Path
from typing import Any, Literal, cast from typing import Any, Literal, cast
from openai import NOT_GIVEN
from llama_stack.log import get_logger from llama_stack.log import get_logger
logger = get_logger(__name__, category="testing") logger = get_logger(__name__, category="testing")
@ -105,7 +107,11 @@ def _deserialize_response(data: dict[str, Any]) -> Any:
return cls.model_validate(data["__data__"]) return cls.model_validate(data["__data__"])
except (ImportError, AttributeError, TypeError, ValueError) as e: except (ImportError, AttributeError, TypeError, ValueError) as e:
logger.warning(f"Failed to deserialize object of type {data['__type__']}: {e}") logger.warning(f"Failed to deserialize object of type {data['__type__']} with model_validate: {e}")
try:
return cls.model_construct(**data["__data__"])
except Exception as e:
logger.warning(f"Failed to deserialize object of type {data['__type__']} with model_construct: {e}")
return data["__data__"] return data["__data__"]
return data return data
@ -194,20 +200,15 @@ def _model_identifiers_digest(endpoint: str, response: dict[str, Any]) -> str:
Supported endpoints: Supported endpoints:
- '/api/tags' (Ollama): response body has 'models': [ { name/model/digest/id/... }, ... ] - '/api/tags' (Ollama): response body has 'models': [ { name/model/digest/id/... }, ... ]
- '/v1/models' (OpenAI): response body has 'data': [ { id: ... }, ... ] - '/v1/models' (OpenAI): response body is: [ { id: ... }, ... ]
Returns a list of unique identifiers or None if structure doesn't match. Returns a list of unique identifiers or None if structure doesn't match.
""" """
body = response["body"] items = response["body"]
if endpoint == "/api/tags": idents = [m.model if endpoint == "/api/tags" else m.id for m in items]
items = body.get("models")
idents = [m.model for m in items]
else:
items = body.get("data")
idents = [m.id for m in items]
return sorted(set(idents)) return sorted(set(idents))
identifiers = _extract_model_identifiers() identifiers = _extract_model_identifiers()
return hashlib.sha1(("|".join(identifiers)).encode("utf-8")).hexdigest()[:8] return hashlib.sha256(("|".join(identifiers)).encode("utf-8")).hexdigest()[:8]
def _combine_model_list_responses(endpoint: str, records: list[dict[str, Any]]) -> dict[str, Any] | None: def _combine_model_list_responses(endpoint: str, records: list[dict[str, Any]]) -> dict[str, Any] | None:
@ -215,17 +216,12 @@ def _combine_model_list_responses(endpoint: str, records: list[dict[str, Any]])
seen: dict[str, dict[str, Any]] = {} seen: dict[str, dict[str, Any]] = {}
for rec in records: for rec in records:
body = rec["response"]["body"] body = rec["response"]["body"]
if endpoint == "/api/tags":
items = body.models
elif endpoint == "/v1/models":
items = body.data
else:
items = []
for m in items:
if endpoint == "/v1/models": if endpoint == "/v1/models":
for m in body:
key = m.id key = m.id
else: seen[key] = m
elif endpoint == "/api/tags":
for m in body.models:
key = m.model key = m.model
seen[key] = m seen[key] = m
@ -234,9 +230,8 @@ def _combine_model_list_responses(endpoint: str, records: list[dict[str, Any]])
canonical_req = canonical.get("request", {}) canonical_req = canonical.get("request", {})
if isinstance(canonical_req, dict): if isinstance(canonical_req, dict):
canonical_req["endpoint"] = endpoint canonical_req["endpoint"] = endpoint
if endpoint == "/v1/models": body = ordered
body = {"data": ordered, "object": "list"} if endpoint == "/api/tags":
else:
from ollama import ListResponse from ollama import ListResponse
body = ListResponse(models=ordered) body = ListResponse(models=ordered)
@ -247,12 +242,17 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
global _current_mode, _current_storage global _current_mode, _current_storage
if _current_mode == InferenceMode.LIVE or _current_storage is None: if _current_mode == InferenceMode.LIVE or _current_storage is None:
# Normal operation if endpoint == "/v1/models":
return original_method(self, *args, **kwargs)
else:
return await original_method(self, *args, **kwargs) return await original_method(self, *args, **kwargs)
# Get base URL based on client type # Get base URL based on client type
if client_type == "openai": if client_type == "openai":
base_url = str(self._client.base_url) base_url = str(self._client.base_url)
# the OpenAI client methods may pass NOT_GIVEN for unset parameters; filter these out
kwargs = {k: v for k, v in kwargs.items() if v is not NOT_GIVEN}
elif client_type == "ollama": elif client_type == "ollama":
# Get base URL from the client (Ollama client uses host attribute) # Get base URL from the client (Ollama client uses host attribute)
base_url = getattr(self, "host", "http://localhost:11434") base_url = getattr(self, "host", "http://localhost:11434")
@ -296,8 +296,15 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
) )
elif _current_mode == InferenceMode.RECORD: elif _current_mode == InferenceMode.RECORD:
if endpoint == "/v1/models":
response = original_method(self, *args, **kwargs)
else:
response = await original_method(self, *args, **kwargs) response = await original_method(self, *args, **kwargs)
# we want to store the result of the iterator, not the iterator itself
if endpoint == "/v1/models":
response = [m async for m in response]
request_data = { request_data = {
"method": method, "method": method,
"url": url, "url": url,
@ -376,10 +383,14 @@ def patch_inference_clients():
_original_methods["embeddings_create"], self, "openai", "/v1/embeddings", *args, **kwargs _original_methods["embeddings_create"], self, "openai", "/v1/embeddings", *args, **kwargs
) )
async def patched_models_list(self, *args, **kwargs): def patched_models_list(self, *args, **kwargs):
return await _patched_inference_method( async def _iter():
for item in await _patched_inference_method(
_original_methods["models_list"], self, "openai", "/v1/models", *args, **kwargs _original_methods["models_list"], self, "openai", "/v1/models", *args, **kwargs
) ):
yield item
return _iter()
# Apply OpenAI patches # Apply OpenAI patches
AsyncChatCompletions.create = patched_chat_completions_create AsyncChatCompletions.create = patched_chat_completions_create

View file

@ -11,7 +11,7 @@
"@radix-ui/react-collapsible": "^1.1.12", "@radix-ui/react-collapsible": "^1.1.12",
"@radix-ui/react-dialog": "^1.1.13", "@radix-ui/react-dialog": "^1.1.13",
"@radix-ui/react-dropdown-menu": "^2.1.16", "@radix-ui/react-dropdown-menu": "^2.1.16",
"@radix-ui/react-select": "^2.2.5", "@radix-ui/react-select": "^2.2.6",
"@radix-ui/react-separator": "^1.1.7", "@radix-ui/react-separator": "^1.1.7",
"@radix-ui/react-slot": "^1.2.3", "@radix-ui/react-slot": "^1.2.3",
"@radix-ui/react-tooltip": "^1.2.8", "@radix-ui/react-tooltip": "^1.2.8",
@ -20,7 +20,7 @@
"framer-motion": "^12.23.12", "framer-motion": "^12.23.12",
"llama-stack-client": "^0.2.21", "llama-stack-client": "^0.2.21",
"lucide-react": "^0.542.0", "lucide-react": "^0.542.0",
"next": "15.3.3", "next": "15.5.3",
"next-auth": "^4.24.11", "next-auth": "^4.24.11",
"next-themes": "^0.4.6", "next-themes": "^0.4.6",
"react": "^19.0.0", "react": "^19.0.0",
@ -664,9 +664,9 @@
} }
}, },
"node_modules/@emnapi/runtime": { "node_modules/@emnapi/runtime": {
"version": "1.4.3", "version": "1.5.0",
"resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.4.3.tgz", "resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.5.0.tgz",
"integrity": "sha512-pBPWdu6MLKROBX05wSNKcNb++m5Er+KQ9QkB+WVM+pW2Kx9hoSrVTnu3BdkI5eBLZoKu/J6mW/B6i6bJB2ytXQ==", "integrity": "sha512-97/BJ3iXHww3djw6hYIfErCZFee7qCtrneuLa20UXFCOTCfBM2cvQHjWJ2EG0s0MtdNwInarqCTz35i4wWXHsQ==",
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,
"dependencies": { "dependencies": {
@ -927,9 +927,9 @@
} }
}, },
"node_modules/@img/sharp-darwin-arm64": { "node_modules/@img/sharp-darwin-arm64": {
"version": "0.34.1", "version": "0.34.3",
"resolved": "https://registry.npmjs.org/@img/sharp-darwin-arm64/-/sharp-darwin-arm64-0.34.1.tgz", "resolved": "https://registry.npmjs.org/@img/sharp-darwin-arm64/-/sharp-darwin-arm64-0.34.3.tgz",
"integrity": "sha512-pn44xgBtgpEbZsu+lWf2KNb6OAf70X68k+yk69Ic2Xz11zHR/w24/U49XT7AeRwJ0Px+mhALhU5LPci1Aymk7A==", "integrity": "sha512-ryFMfvxxpQRsgZJqBd4wsttYQbCxsJksrv9Lw/v798JcQ8+w84mBWuXwl+TT0WJ/WrYOLaYpwQXi3sA9nTIaIg==",
"cpu": [ "cpu": [
"arm64" "arm64"
], ],
@ -945,13 +945,13 @@
"url": "https://opencollective.com/libvips" "url": "https://opencollective.com/libvips"
}, },
"optionalDependencies": { "optionalDependencies": {
"@img/sharp-libvips-darwin-arm64": "1.1.0" "@img/sharp-libvips-darwin-arm64": "1.2.0"
} }
}, },
"node_modules/@img/sharp-darwin-x64": { "node_modules/@img/sharp-darwin-x64": {
"version": "0.34.1", "version": "0.34.3",
"resolved": "https://registry.npmjs.org/@img/sharp-darwin-x64/-/sharp-darwin-x64-0.34.1.tgz", "resolved": "https://registry.npmjs.org/@img/sharp-darwin-x64/-/sharp-darwin-x64-0.34.3.tgz",
"integrity": "sha512-VfuYgG2r8BpYiOUN+BfYeFo69nP/MIwAtSJ7/Zpxc5QF3KS22z8Pvg3FkrSFJBPNQ7mmcUcYQFBmEQp7eu1F8Q==", "integrity": "sha512-yHpJYynROAj12TA6qil58hmPmAwxKKC7reUqtGLzsOHfP7/rniNGTL8tjWX6L3CTV4+5P4ypcS7Pp+7OB+8ihA==",
"cpu": [ "cpu": [
"x64" "x64"
], ],
@ -967,13 +967,13 @@
"url": "https://opencollective.com/libvips" "url": "https://opencollective.com/libvips"
}, },
"optionalDependencies": { "optionalDependencies": {
"@img/sharp-libvips-darwin-x64": "1.1.0" "@img/sharp-libvips-darwin-x64": "1.2.0"
} }
}, },
"node_modules/@img/sharp-libvips-darwin-arm64": { "node_modules/@img/sharp-libvips-darwin-arm64": {
"version": "1.1.0", "version": "1.2.0",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-arm64/-/sharp-libvips-darwin-arm64-1.1.0.tgz", "resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-arm64/-/sharp-libvips-darwin-arm64-1.2.0.tgz",
"integrity": "sha512-HZ/JUmPwrJSoM4DIQPv/BfNh9yrOA8tlBbqbLz4JZ5uew2+o22Ik+tHQJcih7QJuSa0zo5coHTfD5J8inqj9DA==", "integrity": "sha512-sBZmpwmxqwlqG9ueWFXtockhsxefaV6O84BMOrhtg/YqbTaRdqDE7hxraVE3y6gVM4eExmfzW4a8el9ArLeEiQ==",
"cpu": [ "cpu": [
"arm64" "arm64"
], ],
@ -987,9 +987,9 @@
} }
}, },
"node_modules/@img/sharp-libvips-darwin-x64": { "node_modules/@img/sharp-libvips-darwin-x64": {
"version": "1.1.0", "version": "1.2.0",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-x64/-/sharp-libvips-darwin-x64-1.1.0.tgz", "resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-x64/-/sharp-libvips-darwin-x64-1.2.0.tgz",
"integrity": "sha512-Xzc2ToEmHN+hfvsl9wja0RlnXEgpKNmftriQp6XzY/RaSfwD9th+MSh0WQKzUreLKKINb3afirxW7A0fz2YWuQ==", "integrity": "sha512-M64XVuL94OgiNHa5/m2YvEQI5q2cl9d/wk0qFTDVXcYzi43lxuiFTftMR1tOnFQovVXNZJ5TURSDK2pNe9Yzqg==",
"cpu": [ "cpu": [
"x64" "x64"
], ],
@ -1003,9 +1003,9 @@
} }
}, },
"node_modules/@img/sharp-libvips-linux-arm": { "node_modules/@img/sharp-libvips-linux-arm": {
"version": "1.1.0", "version": "1.2.0",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm/-/sharp-libvips-linux-arm-1.1.0.tgz", "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm/-/sharp-libvips-linux-arm-1.2.0.tgz",
"integrity": "sha512-s8BAd0lwUIvYCJyRdFqvsj+BJIpDBSxs6ivrOPm/R7piTs5UIwY5OjXrP2bqXC9/moGsyRa37eYWYCOGVXxVrA==", "integrity": "sha512-mWd2uWvDtL/nvIzThLq3fr2nnGfyr/XMXlq8ZJ9WMR6PXijHlC3ksp0IpuhK6bougvQrchUAfzRLnbsen0Cqvw==",
"cpu": [ "cpu": [
"arm" "arm"
], ],
@ -1019,9 +1019,9 @@
} }
}, },
"node_modules/@img/sharp-libvips-linux-arm64": { "node_modules/@img/sharp-libvips-linux-arm64": {
"version": "1.1.0", "version": "1.2.0",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm64/-/sharp-libvips-linux-arm64-1.1.0.tgz", "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm64/-/sharp-libvips-linux-arm64-1.2.0.tgz",
"integrity": "sha512-IVfGJa7gjChDET1dK9SekxFFdflarnUB8PwW8aGwEoF3oAsSDuNUTYS+SKDOyOJxQyDC1aPFMuRYLoDInyV9Ew==", "integrity": "sha512-RXwd0CgG+uPRX5YYrkzKyalt2OJYRiJQ8ED/fi1tq9WQW2jsQIn0tqrlR5l5dr/rjqq6AHAxURhj2DVjyQWSOA==",
"cpu": [ "cpu": [
"arm64" "arm64"
], ],
@ -1035,9 +1035,9 @@
} }
}, },
"node_modules/@img/sharp-libvips-linux-ppc64": { "node_modules/@img/sharp-libvips-linux-ppc64": {
"version": "1.1.0", "version": "1.2.0",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-ppc64/-/sharp-libvips-linux-ppc64-1.1.0.tgz", "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-ppc64/-/sharp-libvips-linux-ppc64-1.2.0.tgz",
"integrity": "sha512-tiXxFZFbhnkWE2LA8oQj7KYR+bWBkiV2nilRldT7bqoEZ4HiDOcePr9wVDAZPi/Id5fT1oY9iGnDq20cwUz8lQ==", "integrity": "sha512-Xod/7KaDDHkYu2phxxfeEPXfVXFKx70EAFZ0qyUdOjCcxbjqyJOEUpDe6RIyaunGxT34Anf9ue/wuWOqBW2WcQ==",
"cpu": [ "cpu": [
"ppc64" "ppc64"
], ],
@ -1051,9 +1051,9 @@
} }
}, },
"node_modules/@img/sharp-libvips-linux-s390x": { "node_modules/@img/sharp-libvips-linux-s390x": {
"version": "1.1.0", "version": "1.2.0",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-s390x/-/sharp-libvips-linux-s390x-1.1.0.tgz", "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-s390x/-/sharp-libvips-linux-s390x-1.2.0.tgz",
"integrity": "sha512-xukSwvhguw7COyzvmjydRb3x/09+21HykyapcZchiCUkTThEQEOMtBj9UhkaBRLuBrgLFzQ2wbxdeCCJW/jgJA==", "integrity": "sha512-eMKfzDxLGT8mnmPJTNMcjfO33fLiTDsrMlUVcp6b96ETbnJmd4uvZxVJSKPQfS+odwfVaGifhsB07J1LynFehw==",
"cpu": [ "cpu": [
"s390x" "s390x"
], ],
@ -1067,9 +1067,9 @@
} }
}, },
"node_modules/@img/sharp-libvips-linux-x64": { "node_modules/@img/sharp-libvips-linux-x64": {
"version": "1.1.0", "version": "1.2.0",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-x64/-/sharp-libvips-linux-x64-1.1.0.tgz", "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-x64/-/sharp-libvips-linux-x64-1.2.0.tgz",
"integrity": "sha512-yRj2+reB8iMg9W5sULM3S74jVS7zqSzHG3Ol/twnAAkAhnGQnpjj6e4ayUz7V+FpKypwgs82xbRdYtchTTUB+Q==", "integrity": "sha512-ZW3FPWIc7K1sH9E3nxIGB3y3dZkpJlMnkk7z5tu1nSkBoCgw2nSRTFHI5pB/3CQaJM0pdzMF3paf9ckKMSE9Tg==",
"cpu": [ "cpu": [
"x64" "x64"
], ],
@ -1083,9 +1083,9 @@
} }
}, },
"node_modules/@img/sharp-libvips-linuxmusl-arm64": { "node_modules/@img/sharp-libvips-linuxmusl-arm64": {
"version": "1.1.0", "version": "1.2.0",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-arm64/-/sharp-libvips-linuxmusl-arm64-1.1.0.tgz", "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-arm64/-/sharp-libvips-linuxmusl-arm64-1.2.0.tgz",
"integrity": "sha512-jYZdG+whg0MDK+q2COKbYidaqW/WTz0cc1E+tMAusiDygrM4ypmSCjOJPmFTvHHJ8j/6cAGyeDWZOsK06tP33w==", "integrity": "sha512-UG+LqQJbf5VJ8NWJ5Z3tdIe/HXjuIdo4JeVNADXBFuG7z9zjoegpzzGIyV5zQKi4zaJjnAd2+g2nna8TZvuW9Q==",
"cpu": [ "cpu": [
"arm64" "arm64"
], ],
@ -1099,9 +1099,9 @@
} }
}, },
"node_modules/@img/sharp-libvips-linuxmusl-x64": { "node_modules/@img/sharp-libvips-linuxmusl-x64": {
"version": "1.1.0", "version": "1.2.0",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-x64/-/sharp-libvips-linuxmusl-x64-1.1.0.tgz", "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-x64/-/sharp-libvips-linuxmusl-x64-1.2.0.tgz",
"integrity": "sha512-wK7SBdwrAiycjXdkPnGCPLjYb9lD4l6Ze2gSdAGVZrEL05AOUJESWU2lhlC+Ffn5/G+VKuSm6zzbQSzFX/P65A==", "integrity": "sha512-SRYOLR7CXPgNze8akZwjoGBoN1ThNZoqpOgfnOxmWsklTGVfJiGJoC/Lod7aNMGA1jSsKWM1+HRX43OP6p9+6Q==",
"cpu": [ "cpu": [
"x64" "x64"
], ],
@ -1115,9 +1115,9 @@
} }
}, },
"node_modules/@img/sharp-linux-arm": { "node_modules/@img/sharp-linux-arm": {
"version": "0.34.1", "version": "0.34.3",
"resolved": "https://registry.npmjs.org/@img/sharp-linux-arm/-/sharp-linux-arm-0.34.1.tgz", "resolved": "https://registry.npmjs.org/@img/sharp-linux-arm/-/sharp-linux-arm-0.34.3.tgz",
"integrity": "sha512-anKiszvACti2sGy9CirTlNyk7BjjZPiML1jt2ZkTdcvpLU1YH6CXwRAZCA2UmRXnhiIftXQ7+Oh62Ji25W72jA==", "integrity": "sha512-oBK9l+h6KBN0i3dC8rYntLiVfW8D8wH+NPNT3O/WBHeW0OQWCjfWksLUaPidsrDKpJgXp3G3/hkmhptAW0I3+A==",
"cpu": [ "cpu": [
"arm" "arm"
], ],
@ -1133,13 +1133,13 @@
"url": "https://opencollective.com/libvips" "url": "https://opencollective.com/libvips"
}, },
"optionalDependencies": { "optionalDependencies": {
"@img/sharp-libvips-linux-arm": "1.1.0" "@img/sharp-libvips-linux-arm": "1.2.0"
} }
}, },
"node_modules/@img/sharp-linux-arm64": { "node_modules/@img/sharp-linux-arm64": {
"version": "0.34.1", "version": "0.34.3",
"resolved": "https://registry.npmjs.org/@img/sharp-linux-arm64/-/sharp-linux-arm64-0.34.1.tgz", "resolved": "https://registry.npmjs.org/@img/sharp-linux-arm64/-/sharp-linux-arm64-0.34.3.tgz",
"integrity": "sha512-kX2c+vbvaXC6vly1RDf/IWNXxrlxLNpBVWkdpRq5Ka7OOKj6nr66etKy2IENf6FtOgklkg9ZdGpEu9kwdlcwOQ==", "integrity": "sha512-QdrKe3EvQrqwkDrtuTIjI0bu6YEJHTgEeqdzI3uWJOH6G1O8Nl1iEeVYRGdj1h5I21CqxSvQp1Yv7xeU3ZewbA==",
"cpu": [ "cpu": [
"arm64" "arm64"
], ],
@ -1155,13 +1155,35 @@
"url": "https://opencollective.com/libvips" "url": "https://opencollective.com/libvips"
}, },
"optionalDependencies": { "optionalDependencies": {
"@img/sharp-libvips-linux-arm64": "1.1.0" "@img/sharp-libvips-linux-arm64": "1.2.0"
}
},
"node_modules/@img/sharp-linux-ppc64": {
"version": "0.34.3",
"resolved": "https://registry.npmjs.org/@img/sharp-linux-ppc64/-/sharp-linux-ppc64-0.34.3.tgz",
"integrity": "sha512-GLtbLQMCNC5nxuImPR2+RgrviwKwVql28FWZIW1zWruy6zLgA5/x2ZXk3mxj58X/tszVF69KK0Is83V8YgWhLA==",
"cpu": [
"ppc64"
],
"license": "Apache-2.0",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
},
"funding": {
"url": "https://opencollective.com/libvips"
},
"optionalDependencies": {
"@img/sharp-libvips-linux-ppc64": "1.2.0"
} }
}, },
"node_modules/@img/sharp-linux-s390x": { "node_modules/@img/sharp-linux-s390x": {
"version": "0.34.1", "version": "0.34.3",
"resolved": "https://registry.npmjs.org/@img/sharp-linux-s390x/-/sharp-linux-s390x-0.34.1.tgz", "resolved": "https://registry.npmjs.org/@img/sharp-linux-s390x/-/sharp-linux-s390x-0.34.3.tgz",
"integrity": "sha512-7s0KX2tI9mZI2buRipKIw2X1ufdTeaRgwmRabt5bi9chYfhur+/C1OXg3TKg/eag1W+6CCWLVmSauV1owmRPxA==", "integrity": "sha512-3gahT+A6c4cdc2edhsLHmIOXMb17ltffJlxR0aC2VPZfwKoTGZec6u5GrFgdR7ciJSsHT27BD3TIuGcuRT0KmQ==",
"cpu": [ "cpu": [
"s390x" "s390x"
], ],
@ -1177,13 +1199,13 @@
"url": "https://opencollective.com/libvips" "url": "https://opencollective.com/libvips"
}, },
"optionalDependencies": { "optionalDependencies": {
"@img/sharp-libvips-linux-s390x": "1.1.0" "@img/sharp-libvips-linux-s390x": "1.2.0"
} }
}, },
"node_modules/@img/sharp-linux-x64": { "node_modules/@img/sharp-linux-x64": {
"version": "0.34.1", "version": "0.34.3",
"resolved": "https://registry.npmjs.org/@img/sharp-linux-x64/-/sharp-linux-x64-0.34.1.tgz", "resolved": "https://registry.npmjs.org/@img/sharp-linux-x64/-/sharp-linux-x64-0.34.3.tgz",
"integrity": "sha512-wExv7SH9nmoBW3Wr2gvQopX1k8q2g5V5Iag8Zk6AVENsjwd+3adjwxtp3Dcu2QhOXr8W9NusBU6XcQUohBZ5MA==", "integrity": "sha512-8kYso8d806ypnSq3/Ly0QEw90V5ZoHh10yH0HnrzOCr6DKAPI6QVHvwleqMkVQ0m+fc7EH8ah0BB0QPuWY6zJQ==",
"cpu": [ "cpu": [
"x64" "x64"
], ],
@ -1199,13 +1221,13 @@
"url": "https://opencollective.com/libvips" "url": "https://opencollective.com/libvips"
}, },
"optionalDependencies": { "optionalDependencies": {
"@img/sharp-libvips-linux-x64": "1.1.0" "@img/sharp-libvips-linux-x64": "1.2.0"
} }
}, },
"node_modules/@img/sharp-linuxmusl-arm64": { "node_modules/@img/sharp-linuxmusl-arm64": {
"version": "0.34.1", "version": "0.34.3",
"resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-arm64/-/sharp-linuxmusl-arm64-0.34.1.tgz", "resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-arm64/-/sharp-linuxmusl-arm64-0.34.3.tgz",
"integrity": "sha512-DfvyxzHxw4WGdPiTF0SOHnm11Xv4aQexvqhRDAoD00MzHekAj9a/jADXeXYCDFH/DzYruwHbXU7uz+H+nWmSOQ==", "integrity": "sha512-vAjbHDlr4izEiXM1OTggpCcPg9tn4YriK5vAjowJsHwdBIdx0fYRsURkxLG2RLm9gyBq66gwtWI8Gx0/ov+JKQ==",
"cpu": [ "cpu": [
"arm64" "arm64"
], ],
@ -1221,13 +1243,13 @@
"url": "https://opencollective.com/libvips" "url": "https://opencollective.com/libvips"
}, },
"optionalDependencies": { "optionalDependencies": {
"@img/sharp-libvips-linuxmusl-arm64": "1.1.0" "@img/sharp-libvips-linuxmusl-arm64": "1.2.0"
} }
}, },
"node_modules/@img/sharp-linuxmusl-x64": { "node_modules/@img/sharp-linuxmusl-x64": {
"version": "0.34.1", "version": "0.34.3",
"resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-x64/-/sharp-linuxmusl-x64-0.34.1.tgz", "resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-x64/-/sharp-linuxmusl-x64-0.34.3.tgz",
"integrity": "sha512-pax/kTR407vNb9qaSIiWVnQplPcGU8LRIJpDT5o8PdAx5aAA7AS3X9PS8Isw1/WfqgQorPotjrZL3Pqh6C5EBg==", "integrity": "sha512-gCWUn9547K5bwvOn9l5XGAEjVTTRji4aPTqLzGXHvIr6bIDZKNTA34seMPgM0WmSf+RYBH411VavCejp3PkOeQ==",
"cpu": [ "cpu": [
"x64" "x64"
], ],
@ -1243,20 +1265,20 @@
"url": "https://opencollective.com/libvips" "url": "https://opencollective.com/libvips"
}, },
"optionalDependencies": { "optionalDependencies": {
"@img/sharp-libvips-linuxmusl-x64": "1.1.0" "@img/sharp-libvips-linuxmusl-x64": "1.2.0"
} }
}, },
"node_modules/@img/sharp-wasm32": { "node_modules/@img/sharp-wasm32": {
"version": "0.34.1", "version": "0.34.3",
"resolved": "https://registry.npmjs.org/@img/sharp-wasm32/-/sharp-wasm32-0.34.1.tgz", "resolved": "https://registry.npmjs.org/@img/sharp-wasm32/-/sharp-wasm32-0.34.3.tgz",
"integrity": "sha512-YDybQnYrLQfEpzGOQe7OKcyLUCML4YOXl428gOOzBgN6Gw0rv8dpsJ7PqTHxBnXnwXr8S1mYFSLSa727tpz0xg==", "integrity": "sha512-+CyRcpagHMGteySaWos8IbnXcHgfDn7pO2fiC2slJxvNq9gDipYBN42/RagzctVRKgxATmfqOSulgZv5e1RdMg==",
"cpu": [ "cpu": [
"wasm32" "wasm32"
], ],
"license": "Apache-2.0 AND LGPL-3.0-or-later AND MIT", "license": "Apache-2.0 AND LGPL-3.0-or-later AND MIT",
"optional": true, "optional": true,
"dependencies": { "dependencies": {
"@emnapi/runtime": "^1.4.0" "@emnapi/runtime": "^1.4.4"
}, },
"engines": { "engines": {
"node": "^18.17.0 || ^20.3.0 || >=21.0.0" "node": "^18.17.0 || ^20.3.0 || >=21.0.0"
@ -1265,10 +1287,29 @@
"url": "https://opencollective.com/libvips" "url": "https://opencollective.com/libvips"
} }
}, },
"node_modules/@img/sharp-win32-arm64": {
"version": "0.34.3",
"resolved": "https://registry.npmjs.org/@img/sharp-win32-arm64/-/sharp-win32-arm64-0.34.3.tgz",
"integrity": "sha512-MjnHPnbqMXNC2UgeLJtX4XqoVHHlZNd+nPt1kRPmj63wURegwBhZlApELdtxM2OIZDRv/DFtLcNhVbd1z8GYXQ==",
"cpu": [
"arm64"
],
"license": "Apache-2.0 AND LGPL-3.0-or-later",
"optional": true,
"os": [
"win32"
],
"engines": {
"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
},
"funding": {
"url": "https://opencollective.com/libvips"
}
},
"node_modules/@img/sharp-win32-ia32": { "node_modules/@img/sharp-win32-ia32": {
"version": "0.34.1", "version": "0.34.3",
"resolved": "https://registry.npmjs.org/@img/sharp-win32-ia32/-/sharp-win32-ia32-0.34.1.tgz", "resolved": "https://registry.npmjs.org/@img/sharp-win32-ia32/-/sharp-win32-ia32-0.34.3.tgz",
"integrity": "sha512-WKf/NAZITnonBf3U1LfdjoMgNO5JYRSlhovhRhMxXVdvWYveM4kM3L8m35onYIdh75cOMCo1BexgVQcCDzyoWw==", "integrity": "sha512-xuCdhH44WxuXgOM714hn4amodJMZl3OEvf0GVTm0BEyMeA2to+8HEdRPShH0SLYptJY1uBw+SCFP9WVQi1Q/cw==",
"cpu": [ "cpu": [
"ia32" "ia32"
], ],
@ -1285,9 +1326,9 @@
} }
}, },
"node_modules/@img/sharp-win32-x64": { "node_modules/@img/sharp-win32-x64": {
"version": "0.34.1", "version": "0.34.3",
"resolved": "https://registry.npmjs.org/@img/sharp-win32-x64/-/sharp-win32-x64-0.34.1.tgz", "resolved": "https://registry.npmjs.org/@img/sharp-win32-x64/-/sharp-win32-x64-0.34.3.tgz",
"integrity": "sha512-hw1iIAHpNE8q3uMIRCgGOeDoz9KtFNarFLQclLxr/LK1VBkj8nby18RjFvr6aP7USRYAjTZW6yisnBWMX571Tw==", "integrity": "sha512-OWwz05d++TxzLEv4VnsTz5CmZ6mI6S05sfQGEMrNrQcOEERbX46332IvE7pO/EUiw7jUrrS40z/M7kPyjfl04g==",
"cpu": [ "cpu": [
"x64" "x64"
], ],
@ -1849,9 +1890,10 @@
} }
}, },
"node_modules/@next/env": { "node_modules/@next/env": {
"version": "15.3.3", "version": "15.5.3",
"resolved": "https://registry.npmjs.org/@next/env/-/env-15.3.3.tgz", "resolved": "https://registry.npmjs.org/@next/env/-/env-15.5.3.tgz",
"integrity": "sha512-OdiMrzCl2Xi0VTjiQQUK0Xh7bJHnOuET2s+3V+Y40WJBAXrJeGA3f+I8MZJ/YQ3mVGi5XGR1L66oFlgqXhQ4Vw==" "integrity": "sha512-RSEDTRqyihYXygx/OJXwvVupfr9m04+0vH8vyy0HfZ7keRto6VX9BbEk0J2PUk0VGy6YhklJUSrgForov5F9pw==",
"license": "MIT"
}, },
"node_modules/@next/eslint-plugin-next": { "node_modules/@next/eslint-plugin-next": {
"version": "15.5.2", "version": "15.5.2",
@ -1864,12 +1906,13 @@
} }
}, },
"node_modules/@next/swc-darwin-arm64": { "node_modules/@next/swc-darwin-arm64": {
"version": "15.3.3", "version": "15.5.3",
"resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-15.3.3.tgz", "resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-15.5.3.tgz",
"integrity": "sha512-WRJERLuH+O3oYB4yZNVahSVFmtxRNjNF1I1c34tYMoJb0Pve+7/RaLAJJizyYiFhjYNGHRAE1Ri2Fd23zgDqhg==", "integrity": "sha512-nzbHQo69+au9wJkGKTU9lP7PXv0d1J5ljFpvb+LnEomLtSbJkbZyEs6sbF3plQmiOB2l9OBtN2tNSvCH1nQ9Jg==",
"cpu": [ "cpu": [
"arm64" "arm64"
], ],
"license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
"darwin" "darwin"
@ -1879,12 +1922,13 @@
} }
}, },
"node_modules/@next/swc-darwin-x64": { "node_modules/@next/swc-darwin-x64": {
"version": "15.3.3", "version": "15.5.3",
"resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-15.3.3.tgz", "resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-15.5.3.tgz",
"integrity": "sha512-XHdzH/yBc55lu78k/XwtuFR/ZXUTcflpRXcsu0nKmF45U96jt1tsOZhVrn5YH+paw66zOANpOnFQ9i6/j+UYvw==", "integrity": "sha512-w83w4SkOOhekJOcA5HBvHyGzgV1W/XvOfpkrxIse4uPWhYTTRwtGEM4v/jiXwNSJvfRvah0H8/uTLBKRXlef8g==",
"cpu": [ "cpu": [
"x64" "x64"
], ],
"license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
"darwin" "darwin"
@ -1894,12 +1938,13 @@
} }
}, },
"node_modules/@next/swc-linux-arm64-gnu": { "node_modules/@next/swc-linux-arm64-gnu": {
"version": "15.3.3", "version": "15.5.3",
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-15.3.3.tgz", "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-15.5.3.tgz",
"integrity": "sha512-VZ3sYL2LXB8znNGcjhocikEkag/8xiLgnvQts41tq6i+wql63SMS1Q6N8RVXHw5pEUjiof+II3HkDd7GFcgkzw==", "integrity": "sha512-+m7pfIs0/yvgVu26ieaKrifV8C8yiLe7jVp9SpcIzg7XmyyNE7toC1fy5IOQozmr6kWl/JONC51osih2RyoXRw==",
"cpu": [ "cpu": [
"arm64" "arm64"
], ],
"license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
"linux" "linux"
@ -1909,12 +1954,13 @@
} }
}, },
"node_modules/@next/swc-linux-arm64-musl": { "node_modules/@next/swc-linux-arm64-musl": {
"version": "15.3.3", "version": "15.5.3",
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-15.3.3.tgz", "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-15.5.3.tgz",
"integrity": "sha512-h6Y1fLU4RWAp1HPNJWDYBQ+e3G7sLckyBXhmH9ajn8l/RSMnhbuPBV/fXmy3muMcVwoJdHL+UtzRzs0nXOf9SA==", "integrity": "sha512-u3PEIzuguSenoZviZJahNLgCexGFhso5mxWCrrIMdvpZn6lkME5vc/ADZG8UUk5K1uWRy4hqSFECrON6UKQBbQ==",
"cpu": [ "cpu": [
"arm64" "arm64"
], ],
"license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
"linux" "linux"
@ -1924,12 +1970,13 @@
} }
}, },
"node_modules/@next/swc-linux-x64-gnu": { "node_modules/@next/swc-linux-x64-gnu": {
"version": "15.3.3", "version": "15.5.3",
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-15.3.3.tgz", "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-15.5.3.tgz",
"integrity": "sha512-jJ8HRiF3N8Zw6hGlytCj5BiHyG/K+fnTKVDEKvUCyiQ/0r5tgwO7OgaRiOjjRoIx2vwLR+Rz8hQoPrnmFbJdfw==", "integrity": "sha512-lDtOOScYDZxI2BENN9m0pfVPJDSuUkAD1YXSvlJF0DKwZt0WlA7T7o3wrcEr4Q+iHYGzEaVuZcsIbCps4K27sA==",
"cpu": [ "cpu": [
"x64" "x64"
], ],
"license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
"linux" "linux"
@ -1939,12 +1986,13 @@
} }
}, },
"node_modules/@next/swc-linux-x64-musl": { "node_modules/@next/swc-linux-x64-musl": {
"version": "15.3.3", "version": "15.5.3",
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-15.3.3.tgz", "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-15.5.3.tgz",
"integrity": "sha512-HrUcTr4N+RgiiGn3jjeT6Oo208UT/7BuTr7K0mdKRBtTbT4v9zJqCDKO97DUqqoBK1qyzP1RwvrWTvU6EPh/Cw==", "integrity": "sha512-9vWVUnsx9PrY2NwdVRJ4dUURAQ8Su0sLRPqcCCxtX5zIQUBES12eRVHq6b70bbfaVaxIDGJN2afHui0eDm+cLg==",
"cpu": [ "cpu": [
"x64" "x64"
], ],
"license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
"linux" "linux"
@ -1954,12 +2002,13 @@
} }
}, },
"node_modules/@next/swc-win32-arm64-msvc": { "node_modules/@next/swc-win32-arm64-msvc": {
"version": "15.3.3", "version": "15.5.3",
"resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-15.3.3.tgz", "resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-15.5.3.tgz",
"integrity": "sha512-SxorONgi6K7ZUysMtRF3mIeHC5aA3IQLmKFQzU0OuhuUYwpOBc1ypaLJLP5Bf3M9k53KUUUj4vTPwzGvl/NwlQ==", "integrity": "sha512-1CU20FZzY9LFQigRi6jM45oJMU3KziA5/sSG+dXeVaTm661snQP6xu3ykGxxwU5sLG3sh14teO/IOEPVsQMRfA==",
"cpu": [ "cpu": [
"arm64" "arm64"
], ],
"license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
"win32" "win32"
@ -1969,12 +2018,13 @@
} }
}, },
"node_modules/@next/swc-win32-x64-msvc": { "node_modules/@next/swc-win32-x64-msvc": {
"version": "15.3.3", "version": "15.5.3",
"resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-15.3.3.tgz", "resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-15.5.3.tgz",
"integrity": "sha512-4QZG6F8enl9/S2+yIiOiju0iCTFd93d8VC1q9LZS4p/Xuk81W2QDjCFeoogmrWWkAD59z8ZxepBQap2dKS5ruw==", "integrity": "sha512-JMoLAq3n3y5tKXPQwCK5c+6tmwkuFDa2XAxz8Wm4+IVthdBZdZGh+lmiLUHg9f9IDwIQpUjp+ysd6OkYTyZRZw==",
"cpu": [ "cpu": [
"x64" "x64"
], ],
"license": "MIT",
"optional": true, "optional": true,
"os": [ "os": [
"win32" "win32"
@ -2874,22 +2924,22 @@
} }
}, },
"node_modules/@radix-ui/react-select": { "node_modules/@radix-ui/react-select": {
"version": "2.2.5", "version": "2.2.6",
"resolved": "https://registry.npmjs.org/@radix-ui/react-select/-/react-select-2.2.5.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-select/-/react-select-2.2.6.tgz",
"integrity": "sha512-HnMTdXEVuuyzx63ME0ut4+sEMYW6oouHWNGUZc7ddvUWIcfCva/AMoqEW/3wnEllriMWBa0RHspCYnfCWJQYmA==", "integrity": "sha512-I30RydO+bnn2PQztvo25tswPH+wFBjehVGtmagkU78yMdwTwVf12wnAOF+AeP8S2N8xD+5UPbGhkUfPyvT+mwQ==",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"@radix-ui/number": "1.1.1", "@radix-ui/number": "1.1.1",
"@radix-ui/primitive": "1.1.2", "@radix-ui/primitive": "1.1.3",
"@radix-ui/react-collection": "1.1.7", "@radix-ui/react-collection": "1.1.7",
"@radix-ui/react-compose-refs": "1.1.2", "@radix-ui/react-compose-refs": "1.1.2",
"@radix-ui/react-context": "1.1.2", "@radix-ui/react-context": "1.1.2",
"@radix-ui/react-direction": "1.1.1", "@radix-ui/react-direction": "1.1.1",
"@radix-ui/react-dismissable-layer": "1.1.10", "@radix-ui/react-dismissable-layer": "1.1.11",
"@radix-ui/react-focus-guards": "1.1.2", "@radix-ui/react-focus-guards": "1.1.3",
"@radix-ui/react-focus-scope": "1.1.7", "@radix-ui/react-focus-scope": "1.1.7",
"@radix-ui/react-id": "1.1.1", "@radix-ui/react-id": "1.1.1",
"@radix-ui/react-popper": "1.2.7", "@radix-ui/react-popper": "1.2.8",
"@radix-ui/react-portal": "1.1.9", "@radix-ui/react-portal": "1.1.9",
"@radix-ui/react-primitive": "2.1.3", "@radix-ui/react-primitive": "2.1.3",
"@radix-ui/react-slot": "1.2.3", "@radix-ui/react-slot": "1.2.3",
@ -2916,13 +2966,19 @@
} }
} }
}, },
"node_modules/@radix-ui/react-select/node_modules/@radix-ui/primitive": {
"version": "1.1.3",
"resolved": "https://registry.npmjs.org/@radix-ui/primitive/-/primitive-1.1.3.tgz",
"integrity": "sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg==",
"license": "MIT"
},
"node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-dismissable-layer": { "node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-dismissable-layer": {
"version": "1.1.10", "version": "1.1.11",
"resolved": "https://registry.npmjs.org/@radix-ui/react-dismissable-layer/-/react-dismissable-layer-1.1.10.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-dismissable-layer/-/react-dismissable-layer-1.1.11.tgz",
"integrity": "sha512-IM1zzRV4W3HtVgftdQiiOmA0AdJlCtMLe00FXaHwgt3rAnNsIyDqshvkIW3hj/iu5hu8ERP7KIYki6NkqDxAwQ==", "integrity": "sha512-Nqcp+t5cTB8BinFkZgXiMJniQH0PsUt2k51FUhbdfeKvc4ACcG2uQniY/8+h1Yv6Kza4Q7lD7PQV0z0oicE0Mg==",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"@radix-ui/primitive": "1.1.2", "@radix-ui/primitive": "1.1.3",
"@radix-ui/react-compose-refs": "1.1.2", "@radix-ui/react-compose-refs": "1.1.2",
"@radix-ui/react-primitive": "2.1.3", "@radix-ui/react-primitive": "2.1.3",
"@radix-ui/react-use-callback-ref": "1.1.1", "@radix-ui/react-use-callback-ref": "1.1.1",
@ -2943,6 +2999,21 @@
} }
} }
}, },
"node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-focus-guards": {
"version": "1.1.3",
"resolved": "https://registry.npmjs.org/@radix-ui/react-focus-guards/-/react-focus-guards-1.1.3.tgz",
"integrity": "sha512-0rFg/Rj2Q62NCm62jZw0QX7a3sz6QCQU0LpZdNrJX8byRGaGVTqbrW9jAoIAHyMQqsNpeZ81YgSizOt5WXq0Pw==",
"license": "MIT",
"peerDependencies": {
"@types/react": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-focus-scope": { "node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-focus-scope": {
"version": "1.1.7", "version": "1.1.7",
"resolved": "https://registry.npmjs.org/@radix-ui/react-focus-scope/-/react-focus-scope-1.1.7.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-focus-scope/-/react-focus-scope-1.1.7.tgz",
@ -2968,38 +3039,6 @@
} }
} }
}, },
"node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-popper": {
"version": "1.2.7",
"resolved": "https://registry.npmjs.org/@radix-ui/react-popper/-/react-popper-1.2.7.tgz",
"integrity": "sha512-IUFAccz1JyKcf/RjB552PlWwxjeCJB8/4KxT7EhBHOJM+mN7LdW+B3kacJXILm32xawcMMjb2i0cIZpo+f9kiQ==",
"license": "MIT",
"dependencies": {
"@floating-ui/react-dom": "^2.0.0",
"@radix-ui/react-arrow": "1.1.7",
"@radix-ui/react-compose-refs": "1.1.2",
"@radix-ui/react-context": "1.1.2",
"@radix-ui/react-primitive": "2.1.3",
"@radix-ui/react-use-callback-ref": "1.1.1",
"@radix-ui/react-use-layout-effect": "1.1.1",
"@radix-ui/react-use-rect": "1.1.1",
"@radix-ui/react-use-size": "1.1.1",
"@radix-ui/rect": "1.1.1"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-portal": { "node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-portal": {
"version": "1.1.9", "version": "1.1.9",
"resolved": "https://registry.npmjs.org/@radix-ui/react-portal/-/react-portal-1.1.9.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-portal/-/react-portal-1.1.9.tgz",
@ -3547,12 +3586,6 @@
"@sinonjs/commons": "^3.0.0" "@sinonjs/commons": "^3.0.0"
} }
}, },
"node_modules/@swc/counter": {
"version": "0.1.3",
"resolved": "https://registry.npmjs.org/@swc/counter/-/counter-0.1.3.tgz",
"integrity": "sha512-e2BR4lsJkkRlKZ/qCHPw9ZaSxc0MVUd7gtbtaB7aMvHeJVYe8sOB8DBZkP2DtISHGSku9sCK6T6cnY0CtXrOCQ==",
"license": "Apache-2.0"
},
"node_modules/@swc/helpers": { "node_modules/@swc/helpers": {
"version": "0.5.15", "version": "0.5.15",
"resolved": "https://registry.npmjs.org/@swc/helpers/-/helpers-0.5.15.tgz", "resolved": "https://registry.npmjs.org/@swc/helpers/-/helpers-0.5.15.tgz",
@ -3578,6 +3611,13 @@
"tailwindcss": "4.1.6" "tailwindcss": "4.1.6"
} }
}, },
"node_modules/@tailwindcss/node/node_modules/tailwindcss": {
"version": "4.1.6",
"resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.6.tgz",
"integrity": "sha512-j0cGLTreM6u4OWzBeLBpycK0WIh8w7kSwcUsQZoGLHZ7xDTdM69lN64AgoIEEwFi0tnhs4wSykUa5YWxAzgFYg==",
"dev": true,
"license": "MIT"
},
"node_modules/@tailwindcss/oxide": { "node_modules/@tailwindcss/oxide": {
"version": "4.1.6", "version": "4.1.6",
"resolved": "https://registry.npmjs.org/@tailwindcss/oxide/-/oxide-4.1.6.tgz", "resolved": "https://registry.npmjs.org/@tailwindcss/oxide/-/oxide-4.1.6.tgz",
@ -3838,6 +3878,13 @@
"tailwindcss": "4.1.6" "tailwindcss": "4.1.6"
} }
}, },
"node_modules/@tailwindcss/postcss/node_modules/tailwindcss": {
"version": "4.1.6",
"resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.6.tgz",
"integrity": "sha512-j0cGLTreM6u4OWzBeLBpycK0WIh8w7kSwcUsQZoGLHZ7xDTdM69lN64AgoIEEwFi0tnhs4wSykUa5YWxAzgFYg==",
"dev": true,
"license": "MIT"
},
"node_modules/@testing-library/dom": { "node_modules/@testing-library/dom": {
"version": "10.4.1", "version": "10.4.1",
"resolved": "https://registry.npmjs.org/@testing-library/dom/-/dom-10.4.1.tgz", "resolved": "https://registry.npmjs.org/@testing-library/dom/-/dom-10.4.1.tgz",
@ -5461,17 +5508,6 @@
"dev": true, "dev": true,
"license": "MIT" "license": "MIT"
}, },
"node_modules/busboy": {
"version": "1.6.0",
"resolved": "https://registry.npmjs.org/busboy/-/busboy-1.6.0.tgz",
"integrity": "sha512-8SFQbg/0hQ9xy3UNTB0YEnsNBbWfhf7RtnzpL7TkBiTBRfrQ9Fxcnz7VJsleJpyp6rVLvXiuORqjlHi5q+PYuA==",
"dependencies": {
"streamsearch": "^1.1.0"
},
"engines": {
"node": ">=10.16.0"
}
},
"node_modules/bytes": { "node_modules/bytes": {
"version": "3.1.2", "version": "3.1.2",
"resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz", "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz",
@ -8281,9 +8317,9 @@
} }
}, },
"node_modules/is-arrayish": { "node_modules/is-arrayish": {
"version": "0.3.2", "version": "0.3.4",
"resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.2.tgz", "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.4.tgz",
"integrity": "sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ==", "integrity": "sha512-m6UrgzFVUYawGBh1dUsWR5M2Clqic9RVXC/9f8ceNlv2IcO9j9J/z8UoCLPqtsPBFNzEpfR3xftohbfqDx8EQA==",
"license": "MIT", "license": "MIT",
"optional": true "optional": true
}, },
@ -11528,14 +11564,13 @@
} }
}, },
"node_modules/next": { "node_modules/next": {
"version": "15.3.3", "version": "15.5.3",
"resolved": "https://registry.npmjs.org/next/-/next-15.3.3.tgz", "resolved": "https://registry.npmjs.org/next/-/next-15.5.3.tgz",
"integrity": "sha512-JqNj29hHNmCLtNvd090SyRbXJiivQ+58XjCcrC50Crb5g5u2zi7Y2YivbsEfzk6AtVI80akdOQbaMZwWB1Hthw==", "integrity": "sha512-r/liNAx16SQj4D+XH/oI1dlpv9tdKJ6cONYPwwcCC46f2NjpaRWY+EKCzULfgQYV6YKXjHBchff2IZBSlZmJNw==",
"license": "MIT",
"dependencies": { "dependencies": {
"@next/env": "15.3.3", "@next/env": "15.5.3",
"@swc/counter": "0.1.3",
"@swc/helpers": "0.5.15", "@swc/helpers": "0.5.15",
"busboy": "1.6.0",
"caniuse-lite": "^1.0.30001579", "caniuse-lite": "^1.0.30001579",
"postcss": "8.4.31", "postcss": "8.4.31",
"styled-jsx": "5.1.6" "styled-jsx": "5.1.6"
@ -11547,19 +11582,19 @@
"node": "^18.18.0 || ^19.8.0 || >= 20.0.0" "node": "^18.18.0 || ^19.8.0 || >= 20.0.0"
}, },
"optionalDependencies": { "optionalDependencies": {
"@next/swc-darwin-arm64": "15.3.3", "@next/swc-darwin-arm64": "15.5.3",
"@next/swc-darwin-x64": "15.3.3", "@next/swc-darwin-x64": "15.5.3",
"@next/swc-linux-arm64-gnu": "15.3.3", "@next/swc-linux-arm64-gnu": "15.5.3",
"@next/swc-linux-arm64-musl": "15.3.3", "@next/swc-linux-arm64-musl": "15.5.3",
"@next/swc-linux-x64-gnu": "15.3.3", "@next/swc-linux-x64-gnu": "15.5.3",
"@next/swc-linux-x64-musl": "15.3.3", "@next/swc-linux-x64-musl": "15.5.3",
"@next/swc-win32-arm64-msvc": "15.3.3", "@next/swc-win32-arm64-msvc": "15.5.3",
"@next/swc-win32-x64-msvc": "15.3.3", "@next/swc-win32-x64-msvc": "15.5.3",
"sharp": "^0.34.1" "sharp": "^0.34.3"
}, },
"peerDependencies": { "peerDependencies": {
"@opentelemetry/api": "^1.1.0", "@opentelemetry/api": "^1.1.0",
"@playwright/test": "^1.41.2", "@playwright/test": "^1.51.1",
"babel-plugin-react-compiler": "*", "babel-plugin-react-compiler": "*",
"react": "^18.2.0 || 19.0.0-rc-de68d2f4-20241204 || ^19.0.0", "react": "^18.2.0 || 19.0.0-rc-de68d2f4-20241204 || ^19.0.0",
"react-dom": "^18.2.0 || 19.0.0-rc-de68d2f4-20241204 || ^19.0.0", "react-dom": "^18.2.0 || 19.0.0-rc-de68d2f4-20241204 || ^19.0.0",
@ -13226,16 +13261,16 @@
"license": "ISC" "license": "ISC"
}, },
"node_modules/sharp": { "node_modules/sharp": {
"version": "0.34.1", "version": "0.34.3",
"resolved": "https://registry.npmjs.org/sharp/-/sharp-0.34.1.tgz", "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.34.3.tgz",
"integrity": "sha512-1j0w61+eVxu7DawFJtnfYcvSv6qPFvfTaqzTQ2BLknVhHTwGS8sc63ZBF4rzkWMBVKybo4S5OBtDdZahh2A1xg==", "integrity": "sha512-eX2IQ6nFohW4DbvHIOLRB3MHFpYqaqvXd3Tp5e/T/dSH83fxaNJQRvDMhASmkNTsNTVF2/OOopzRCt7xokgPfg==",
"hasInstallScript": true, "hasInstallScript": true,
"license": "Apache-2.0", "license": "Apache-2.0",
"optional": true, "optional": true,
"dependencies": { "dependencies": {
"color": "^4.2.3", "color": "^4.2.3",
"detect-libc": "^2.0.3", "detect-libc": "^2.0.4",
"semver": "^7.7.1" "semver": "^7.7.2"
}, },
"engines": { "engines": {
"node": "^18.17.0 || ^20.3.0 || >=21.0.0" "node": "^18.17.0 || ^20.3.0 || >=21.0.0"
@ -13244,26 +13279,28 @@
"url": "https://opencollective.com/libvips" "url": "https://opencollective.com/libvips"
}, },
"optionalDependencies": { "optionalDependencies": {
"@img/sharp-darwin-arm64": "0.34.1", "@img/sharp-darwin-arm64": "0.34.3",
"@img/sharp-darwin-x64": "0.34.1", "@img/sharp-darwin-x64": "0.34.3",
"@img/sharp-libvips-darwin-arm64": "1.1.0", "@img/sharp-libvips-darwin-arm64": "1.2.0",
"@img/sharp-libvips-darwin-x64": "1.1.0", "@img/sharp-libvips-darwin-x64": "1.2.0",
"@img/sharp-libvips-linux-arm": "1.1.0", "@img/sharp-libvips-linux-arm": "1.2.0",
"@img/sharp-libvips-linux-arm64": "1.1.0", "@img/sharp-libvips-linux-arm64": "1.2.0",
"@img/sharp-libvips-linux-ppc64": "1.1.0", "@img/sharp-libvips-linux-ppc64": "1.2.0",
"@img/sharp-libvips-linux-s390x": "1.1.0", "@img/sharp-libvips-linux-s390x": "1.2.0",
"@img/sharp-libvips-linux-x64": "1.1.0", "@img/sharp-libvips-linux-x64": "1.2.0",
"@img/sharp-libvips-linuxmusl-arm64": "1.1.0", "@img/sharp-libvips-linuxmusl-arm64": "1.2.0",
"@img/sharp-libvips-linuxmusl-x64": "1.1.0", "@img/sharp-libvips-linuxmusl-x64": "1.2.0",
"@img/sharp-linux-arm": "0.34.1", "@img/sharp-linux-arm": "0.34.3",
"@img/sharp-linux-arm64": "0.34.1", "@img/sharp-linux-arm64": "0.34.3",
"@img/sharp-linux-s390x": "0.34.1", "@img/sharp-linux-ppc64": "0.34.3",
"@img/sharp-linux-x64": "0.34.1", "@img/sharp-linux-s390x": "0.34.3",
"@img/sharp-linuxmusl-arm64": "0.34.1", "@img/sharp-linux-x64": "0.34.3",
"@img/sharp-linuxmusl-x64": "0.34.1", "@img/sharp-linuxmusl-arm64": "0.34.3",
"@img/sharp-wasm32": "0.34.1", "@img/sharp-linuxmusl-x64": "0.34.3",
"@img/sharp-win32-ia32": "0.34.1", "@img/sharp-wasm32": "0.34.3",
"@img/sharp-win32-x64": "0.34.1" "@img/sharp-win32-arm64": "0.34.3",
"@img/sharp-win32-ia32": "0.34.3",
"@img/sharp-win32-x64": "0.34.3"
} }
}, },
"node_modules/shebang-command": { "node_modules/shebang-command": {
@ -13389,9 +13426,9 @@
"license": "ISC" "license": "ISC"
}, },
"node_modules/simple-swizzle": { "node_modules/simple-swizzle": {
"version": "0.2.2", "version": "0.2.4",
"resolved": "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.2.tgz", "resolved": "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.4.tgz",
"integrity": "sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg==", "integrity": "sha512-nAu1WFPQSMNr2Zn9PGSZK9AGn4t/y97lEm+MXTtUDwfP0ksAIX4nO+6ruD9Jwut4C49SB1Ws+fbXsm/yScWOHw==",
"license": "MIT", "license": "MIT",
"optional": true, "optional": true,
"dependencies": { "dependencies": {
@ -13512,14 +13549,6 @@
"node": ">= 0.8" "node": ">= 0.8"
} }
}, },
"node_modules/streamsearch": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/streamsearch/-/streamsearch-1.1.0.tgz",
"integrity": "sha512-Mcc5wHehp9aXz1ax6bZUyY5afg9u2rv5cqQI3mRrYkGC8rW2hM02jWuwjtL++LS5qinSyhj2QfLyNsuc+VsExg==",
"engines": {
"node": ">=10.0.0"
}
},
"node_modules/string-length": { "node_modules/string-length": {
"version": "4.0.2", "version": "4.0.2",
"resolved": "https://registry.npmjs.org/string-length/-/string-length-4.0.2.tgz", "resolved": "https://registry.npmjs.org/string-length/-/string-length-4.0.2.tgz",
@ -13843,9 +13872,9 @@
} }
}, },
"node_modules/tailwindcss": { "node_modules/tailwindcss": {
"version": "4.1.6", "version": "4.1.13",
"resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.6.tgz", "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.13.tgz",
"integrity": "sha512-j0cGLTreM6u4OWzBeLBpycK0WIh8w7kSwcUsQZoGLHZ7xDTdM69lN64AgoIEEwFi0tnhs4wSykUa5YWxAzgFYg==", "integrity": "sha512-i+zidfmTqtwquj4hMEwdjshYYgMbOrPzb9a0M3ZgNa0JMoZeFC6bxZvO8yr8ozS6ix2SDz0+mvryPeBs2TFE+w==",
"dev": true, "dev": true,
"license": "MIT" "license": "MIT"
}, },

View file

@ -16,7 +16,7 @@
"@radix-ui/react-collapsible": "^1.1.12", "@radix-ui/react-collapsible": "^1.1.12",
"@radix-ui/react-dialog": "^1.1.13", "@radix-ui/react-dialog": "^1.1.13",
"@radix-ui/react-dropdown-menu": "^2.1.16", "@radix-ui/react-dropdown-menu": "^2.1.16",
"@radix-ui/react-select": "^2.2.5", "@radix-ui/react-select": "^2.2.6",
"@radix-ui/react-separator": "^1.1.7", "@radix-ui/react-separator": "^1.1.7",
"@radix-ui/react-slot": "^1.2.3", "@radix-ui/react-slot": "^1.2.3",
"@radix-ui/react-tooltip": "^1.2.8", "@radix-ui/react-tooltip": "^1.2.8",
@ -25,7 +25,7 @@
"framer-motion": "^12.23.12", "framer-motion": "^12.23.12",
"llama-stack-client": "^0.2.21", "llama-stack-client": "^0.2.21",
"lucide-react": "^0.542.0", "lucide-react": "^0.542.0",
"next": "15.3.3", "next": "15.5.3",
"next-auth": "^4.24.11", "next-auth": "^4.24.11",
"next-themes": "^0.4.6", "next-themes": "^0.4.6",
"react": "^19.0.0", "react": "^19.0.0",

View file

@ -32,7 +32,7 @@ dependencies = [
"jinja2>=3.1.6", "jinja2>=3.1.6",
"jsonschema", "jsonschema",
"llama-stack-client>=0.2.21", "llama-stack-client>=0.2.21",
"openai>=1.99.6", "openai>=1.100.0", # for expires_after support
"prompt-toolkit", "prompt-toolkit",
"python-dotenv", "python-dotenv",
"python-jose[cryptography]", "python-jose[cryptography]",
@ -80,7 +80,6 @@ dev = [
unit = [ unit = [
"sqlite-vec", "sqlite-vec",
"ollama", "ollama",
"openai",
"aiosqlite", "aiosqlite",
"aiohttp", "aiohttp",
"psycopg2-binary>=2.9.0", "psycopg2-binary>=2.9.0",
@ -105,7 +104,6 @@ unit = [
# separately. If you are using "uv" to execute your tests, you can use the "--group" flag to specify extra # separately. If you are using "uv" to execute your tests, you can use the "--group" flag to specify extra
# dependencies. # dependencies.
test = [ test = [
"openai>=1.100.0", # for expires_after support
"aiosqlite", "aiosqlite",
"aiohttp", "aiohttp",
"torch>=2.6.0", "torch>=2.6.0",
@ -356,6 +354,7 @@ warn_required_dynamic_aliases = true
classmethod-decorators = ["classmethod", "pydantic.field_validator"] classmethod-decorators = ["classmethod", "pydantic.field_validator"]
[tool.pytest.ini_options] [tool.pytest.ini_options]
addopts = ["--durations=10"]
asyncio_mode = "auto" asyncio_mode = "auto"
markers = [ markers = [
"allow_network: Allow network access for specific unit tests", "allow_network: Allow network access for specific unit tests",

View file

@ -239,8 +239,9 @@ echo "Test pattern: ${TEST_PATTERN:-"(none)"}"
echo "" echo ""
# Prepare inputs for gh workflow run # Prepare inputs for gh workflow run
INPUTS=
if [[ -n "$TEST_SUBDIRS" ]]; then if [[ -n "$TEST_SUBDIRS" ]]; then
INPUTS="-f subdirs='$TEST_SUBDIRS'" INPUTS="$INPUTS -f subdirs='$TEST_SUBDIRS'"
fi fi
if [[ -n "$TEST_SETUP" ]]; then if [[ -n "$TEST_SETUP" ]]; then
INPUTS="$INPUTS -f test-setup='$TEST_SETUP'" INPUTS="$INPUTS -f test-setup='$TEST_SETUP'"

View file

@ -6,12 +6,25 @@
import time import time
import unicodedata
import pytest import pytest
from ..test_cases.test_case import TestCase from ..test_cases.test_case import TestCase
def _normalize_text(text: str) -> str:
"""
Normalize Unicode text by removing diacritical marks for comparison.
The test case streaming_01 expects the answer "Sol" for the question "What's the name of the Sun
in latin?", but the model is returning "sōl" (with a macron over the 'o'), which is the correct
Latin spelling. The test is failing because it's doing a simple case-insensitive string search
for "sol" but the actual response contains the diacritical mark.
"""
return unicodedata.normalize("NFD", text).encode("ascii", "ignore").decode("ascii").lower()
def provider_from_model(client_with_models, model_id): def provider_from_model(client_with_models, model_id):
models = {m.identifier: m for m in client_with_models.models.list()} models = {m.identifier: m for m in client_with_models.models.list()}
models.update({m.provider_resource_id: m for m in client_with_models.models.list()}) models.update({m.provider_resource_id: m for m in client_with_models.models.list()})
@ -42,6 +55,10 @@ def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id)
"remote::groq", "remote::groq",
"remote::gemini", # https://generativelanguage.googleapis.com/v1beta/openai/completions -> 404 "remote::gemini", # https://generativelanguage.googleapis.com/v1beta/openai/completions -> 404
"remote::anthropic", # at least claude-3-{5,7}-{haiku,sonnet}-* / claude-{sonnet,opus}-4-* are not supported "remote::anthropic", # at least claude-3-{5,7}-{haiku,sonnet}-* / claude-{sonnet,opus}-4-* are not supported
"remote::azure", # {'error': {'code': 'OperationNotSupported', 'message': 'The completion operation
# does not work with the specified model, gpt-5-mini. Please choose different model and try
# again. You can learn more about which models can be used with each operation here:
# https://go.microsoft.com/fwlink/?linkid=2197993.'}}"}
): ):
pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.") pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.")
@ -157,7 +174,8 @@ def test_openai_completion_non_streaming_suffix(llama_stack_client, client_with_
assert len(response.choices) > 0 assert len(response.choices) > 0
choice = response.choices[0] choice = response.choices[0]
assert len(choice.text) > 5 assert len(choice.text) > 5
assert "france" in choice.text.lower() normalized_text = _normalize_text(choice.text)
assert "france" in normalized_text
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -248,7 +266,9 @@ def test_openai_chat_completion_non_streaming(compat_client, client_with_models,
) )
message_content = response.choices[0].message.content.lower().strip() message_content = response.choices[0].message.content.lower().strip()
assert len(message_content) > 0 assert len(message_content) > 0
assert expected.lower() in message_content normalized_expected = _normalize_text(expected)
normalized_content = _normalize_text(message_content)
assert normalized_expected in normalized_content
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -272,10 +292,13 @@ def test_openai_chat_completion_streaming(compat_client, client_with_models, tex
) )
streamed_content = [] streamed_content = []
for chunk in response: for chunk in response:
if chunk.choices[0].delta.content: # On some providers like Azure, the choices are empty on the first chunk, so we need to check for that
if chunk.choices and len(chunk.choices) > 0 and chunk.choices[0].delta.content:
streamed_content.append(chunk.choices[0].delta.content.lower().strip()) streamed_content.append(chunk.choices[0].delta.content.lower().strip())
assert len(streamed_content) > 0 assert len(streamed_content) > 0
assert expected.lower() in "".join(streamed_content) normalized_expected = _normalize_text(expected)
normalized_content = _normalize_text("".join(streamed_content))
assert normalized_expected in normalized_content
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -308,8 +331,12 @@ def test_openai_chat_completion_streaming_with_n(compat_client, client_with_mode
streamed_content.get(choice.index, "") + choice.delta.content.lower().strip() streamed_content.get(choice.index, "") + choice.delta.content.lower().strip()
) )
assert len(streamed_content) == 2 assert len(streamed_content) == 2
normalized_expected = _normalize_text(expected)
for i, content in streamed_content.items(): for i, content in streamed_content.items():
assert expected.lower() in content, f"Choice {i}: Expected {expected.lower()} in {content}" normalized_content = _normalize_text(content)
assert normalized_expected in normalized_content, (
f"Choice {i}: Expected {normalized_expected} in {normalized_content}"
)
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -339,9 +366,9 @@ def test_inference_store(compat_client, client_with_models, text_model_id, strea
content = "" content = ""
response_id = None response_id = None
for chunk in response: for chunk in response:
if response_id is None: if response_id is None and chunk.id:
response_id = chunk.id response_id = chunk.id
if chunk.choices[0].delta.content: if chunk.choices and len(chunk.choices) > 0 and chunk.choices[0].delta.content:
content += chunk.choices[0].delta.content content += chunk.choices[0].delta.content
else: else:
response_id = response.id response_id = response.id
@ -410,8 +437,9 @@ def test_inference_store_tool_calls(compat_client, client_with_models, text_mode
content = "" content = ""
response_id = None response_id = None
for chunk in response: for chunk in response:
if response_id is None: if response_id is None and chunk.id:
response_id = chunk.id response_id = chunk.id
if chunk.choices and len(chunk.choices) > 0:
if delta := chunk.choices[0].delta: if delta := chunk.choices[0].delta:
if delta.content: if delta.content:
content += delta.content content += delta.content
@ -484,4 +512,5 @@ def test_openai_chat_completion_non_streaming_with_file(openai_client, client_wi
stream=False, stream=False,
) )
message_content = response.choices[0].message.content.lower().strip() message_content = response.choices[0].message.content.lower().strip()
assert "hello world" in message_content normalized_content = _normalize_text(message_content)
assert "hello world" in normalized_content

View file

@ -32,6 +32,7 @@ def skip_if_model_doesnt_support_completion(client_with_models, model_id):
"remote::vertexai", "remote::vertexai",
"remote::groq", "remote::groq",
"remote::sambanova", "remote::sambanova",
"remote::azure",
) )
or "openai-compat" in provider.provider_type or "openai-compat" in provider.provider_type
): ):
@ -44,7 +45,7 @@ def skip_if_model_doesnt_support_json_schema_structured_output(client_with_model
provider_id = models[model_id].provider_id provider_id = models[model_id].provider_id
providers = {p.provider_id: p for p in client_with_models.providers.list()} providers = {p.provider_id: p for p in client_with_models.providers.list()}
provider = providers[provider_id] provider = providers[provider_id]
if provider.provider_type in ("remote::sambanova",): if provider.provider_type in ("remote::sambanova", "remote::azure"):
pytest.skip( pytest.skip(
f"Model {model_id} hosted by {provider.provider_type} doesn't support json_schema structured output" f"Model {model_id} hosted by {provider.provider_type} doesn't support json_schema structured output"
) )

View file

@ -0,0 +1,71 @@
{
"request": {
"method": "POST",
"url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions",
"headers": {},
"body": {
"model": "gpt-5-mini",
"messages": [
{
"role": "user",
"content": "Which planet do humans live on?"
}
],
"stream": false
},
"endpoint": "/v1/chat/completions",
"model": "gpt-5-mini"
},
"response": {
"body": {
"__type__": "openai.types.chat.chat_completion.ChatCompletion",
"__data__": {
"id": "chatcmpl-CECIXqfvjuluKkZtG3q2QJoSQhBU0",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"logprobs": null,
"message": {
"content": "Humans live on Earth \u2014 the third planet from the Sun. It's the only known planet that naturally supports life, with a breathable atmosphere, liquid water, and temperatures suitable for living organisms.",
"refusal": null,
"role": "assistant",
"annotations": [],
"audio": null,
"function_call": null,
"tool_calls": null
},
"content_filter_results": {}
}
],
"created": 1757499901,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion",
"service_tier": null,
"system_fingerprint": null,
"usage": {
"completion_tokens": 112,
"prompt_tokens": 13,
"total_tokens": 125,
"completion_tokens_details": {
"accepted_prediction_tokens": 0,
"audio_tokens": 0,
"reasoning_tokens": 64,
"rejected_prediction_tokens": 0
},
"prompt_tokens_details": {
"audio_tokens": 0,
"cached_tokens": 0
}
},
"prompt_filter_results": [
{
"prompt_index": 0,
"content_filter_results": {}
}
]
}
},
"is_streaming": false
}
}

View file

@ -0,0 +1,448 @@
{
"request": {
"method": "POST",
"url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions",
"headers": {},
"body": {
"model": "gpt-5-mini",
"messages": [
{
"role": "user",
"content": "Hello, world!"
}
],
"stream": true
},
"endpoint": "/v1/chat/completions",
"model": "gpt-5-mini"
},
"response": {
"body": [
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "",
"choices": [],
"created": 0,
"model": "",
"object": "",
"service_tier": null,
"system_fingerprint": null,
"usage": null,
"prompt_filter_results": [
{
"prompt_index": 0,
"content_filter_results": {}
}
]
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
"choices": [
{
"delta": {
"content": "",
"function_call": null,
"refusal": null,
"role": "assistant",
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499910,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
"choices": [
{
"delta": {
"content": "Hello",
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499910,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
"choices": [
{
"delta": {
"content": ",",
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499910,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
"choices": [
{
"delta": {
"content": " world",
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499910,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
"choices": [
{
"delta": {
"content": "!",
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499910,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
"choices": [
{
"delta": {
"content": " Hi",
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499910,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
"choices": [
{
"delta": {
"content": " \u2014",
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499910,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
"choices": [
{
"delta": {
"content": " how",
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499910,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
"choices": [
{
"delta": {
"content": " can",
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499910,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
"choices": [
{
"delta": {
"content": " I",
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499910,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
"choices": [
{
"delta": {
"content": " help",
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499910,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
"choices": [
{
"delta": {
"content": " you",
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499910,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
"choices": [
{
"delta": {
"content": " today",
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499910,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
"choices": [
{
"delta": {
"content": "?",
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499910,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECIgeXOClAuSm8xHAS6CYQ87lB8O",
"choices": [
{
"delta": {
"content": null,
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": "stop",
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499910,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
}
],
"is_streaming": true
}
}

View file

@ -0,0 +1,71 @@
{
"request": {
"method": "POST",
"url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions",
"headers": {},
"body": {
"model": "gpt-5-mini",
"messages": [
{
"role": "user",
"content": "Which planet has rings around it with a name starting with letter S?"
}
],
"stream": false
},
"endpoint": "/v1/chat/completions",
"model": "gpt-5-mini"
},
"response": {
"body": {
"__type__": "openai.types.chat.chat_completion.ChatCompletion",
"__data__": {
"id": "chatcmpl-CECIkT5cbqFazpungtewksVePcUNa",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"logprobs": null,
"message": {
"content": "Saturn. It's the planet famous for its prominent ring system made of ice and rock.",
"refusal": null,
"role": "assistant",
"annotations": [],
"audio": null,
"function_call": null,
"tool_calls": null
},
"content_filter_results": {}
}
],
"created": 1757499914,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion",
"service_tier": null,
"system_fingerprint": null,
"usage": {
"completion_tokens": 156,
"prompt_tokens": 20,
"total_tokens": 176,
"completion_tokens_details": {
"accepted_prediction_tokens": 0,
"audio_tokens": 0,
"reasoning_tokens": 128,
"rejected_prediction_tokens": 0
},
"prompt_tokens_details": {
"audio_tokens": 0,
"cached_tokens": 0
}
},
"prompt_filter_results": [
{
"prompt_index": 0,
"content_filter_results": {}
}
]
}
},
"is_streaming": false
}
}

View file

@ -0,0 +1,71 @@
{
"request": {
"method": "POST",
"url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions",
"headers": {},
"body": {
"model": "gpt-5-mini",
"messages": [
{
"role": "user",
"content": "Hello, world!"
}
],
"stream": false
},
"endpoint": "/v1/chat/completions",
"model": "gpt-5-mini"
},
"response": {
"body": {
"__type__": "openai.types.chat.chat_completion.ChatCompletion",
"__data__": {
"id": "chatcmpl-CECIuyylsMNXspa83k8LrD8SQadNY",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"logprobs": null,
"message": {
"content": "Hello! \ud83d\udc4b How can I help you today \u2014 answer a question, write or edit something, debug code, brainstorm ideas, or anything else?",
"refusal": null,
"role": "assistant",
"annotations": [],
"audio": null,
"function_call": null,
"tool_calls": null
},
"content_filter_results": {}
}
],
"created": 1757499924,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion",
"service_tier": null,
"system_fingerprint": null,
"usage": {
"completion_tokens": 40,
"prompt_tokens": 10,
"total_tokens": 50,
"completion_tokens_details": {
"accepted_prediction_tokens": 0,
"audio_tokens": 0,
"reasoning_tokens": 0,
"rejected_prediction_tokens": 0
},
"prompt_tokens_details": {
"audio_tokens": 0,
"cached_tokens": 0
}
},
"prompt_filter_results": [
{
"prompt_index": 0,
"content_filter_results": {}
}
]
}
},
"is_streaming": false
}
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,98 @@
{
"request": {
"method": "POST",
"url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions",
"headers": {},
"body": {
"model": "gpt-5-mini",
"messages": [
{
"role": "user",
"content": "What's the weather in Tokyo? Use the get_weather function to get the weather."
}
],
"stream": false,
"tools": [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the weather in a given city",
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string",
"description": "The city to get the weather for"
}
}
}
}
}
]
},
"endpoint": "/v1/chat/completions",
"model": "gpt-5-mini"
},
"response": {
"body": {
"__type__": "openai.types.chat.chat_completion.ChatCompletion",
"__data__": {
"id": "chatcmpl-CECIwq9Odd0mOJMmw7ytv8iEazH4H",
"choices": [
{
"finish_reason": "tool_calls",
"index": 0,
"logprobs": null,
"message": {
"content": null,
"refusal": null,
"role": "assistant",
"annotations": [],
"audio": null,
"function_call": null,
"tool_calls": [
{
"id": "call_yw18spRc1jjUlEyabbXBhB33",
"function": {
"arguments": "{\"city\":\"Tokyo\"}",
"name": "get_weather"
},
"type": "function"
}
]
},
"content_filter_results": {}
}
],
"created": 1757499926,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion",
"service_tier": null,
"system_fingerprint": null,
"usage": {
"completion_tokens": 88,
"prompt_tokens": 151,
"total_tokens": 239,
"completion_tokens_details": {
"accepted_prediction_tokens": 0,
"audio_tokens": 0,
"reasoning_tokens": 64,
"rejected_prediction_tokens": 0
},
"prompt_tokens_details": {
"audio_tokens": 0,
"cached_tokens": 0
}
},
"prompt_filter_results": [
{
"prompt_index": 0,
"content_filter_results": {}
}
]
}
},
"is_streaming": false
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,310 @@
{
"request": {
"method": "POST",
"url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions",
"headers": {},
"body": {
"model": "gpt-5-mini",
"messages": [
{
"role": "user",
"content": "What's the weather in Tokyo? Use the get_weather function to get the weather."
}
],
"stream": true,
"tools": [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the weather in a given city",
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string",
"description": "The city to get the weather for"
}
}
}
}
}
]
},
"endpoint": "/v1/chat/completions",
"model": "gpt-5-mini"
},
"response": {
"body": [
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "",
"choices": [],
"created": 0,
"model": "",
"object": "",
"service_tier": null,
"system_fingerprint": null,
"usage": null,
"prompt_filter_results": [
{
"prompt_index": 0,
"content_filter_results": {}
}
]
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo",
"choices": [
{
"delta": {
"content": null,
"function_call": null,
"refusal": null,
"role": "assistant",
"tool_calls": [
{
"index": 0,
"id": "call_TMbEoYn9q0ZKtoxav5LpD9Ts",
"function": {
"arguments": "",
"name": "get_weather"
},
"type": "function"
}
]
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499912,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo",
"choices": [
{
"delta": {
"content": null,
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": [
{
"index": 0,
"id": null,
"function": {
"arguments": "{\"",
"name": null
},
"type": null
}
]
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499912,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo",
"choices": [
{
"delta": {
"content": null,
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": [
{
"index": 0,
"id": null,
"function": {
"arguments": "city",
"name": null
},
"type": null
}
]
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499912,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo",
"choices": [
{
"delta": {
"content": null,
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": [
{
"index": 0,
"id": null,
"function": {
"arguments": "\":\"",
"name": null
},
"type": null
}
]
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499912,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo",
"choices": [
{
"delta": {
"content": null,
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": [
{
"index": 0,
"id": null,
"function": {
"arguments": "Tokyo",
"name": null
},
"type": null
}
]
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499912,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo",
"choices": [
{
"delta": {
"content": null,
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": [
{
"index": 0,
"id": null,
"function": {
"arguments": "\"}",
"name": null
},
"type": null
}
]
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499912,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECIiMMWyfACuKUYWEyYSazcnvRVo",
"choices": [
{
"delta": {
"content": null,
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": "tool_calls",
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499912,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
}
],
"is_streaming": true
}
}

View file

@ -0,0 +1,556 @@
{
"request": {
"method": "POST",
"url": "https://shan-mfbb618r-eastus2.cognitiveservices.azure.com/openai/v1/v1/chat/completions",
"headers": {},
"body": {
"model": "gpt-5-mini",
"messages": [
{
"role": "user",
"content": "What is the name of the US captial?"
}
],
"stream": true
},
"endpoint": "/v1/chat/completions",
"model": "gpt-5-mini"
},
"response": {
"body": [
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "",
"choices": [],
"created": 0,
"model": "",
"object": "",
"service_tier": null,
"system_fingerprint": null,
"usage": null,
"prompt_filter_results": [
{
"prompt_index": 0,
"content_filter_results": {}
}
]
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
"choices": [
{
"delta": {
"content": "",
"function_call": null,
"refusal": null,
"role": "assistant",
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499916,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
"choices": [
{
"delta": {
"content": "The",
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499916,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
"choices": [
{
"delta": {
"content": " capital",
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499916,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
"choices": [
{
"delta": {
"content": " of",
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499916,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
"choices": [
{
"delta": {
"content": " the",
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499916,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
"choices": [
{
"delta": {
"content": " United",
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499916,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
"choices": [
{
"delta": {
"content": " States",
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499916,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
"choices": [
{
"delta": {
"content": " is",
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499916,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
"choices": [
{
"delta": {
"content": " Washington",
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499916,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
"choices": [
{
"delta": {
"content": ",",
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499916,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
"choices": [
{
"delta": {
"content": " D",
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499916,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
"choices": [
{
"delta": {
"content": ".C",
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499916,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
"choices": [
{
"delta": {
"content": ".",
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499916,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
"choices": [
{
"delta": {
"content": " (",
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499916,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
"choices": [
{
"delta": {
"content": "District",
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499916,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
"choices": [
{
"delta": {
"content": " of",
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499916,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
"choices": [
{
"delta": {
"content": " Columbia",
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499916,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
"choices": [
{
"delta": {
"content": ").",
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499916,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "chatcmpl-CECImr5TLfMFiZN3FUlfVdBLr51Fs",
"choices": [
{
"delta": {
"content": null,
"function_call": null,
"refusal": null,
"role": null,
"tool_calls": null
},
"finish_reason": "stop",
"index": 0,
"logprobs": null,
"content_filter_results": {}
}
],
"created": 1757499916,
"model": "gpt-5-mini-2025-08-07",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}
}
],
"is_streaming": true
}
}

View file

@ -49,16 +49,13 @@ def setup_openai_telemetry_data(llama_stack_client, text_model_id):
traces = llama_stack_client.telemetry.query_traces(limit=10) traces = llama_stack_client.telemetry.query_traces(limit=10)
if len(traces) >= 5: # 5 OpenAI completion traces if len(traces) >= 5: # 5 OpenAI completion traces
break break
time.sleep(1) time.sleep(0.1)
if len(traces) < 5: if len(traces) < 5:
pytest.fail( pytest.fail(
f"Failed to create sufficient OpenAI completion telemetry data after 30s. Got {len(traces)} traces." f"Failed to create sufficient OpenAI completion telemetry data after 30s. Got {len(traces)} traces."
) )
# Wait for 5 seconds to ensure traces has completed logging
time.sleep(5)
yield yield
@ -185,11 +182,13 @@ def test_openai_completion_creates_telemetry(llama_stack_client, text_model_id):
assert len(response.choices) > 0, "Response should have at least one choice" assert len(response.choices) > 0, "Response should have at least one choice"
# Wait for telemetry to be recorded # Wait for telemetry to be recorded
time.sleep(3) start_time = time.time()
while time.time() - start_time < 30:
# Check that we have more traces now
final_traces = llama_stack_client.telemetry.query_traces(limit=20) final_traces = llama_stack_client.telemetry.query_traces(limit=20)
final_count = len(final_traces) final_count = len(final_traces)
if final_count > initial_count:
break
time.sleep(0.1)
# Should have at least as many traces as before (might have more due to other activity) # Should have at least as many traces as before (might have more due to other activity)
assert final_count >= initial_count, "Should have at least as many traces after OpenAI call" assert final_count >= initial_count, "Should have at least as many traces after OpenAI call"

View file

@ -42,14 +42,11 @@ def setup_telemetry_data(llama_stack_client, text_model_id):
traces = llama_stack_client.telemetry.query_traces(limit=10) traces = llama_stack_client.telemetry.query_traces(limit=10)
if len(traces) >= 4: if len(traces) >= 4:
break break
time.sleep(1) time.sleep(0.1)
if len(traces) < 4: if len(traces) < 4:
pytest.fail(f"Failed to create sufficient telemetry data after 30s. Got {len(traces)} traces.") pytest.fail(f"Failed to create sufficient telemetry data after 30s. Got {len(traces)} traces.")
# Wait for 5 seconds to ensure traces has completed logging
time.sleep(5)
yield yield

View file

@ -46,10 +46,7 @@ def setup_telemetry_metrics_data(openai_client, client_with_models, text_model_i
break break
except Exception: except Exception:
pass pass
time.sleep(1) time.sleep(0.1)
# Wait additional time to ensure all metrics are processed
time.sleep(5)
# Return the token lists for use in tests # Return the token lists for use in tests
return {"prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, "total_tokens": total_tokens} return {"prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens, "total_tokens": total_tokens}

View file

@ -183,6 +183,110 @@ def test_vector_db_insert_from_url_and_query(
assert any("llama2" in chunk.content.lower() for chunk in response2.chunks) assert any("llama2" in chunk.content.lower() for chunk in response2.chunks)
def test_rag_tool_openai_apis(client_with_empty_registry, embedding_model_id, embedding_dimension):
vector_db_id = "test_openai_vector_db"
client_with_empty_registry.vector_dbs.register(
vector_db_id=vector_db_id,
embedding_model=embedding_model_id,
embedding_dimension=embedding_dimension,
)
available_vector_dbs = [vector_db.identifier for vector_db in client_with_empty_registry.vector_dbs.list()]
actual_vector_db_id = available_vector_dbs[0]
# different document formats that should work with OpenAI APIs
documents = [
Document(
document_id="text-doc",
content="This is a plain text document about machine learning algorithms.",
metadata={"type": "text", "category": "AI"},
),
Document(
document_id="url-doc",
content="https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/chat.rst",
mime_type="text/plain",
metadata={"type": "url", "source": "pytorch"},
),
Document(
document_id="data-url-doc",
content="data:text/plain;base64,VGhpcyBpcyBhIGRhdGEgVVJMIGRvY3VtZW50IGFib3V0IGRlZXAgbGVhcm5pbmcu", # "This is a data URL document about deep learning."
metadata={"type": "data_url", "encoding": "base64"},
),
]
client_with_empty_registry.tool_runtime.rag_tool.insert(
documents=documents,
vector_db_id=actual_vector_db_id,
chunk_size_in_tokens=256,
)
files_list = client_with_empty_registry.files.list()
assert len(files_list.data) >= len(documents), (
f"Expected at least {len(documents)} files, got {len(files_list.data)}"
)
vector_store_files = client_with_empty_registry.vector_io.openai_list_files_in_vector_store(
vector_store_id=actual_vector_db_id
)
assert len(vector_store_files.data) >= len(documents), f"Expected at least {len(documents)} files in vector store"
response = client_with_empty_registry.tool_runtime.rag_tool.query(
vector_db_ids=[actual_vector_db_id],
content="Tell me about machine learning and deep learning",
)
assert_valid_text_response(response)
content_text = " ".join([chunk.text for chunk in response.content]).lower()
assert "machine learning" in content_text or "deep learning" in content_text
def test_rag_tool_exception_handling(client_with_empty_registry, embedding_model_id, embedding_dimension):
vector_db_id = "test_exception_handling"
client_with_empty_registry.vector_dbs.register(
vector_db_id=vector_db_id,
embedding_model=embedding_model_id,
embedding_dimension=embedding_dimension,
)
available_vector_dbs = [vector_db.identifier for vector_db in client_with_empty_registry.vector_dbs.list()]
actual_vector_db_id = available_vector_dbs[0]
documents = [
Document(
document_id="valid-doc",
content="This is a valid document that should be processed successfully.",
metadata={"status": "valid"},
),
Document(
document_id="invalid-url-doc",
content="https://nonexistent-domain-12345.com/invalid.txt",
metadata={"status": "invalid_url"},
),
Document(
document_id="another-valid-doc",
content="This is another valid document for testing resilience.",
metadata={"status": "valid"},
),
]
client_with_empty_registry.tool_runtime.rag_tool.insert(
documents=documents,
vector_db_id=actual_vector_db_id,
chunk_size_in_tokens=256,
)
response = client_with_empty_registry.tool_runtime.rag_tool.query(
vector_db_ids=[actual_vector_db_id],
content="valid document",
)
assert_valid_text_response(response)
content_text = " ".join([chunk.text for chunk in response.content]).lower()
assert "valid document" in content_text
def test_rag_tool_insert_and_query(client_with_empty_registry, embedding_model_id, embedding_dimension): def test_rag_tool_insert_and_query(client_with_empty_registry, embedding_model_id, embedding_dimension):
providers = [p for p in client_with_empty_registry.providers.list() if p.api == "vector_io"] providers = [p for p in client_with_empty_registry.providers.list() if p.api == "vector_io"]
assert len(providers) > 0 assert len(providers) > 0
@ -249,3 +353,107 @@ def test_rag_tool_insert_and_query(client_with_empty_registry, embedding_model_i
"chunk_template": "This should raise a ValueError because it is missing the proper template variables", "chunk_template": "This should raise a ValueError because it is missing the proper template variables",
}, },
) )
def test_rag_tool_query_generation(client_with_empty_registry, embedding_model_id, embedding_dimension):
vector_db_id = "test_query_generation_db"
client_with_empty_registry.vector_dbs.register(
vector_db_id=vector_db_id,
embedding_model=embedding_model_id,
embedding_dimension=embedding_dimension,
)
available_vector_dbs = [vector_db.identifier for vector_db in client_with_empty_registry.vector_dbs.list()]
actual_vector_db_id = available_vector_dbs[0]
documents = [
Document(
document_id="ai-doc",
content="Artificial intelligence and machine learning are transforming technology.",
metadata={"category": "AI"},
),
Document(
document_id="banana-doc",
content="Don't bring a banana to a knife fight.",
metadata={"category": "wisdom"},
),
]
client_with_empty_registry.tool_runtime.rag_tool.insert(
documents=documents,
vector_db_id=actual_vector_db_id,
chunk_size_in_tokens=256,
)
response = client_with_empty_registry.tool_runtime.rag_tool.query(
vector_db_ids=[actual_vector_db_id],
content="Tell me about AI",
)
assert_valid_text_response(response)
content_text = " ".join([chunk.text for chunk in response.content]).lower()
assert "artificial intelligence" in content_text or "machine learning" in content_text
def test_rag_tool_pdf_data_url_handling(client_with_empty_registry, embedding_model_id, embedding_dimension):
vector_db_id = "test_pdf_data_url_db"
client_with_empty_registry.vector_dbs.register(
vector_db_id=vector_db_id,
embedding_model=embedding_model_id,
embedding_dimension=embedding_dimension,
)
available_vector_dbs = [vector_db.identifier for vector_db in client_with_empty_registry.vector_dbs.list()]
actual_vector_db_id = available_vector_dbs[0]
sample_pdf = b"%PDF-1.3\n3 0 obj\n<</Type /Page\n/Parent 1 0 R\n/Resources 2 0 R\n/Contents 4 0 R>>\nendobj\n4 0 obj\n<</Filter /FlateDecode /Length 115>>\nstream\nx\x9c\x15\xcc1\x0e\x820\x18@\xe1\x9dS\xbcM]jk$\xd5\xd5(\x83!\x86\xa1\x17\xf8\xa3\xa5`LIh+\xd7W\xc6\xf7\r\xef\xc0\xbd\xd2\xaa\xb6,\xd5\xc5\xb1o\x0c\xa6VZ\xe3znn%\xf3o\xab\xb1\xe7\xa3:Y\xdc\x8bm\xeb\xf3&1\xc8\xd7\xd3\x97\xc82\xe6\x81\x87\xe42\xcb\x87Vb(\x12<\xdd<=}Jc\x0cL\x91\xee\xda$\xb5\xc3\xbd\xd7\xe9\x0f\x8d\x97 $\nendstream\nendobj\n1 0 obj\n<</Type /Pages\n/Kids [3 0 R ]\n/Count 1\n/MediaBox [0 0 595.28 841.89]\n>>\nendobj\n5 0 obj\n<</Type /Font\n/BaseFont /Helvetica\n/Subtype /Type1\n/Encoding /WinAnsiEncoding\n>>\nendobj\n2 0 obj\n<<\n/ProcSet [/PDF /Text /ImageB /ImageC /ImageI]\n/Font <<\n/F1 5 0 R\n>>\n/XObject <<\n>>\n>>\nendobj\n6 0 obj\n<<\n/Producer (PyFPDF 1.7.2 http://pyfpdf.googlecode.com/)\n/Title (This is a sample title.)\n/Author (Llama Stack Developers)\n/CreationDate (D:20250312165548)\n>>\nendobj\n7 0 obj\n<<\n/Type /Catalog\n/Pages 1 0 R\n/OpenAction [3 0 R /FitH null]\n/PageLayout /OneColumn\n>>\nendobj\nxref\n0 8\n0000000000 65535 f \n0000000272 00000 n \n0000000455 00000 n \n0000000009 00000 n \n0000000087 00000 n \n0000000359 00000 n \n0000000559 00000 n \n0000000734 00000 n \ntrailer\n<<\n/Size 8\n/Root 7 0 R\n/Info 6 0 R\n>>\nstartxref\n837\n%%EOF\n"
import base64
pdf_base64 = base64.b64encode(sample_pdf).decode("utf-8")
pdf_data_url = f"data:application/pdf;base64,{pdf_base64}"
documents = [
Document(
document_id="test-pdf-data-url",
content=pdf_data_url,
metadata={"type": "pdf", "source": "data_url"},
),
]
client_with_empty_registry.tool_runtime.rag_tool.insert(
documents=documents,
vector_db_id=actual_vector_db_id,
chunk_size_in_tokens=256,
)
files_list = client_with_empty_registry.files.list()
assert len(files_list.data) >= 1, "PDF should have been uploaded to Files API"
pdf_file = None
for file in files_list.data:
if file.filename and "test-pdf-data-url" in file.filename:
pdf_file = file
break
assert pdf_file is not None, "PDF file should be found in Files API"
assert pdf_file.bytes == len(sample_pdf), f"File size should match original PDF ({len(sample_pdf)} bytes)"
file_content = client_with_empty_registry.files.retrieve_content(pdf_file.id)
assert file_content.startswith(b"%PDF-"), "Retrieved file should be a valid PDF"
vector_store_files = client_with_empty_registry.vector_io.openai_list_files_in_vector_store(
vector_store_id=actual_vector_db_id
)
assert len(vector_store_files.data) >= 1, "PDF should be attached to vector store"
response = client_with_empty_registry.tool_runtime.rag_tool.query(
vector_db_ids=[actual_vector_db_id],
content="sample title",
)
assert_valid_text_response(response)
content_text = " ".join([chunk.text for chunk in response.content]).lower()
assert "sample title" in content_text or "title" in content_text

View file

@ -6,16 +6,18 @@
import tempfile import tempfile
from pathlib import Path from pathlib import Path
from unittest.mock import patch from unittest.mock import AsyncMock, Mock, patch
import pytest import pytest
from openai import AsyncOpenAI from openai import NOT_GIVEN, AsyncOpenAI
from openai.types.model import Model as OpenAIModel
# Import the real Pydantic response types instead of using Mocks # Import the real Pydantic response types instead of using Mocks
from llama_stack.apis.inference import ( from llama_stack.apis.inference import (
OpenAIAssistantMessageParam, OpenAIAssistantMessageParam,
OpenAIChatCompletion, OpenAIChatCompletion,
OpenAIChoice, OpenAIChoice,
OpenAICompletion,
OpenAIEmbeddingData, OpenAIEmbeddingData,
OpenAIEmbeddingsResponse, OpenAIEmbeddingsResponse,
OpenAIEmbeddingUsage, OpenAIEmbeddingUsage,
@ -153,24 +155,22 @@ class TestInferenceRecording:
async def test_recording_mode(self, temp_storage_dir, real_openai_chat_response): async def test_recording_mode(self, temp_storage_dir, real_openai_chat_response):
"""Test that recording mode captures and stores responses.""" """Test that recording mode captures and stores responses."""
async def mock_create(*args, **kwargs):
return real_openai_chat_response
temp_storage_dir = temp_storage_dir / "test_recording_mode" temp_storage_dir = temp_storage_dir / "test_recording_mode"
with patch("openai.resources.chat.completions.AsyncCompletions.create", side_effect=mock_create):
with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)): with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)):
client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test") client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
client.chat.completions._post = AsyncMock(return_value=real_openai_chat_response)
response = await client.chat.completions.create( response = await client.chat.completions.create(
model="llama3.2:3b", model="llama3.2:3b",
messages=[{"role": "user", "content": "Hello, how are you?"}], messages=[{"role": "user", "content": "Hello, how are you?"}],
temperature=0.7, temperature=0.7,
max_tokens=50, max_tokens=50,
user=NOT_GIVEN,
) )
# Verify the response was returned correctly # Verify the response was returned correctly
assert response.choices[0].message.content == "Hello! I'm doing well, thank you for asking." assert response.choices[0].message.content == "Hello! I'm doing well, thank you for asking."
client.chat.completions._post.assert_called_once()
# Verify recording was stored # Verify recording was stored
storage = ResponseStorage(temp_storage_dir) storage = ResponseStorage(temp_storage_dir)
@ -178,27 +178,25 @@ class TestInferenceRecording:
async def test_replay_mode(self, temp_storage_dir, real_openai_chat_response): async def test_replay_mode(self, temp_storage_dir, real_openai_chat_response):
"""Test that replay mode returns stored responses without making real calls.""" """Test that replay mode returns stored responses without making real calls."""
async def mock_create(*args, **kwargs):
return real_openai_chat_response
temp_storage_dir = temp_storage_dir / "test_replay_mode" temp_storage_dir = temp_storage_dir / "test_replay_mode"
# First, record a response # First, record a response
with patch("openai.resources.chat.completions.AsyncCompletions.create", side_effect=mock_create):
with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)): with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)):
client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test") client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
client.chat.completions._post = AsyncMock(return_value=real_openai_chat_response)
response = await client.chat.completions.create( response = await client.chat.completions.create(
model="llama3.2:3b", model="llama3.2:3b",
messages=[{"role": "user", "content": "Hello, how are you?"}], messages=[{"role": "user", "content": "Hello, how are you?"}],
temperature=0.7, temperature=0.7,
max_tokens=50, max_tokens=50,
user=NOT_GIVEN,
) )
client.chat.completions._post.assert_called_once()
# Now test replay mode - should not call the original method # Now test replay mode - should not call the original method
with patch("openai.resources.chat.completions.AsyncCompletions.create") as mock_create_patch:
with inference_recording(mode=InferenceMode.REPLAY, storage_dir=str(temp_storage_dir)): with inference_recording(mode=InferenceMode.REPLAY, storage_dir=str(temp_storage_dir)):
client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test") client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
client.chat.completions._post = AsyncMock(return_value=real_openai_chat_response)
response = await client.chat.completions.create( response = await client.chat.completions.create(
model="llama3.2:3b", model="llama3.2:3b",
@ -211,7 +209,43 @@ class TestInferenceRecording:
assert response.choices[0].message.content == "Hello! I'm doing well, thank you for asking." assert response.choices[0].message.content == "Hello! I'm doing well, thank you for asking."
# Verify the original method was NOT called # Verify the original method was NOT called
mock_create_patch.assert_not_called() client.chat.completions._post.assert_not_called()
async def test_replay_mode_models(self, temp_storage_dir):
"""Test that replay mode returns stored responses without making real model listing calls."""
async def _async_iterator(models):
for model in models:
yield model
models = [
OpenAIModel(id="foo", created=1, object="model", owned_by="test"),
OpenAIModel(id="bar", created=2, object="model", owned_by="test"),
]
expected_ids = {m.id for m in models}
temp_storage_dir = temp_storage_dir / "test_replay_mode_models"
# baseline - mock works without recording
client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
client.models._get_api_list = Mock(return_value=_async_iterator(models))
assert {m.id async for m in client.models.list()} == expected_ids
client.models._get_api_list.assert_called_once()
# record the call
with inference_recording(mode=InferenceMode.RECORD, storage_dir=temp_storage_dir):
client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
client.models._get_api_list = Mock(return_value=_async_iterator(models))
assert {m.id async for m in client.models.list()} == expected_ids
client.models._get_api_list.assert_called_once()
# replay the call
with inference_recording(mode=InferenceMode.REPLAY, storage_dir=temp_storage_dir):
client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
client.models._get_api_list = Mock(return_value=_async_iterator(models))
assert {m.id async for m in client.models.list()} == expected_ids
client.models._get_api_list.assert_not_called()
async def test_replay_missing_recording(self, temp_storage_dir): async def test_replay_missing_recording(self, temp_storage_dir):
"""Test that replay mode fails when no recording is found.""" """Test that replay mode fails when no recording is found."""
@ -228,28 +262,42 @@ class TestInferenceRecording:
async def test_embeddings_recording(self, temp_storage_dir, real_embeddings_response): async def test_embeddings_recording(self, temp_storage_dir, real_embeddings_response):
"""Test recording and replay of embeddings calls.""" """Test recording and replay of embeddings calls."""
async def mock_create(*args, **kwargs): # baseline - mock works without recording
return real_embeddings_response client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
client.embeddings._post = AsyncMock(return_value=real_embeddings_response)
response = await client.embeddings.create(
model=real_embeddings_response.model,
input=["Hello world", "Test embedding"],
encoding_format=NOT_GIVEN,
)
assert len(response.data) == 2
assert response.data[0].embedding == [0.1, 0.2, 0.3]
client.embeddings._post.assert_called_once()
temp_storage_dir = temp_storage_dir / "test_embeddings_recording" temp_storage_dir = temp_storage_dir / "test_embeddings_recording"
# Record # Record
with patch("openai.resources.embeddings.AsyncEmbeddings.create", side_effect=mock_create):
with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)): with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)):
client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test") client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
client.embeddings._post = AsyncMock(return_value=real_embeddings_response)
response = await client.embeddings.create( response = await client.embeddings.create(
model="nomic-embed-text", input=["Hello world", "Test embedding"] model=real_embeddings_response.model,
input=["Hello world", "Test embedding"],
encoding_format=NOT_GIVEN,
dimensions=NOT_GIVEN,
user=NOT_GIVEN,
) )
assert len(response.data) == 2 assert len(response.data) == 2
# Replay # Replay
with patch("openai.resources.embeddings.AsyncEmbeddings.create") as mock_create_patch:
with inference_recording(mode=InferenceMode.REPLAY, storage_dir=str(temp_storage_dir)): with inference_recording(mode=InferenceMode.REPLAY, storage_dir=str(temp_storage_dir)):
client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test") client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
client.embeddings._post = AsyncMock(return_value=real_embeddings_response)
response = await client.embeddings.create( response = await client.embeddings.create(
model="nomic-embed-text", input=["Hello world", "Test embedding"] model=real_embeddings_response.model,
input=["Hello world", "Test embedding"],
) )
# Verify we got the recorded response # Verify we got the recorded response
@ -257,7 +305,67 @@ class TestInferenceRecording:
assert response.data[0].embedding == [0.1, 0.2, 0.3] assert response.data[0].embedding == [0.1, 0.2, 0.3]
# Verify original method was not called # Verify original method was not called
mock_create_patch.assert_not_called() client.embeddings._post.assert_not_called()
async def test_completions_recording(self, temp_storage_dir):
real_completions_response = OpenAICompletion(
id="test_completion",
object="text_completion",
created=1234567890,
model="llama3.2:3b",
choices=[
{
"text": "Hello! I'm doing well, thank you for asking.",
"index": 0,
"logprobs": None,
"finish_reason": "stop",
}
],
)
temp_storage_dir = temp_storage_dir / "test_completions_recording"
# baseline - mock works without recording
client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
client.completions._post = AsyncMock(return_value=real_completions_response)
response = await client.completions.create(
model=real_completions_response.model,
prompt="Hello, how are you?",
temperature=0.7,
max_tokens=50,
user=NOT_GIVEN,
)
assert response.choices[0].text == real_completions_response.choices[0].text
client.completions._post.assert_called_once()
# Record
with inference_recording(mode=InferenceMode.RECORD, storage_dir=str(temp_storage_dir)):
client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
client.completions._post = AsyncMock(return_value=real_completions_response)
response = await client.completions.create(
model=real_completions_response.model,
prompt="Hello, how are you?",
temperature=0.7,
max_tokens=50,
user=NOT_GIVEN,
)
assert response.choices[0].text == real_completions_response.choices[0].text
client.completions._post.assert_called_once()
# Replay
with inference_recording(mode=InferenceMode.REPLAY, storage_dir=str(temp_storage_dir)):
client = AsyncOpenAI(base_url="http://localhost:11434/v1", api_key="test")
client.completions._post = AsyncMock(return_value=real_completions_response)
response = await client.completions.create(
model=real_completions_response.model,
prompt="Hello, how are you?",
temperature=0.7,
max_tokens=50,
)
assert response.choices[0].text == real_completions_response.choices[0].text
client.completions._post.assert_not_called()
async def test_live_mode(self, real_openai_chat_response): async def test_live_mode(self, real_openai_chat_response):
"""Test that live mode passes through to original methods.""" """Test that live mode passes through to original methods."""

View file

@ -6,19 +6,15 @@
import asyncio import asyncio
import json import json
import logging # allow-direct-logging
import threading
import time import time
from http.server import BaseHTTPRequestHandler, HTTPServer from unittest.mock import AsyncMock, MagicMock, PropertyMock, patch
from typing import Any
from unittest.mock import AsyncMock, MagicMock, patch
import pytest import pytest
from openai.types.chat.chat_completion_chunk import ( from openai.types.chat.chat_completion_chunk import (
ChatCompletionChunk as OpenAIChatCompletionChunk, ChatCompletionChunk as OpenAIChatCompletionChunk,
) )
from openai.types.chat.chat_completion_chunk import ( from openai.types.chat.chat_completion_chunk import (
Choice as OpenAIChoice, Choice as OpenAIChoiceChunk,
) )
from openai.types.chat.chat_completion_chunk import ( from openai.types.chat.chat_completion_chunk import (
ChoiceDelta as OpenAIChoiceDelta, ChoiceDelta as OpenAIChoiceDelta,
@ -35,6 +31,9 @@ from llama_stack.apis.inference import (
ChatCompletionRequest, ChatCompletionRequest,
ChatCompletionResponseEventType, ChatCompletionResponseEventType,
CompletionMessage, CompletionMessage,
OpenAIAssistantMessageParam,
OpenAIChatCompletion,
OpenAIChoice,
SystemMessage, SystemMessage,
ToolChoice, ToolChoice,
ToolConfig, ToolConfig,
@ -61,41 +60,6 @@ from llama_stack.providers.remote.inference.vllm.vllm import (
# -v -s --tb=short --disable-warnings # -v -s --tb=short --disable-warnings
class MockInferenceAdapterWithSleep:
def __init__(self, sleep_time: int, response: dict[str, Any]):
self.httpd = None
class DelayedRequestHandler(BaseHTTPRequestHandler):
# ruff: noqa: N802
def do_POST(self):
time.sleep(sleep_time)
response_body = json.dumps(response).encode("utf-8")
self.send_response(code=200)
self.send_header("Content-Type", "application/json")
self.send_header("Content-Length", len(response_body))
self.end_headers()
self.wfile.write(response_body)
self.request_handler = DelayedRequestHandler
def __enter__(self):
httpd = HTTPServer(("", 0), self.request_handler)
self.httpd = httpd
host, port = httpd.server_address
httpd_thread = threading.Thread(target=httpd.serve_forever)
httpd_thread.daemon = True # stop server if this thread terminates
httpd_thread.start()
config = VLLMInferenceAdapterConfig(url=f"http://{host}:{port}")
inference_adapter = VLLMInferenceAdapter(config)
return inference_adapter
def __exit__(self, _exc_type, _exc_value, _traceback):
if self.httpd:
self.httpd.shutdown()
self.httpd.server_close()
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def mock_openai_models_list(): def mock_openai_models_list():
with patch("openai.resources.models.AsyncModels.list", new_callable=AsyncMock) as mock_list: with patch("openai.resources.models.AsyncModels.list", new_callable=AsyncMock) as mock_list:
@ -150,10 +114,12 @@ async def test_tool_call_response(vllm_inference_adapter):
"""Verify that tool call arguments from a CompletionMessage are correctly converted """Verify that tool call arguments from a CompletionMessage are correctly converted
into the expected JSON format.""" into the expected JSON format."""
# Patch the call to vllm so we can inspect the arguments sent were correct # Patch the client property to avoid instantiating a real AsyncOpenAI client
with patch.object( with patch.object(VLLMInferenceAdapter, "client", new_callable=PropertyMock) as mock_create_client:
vllm_inference_adapter.client.chat.completions, "create", new_callable=AsyncMock mock_client = MagicMock()
) as mock_nonstream_completion: mock_client.chat.completions.create = AsyncMock()
mock_create_client.return_value = mock_client
messages = [ messages = [
SystemMessage(content="You are a helpful assistant"), SystemMessage(content="You are a helpful assistant"),
UserMessage(content="How many?"), UserMessage(content="How many?"),
@ -179,7 +145,7 @@ async def test_tool_call_response(vllm_inference_adapter):
tool_config=ToolConfig(tool_choice=ToolChoice.auto), tool_config=ToolConfig(tool_choice=ToolChoice.auto),
) )
assert mock_nonstream_completion.call_args.kwargs["messages"][2]["tool_calls"] == [ assert mock_client.chat.completions.create.call_args.kwargs["messages"][2]["tool_calls"] == [
{ {
"id": "foo", "id": "foo",
"type": "function", "type": "function",
@ -199,7 +165,7 @@ async def test_tool_call_delta_empty_tool_call_buf():
async def mock_stream(): async def mock_stream():
delta = OpenAIChoiceDelta(content="", tool_calls=None) delta = OpenAIChoiceDelta(content="", tool_calls=None)
choices = [OpenAIChoice(delta=delta, finish_reason="stop", index=0)] choices = [OpenAIChoiceChunk(delta=delta, finish_reason="stop", index=0)]
mock_chunk = OpenAIChatCompletionChunk( mock_chunk = OpenAIChatCompletionChunk(
id="chunk-1", id="chunk-1",
created=1, created=1,
@ -225,7 +191,7 @@ async def test_tool_call_delta_streaming_arguments_dict():
model="foo", model="foo",
object="chat.completion.chunk", object="chat.completion.chunk",
choices=[ choices=[
OpenAIChoice( OpenAIChoiceChunk(
delta=OpenAIChoiceDelta( delta=OpenAIChoiceDelta(
content="", content="",
tool_calls=[ tool_calls=[
@ -250,7 +216,7 @@ async def test_tool_call_delta_streaming_arguments_dict():
model="foo", model="foo",
object="chat.completion.chunk", object="chat.completion.chunk",
choices=[ choices=[
OpenAIChoice( OpenAIChoiceChunk(
delta=OpenAIChoiceDelta( delta=OpenAIChoiceDelta(
content="", content="",
tool_calls=[ tool_calls=[
@ -275,7 +241,9 @@ async def test_tool_call_delta_streaming_arguments_dict():
model="foo", model="foo",
object="chat.completion.chunk", object="chat.completion.chunk",
choices=[ choices=[
OpenAIChoice(delta=OpenAIChoiceDelta(content="", tool_calls=None), finish_reason="tool_calls", index=0) OpenAIChoiceChunk(
delta=OpenAIChoiceDelta(content="", tool_calls=None), finish_reason="tool_calls", index=0
)
], ],
) )
for chunk in [mock_chunk_1, mock_chunk_2, mock_chunk_3]: for chunk in [mock_chunk_1, mock_chunk_2, mock_chunk_3]:
@ -299,7 +267,7 @@ async def test_multiple_tool_calls():
model="foo", model="foo",
object="chat.completion.chunk", object="chat.completion.chunk",
choices=[ choices=[
OpenAIChoice( OpenAIChoiceChunk(
delta=OpenAIChoiceDelta( delta=OpenAIChoiceDelta(
content="", content="",
tool_calls=[ tool_calls=[
@ -324,7 +292,7 @@ async def test_multiple_tool_calls():
model="foo", model="foo",
object="chat.completion.chunk", object="chat.completion.chunk",
choices=[ choices=[
OpenAIChoice( OpenAIChoiceChunk(
delta=OpenAIChoiceDelta( delta=OpenAIChoiceDelta(
content="", content="",
tool_calls=[ tool_calls=[
@ -349,7 +317,9 @@ async def test_multiple_tool_calls():
model="foo", model="foo",
object="chat.completion.chunk", object="chat.completion.chunk",
choices=[ choices=[
OpenAIChoice(delta=OpenAIChoiceDelta(content="", tool_calls=None), finish_reason="tool_calls", index=0) OpenAIChoiceChunk(
delta=OpenAIChoiceDelta(content="", tool_calls=None), finish_reason="tool_calls", index=0
)
], ],
) )
for chunk in [mock_chunk_1, mock_chunk_2, mock_chunk_3]: for chunk in [mock_chunk_1, mock_chunk_2, mock_chunk_3]:
@ -393,59 +363,6 @@ async def test_process_vllm_chat_completion_stream_response_no_choices():
assert chunks[0].event.event_type.value == "start" assert chunks[0].event.event_type.value == "start"
@pytest.mark.allow_network
def test_chat_completion_doesnt_block_event_loop(caplog):
loop = asyncio.new_event_loop()
loop.set_debug(True)
caplog.set_level(logging.WARNING)
# Log when event loop is blocked for more than 200ms
loop.slow_callback_duration = 0.5
# Sleep for 500ms in our delayed http response
sleep_time = 0.5
mock_model = Model(identifier="mock-model", provider_resource_id="mock-model", provider_id="vllm-inference")
mock_response = {
"id": "chatcmpl-abc123",
"object": "chat.completion",
"created": 1,
"modle": "mock-model",
"choices": [
{
"message": {"content": ""},
"logprobs": None,
"finish_reason": "stop",
"index": 0,
}
],
}
async def do_chat_completion():
await inference_adapter.chat_completion(
"mock-model",
[],
stream=False,
tools=None,
tool_config=ToolConfig(tool_choice=ToolChoice.auto),
)
with MockInferenceAdapterWithSleep(sleep_time, mock_response) as inference_adapter:
inference_adapter.model_store = AsyncMock()
inference_adapter.model_store.get_model.return_value = mock_model
loop.run_until_complete(inference_adapter.initialize())
# Clear the logs so far and run the actual chat completion we care about
caplog.clear()
loop.run_until_complete(do_chat_completion())
# Ensure we don't have any asyncio warnings in the captured log
# records from our chat completion call. A message gets logged
# here any time we exceed the slow_callback_duration configured
# above.
asyncio_warnings = [record.message for record in caplog.records if record.name == "asyncio"]
assert not asyncio_warnings
async def test_get_params_empty_tools(vllm_inference_adapter): async def test_get_params_empty_tools(vllm_inference_adapter):
request = ChatCompletionRequest( request = ChatCompletionRequest(
tools=[], tools=[],
@ -641,9 +558,7 @@ async def test_health_status_success(vllm_inference_adapter):
This test verifies that the health method returns a HealthResponse with status OK, only This test verifies that the health method returns a HealthResponse with status OK, only
when the connection to the vLLM server is successful. when the connection to the vLLM server is successful.
""" """
# Set vllm_inference_adapter.client to None to ensure _create_client is called with patch.object(VLLMInferenceAdapter, "client", new_callable=PropertyMock) as mock_create_client:
vllm_inference_adapter.client = None
with patch.object(vllm_inference_adapter, "_create_client") as mock_create_client:
# Create mock client and models # Create mock client and models
mock_client = MagicMock() mock_client = MagicMock()
mock_models = MagicMock() mock_models = MagicMock()
@ -674,8 +589,7 @@ async def test_health_status_failure(vllm_inference_adapter):
This test verifies that the health method returns a HealthResponse with status ERROR This test verifies that the health method returns a HealthResponse with status ERROR
and an appropriate error message when the connection to the vLLM server fails. and an appropriate error message when the connection to the vLLM server fails.
""" """
vllm_inference_adapter.client = None with patch.object(VLLMInferenceAdapter, "client", new_callable=PropertyMock) as mock_create_client:
with patch.object(vllm_inference_adapter, "_create_client") as mock_create_client:
# Create mock client and models # Create mock client and models
mock_client = MagicMock() mock_client = MagicMock()
mock_models = MagicMock() mock_models = MagicMock()
@ -697,3 +611,48 @@ async def test_health_status_failure(vllm_inference_adapter):
assert "Health check failed: Connection failed" in health_response["message"] assert "Health check failed: Connection failed" in health_response["message"]
mock_models.list.assert_called_once() mock_models.list.assert_called_once()
async def test_openai_chat_completion_is_async(vllm_inference_adapter):
"""
Verify that openai_chat_completion is async and doesn't block the event loop.
To do this we mock the underlying inference with a sleep, start multiple
inference calls in parallel, and ensure the total time taken is less
than the sum of the individual sleep times.
"""
sleep_time = 0.5
async def mock_create(*args, **kwargs):
await asyncio.sleep(sleep_time)
return OpenAIChatCompletion(
id="chatcmpl-abc123",
created=1,
model="mock-model",
choices=[
OpenAIChoice(
message=OpenAIAssistantMessageParam(
content="nothing interesting",
),
finish_reason="stop",
index=0,
)
],
)
async def do_inference():
await vllm_inference_adapter.openai_chat_completion(
"mock-model", messages=["one fish", "two fish"], stream=False
)
with patch.object(VLLMInferenceAdapter, "client", new_callable=PropertyMock) as mock_create_client:
mock_client = MagicMock()
mock_client.chat.completions.create = AsyncMock(side_effect=mock_create)
mock_create_client.return_value = mock_client
start_time = time.time()
await asyncio.gather(do_inference(), do_inference(), do_inference(), do_inference())
total_time = time.time() - start_time
assert mock_create_client.call_count == 4 # no cheating
assert total_time < (sleep_time * 2), f"Total time taken: {total_time}s exceeded expected max"

View file

@ -0,0 +1,53 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from llama_stack.providers.remote.inference.bedrock.bedrock import (
_get_region_prefix,
_to_inference_profile_id,
)
def test_region_prefixes():
assert _get_region_prefix("us-east-1") == "us."
assert _get_region_prefix("eu-west-1") == "eu."
assert _get_region_prefix("ap-south-1") == "ap."
assert _get_region_prefix("ca-central-1") == "us."
# Test case insensitive
assert _get_region_prefix("US-EAST-1") == "us."
assert _get_region_prefix("EU-WEST-1") == "eu."
assert _get_region_prefix("Ap-South-1") == "ap."
# Test None region
assert _get_region_prefix(None) == "us."
def test_model_id_conversion():
# Basic conversion
assert (
_to_inference_profile_id("meta.llama3-1-70b-instruct-v1:0", "us-east-1") == "us.meta.llama3-1-70b-instruct-v1:0"
)
# Already has prefix
assert (
_to_inference_profile_id("us.meta.llama3-1-70b-instruct-v1:0", "us-east-1")
== "us.meta.llama3-1-70b-instruct-v1:0"
)
# ARN should be returned unchanged
arn = "arn:aws:bedrock:us-east-1:123456789012:inference-profile/us.meta.llama3-1-70b-instruct-v1:0"
assert _to_inference_profile_id(arn, "us-east-1") == arn
# ARN should be returned unchanged even without region
assert _to_inference_profile_id(arn) == arn
# Optional region parameter defaults to us-east-1
assert _to_inference_profile_id("meta.llama3-1-70b-instruct-v1:0") == "us.meta.llama3-1-70b-instruct-v1:0"
# Different regions work with optional parameter
assert (
_to_inference_profile_id("meta.llama3-1-70b-instruct-v1:0", "eu-west-1") == "eu.meta.llama3-1-70b-instruct-v1:0"
)

View file

@ -178,3 +178,41 @@ def test_content_from_data_and_mime_type_both_encodings_fail():
# Should raise an exception instead of returning empty string # Should raise an exception instead of returning empty string
with pytest.raises(UnicodeDecodeError): with pytest.raises(UnicodeDecodeError):
content_from_data_and_mime_type(data, mime_type) content_from_data_and_mime_type(data, mime_type)
async def test_memory_tool_error_handling():
"""Test that memory tool handles various failures gracefully without crashing."""
from llama_stack.providers.inline.tool_runtime.rag.config import RagToolRuntimeConfig
from llama_stack.providers.inline.tool_runtime.rag.memory import MemoryToolRuntimeImpl
config = RagToolRuntimeConfig()
memory_tool = MemoryToolRuntimeImpl(
config=config,
vector_io_api=AsyncMock(),
inference_api=AsyncMock(),
files_api=AsyncMock(),
)
docs = [
RAGDocument(document_id="good_doc", content="Good content", metadata={}),
RAGDocument(document_id="bad_url_doc", content=URL(uri="https://bad.url"), metadata={}),
RAGDocument(document_id="another_good_doc", content="Another good content", metadata={}),
]
mock_file1 = MagicMock()
mock_file1.id = "file_good1"
mock_file2 = MagicMock()
mock_file2.id = "file_good2"
memory_tool.files_api.openai_upload_file.side_effect = [mock_file1, mock_file2]
with patch("httpx.AsyncClient") as mock_client:
mock_instance = AsyncMock()
mock_instance.get.side_effect = Exception("Bad URL")
mock_client.return_value.__aenter__.return_value = mock_instance
# won't raise exception despite one document failing
await memory_tool.insert(docs, "vector_store_123")
# processed 2 documents successfully, skipped 1
assert memory_tool.files_api.openai_upload_file.call_count == 2
assert memory_tool.vector_io_api.openai_attach_file_to_vector_store.call_count == 2

View file

@ -26,9 +26,9 @@ def test_generate_chunk_id():
chunk_ids = sorted([chunk.chunk_id for chunk in chunks]) chunk_ids = sorted([chunk.chunk_id for chunk in chunks])
assert chunk_ids == [ assert chunk_ids == [
"177a1368-f6a8-0c50-6e92-18677f2c3de3", "31d1f9a3-c8d2-66e7-3c37-af2acd329778",
"bc744db3-1b25-0a9c-cdff-b6ba3df73c36", "d07dade7-29c0-cda7-df29-0249a1dcbc3e",
"f68df25d-d9aa-ab4d-5684-64a233add20d", "d14f75a1-5855-7f72-2c78-d9fc4275a346",
] ]
@ -36,14 +36,14 @@ def test_generate_chunk_id_with_window():
chunk = Chunk(content="test", metadata={"document_id": "doc-1"}) chunk = Chunk(content="test", metadata={"document_id": "doc-1"})
chunk_id1 = generate_chunk_id("doc-1", chunk, chunk_window="0-1") chunk_id1 = generate_chunk_id("doc-1", chunk, chunk_window="0-1")
chunk_id2 = generate_chunk_id("doc-1", chunk, chunk_window="1-2") chunk_id2 = generate_chunk_id("doc-1", chunk, chunk_window="1-2")
assert chunk_id1 == "149018fe-d0eb-0f8d-5f7f-726bdd2aeedb" assert chunk_id1 == "8630321a-d9cb-2bb6-cd28-ebf68dafd866"
assert chunk_id2 == "4562c1ee-9971-1f3b-51a6-7d05e5211154" assert chunk_id2 == "13a1c09a-cbda-b61a-2d1a-7baa90888685"
def test_chunk_id(): def test_chunk_id():
# Test with existing chunk ID # Test with existing chunk ID
chunk_with_id = Chunk(content="test", metadata={"document_id": "existing-id"}) chunk_with_id = Chunk(content="test", metadata={"document_id": "existing-id"})
assert chunk_with_id.chunk_id == "84ededcc-b80b-a83e-1a20-ca6515a11350" assert chunk_with_id.chunk_id == "11704f92-42b6-61df-bf85-6473e7708fbd"
# Test with document ID in metadata # Test with document ID in metadata
chunk_with_doc_id = Chunk(content="test", metadata={"document_id": "doc-1"}) chunk_with_doc_id = Chunk(content="test", metadata={"document_id": "doc-1"})

View file

@ -65,6 +65,9 @@ async def test_inference_store_pagination_basic():
input_messages = [OpenAIUserMessageParam(role="user", content=f"Test message for {completion_id}")] input_messages = [OpenAIUserMessageParam(role="user", content=f"Test message for {completion_id}")]
await store.store_chat_completion(completion, input_messages) await store.store_chat_completion(completion, input_messages)
# Wait for all queued writes to complete
await store.flush()
# Test 1: First page with limit=2, descending order (default) # Test 1: First page with limit=2, descending order (default)
result = await store.list_chat_completions(limit=2, order=Order.desc) result = await store.list_chat_completions(limit=2, order=Order.desc)
assert len(result.data) == 2 assert len(result.data) == 2
@ -108,6 +111,9 @@ async def test_inference_store_pagination_ascending():
input_messages = [OpenAIUserMessageParam(role="user", content=f"Test message for {completion_id}")] input_messages = [OpenAIUserMessageParam(role="user", content=f"Test message for {completion_id}")]
await store.store_chat_completion(completion, input_messages) await store.store_chat_completion(completion, input_messages)
# Wait for all queued writes to complete
await store.flush()
# Test ascending order pagination # Test ascending order pagination
result = await store.list_chat_completions(limit=1, order=Order.asc) result = await store.list_chat_completions(limit=1, order=Order.asc)
assert len(result.data) == 1 assert len(result.data) == 1
@ -143,6 +149,9 @@ async def test_inference_store_pagination_with_model_filter():
input_messages = [OpenAIUserMessageParam(role="user", content=f"Test message for {completion_id}")] input_messages = [OpenAIUserMessageParam(role="user", content=f"Test message for {completion_id}")]
await store.store_chat_completion(completion, input_messages) await store.store_chat_completion(completion, input_messages)
# Wait for all queued writes to complete
await store.flush()
# Test pagination with model filter # Test pagination with model filter
result = await store.list_chat_completions(limit=1, model="model-a", order=Order.desc) result = await store.list_chat_completions(limit=1, model="model-a", order=Order.desc)
assert len(result.data) == 1 assert len(result.data) == 1
@ -190,6 +199,9 @@ async def test_inference_store_pagination_no_limit():
input_messages = [OpenAIUserMessageParam(role="user", content=f"Test message for {completion_id}")] input_messages = [OpenAIUserMessageParam(role="user", content=f"Test message for {completion_id}")]
await store.store_chat_completion(completion, input_messages) await store.store_chat_completion(completion, input_messages)
# Wait for all queued writes to complete
await store.flush()
# Test without limit # Test without limit
result = await store.list_chat_completions(order=Order.desc) result = await store.list_chat_completions(order=Order.desc)
assert len(result.data) == 2 assert len(result.data) == 2

27
uv.lock generated
View file

@ -1,5 +1,5 @@
version = 1 version = 1
revision = 3 revision = 2
requires-python = ">=3.12" requires-python = ">=3.12"
resolution-markers = [ resolution-markers = [
"(python_full_version >= '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.13' and sys_platform != 'darwin' and sys_platform != 'linux')", "(python_full_version >= '3.13' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version >= '3.13' and sys_platform != 'darwin' and sys_platform != 'linux')",
@ -1839,7 +1839,6 @@ test = [
{ name = "datasets" }, { name = "datasets" },
{ name = "mcp" }, { name = "mcp" },
{ name = "milvus-lite" }, { name = "milvus-lite" },
{ name = "openai" },
{ name = "psycopg2-binary" }, { name = "psycopg2-binary" },
{ name = "pymilvus" }, { name = "pymilvus" },
{ name = "pypdf" }, { name = "pypdf" },
@ -1865,7 +1864,6 @@ unit = [
{ name = "milvus-lite" }, { name = "milvus-lite" },
{ name = "moto", extra = ["s3"] }, { name = "moto", extra = ["s3"] },
{ name = "ollama" }, { name = "ollama" },
{ name = "openai" },
{ name = "psycopg2-binary" }, { name = "psycopg2-binary" },
{ name = "pymilvus" }, { name = "pymilvus" },
{ name = "pypdf" }, { name = "pypdf" },
@ -1889,7 +1887,7 @@ requires-dist = [
{ name = "jsonschema" }, { name = "jsonschema" },
{ name = "llama-stack-client", specifier = ">=0.2.21" }, { name = "llama-stack-client", specifier = ">=0.2.21" },
{ name = "llama-stack-client", marker = "extra == 'ui'", specifier = ">=0.2.21" }, { name = "llama-stack-client", marker = "extra == 'ui'", specifier = ">=0.2.21" },
{ name = "openai", specifier = ">=1.99.6" }, { name = "openai", specifier = ">=1.100.0" },
{ name = "opentelemetry-exporter-otlp-proto-http", specifier = ">=1.30.0" }, { name = "opentelemetry-exporter-otlp-proto-http", specifier = ">=1.30.0" },
{ name = "opentelemetry-sdk", specifier = ">=1.30.0" }, { name = "opentelemetry-sdk", specifier = ">=1.30.0" },
{ name = "pandas", marker = "extra == 'ui'" }, { name = "pandas", marker = "extra == 'ui'" },
@ -1959,7 +1957,6 @@ test = [
{ name = "datasets", specifier = ">=4.0.0" }, { name = "datasets", specifier = ">=4.0.0" },
{ name = "mcp" }, { name = "mcp" },
{ name = "milvus-lite", specifier = ">=2.5.0" }, { name = "milvus-lite", specifier = ">=2.5.0" },
{ name = "openai", specifier = ">=1.100.0" },
{ name = "psycopg2-binary", specifier = ">=2.9.0" }, { name = "psycopg2-binary", specifier = ">=2.9.0" },
{ name = "pymilvus", specifier = ">=2.6.1" }, { name = "pymilvus", specifier = ">=2.6.1" },
{ name = "pypdf" }, { name = "pypdf" },
@ -1984,7 +1981,6 @@ unit = [
{ name = "milvus-lite", specifier = ">=2.5.0" }, { name = "milvus-lite", specifier = ">=2.5.0" },
{ name = "moto", extras = ["s3"], specifier = ">=5.1.10" }, { name = "moto", extras = ["s3"], specifier = ">=5.1.10" },
{ name = "ollama" }, { name = "ollama" },
{ name = "openai" },
{ name = "psycopg2-binary", specifier = ">=2.9.0" }, { name = "psycopg2-binary", specifier = ">=2.9.0" },
{ name = "pymilvus", specifier = ">=2.6.1" }, { name = "pymilvus", specifier = ">=2.6.1" },
{ name = "pypdf" }, { name = "pypdf" },
@ -2023,7 +2019,7 @@ wheels = [
[[package]] [[package]]
name = "locust" name = "locust"
version = "2.39.1" version = "2.40.1"
source = { registry = "https://pypi.org/simple" } source = { registry = "https://pypi.org/simple" }
dependencies = [ dependencies = [
{ name = "configargparse" }, { name = "configargparse" },
@ -2035,6 +2031,7 @@ dependencies = [
{ name = "locust-cloud" }, { name = "locust-cloud" },
{ name = "msgpack" }, { name = "msgpack" },
{ name = "psutil" }, { name = "psutil" },
{ name = "pytest" },
{ name = "python-engineio" }, { name = "python-engineio" },
{ name = "python-socketio", extra = ["client"] }, { name = "python-socketio", extra = ["client"] },
{ name = "pywin32", marker = "sys_platform == 'win32'" }, { name = "pywin32", marker = "sys_platform == 'win32'" },
@ -2043,9 +2040,9 @@ dependencies = [
{ name = "setuptools" }, { name = "setuptools" },
{ name = "werkzeug" }, { name = "werkzeug" },
] ]
sdist = { url = "https://files.pythonhosted.org/packages/95/c8/10aa5445c404eed389b56877e6714c1787190cc09dd70059ce3765979ec5/locust-2.39.1.tar.gz", hash = "sha256:6bdd19e27edf9a1c84391d6cf6e9a737dfb832be7dfbf39053191ae31b9cc498", size = 1409902, upload-time = "2025-08-29T17:41:01.544Z" } sdist = { url = "https://files.pythonhosted.org/packages/01/22/82f40176473a98c9479bed667d3ad21bb859d2cb67f6880a6b0b6a725e45/locust-2.40.1.tar.gz", hash = "sha256:5bde76c1cf7e412071670f926f34844e119210c93f07a4cf9fc4cb93c60a578a", size = 1411606, upload-time = "2025-09-05T15:57:35.76Z" }
wheels = [ wheels = [
{ url = "https://files.pythonhosted.org/packages/ec/b3/b2f4b2ca88b1e72eba7be2b2982533b887f8b709d222db78eb9602aa5121/locust-2.39.1-py3-none-any.whl", hash = "sha256:fd5148f2f1a4ed34aee968abc4393674e69d1b5e1b54db50a397f6eb09ce0b04", size = 1428155, upload-time = "2025-08-29T17:41:00.245Z" }, { url = "https://files.pythonhosted.org/packages/3b/e6/9c6335ab16becf4f8ad3da6083ab78793c56ec1ca496d6f7c74660c21c3f/locust-2.40.1-py3-none-any.whl", hash = "sha256:ef0517f9bb5ed0afa7035014faaf944802917e07da8649461aaaf5e5f3ba8a65", size = 1430154, upload-time = "2025-09-05T15:57:33.233Z" },
] ]
[[package]] [[package]]
@ -2619,7 +2616,7 @@ wheels = [
[[package]] [[package]]
name = "openai" name = "openai"
version = "1.102.0" version = "1.107.0"
source = { registry = "https://pypi.org/simple" } source = { registry = "https://pypi.org/simple" }
dependencies = [ dependencies = [
{ name = "anyio" }, { name = "anyio" },
@ -2631,9 +2628,9 @@ dependencies = [
{ name = "tqdm" }, { name = "tqdm" },
{ name = "typing-extensions" }, { name = "typing-extensions" },
] ]
sdist = { url = "https://files.pythonhosted.org/packages/07/55/da5598ed5c6bdd9939633854049cddc5cbac0da938dfcfcb3c6b119c16c0/openai-1.102.0.tar.gz", hash = "sha256:2e0153bcd64a6523071e90211cbfca1f2bbc5ceedd0993ba932a5869f93b7fc9", size = 519027, upload-time = "2025-08-26T20:50:29.397Z" } sdist = { url = "https://files.pythonhosted.org/packages/88/67/d6498de300f83ff57a79cb7aa96ef3bef8d6f070c3ded0f1b5b45442a6bc/openai-1.107.0.tar.gz", hash = "sha256:43e04927584e57d0e9e640ee0077c78baf8150098be96ebd5c512539b6c4e9a4", size = 566056, upload-time = "2025-09-08T19:25:47.604Z" }
wheels = [ wheels = [
{ url = "https://files.pythonhosted.org/packages/bd/0d/c9e7016d82c53c5b5e23e2bad36daebb8921ed44f69c0a985c6529a35106/openai-1.102.0-py3-none-any.whl", hash = "sha256:d751a7e95e222b5325306362ad02a7aa96e1fab3ed05b5888ce1c7ca63451345", size = 812015, upload-time = "2025-08-26T20:50:27.219Z" }, { url = "https://files.pythonhosted.org/packages/91/ed/e8a4fd20390f2858b95227c288df8fe0c835f7c77625f7583609161684ba/openai-1.107.0-py3-none-any.whl", hash = "sha256:3dcfa3cbb116bd6924b27913b8da28c4a787379ff60049588547a1013e6d6438", size = 950968, upload-time = "2025-09-08T19:25:45.552Z" },
] ]
[[package]] [[package]]
@ -3540,7 +3537,7 @@ wheels = [
[[package]] [[package]]
name = "pytest" name = "pytest"
version = "8.4.1" version = "8.4.2"
source = { registry = "https://pypi.org/simple" } source = { registry = "https://pypi.org/simple" }
dependencies = [ dependencies = [
{ name = "colorama", marker = "sys_platform == 'win32'" }, { name = "colorama", marker = "sys_platform == 'win32'" },
@ -3549,9 +3546,9 @@ dependencies = [
{ name = "pluggy" }, { name = "pluggy" },
{ name = "pygments" }, { name = "pygments" },
] ]
sdist = { url = "https://files.pythonhosted.org/packages/08/ba/45911d754e8eba3d5a841a5ce61a65a685ff1798421ac054f85aa8747dfb/pytest-8.4.1.tar.gz", hash = "sha256:7c67fd69174877359ed9371ec3af8a3d2b04741818c51e5e99cc1742251fa93c", size = 1517714, upload-time = "2025-06-18T05:48:06.109Z" } sdist = { url = "https://files.pythonhosted.org/packages/a3/5c/00a0e072241553e1a7496d638deababa67c5058571567b92a7eaa258397c/pytest-8.4.2.tar.gz", hash = "sha256:86c0d0b93306b961d58d62a4db4879f27fe25513d4b969df351abdddb3c30e01", size = 1519618, upload-time = "2025-09-04T14:34:22.711Z" }
wheels = [ wheels = [
{ url = "https://files.pythonhosted.org/packages/29/16/c8a903f4c4dffe7a12843191437d7cd8e32751d5de349d45d3fe69544e87/pytest-8.4.1-py3-none-any.whl", hash = "sha256:539c70ba6fcead8e78eebbf1115e8b589e7565830d7d006a8723f19ac8a0afb7", size = 365474, upload-time = "2025-06-18T05:48:03.955Z" }, { url = "https://files.pythonhosted.org/packages/a8/a4/20da314d277121d6534b3a980b29035dcd51e6744bd79075a6ce8fa4eb8d/pytest-8.4.2-py3-none-any.whl", hash = "sha256:872f880de3fc3a5bdc88a11b39c9710c3497a547cfa9320bc3c5e62fbf272e79", size = 365750, upload-time = "2025-09-04T14:34:20.226Z" },
] ]
[[package]] [[package]]