mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-04 04:04:14 +00:00
chore: introduce write queue for inference_store (#3383)
# What does this PR do? Adds a write worker queue for writes to inference store. This avoids overwhelming request processing with slow inference writes. ## Test Plan Benchmark: ``` cd /docs/source/distributions/k8s-benchmark # start mock server python openai-mock-server.py --port 8000 # start stack server LLAMA_STACK_LOGGING="all=WARNING" uv run --with llama-stack python -m llama_stack.core.server.server docs/source/distributions/k8s-benchmark/stack_run_config.yaml # run benchmark script uv run python3 benchmark.py --duration 120 --concurrent 50 --base-url=http://localhost:8321/v1/openai/v1 --model=vllm-inference/meta-llama/Llama-3.2-3B-Instruct ``` ## RPS from 21 -> 57
This commit is contained in:
parent
e6edc1f934
commit
e980436a2e
7 changed files with 139 additions and 22 deletions
|
@ -58,14 +58,6 @@ class BenchmarkStats:
|
|||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"BENCHMARK RESULTS")
|
||||
print(f"{'='*60}")
|
||||
print(f"Total time: {total_time:.2f}s")
|
||||
print(f"Concurrent users: {self.concurrent_users}")
|
||||
print(f"Total requests: {self.total_requests}")
|
||||
print(f"Successful requests: {self.success_count}")
|
||||
print(f"Failed requests: {len(self.errors)}")
|
||||
print(f"Success rate: {success_rate:.1f}%")
|
||||
print(f"Requests per second: {self.success_count / total_time:.2f}")
|
||||
|
||||
print(f"\nResponse Time Statistics:")
|
||||
print(f" Mean: {statistics.mean(self.response_times):.3f}s")
|
||||
|
@ -106,6 +98,15 @@ class BenchmarkStats:
|
|||
print(f" Mean chunks per response: {statistics.mean(self.chunks_received):.1f}")
|
||||
print(f" Total chunks received: {sum(self.chunks_received)}")
|
||||
|
||||
print(f"{'='*60}")
|
||||
print(f"Total time: {total_time:.2f}s")
|
||||
print(f"Concurrent users: {self.concurrent_users}")
|
||||
print(f"Total requests: {self.total_requests}")
|
||||
print(f"Successful requests: {self.success_count}")
|
||||
print(f"Failed requests: {len(self.errors)}")
|
||||
print(f"Success rate: {success_rate:.1f}%")
|
||||
print(f"Requests per second: {self.success_count / total_time:.2f}")
|
||||
|
||||
if self.errors:
|
||||
print(f"\nErrors (showing first 5):")
|
||||
for error in self.errors[:5]:
|
||||
|
@ -215,7 +216,7 @@ class LlamaStackBenchmark:
|
|||
await asyncio.sleep(1) # Report every second
|
||||
if time.time() >= last_report_time + 10: # Report every 10 seconds
|
||||
elapsed = time.time() - stats.start_time
|
||||
print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s")
|
||||
print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s, RPS: {stats.total_requests / elapsed:.1f}")
|
||||
last_report_time = time.time()
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
|
|
|
@ -2,6 +2,7 @@ version: '2'
|
|||
image_name: kubernetes-benchmark-demo
|
||||
apis:
|
||||
- agents
|
||||
- files
|
||||
- inference
|
||||
- files
|
||||
- safety
|
||||
|
@ -20,6 +21,14 @@ providers:
|
|||
- provider_id: sentence-transformers
|
||||
provider_type: inline::sentence-transformers
|
||||
config: {}
|
||||
files:
|
||||
- provider_id: meta-reference-files
|
||||
provider_type: inline::localfs
|
||||
config:
|
||||
storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
|
||||
metadata_store:
|
||||
type: sqlite
|
||||
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
|
||||
vector_io:
|
||||
- provider_id: ${env.ENABLE_CHROMADB:+chromadb}
|
||||
provider_type: remote::chromadb
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue