mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-08 21:04:39 +00:00
chore: refactor server.main
# What does this PR do? Refactor main to split out the app construction so that we can use `uvicorn --workers` to enable multi-process stack. ## Test Plan CI > uv run --with llama-stack python -m llama_stack.core.server.server benchmarking/k8s-benchmark/stack_run_config.yaml works. > LLAMA_STACK_CONFIG=benchmarking/k8s-benchmark/stack_run_config.yaml uv run uvicorn llama_stack.core.server.server:create_app --port 8321 --workers 4 works.
This commit is contained in:
parent
ac1414b571
commit
a285f9c95f
7 changed files with 233 additions and 146 deletions
|
@ -52,9 +52,20 @@ spec:
|
|||
value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
|
||||
- name: VLLM_TLS_VERIFY
|
||||
value: "false"
|
||||
command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8323"]
|
||||
- name: LLAMA_STACK_LOGGING
|
||||
value: "all=WARNING"
|
||||
- name: LLAMA_STACK_CONFIG
|
||||
value: "/etc/config/stack_run_config.yaml"
|
||||
- name: LLAMA_STACK_WORKERS
|
||||
value: "${LLAMA_STACK_WORKERS}"
|
||||
command: ["uvicorn", "llama_stack.core.server.server:create_app", "--host", "0.0.0.0", "--port", "8323", "--workers", "$LLAMA_STACK_WORKERS", "--factory"]
|
||||
ports:
|
||||
- containerPort: 8323
|
||||
resources:
|
||||
requests:
|
||||
cpu: "${LLAMA_STACK_WORKERS}"
|
||||
limits:
|
||||
cpu: "${LLAMA_STACK_WORKERS}"
|
||||
volumeMounts:
|
||||
- name: llama-storage
|
||||
mountPath: /root/.llama
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue