chore: refactor server.main

# What does this PR do? Refactor main to split out the app construction so that we can use `uvicorn --workers` to enable multi-process stack. ## Test Plan CI > uv run --with llama-stack python -m llama_stack.core.server.server benchmarking/k8s-benchmark/stack_run_config.yaml works. > LLAMA_STACK_CONFIG=benchmarking/k8s-benchmark/stack_run_config.yaml uv run uvicorn llama_stack.core.server.server:create_app --port 8321 --workers 4 works.
2025-10-08 21:04:39 +00:00 · 2025-09-17 12:24:13 -07:00 · 2025-09-17 12:24:13 -07:00 · a285f9c95f
commit a285f9c95f
parent ac1414b571
7 changed files with 233 additions and 146 deletions
--- a/benchmarking/k8s-benchmark/stack-k8s.yaml.template
+++ b/benchmarking/k8s-benchmark/stack-k8s.yaml.template
@ -52,9 +52,20 @@ spec:
          value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
        - name: VLLM_TLS_VERIFY
          value: "false"
-        command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8323"]
+        - name: LLAMA_STACK_LOGGING
+          value: "all=WARNING"
+        - name: LLAMA_STACK_CONFIG
+          value: "/etc/config/stack_run_config.yaml"
+        - name: LLAMA_STACK_WORKERS
+          value: "${LLAMA_STACK_WORKERS}"
+        command: ["uvicorn", "llama_stack.core.server.server:create_app", "--host", "0.0.0.0", "--port", "8323", "--workers", "$LLAMA_STACK_WORKERS", "--factory"]
        ports:
          - containerPort: 8323
+        resources:
+          requests:
+            cpu: "${LLAMA_STACK_WORKERS}"
+          limits:
+            cpu: "${LLAMA_STACK_WORKERS}"
        volumeMounts:
          - name: llama-storage
            mountPath: /root/.llama