chore: refactor server.main (#3462)

# What does this PR do? As shown in #3421, we can scale stack to handle more RPS with k8s replicas. This PR enables multi process stack with uvicorn --workers so that we can achieve the same scaling without being in k8s. To achieve that we refactor main to split out the app construction logic. This method needs to be non-async. We created a new `Stack` class to house impls and have a `start()` method to be called in lifespan to start background tasks instead of starting them in the old `construct_stack`. This way we avoid having to manage an event loop manually. ## Test Plan CI > uv run --with llama-stack python -m llama_stack.core.server.server benchmarking/k8s-benchmark/stack_run_config.yaml works. > LLAMA_STACK_CONFIG=benchmarking/k8s-benchmark/stack_run_config.yaml uv run uvicorn llama_stack.core.server.server:create_app --port 8321 --workers 4 works.
2025-12-03 18:00:36 +00:00 · 2025-09-18 21:11:13 -07:00 · 2025-09-18 21:11:13 -07:00 · 4c2fcb6b51
commit 4c2fcb6b51
parent 8422bd102a
7 changed files with 233 additions and 146 deletions
--- a/benchmarking/k8s-benchmark/apply.sh
+++ b/benchmarking/k8s-benchmark/apply.sh
@ -17,11 +17,8 @@ export POSTGRES_PASSWORD=llamastack
 export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B

-export MOCK_INFERENCE_MODEL=mock-inference
-
-export MOCK_INFERENCE_URL=openai-mock-service:8080
-
 export BENCHMARK_INFERENCE_MODEL=$INFERENCE_MODEL
+export LLAMA_STACK_WORKERS=4

 set -euo pipefail
 set -x
--- a/benchmarking/k8s-benchmark/stack-configmap.yaml
+++ b/benchmarking/k8s-benchmark/stack-configmap.yaml
@ -5,6 +5,7 @@ data:
    image_name: kubernetes-benchmark-demo
    apis:
    - agents
+    - files
    - inference
    - files
    - safety
@ -23,6 +24,14 @@ data:
      - provider_id: sentence-transformers
        provider_type: inline::sentence-transformers
        config: {}
+      files:
+      - provider_id: meta-reference-files
+        provider_type: inline::localfs
+        config:
+          storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
+          metadata_store:
+            type: sqlite
+            db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
      vector_io:
      - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
        provider_type: remote::chromadb
--- a/benchmarking/k8s-benchmark/stack-k8s.yaml.template
+++ b/benchmarking/k8s-benchmark/stack-k8s.yaml.template
@ -52,9 +52,20 @@ spec:
          value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
        - name: VLLM_TLS_VERIFY
          value: "false"
-        command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8323"]
+        - name: LLAMA_STACK_LOGGING
+          value: "all=WARNING"
+        - name: LLAMA_STACK_CONFIG
+          value: "/etc/config/stack_run_config.yaml"
+        - name: LLAMA_STACK_WORKERS
+          value: "${LLAMA_STACK_WORKERS}"
+        command: ["uvicorn", "llama_stack.core.server.server:create_app", "--host", "0.0.0.0", "--port", "8323", "--workers", "$LLAMA_STACK_WORKERS", "--factory"]
        ports:
          - containerPort: 8323
+        resources:
+          requests:
+            cpu: "${LLAMA_STACK_WORKERS}"
+          limits:
+            cpu: "${LLAMA_STACK_WORKERS}"
        volumeMounts:
          - name: llama-storage
            mountPath: /root/.llama