Merge branch 'main' into aslib-useful-init-error

2025-12-25 07:38:03 +00:00 · 2025-07-29 17:42:45 -04:00 · 2025-07-29 17:42:45 -04:00 · 6092341d90
commit 6092341d90
parent c78ea22ff1 81c7d6fa2e
106 changed files with 34234 additions and 1610 deletions
--- a/llama_stack/distribution/build_container.sh
+++ b/llama_stack/distribution/build_container.sh
@ -18,10 +18,6 @@ UV_HTTP_TIMEOUT=${UV_HTTP_TIMEOUT:-500}

 # mounting is not supported by docker buildx, so we use COPY instead
 USE_COPY_NOT_MOUNT=${USE_COPY_NOT_MOUNT:-}
-
-# Mount command for cache container .cache, can be overridden by the user if needed
-MOUNT_CACHE=${MOUNT_CACHE:-"--mount=type=cache,id=llama-stack-cache,target=/root/.cache"}
-
 # Path to the run.yaml file in the container
 RUN_CONFIG_PATH=/app/run.yaml

@ -176,18 +172,13 @@ RUN pip install uv
 EOF
 fi

-# Set the link mode to copy so that uv doesn't attempt to symlink to the cache directory
-add_to_container << EOF
-ENV UV_LINK_MODE=copy
-EOF
-
 # Add pip dependencies first since llama-stack is what will change most often
 # so we can reuse layers.
 if [ -n "$normal_deps" ]; then
  read -ra pip_args <<<  "$normal_deps"
  quoted_deps=$(printf " %q" "${pip_args[@]}")
  add_to_container << EOF
-RUN $MOUNT_CACHE uv pip install $quoted_deps
+RUN uv pip install --no-cache $quoted_deps
 EOF
 fi

@ -197,7 +188,7 @@ if [ -n "$optional_deps" ]; then
    read -ra pip_args <<< "$part"
    quoted_deps=$(printf " %q" "${pip_args[@]}")
    add_to_container <<EOF
-RUN $MOUNT_CACHE uv pip install $quoted_deps
+RUN uv pip install --no-cache $quoted_deps
 EOF
  done
 fi
@ -208,10 +199,10 @@ if [ -n "$external_provider_deps" ]; then
    read -ra pip_args <<< "$part"
    quoted_deps=$(printf " %q" "${pip_args[@]}")
    add_to_container <<EOF
-RUN $MOUNT_CACHE uv pip install $quoted_deps
+RUN uv pip install --no-cache $quoted_deps
 EOF
    add_to_container <<EOF
-RUN python3 - <<PYTHON | $MOUNT_CACHE uv pip install -r -
+RUN python3 - <<PYTHON | uv pip install --no-cache -r -
 import importlib
 import sys

@ -293,7 +284,7 @@ COPY $dir $mount_point
 EOF
  fi
  add_to_container << EOF
-RUN $MOUNT_CACHE uv pip install -e $mount_point
+RUN uv pip install --no-cache -e $mount_point
 EOF
 }

@ -308,10 +299,10 @@ else
  if [ -n "$TEST_PYPI_VERSION" ]; then
    # these packages are damaged in test-pypi, so install them first
    add_to_container << EOF
-RUN $MOUNT_CACHE uv pip install fastapi libcst
+RUN uv pip install --no-cache fastapi libcst
 EOF
    add_to_container << EOF
-RUN $MOUNT_CACHE uv pip install --extra-index-url https://test.pypi.org/simple/ \
+RUN uv pip install --no-cache --extra-index-url https://test.pypi.org/simple/ \
  --index-strategy unsafe-best-match \
  llama-stack==$TEST_PYPI_VERSION

@ -323,7 +314,7 @@ EOF
      SPEC_VERSION="llama-stack"
    fi
    add_to_container << EOF
-RUN $MOUNT_CACHE uv pip install $SPEC_VERSION
+RUN uv pip install --no-cache $SPEC_VERSION
 EOF
  fi
 fi
--- a/llama_stack/distribution/routers/inference.py
+++ b/llama_stack/distribution/routers/inference.py
@ -79,11 +79,9 @@ class InferenceRouter(Inference):

    async def initialize(self) -> None:
        logger.debug("InferenceRouter.initialize")
-        pass

    async def shutdown(self) -> None:
        logger.debug("InferenceRouter.shutdown")
-        pass

    async def register_model(
        self,
--- a/llama_stack/distribution/stack.py
+++ b/llama_stack/distribution/stack.py
@ -94,6 +94,7 @@ RESOURCES = [

 REGISTRY_REFRESH_INTERVAL_SECONDS = 300
 REGISTRY_REFRESH_TASK = None
+TEST_RECORDING_CONTEXT = None


 async def register_resources(run_config: StackRunConfig, impls: dict[Api, Any]):
@ -307,6 +308,15 @@ def add_internal_implementations(impls: dict[Api, Any], run_config: StackRunConf
 async def construct_stack(
    run_config: StackRunConfig, provider_registry: ProviderRegistry | None = None
 ) -> dict[Api, Any]:
+    if "LLAMA_STACK_TEST_INFERENCE_MODE" in os.environ:
+        from llama_stack.testing.inference_recorder import setup_inference_recording
+
+        global TEST_RECORDING_CONTEXT
+        TEST_RECORDING_CONTEXT = setup_inference_recording()
+        if TEST_RECORDING_CONTEXT:
+            TEST_RECORDING_CONTEXT.__enter__()
+            logger.info(f"Inference recording enabled: mode={os.environ.get('LLAMA_STACK_TEST_INFERENCE_MODE')}")
+
    dist_registry, _ = await create_dist_registry(run_config.metadata_store, run_config.image_name)
    policy = run_config.server.auth.access_policy if run_config.server.auth else []
    impls = await resolve_impls(
@ -352,6 +362,13 @@ async def shutdown_stack(impls: dict[Api, Any]):
        except (Exception, asyncio.CancelledError) as e:
            logger.exception(f"Failed to shutdown {impl_name}: {e}")

+    global TEST_RECORDING_CONTEXT
+    if TEST_RECORDING_CONTEXT:
+        try:
+            TEST_RECORDING_CONTEXT.__exit__(None, None, None)
+        except Exception as e:
+            logger.error(f"Error during inference recording cleanup: {e}")
+
    global REGISTRY_REFRESH_TASK
    if REGISTRY_REFRESH_TASK:
        REGISTRY_REFRESH_TASK.cancel()