chore(package): migrate to src/ layout (#3920)

Migrates package structure to src/ layout following Python packaging best practices. All code moved from `llama_stack/` to `src/llama_stack/`. Public API unchanged - imports remain `import llama_stack.*`. Updated build configs, pre-commit hooks, scripts, and GitHub workflows accordingly. All hooks pass, package builds cleanly. **Developer note**: Reinstall after pulling: `pip install -e .`
2025-12-03 09:53:45 +00:00 · 2025-10-27 12:02:21 -07:00 · 2025-10-27 12:02:21 -07:00 · 471b1b248b
commit 471b1b248b
parent 98a5047f9d
791 changed files with 2983 additions and 456 deletions
--- a/src/llama_stack/core/server/init.py
+++ b/src/llama_stack/core/server/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/src/llama_stack/core/server/auth.py
+++ b/src/llama_stack/core/server/auth.py
@ -0,0 +1,187 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+
+import httpx
+from aiohttp import hdrs
+
+from llama_stack.core.datatypes import AuthenticationConfig, User
+from llama_stack.core.request_headers import user_from_scope
+from llama_stack.core.server.auth_providers import create_auth_provider
+from llama_stack.core.server.routes import find_matching_route, initialize_route_impls
+from llama_stack.log import get_logger
+
+logger = get_logger(name=__name__, category="core::auth")
+
+
+class AuthenticationMiddleware:
+    """Middleware that authenticates requests using configured authentication provider.
+
+    This middleware:
+    1. Extracts the Bearer token from the Authorization header
+    2. Uses the configured auth provider to validate the token
+    3. Extracts user attributes from the provider's response
+    4. Makes these attributes available to the route handlers for access control
+
+    Unauthenticated Access:
+    Endpoints can opt out of authentication by setting require_authentication=False
+    in their @webmethod decorator. This is typically used for operational endpoints
+    like /health and /version to support monitoring, load balancers, and observability tools.
+
+    The middleware supports multiple authentication providers through the AuthProvider interface:
+    - Kubernetes: Validates tokens against the Kubernetes API server
+    - Custom: Validates tokens against a custom endpoint
+
+    Authentication Request Format for Custom Auth Provider:
+    ```json
+    {
+        "api_key": "the-api-key-extracted-from-auth-header",
+        "request": {
+            "path": "/models/list",
+            "headers": {
+                "content-type": "application/json",
+                "user-agent": "..."
+                // All headers except Authorization
+            },
+            "params": {
+                "limit": ["100"],
+                "offset": ["0"]
+                // Query parameters as key -> list of values
+            }
+        }
+    }
+    ```
+
+    Expected Auth Endpoint Response Format:
+    ```json
+    {
+        "access_attributes": {    // Structured attribute format
+            "roles": ["admin", "user"],
+            "teams": ["ml-team", "nlp-team"],
+            "projects": ["llama-3", "project-x"],
+            "namespaces": ["research"]
+        },
+        "message": "Optional message about auth result"
+    }
+    ```
+
+    Token Validation:
+    Each provider implements its own token validation logic:
+    - Kubernetes: Uses TokenReview API to validate service account tokens
+    - Custom: Sends token to custom endpoint for validation
+
+    Attribute-Based Access Control:
+    The attributes returned by the auth provider are used to determine which
+    resources the user can access. Resources can specify required attributes
+    using the access_attributes field. For a user to access a resource:
+
+    1. All attribute categories specified in the resource must be present in the user's attributes
+    2. For each category, the user must have at least one matching value
+
+    If the auth provider doesn't return any attributes, the user will only be able to
+    access resources that don't have access_attributes defined.
+    """
+
+    def __init__(self, app, auth_config: AuthenticationConfig, impls):
+        self.app = app
+        self.impls = impls
+        self.auth_provider = create_auth_provider(auth_config)
+
+    async def __call__(self, scope, receive, send):
+        if scope["type"] == "http":
+            # Find the route and check if authentication is required
+            path = scope.get("path", "")
+            method = scope.get("method", hdrs.METH_GET)
+
+            if not hasattr(self, "route_impls"):
+                self.route_impls = initialize_route_impls(self.impls)
+
+            webmethod = None
+            try:
+                _, _, _, webmethod = find_matching_route(method, path, self.route_impls)
+            except ValueError:
+                # If no matching endpoint is found, pass here to run auth anyways
+                pass
+
+            # If webmethod explicitly sets require_authentication=False, allow without auth
+            if webmethod and webmethod.require_authentication is False:
+                logger.debug(f"Allowing unauthenticated access to endpoint: {path}")
+                return await self.app(scope, receive, send)
+
+            # Handle authentication
+            headers = dict(scope.get("headers", []))
+            auth_header = headers.get(b"authorization", b"").decode()
+
+            if not auth_header:
+                error_msg = self.auth_provider.get_auth_error_message(scope)
+                return await self._send_auth_error(send, error_msg)
+
+            if not auth_header.startswith("Bearer "):
+                return await self._send_auth_error(send, "Invalid Authorization header format")
+
+            token = auth_header.split("Bearer ", 1)[1]
+
+            # Validate token and get access attributes
+            try:
+                validation_result = await self.auth_provider.validate_token(token, scope)
+            except httpx.TimeoutException:
+                logger.exception("Authentication request timed out")
+                return await self._send_auth_error(send, "Authentication service timeout")
+            except ValueError as e:
+                logger.exception("Error during authentication")
+                return await self._send_auth_error(send, str(e))
+            except Exception:
+                logger.exception("Error during authentication")
+                return await self._send_auth_error(send, "Authentication service error")
+
+            # Store the client ID in the request scope so that downstream middleware (like QuotaMiddleware)
+            # can identify the requester and enforce per-client rate limits.
+            scope["authenticated_client_id"] = token
+
+            # Store attributes in request scope
+            scope["principal"] = validation_result.principal
+            if validation_result.attributes:
+                scope["user_attributes"] = validation_result.attributes
+            logger.debug(
+                f"Authentication successful: {validation_result.principal} with {len(validation_result.attributes)} attributes"
+            )
+
+            # Scope-based API access control
+            if webmethod and webmethod.required_scope:
+                user = user_from_scope(scope)
+                if not _has_required_scope(webmethod.required_scope, user):
+                    return await self._send_auth_error(
+                        send,
+                        f"Access denied: user does not have required scope: {webmethod.required_scope}",
+                        status=403,
+                    )
+
+        return await self.app(scope, receive, send)
+
+    async def _send_auth_error(self, send, message, status=401):
+        await send(
+            {
+                "type": "http.response.start",
+                "status": status,
+                "headers": [[b"content-type", b"application/json"]],
+            }
+        )
+        error_key = "message" if status == 401 else "detail"
+        error_msg = json.dumps({"error": {error_key: message}}).encode()
+        await send({"type": "http.response.body", "body": error_msg})
+
+
+def _has_required_scope(required_scope: str, user: User | None) -> bool:
+    # if no user, assume auth is not enabled
+    if not user:
+        return True
+
+    if not user.attributes:
+        return False
+
+    user_scopes = user.attributes.get("scopes", [])
+    return required_scope in user_scopes
--- a/src/llama_stack/core/server/auth_providers.py
+++ b/src/llama_stack/core/server/auth_providers.py
@ -0,0 +1,494 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import ssl
+from abc import ABC, abstractmethod
+from urllib.parse import parse_qs, urljoin, urlparse
+
+import httpx
+import jwt
+from pydantic import BaseModel, Field
+
+from llama_stack.apis.common.errors import TokenValidationError
+from llama_stack.core.datatypes import (
+    AuthenticationConfig,
+    CustomAuthConfig,
+    GitHubTokenAuthConfig,
+    KubernetesAuthProviderConfig,
+    OAuth2TokenAuthConfig,
+    User,
+)
+from llama_stack.log import get_logger
+
+logger = get_logger(name=__name__, category="core::auth")
+
+
+class AuthResponse(BaseModel):
+    """The format of the authentication response from the auth endpoint."""
+
+    principal: str
+    # further attributes that may be used for access control decisions
+    attributes: dict[str, list[str]] | None = None
+    message: str | None = Field(
+        default=None, description="Optional message providing additional context about the authentication result."
+    )
+
+
+class AuthRequestContext(BaseModel):
+    path: str = Field(description="The path of the request being authenticated")
+
+    headers: dict[str, str] = Field(description="HTTP headers from the original request (excluding Authorization)")
+
+    params: dict[str, list[str]] = Field(default_factory=dict, description="Query parameters from the original request")
+
+
+class AuthRequest(BaseModel):
+    api_key: str = Field(description="The API key extracted from the Authorization header")
+
+    request: AuthRequestContext = Field(description="Context information about the request being authenticated")
+
+
+class AuthProvider(ABC):
+    """Abstract base class for authentication providers."""
+
+    @abstractmethod
+    async def validate_token(self, token: str, scope: dict | None = None) -> User:
+        """Validate a token and return access attributes."""
+        pass
+
+    @abstractmethod
+    async def close(self):
+        """Clean up any resources."""
+        pass
+
+    def get_auth_error_message(self, scope: dict | None = None) -> str:
+        """Return provider-specific authentication error message."""
+        return "Authentication required"
+
+
+def get_attributes_from_claims(claims: dict[str, str], mapping: dict[str, str]) -> dict[str, list[str]]:
+    attributes: dict[str, list[str]] = {}
+    for claim_key, attribute_key in mapping.items():
+        # First try dot notation for nested traversal (e.g., "resource_access.llamastack.roles")
+        # Then fall back to literal key with dots (e.g., "my.dotted.key")
+        claim: object = claims
+        keys = claim_key.split(".")
+        for key in keys:
+            if isinstance(claim, dict) and key in claim:
+                claim = claim[key]
+            else:
+                claim = None
+                break
+
+        if claim is None and claim_key in claims:
+            # Fall back to checking if claim_key exists as a literal key
+            claim = claims[claim_key]
+
+        if claim is None:
+            continue
+
+        if isinstance(claim, list):
+            values = claim
+        elif isinstance(claim, str):
+            values = claim.split()
+        else:
+            continue
+
+        if attribute_key in attributes:
+            attributes[attribute_key].extend(values)
+        else:
+            attributes[attribute_key] = values
+    return attributes
+
+
+class OAuth2TokenAuthProvider(AuthProvider):
+    """
+    JWT token authentication provider that validates a JWT token and extracts access attributes.
+
+    This should be the standard authentication provider for most use cases.
+    """
+
+    def __init__(self, config: OAuth2TokenAuthConfig):
+        self.config = config
+        self._jwks_client: jwt.PyJWKClient | None = None
+
+    async def validate_token(self, token: str, scope: dict | None = None) -> User:
+        if self.config.jwks:
+            return await self.validate_jwt_token(token, scope)
+        if self.config.introspection:
+            return await self.introspect_token(token, scope)
+        raise ValueError("One of jwks or introspection must be configured")
+
+    def _get_jwks_client(self) -> jwt.PyJWKClient:
+        if self._jwks_client is None:
+            ssl_context = None
+            if not self.config.verify_tls:
+                # Disable SSL verification if verify_tls is False
+                ssl_context = ssl.create_default_context()
+                ssl_context.check_hostname = False
+                ssl_context.verify_mode = ssl.CERT_NONE
+            elif self.config.tls_cafile:
+                # Use custom CA file if provided
+                ssl_context = ssl.create_default_context(
+                    cafile=self.config.tls_cafile.as_posix(),
+                )
+                # If verify_tls is True and no tls_cafile, ssl_context remains None (use system defaults)
+
+            # Prepare headers for JWKS request - this is needed for Kubernetes to authenticate
+            # to the JWK endpoint, we must use the token in the config to authenticate
+            headers = {}
+            if self.config.jwks and self.config.jwks.token:
+                headers["Authorization"] = f"Bearer {self.config.jwks.token}"
+
+            self._jwks_client = jwt.PyJWKClient(
+                self.config.jwks.uri if self.config.jwks else None,
+                cache_keys=True,
+                max_cached_keys=10,
+                lifespan=self.config.jwks.key_recheck_period if self.config.jwks else None,
+                headers=headers,
+                ssl_context=ssl_context,
+            )
+        return self._jwks_client
+
+    async def validate_jwt_token(self, token: str, scope: dict | None = None) -> User:
+        """Validate a token using the JWT token."""
+        try:
+            jwks_client: jwt.PyJWKClient = self._get_jwks_client()
+            signing_key = jwks_client.get_signing_key_from_jwt(token)
+            algorithm = jwt.get_unverified_header(token)["alg"]
+            claims = jwt.decode(
+                token,
+                signing_key.key,
+                algorithms=[algorithm],
+                audience=self.config.audience,
+                issuer=self.config.issuer,
+                options={"verify_exp": True, "verify_aud": True, "verify_iss": True},
+            )
+
+            # Decode and verify the JWT
+            claims = jwt.decode(
+                token,
+                signing_key.key,
+                algorithms=[algorithm],
+                audience=self.config.audience,
+                issuer=self.config.issuer,
+                options={"verify_exp": True, "verify_aud": True, "verify_iss": True},
+            )
+        except Exception as exc:
+            raise ValueError("Invalid JWT token") from exc
+
+        # There are other standard claims, the most relevant of which is `scope`.
+        # We should incorporate these into the access attributes.
+        principal = claims["sub"]
+        access_attributes = get_attributes_from_claims(claims, self.config.claims_mapping)
+        return User(
+            principal=principal,
+            attributes=access_attributes,
+        )
+
+    async def introspect_token(self, token: str, scope: dict | None = None) -> User:
+        """Validate a token using token introspection as defined by RFC 7662."""
+        form = {
+            "token": token,
+        }
+        if self.config.introspection is None:
+            raise ValueError("Introspection is not configured")
+
+        if self.config.introspection.send_secret_in_body:
+            form["client_id"] = self.config.introspection.client_id
+            form["client_secret"] = self.config.introspection.client_secret
+            auth = None
+        else:
+            auth = (self.config.introspection.client_id, self.config.introspection.client_secret)
+        ssl_ctxt = None
+        if self.config.tls_cafile:
+            ssl_ctxt = ssl.create_default_context(cafile=self.config.tls_cafile.as_posix())
+        try:
+            async with httpx.AsyncClient(verify=ssl_ctxt) as client:
+                response = await client.post(
+                    self.config.introspection.url,
+                    data=form,
+                    auth=auth,
+                    timeout=10.0,  # Add a reasonable timeout
+                )
+                if response.status_code != httpx.codes.OK:
+                    logger.warning(f"Token introspection failed with status code: {response.status_code}")
+                    raise ValueError(f"Token introspection failed: {response.status_code}")
+
+                fields = response.json()
+                if not fields["active"]:
+                    raise ValueError("Token not active")
+                principal = fields["sub"] or fields["username"]
+                access_attributes = get_attributes_from_claims(fields, self.config.claims_mapping)
+                return User(
+                    principal=principal,
+                    attributes=access_attributes,
+                )
+        except httpx.TimeoutException:
+            logger.exception("Token introspection request timed out")
+            raise
+        except ValueError:
+            # Re-raise ValueError exceptions to preserve their message
+            raise
+        except Exception as e:
+            logger.exception("Error during token introspection")
+            raise ValueError("Token introspection error") from e
+
+    async def close(self):
+        pass
+
+    def get_auth_error_message(self, scope: dict | None = None) -> str:
+        """Return OAuth2-specific authentication error message."""
+        if self.config.issuer:
+            return f"Authentication required. Please provide a valid OAuth2 Bearer token from {self.config.issuer}"
+        elif self.config.introspection:
+            # Extract domain from introspection URL for a cleaner message
+            domain = urlparse(self.config.introspection.url).netloc
+            return f"Authentication required. Please provide a valid OAuth2 Bearer token validated by {domain}"
+        else:
+            return "Authentication required. Please provide a valid OAuth2 Bearer token in the Authorization header"
+
+
+class CustomAuthProvider(AuthProvider):
+    """Custom authentication provider that uses an external endpoint."""
+
+    def __init__(self, config: CustomAuthConfig):
+        self.config = config
+        self._client = None
+
+    async def validate_token(self, token: str, scope: dict | None = None) -> User:
+        """Validate a token using the custom authentication endpoint."""
+        if scope is None:
+            scope = {}
+
+        headers = dict(scope.get("headers", []))
+        path = scope.get("path", "")
+        request_headers = {k.decode(): v.decode() for k, v in headers.items()}
+
+        # Remove sensitive headers
+        if "authorization" in request_headers:
+            del request_headers["authorization"]
+
+        query_string = scope.get("query_string", b"").decode()
+        params = parse_qs(query_string)
+
+        # Build the auth request model
+        auth_request = AuthRequest(
+            api_key=token,
+            request=AuthRequestContext(
+                path=path,
+                headers=request_headers,
+                params=params,
+            ),
+        )
+
+        # Validate with authentication endpoint
+        try:
+            async with httpx.AsyncClient() as client:
+                response = await client.post(
+                    self.config.endpoint,
+                    json=auth_request.model_dump(),
+                    timeout=10.0,  # Add a reasonable timeout
+                )
+                if response.status_code != httpx.codes.OK:
+                    logger.warning(f"Authentication failed with status code: {response.status_code}")
+                    raise ValueError(f"Authentication failed: {response.status_code}")
+
+                # Parse and validate the auth response
+                try:
+                    response_data = response.json()
+                    auth_response = AuthResponse(**response_data)
+                    return User(principal=auth_response.principal, attributes=auth_response.attributes)
+                except Exception as e:
+                    logger.exception("Error parsing authentication response")
+                    raise ValueError("Invalid authentication response format") from e
+
+        except httpx.TimeoutException:
+            logger.exception("Authentication request timed out")
+            raise
+        except ValueError:
+            # Re-raise ValueError exceptions to preserve their message
+            raise
+        except Exception as e:
+            logger.exception("Error during authentication")
+            raise ValueError("Authentication service error") from e
+
+    async def close(self):
+        """Close the HTTP client."""
+        if self._client:
+            await self._client.aclose()
+            self._client = None
+
+    def get_auth_error_message(self, scope: dict | None = None) -> str:
+        """Return custom auth provider-specific authentication error message."""
+        domain = urlparse(self.config.endpoint).netloc
+        if domain:
+            return f"Authentication required. Please provide your API key as a Bearer token (validated by {domain})"
+        else:
+            return "Authentication required. Please provide your API key as a Bearer token in the Authorization header"
+
+
+class GitHubTokenAuthProvider(AuthProvider):
+    """
+    GitHub token authentication provider that validates GitHub access tokens directly.
+
+    This provider accepts GitHub personal access tokens or OAuth tokens and verifies
+    them against the GitHub API to get user information.
+    """
+
+    def __init__(self, config: GitHubTokenAuthConfig):
+        self.config = config
+
+    async def validate_token(self, token: str, scope: dict | None = None) -> User:
+        """Validate a GitHub token by calling the GitHub API.
+
+        This validates tokens issued by GitHub (personal access tokens or OAuth tokens).
+        """
+        try:
+            user_info = await _get_github_user_info(token, self.config.github_api_base_url)
+        except httpx.HTTPStatusError as e:
+            logger.warning(f"GitHub token validation failed: {e}")
+            raise ValueError("GitHub token validation failed. Please check your token and try again.") from e
+
+        principal = user_info["user"]["login"]
+
+        github_data = {
+            "login": user_info["user"]["login"],
+            "id": str(user_info["user"]["id"]),
+            "organizations": user_info.get("organizations", []),
+        }
+
+        access_attributes = get_attributes_from_claims(github_data, self.config.claims_mapping)
+
+        return User(
+            principal=principal,
+            attributes=access_attributes,
+        )
+
+    async def close(self):
+        """Clean up any resources."""
+        pass
+
+    def get_auth_error_message(self, scope: dict | None = None) -> str:
+        """Return GitHub-specific authentication error message."""
+        return "Authentication required. Please provide a valid GitHub access token (https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens) in the Authorization header (Bearer <token>)"
+
+
+async def _get_github_user_info(access_token: str, github_api_base_url: str) -> dict:
+    """Fetch user info and organizations from GitHub API."""
+    headers = {
+        "Authorization": f"Bearer {access_token}",
+        "Accept": "application/vnd.github.v3+json",
+        "User-Agent": "llama-stack",
+    }
+
+    async with httpx.AsyncClient() as client:
+        user_response = await client.get(f"{github_api_base_url}/user", headers=headers, timeout=10.0)
+        user_response.raise_for_status()
+        user_data = user_response.json()
+
+        return {
+            "user": user_data,
+        }
+
+
+class KubernetesAuthProvider(AuthProvider):
+    """
+    Kubernetes authentication provider that validates tokens using the Kubernetes SelfSubjectReview API.
+    This provider integrates with Kubernetes API server by using the
+    /apis/authentication.k8s.io/v1/selfsubjectreviews endpoint to validate tokens and extract user information.
+    """
+
+    def __init__(self, config: KubernetesAuthProviderConfig):
+        self.config = config
+
+    def _httpx_verify_value(self) -> bool | str:
+        """
+        Build the value for httpx's `verify` parameter.
+        - False disables verification.
+        - Path string points to a CA bundle.
+        - True uses system defaults.
+        """
+        if not self.config.verify_tls:
+            return False
+        if self.config.tls_cafile:
+            return self.config.tls_cafile.as_posix()
+        return True
+
+    async def validate_token(self, token: str, scope: dict | None = None) -> User:
+        """Validate a token using Kubernetes SelfSubjectReview API endpoint."""
+        # Build the Kubernetes SelfSubjectReview API endpoint URL
+        review_api_url = urljoin(self.config.api_server_url, "/apis/authentication.k8s.io/v1/selfsubjectreviews")
+
+        # Create SelfSubjectReview request body
+        review_request = {"apiVersion": "authentication.k8s.io/v1", "kind": "SelfSubjectReview"}
+        verify = self._httpx_verify_value()
+
+        try:
+            async with httpx.AsyncClient(verify=verify, timeout=10.0) as client:
+                response = await client.post(
+                    review_api_url,
+                    json=review_request,
+                    headers={
+                        "Authorization": f"Bearer {token}",
+                        "Content-Type": "application/json",
+                    },
+                )
+
+                if response.status_code == httpx.codes.UNAUTHORIZED:
+                    raise TokenValidationError("Invalid token")
+                if response.status_code != httpx.codes.CREATED:
+                    logger.warning(f"Kubernetes SelfSubjectReview API failed with status code: {response.status_code}")
+                    raise TokenValidationError(f"Token validation failed: {response.status_code}")
+
+                review_response = response.json()
+                # Extract user information from SelfSubjectReview response
+                status = review_response.get("status", {})
+                if not status:
+                    raise ValueError("No status found in SelfSubjectReview response")
+
+                user_info = status.get("userInfo", {})
+                if not user_info:
+                    raise ValueError("No userInfo found in SelfSubjectReview response")
+
+                username = user_info.get("username")
+                if not username:
+                    raise ValueError("No username found in SelfSubjectReview response")
+
+                # Build user attributes from Kubernetes user info
+                user_attributes = get_attributes_from_claims(user_info, self.config.claims_mapping)
+
+                return User(
+                    principal=username,
+                    attributes=user_attributes,
+                )
+
+        except httpx.TimeoutException:
+            logger.warning("Kubernetes SelfSubjectReview API request timed out")
+            raise ValueError("Token validation timeout") from None
+        except Exception as e:
+            logger.warning(f"Error during token validation: {str(e)}")
+            raise ValueError(f"Token validation error: {str(e)}") from e
+
+    async def close(self):
+        """Close any resources."""
+        pass
+
+
+def create_auth_provider(config: AuthenticationConfig) -> AuthProvider:
+    """Factory function to create the appropriate auth provider."""
+    provider_config = config.provider_config
+
+    if isinstance(provider_config, CustomAuthConfig):
+        return CustomAuthProvider(provider_config)
+    elif isinstance(provider_config, OAuth2TokenAuthConfig):
+        return OAuth2TokenAuthProvider(provider_config)
+    elif isinstance(provider_config, GitHubTokenAuthConfig):
+        return GitHubTokenAuthProvider(provider_config)
+    elif isinstance(provider_config, KubernetesAuthProviderConfig):
+        return KubernetesAuthProvider(provider_config)
+    else:
+        raise ValueError(f"Unknown authentication provider config type: {type(provider_config)}")
--- a/src/llama_stack/core/server/quota.py
+++ b/src/llama_stack/core/server/quota.py
@ -0,0 +1,110 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+import time
+from datetime import UTC, datetime, timedelta
+
+from starlette.types import ASGIApp, Receive, Scope, Send
+
+from llama_stack.core.storage.datatypes import KVStoreReference, StorageBackendType
+from llama_stack.log import get_logger
+from llama_stack.providers.utils.kvstore.api import KVStore
+from llama_stack.providers.utils.kvstore.kvstore import _KVSTORE_BACKENDS, kvstore_impl
+
+logger = get_logger(name=__name__, category="core::server")
+
+
+class QuotaMiddleware:
+    """
+    ASGI middleware that enforces separate quotas for authenticated and anonymous clients
+    within a configurable time window.
+
+    - For authenticated requests, it reads the client ID from the
+      `Authorization: Bearer <client_id>` header.
+    - For anonymous requests, it falls back to the IP address of the client.
+    Requests are counted in a KV store (e.g., SQLite), and HTTP 429 is returned
+    once a client exceeds its quota.
+    """
+
+    def __init__(
+        self,
+        app: ASGIApp,
+        kv_config: KVStoreReference,
+        anonymous_max_requests: int,
+        authenticated_max_requests: int,
+        window_seconds: int = 86400,
+    ):
+        self.app = app
+        self.kv_config = kv_config
+        self.kv: KVStore | None = None
+        self.anonymous_max_requests = anonymous_max_requests
+        self.authenticated_max_requests = authenticated_max_requests
+        self.window_seconds = window_seconds
+
+    async def _get_kv(self) -> KVStore:
+        if self.kv is None:
+            self.kv = await kvstore_impl(self.kv_config)
+            backend_config = _KVSTORE_BACKENDS.get(self.kv_config.backend)
+            if backend_config and backend_config.type == StorageBackendType.KV_SQLITE:
+                logger.warning(
+                    "QuotaMiddleware: Using SQLite backend. Expiry/TTL is not enforced; cleanup is manual. "
+                    f"window_seconds={self.window_seconds}"
+                )
+        return self.kv
+
+    async def __call__(self, scope: Scope, receive: Receive, send: Send):
+        if scope["type"] == "http":
+            # pick key & limit based on auth
+            auth_id = scope.get("authenticated_client_id")
+            if auth_id:
+                key_id = auth_id
+                limit = self.authenticated_max_requests
+            else:
+                # fallback to IP
+                client = scope.get("client")
+                key_id = client[0] if client else "anonymous"
+                limit = self.anonymous_max_requests
+
+            current_window = int(time.time() // self.window_seconds)
+            key = f"quota:{key_id}:{current_window}"
+
+            try:
+                kv = await self._get_kv()
+                prev = await kv.get(key) or "0"
+                count = int(prev) + 1
+
+                if int(prev) == 0:
+                    # Set with expiration datetime when it is the first request in the window.
+                    expiration = datetime.now(UTC) + timedelta(seconds=self.window_seconds)
+                    await kv.set(key, str(count), expiration=expiration)
+                else:
+                    await kv.set(key, str(count))
+            except Exception:
+                logger.exception("Failed to access KV store for quota")
+                return await self._send_error(send, 500, "Quota service error")
+
+            if count > limit:
+                logger.warning(
+                    "Quota exceeded for client %s: %d/%d",
+                    key_id,
+                    count,
+                    limit,
+                )
+                return await self._send_error(send, 429, "Quota exceeded")
+
+        return await self.app(scope, receive, send)
+
+    async def _send_error(self, send: Send, status: int, message: str):
+        await send(
+            {
+                "type": "http.response.start",
+                "status": status,
+                "headers": [[b"content-type", b"application/json"]],
+            }
+        )
+        body = json.dumps({"error": {"message": message}}).encode()
+        await send({"type": "http.response.body", "body": body})
--- a/src/llama_stack/core/server/routes.py
+++ b/src/llama_stack/core/server/routes.py
@ -0,0 +1,141 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import inspect
+import re
+from collections.abc import Callable
+from typing import Any
+
+from aiohttp import hdrs
+from starlette.routing import Route
+
+from llama_stack.apis.datatypes import Api, ExternalApiSpec
+from llama_stack.apis.tools import RAGToolRuntime, SpecialToolGroup
+from llama_stack.core.resolver import api_protocol_map
+from llama_stack.schema_utils import WebMethod
+
+EndpointFunc = Callable[..., Any]
+PathParams = dict[str, str]
+RouteInfo = tuple[EndpointFunc, str, WebMethod]
+PathImpl = dict[str, RouteInfo]
+RouteImpls = dict[str, PathImpl]
+RouteMatch = tuple[EndpointFunc, PathParams, str, WebMethod]
+
+
+def toolgroup_protocol_map():
+    return {
+        SpecialToolGroup.rag_tool: RAGToolRuntime,
+    }
+
+
+def get_all_api_routes(
+    external_apis: dict[Api, ExternalApiSpec] | None = None,
+) -> dict[Api, list[tuple[Route, WebMethod]]]:
+    apis = {}
+
+    protocols = api_protocol_map(external_apis)
+    toolgroup_protocols = toolgroup_protocol_map()
+    for api, protocol in protocols.items():
+        routes = []
+        protocol_methods = inspect.getmembers(protocol, predicate=inspect.isfunction)
+
+        # HACK ALERT
+        if api == Api.tool_runtime:
+            for tool_group in SpecialToolGroup:
+                sub_protocol = toolgroup_protocols[tool_group]
+                sub_protocol_methods = inspect.getmembers(sub_protocol, predicate=inspect.isfunction)
+                for name, method in sub_protocol_methods:
+                    if not hasattr(method, "__webmethod__"):
+                        continue
+                    protocol_methods.append((f"{tool_group.value}.{name}", method))
+
+        for name, method in protocol_methods:
+            # Get all webmethods for this method (supports multiple decorators)
+            webmethods = getattr(method, "__webmethods__", [])
+            if not webmethods:
+                continue
+
+            # Create routes for each webmethod decorator
+            for webmethod in webmethods:
+                path = f"/{webmethod.level}/{webmethod.route.lstrip('/')}"
+                if webmethod.method == hdrs.METH_GET:
+                    http_method = hdrs.METH_GET
+                elif webmethod.method == hdrs.METH_DELETE:
+                    http_method = hdrs.METH_DELETE
+                else:
+                    http_method = hdrs.METH_POST
+                routes.append(
+                    (Route(path=path, methods=[http_method], name=name, endpoint=None), webmethod)
+                )  # setting endpoint to None since don't use a Router object
+
+        apis[api] = routes
+
+    return apis
+
+
+def initialize_route_impls(impls, external_apis: dict[Api, ExternalApiSpec] | None = None) -> RouteImpls:
+    api_to_routes = get_all_api_routes(external_apis)
+    route_impls: RouteImpls = {}
+
+    def _convert_path_to_regex(path: str) -> str:
+        # Convert {param} to named capture groups
+        # handle {param:path} as well which allows for forward slashes in the param value
+        pattern = re.sub(
+            r"{(\w+)(?::path)?}",
+            lambda m: f"(?P<{m.group(1)}>{'[^/]+' if not m.group(0).endswith(':path') else '.+'})",
+            path,
+        )
+
+        return f"^{pattern}$"
+
+    for api, api_routes in api_to_routes.items():
+        if api not in impls:
+            continue
+        for route, webmethod in api_routes:
+            impl = impls[api]
+            func = getattr(impl, route.name)
+            # Get the first (and typically only) method from the set, filtering out HEAD
+            available_methods = [m for m in route.methods if m != "HEAD"]
+            if not available_methods:
+                continue  # Skip if only HEAD method is available
+            method = available_methods[0].lower()
+            if method not in route_impls:
+                route_impls[method] = {}
+            route_impls[method][_convert_path_to_regex(route.path)] = (
+                func,
+                route.path,
+                webmethod,
+            )
+
+    return route_impls
+
+
+def find_matching_route(method: str, path: str, route_impls: RouteImpls) -> RouteMatch:
+    """Find the matching endpoint implementation for a given method and path.
+
+    Args:
+        method: HTTP method (GET, POST, etc.)
+        path: URL path to match against
+        route_impls: A dictionary of endpoint implementations
+
+    Returns:
+        A tuple of (endpoint_function, path_params, route_path, webmethod_metadata)
+
+    Raises:
+        ValueError: If no matching endpoint is found
+    """
+    impls = route_impls.get(method.lower())
+    if not impls:
+        raise ValueError(f"No endpoint found for {path}")
+
+    for regex, (func, route_path, webmethod) in impls.items():
+        match = re.match(regex, path)
+        if match:
+            # Extract named groups from the regex match
+            path_params = match.groupdict()
+            return func, path_params, route_path, webmethod
+
+    raise ValueError(f"No endpoint found for {path}")
--- a/src/llama_stack/core/server/server.py
+++ b/src/llama_stack/core/server/server.py
@ -0,0 +1,536 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import asyncio
+import concurrent.futures
+import functools
+import inspect
+import json
+import logging  # allow-direct-logging
+import os
+import sys
+import traceback
+import warnings
+from collections.abc import Callable
+from contextlib import asynccontextmanager
+from importlib.metadata import version as parse_version
+from pathlib import Path
+from typing import Annotated, Any, get_origin
+
+import httpx
+import rich.pretty
+import yaml
+from fastapi import Body, FastAPI, HTTPException, Request, Response
+from fastapi import Path as FastapiPath
+from fastapi.exceptions import RequestValidationError
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse, StreamingResponse
+from openai import BadRequestError
+from pydantic import BaseModel, ValidationError
+
+from llama_stack.apis.common.errors import ConflictError, ResourceNotFoundError
+from llama_stack.apis.common.responses import PaginatedResponse
+from llama_stack.core.access_control.access_control import AccessDeniedError
+from llama_stack.core.datatypes import (
+    AuthenticationRequiredError,
+    StackRunConfig,
+    process_cors_config,
+)
+from llama_stack.core.distribution import builtin_automatically_routed_apis
+from llama_stack.core.external import load_external_apis
+from llama_stack.core.request_headers import (
+    PROVIDER_DATA_VAR,
+    request_provider_data_context,
+    user_from_scope,
+)
+from llama_stack.core.server.routes import get_all_api_routes
+from llama_stack.core.stack import (
+    Stack,
+    cast_image_name_to_string,
+    replace_env_vars,
+)
+from llama_stack.core.telemetry import Telemetry
+from llama_stack.core.telemetry.tracing import CURRENT_TRACE_CONTEXT, setup_logger
+from llama_stack.core.utils.config import redact_sensitive_fields
+from llama_stack.core.utils.config_resolution import Mode, resolve_config_or_distro
+from llama_stack.core.utils.context import preserve_contexts_async_generator
+from llama_stack.log import LoggingConfig, get_logger, setup_logging
+from llama_stack.providers.datatypes import Api
+
+from .auth import AuthenticationMiddleware
+from .quota import QuotaMiddleware
+from .tracing import TracingMiddleware
+
+REPO_ROOT = Path(__file__).parent.parent.parent.parent
+
+logger = get_logger(name=__name__, category="core::server")
+
+
+def warn_with_traceback(message, category, filename, lineno, file=None, line=None):
+    log = file if hasattr(file, "write") else sys.stderr
+    traceback.print_stack(file=log)
+    log.write(warnings.formatwarning(message, category, filename, lineno, line))
+
+
+if os.environ.get("LLAMA_STACK_TRACE_WARNINGS"):
+    warnings.showwarning = warn_with_traceback
+
+
+def create_sse_event(data: Any) -> str:
+    if isinstance(data, BaseModel):
+        data = data.model_dump_json()
+    else:
+        data = json.dumps(data)
+
+    return f"data: {data}\n\n"
+
+
+async def global_exception_handler(request: Request, exc: Exception):
+    traceback.print_exception(exc)
+    http_exc = translate_exception(exc)
+
+    return JSONResponse(status_code=http_exc.status_code, content={"error": {"detail": http_exc.detail}})
+
+
+def translate_exception(exc: Exception) -> HTTPException | RequestValidationError:
+    if isinstance(exc, ValidationError):
+        exc = RequestValidationError(exc.errors())
+
+    if isinstance(exc, RequestValidationError):
+        return HTTPException(
+            status_code=httpx.codes.BAD_REQUEST,
+            detail={
+                "errors": [
+                    {
+                        "loc": list(error["loc"]),
+                        "msg": error["msg"],
+                        "type": error["type"],
+                    }
+                    for error in exc.errors()
+                ]
+            },
+        )
+    elif isinstance(exc, ConflictError):
+        return HTTPException(status_code=httpx.codes.CONFLICT, detail=str(exc))
+    elif isinstance(exc, ResourceNotFoundError):
+        return HTTPException(status_code=httpx.codes.NOT_FOUND, detail=str(exc))
+    elif isinstance(exc, ValueError):
+        return HTTPException(status_code=httpx.codes.BAD_REQUEST, detail=f"Invalid value: {str(exc)}")
+    elif isinstance(exc, BadRequestError):
+        return HTTPException(status_code=httpx.codes.BAD_REQUEST, detail=str(exc))
+    elif isinstance(exc, PermissionError | AccessDeniedError):
+        return HTTPException(status_code=httpx.codes.FORBIDDEN, detail=f"Permission denied: {str(exc)}")
+    elif isinstance(exc, ConnectionError | httpx.ConnectError):
+        return HTTPException(status_code=httpx.codes.BAD_GATEWAY, detail=str(exc))
+    elif isinstance(exc, asyncio.TimeoutError | TimeoutError):
+        return HTTPException(status_code=httpx.codes.GATEWAY_TIMEOUT, detail=f"Operation timed out: {str(exc)}")
+    elif isinstance(exc, NotImplementedError):
+        return HTTPException(status_code=httpx.codes.NOT_IMPLEMENTED, detail=f"Not implemented: {str(exc)}")
+    elif isinstance(exc, AuthenticationRequiredError):
+        return HTTPException(status_code=httpx.codes.UNAUTHORIZED, detail=f"Authentication required: {str(exc)}")
+    elif hasattr(exc, "status_code") and isinstance(getattr(exc, "status_code", None), int):
+        # Handle provider SDK exceptions (e.g., OpenAI's APIStatusError and subclasses)
+        # These include AuthenticationError (401), PermissionDeniedError (403), etc.
+        # This preserves the actual HTTP status code from the provider
+        status_code = exc.status_code
+        detail = str(exc)
+        return HTTPException(status_code=status_code, detail=detail)
+    else:
+        return HTTPException(
+            status_code=httpx.codes.INTERNAL_SERVER_ERROR,
+            detail="Internal server error: An unexpected error occurred.",
+        )
+
+
+class StackApp(FastAPI):
+    """
+    A wrapper around the FastAPI application to hold a reference to the Stack instance so that we can
+    start background tasks (e.g. refresh model registry periodically) from the lifespan context manager.
+    """
+
+    def __init__(self, config: StackRunConfig, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.stack: Stack = Stack(config)
+
+        # This code is called from a running event loop managed by uvicorn so we cannot simply call
+        # asyncio.run() to initialize the stack. We cannot await either since this is not an async
+        # function.
+        # As a workaround, we use a thread pool executor to run the initialize() method
+        # in a separate thread.
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            future = executor.submit(asyncio.run, self.stack.initialize())
+            future.result()
+
+
+@asynccontextmanager
+async def lifespan(app: StackApp):
+    server_version = parse_version("llama-stack")
+
+    logger.info(f"Starting up Llama Stack server (version: {server_version})")
+    assert app.stack is not None
+    app.stack.create_registry_refresh_task()
+    yield
+    logger.info("Shutting down")
+    await app.stack.shutdown()
+
+
+def is_streaming_request(func_name: str, request: Request, **kwargs):
+    # TODO: pass the api method and punt it to the Protocol definition directly
+    # If there's a stream parameter at top level, use it
+    if "stream" in kwargs:
+        return kwargs["stream"]
+
+    # If there's a stream parameter inside a "params" parameter, e.g. openai_chat_completion() use it
+    if "params" in kwargs:
+        params = kwargs["params"]
+        if hasattr(params, "stream"):
+            return params.stream
+
+    return False
+
+
+async def maybe_await(value):
+    if inspect.iscoroutine(value):
+        return await value
+    return value
+
+
+async def sse_generator(event_gen_coroutine):
+    event_gen = None
+    try:
+        event_gen = await event_gen_coroutine
+        async for item in event_gen:
+            yield create_sse_event(item)
+    except asyncio.CancelledError:
+        logger.info("Generator cancelled")
+        if event_gen:
+            await event_gen.aclose()
+    except Exception as e:
+        logger.exception("Error in sse_generator")
+        yield create_sse_event(
+            {
+                "error": {
+                    "message": str(translate_exception(e)),
+                },
+            }
+        )
+
+
+async def log_request_pre_validation(request: Request):
+    if request.method in ("POST", "PUT", "PATCH"):
+        try:
+            body_bytes = await request.body()
+            if body_bytes:
+                try:
+                    parsed_body = json.loads(body_bytes.decode())
+                    log_output = rich.pretty.pretty_repr(parsed_body)
+                except (json.JSONDecodeError, UnicodeDecodeError):
+                    log_output = repr(body_bytes)
+                logger.debug(f"Incoming raw request body for {request.method} {request.url.path}:\n{log_output}")
+            else:
+                logger.debug(f"Incoming {request.method} {request.url.path} request with empty body.")
+        except Exception as e:
+            logger.warning(f"Could not read or log request body for {request.method} {request.url.path}: {e}")
+
+
+def create_dynamic_typed_route(func: Any, method: str, route: str) -> Callable:
+    @functools.wraps(func)
+    async def route_handler(request: Request, **kwargs):
+        # Get auth attributes from the request scope
+        user = user_from_scope(request.scope)
+
+        await log_request_pre_validation(request)
+
+        test_context_token = None
+        test_context_var = None
+        reset_test_context_fn = None
+
+        # Use context manager with both provider data and auth attributes
+        with request_provider_data_context(request.headers, user):
+            if os.environ.get("LLAMA_STACK_TEST_INFERENCE_MODE"):
+                from llama_stack.core.testing_context import (
+                    TEST_CONTEXT,
+                    reset_test_context,
+                    sync_test_context_from_provider_data,
+                )
+
+                test_context_token = sync_test_context_from_provider_data()
+                test_context_var = TEST_CONTEXT
+                reset_test_context_fn = reset_test_context
+
+            is_streaming = is_streaming_request(func.__name__, request, **kwargs)
+
+            try:
+                if is_streaming:
+                    context_vars = [CURRENT_TRACE_CONTEXT, PROVIDER_DATA_VAR]
+                    if test_context_var is not None:
+                        context_vars.append(test_context_var)
+                    gen = preserve_contexts_async_generator(sse_generator(func(**kwargs)), context_vars)
+                    return StreamingResponse(gen, media_type="text/event-stream")
+                else:
+                    value = func(**kwargs)
+                    result = await maybe_await(value)
+                    if isinstance(result, PaginatedResponse) and result.url is None:
+                        result.url = route
+
+                    if method.upper() == "DELETE" and result is None:
+                        return Response(status_code=httpx.codes.NO_CONTENT)
+
+                    return result
+            except Exception as e:
+                if logger.isEnabledFor(logging.INFO):
+                    logger.exception(f"Error executing endpoint {route=} {method=}")
+                else:
+                    logger.error(f"Error executing endpoint {route=} {method=}: {str(e)}")
+                raise translate_exception(e) from e
+            finally:
+                if test_context_token is not None and reset_test_context_fn is not None:
+                    reset_test_context_fn(test_context_token)
+
+    sig = inspect.signature(func)
+
+    new_params = [inspect.Parameter("request", inspect.Parameter.POSITIONAL_OR_KEYWORD, annotation=Request)]
+    new_params.extend(sig.parameters.values())
+
+    path_params = extract_path_params(route)
+    if method == "post":
+        # Annotate parameters that are in the path with Path(...) and others with Body(...),
+        # but preserve existing File() and Form() annotations for multipart form data
+        new_params = (
+            [new_params[0]]
+            + [
+                (
+                    param.replace(annotation=Annotated[param.annotation, FastapiPath(..., title=param.name)])
+                    if param.name in path_params
+                    else (
+                        param  # Keep original annotation if it's already an Annotated type
+                        if get_origin(param.annotation) is Annotated
+                        else param.replace(annotation=Annotated[param.annotation, Body(..., embed=True)])
+                    )
+                )
+                for param in new_params[1:]
+            ]
+        )
+
+    route_handler.__signature__ = sig.replace(parameters=new_params)
+
+    return route_handler
+
+
+class ClientVersionMiddleware:
+    def __init__(self, app):
+        self.app = app
+        self.server_version = parse_version("llama-stack")
+
+    async def __call__(self, scope, receive, send):
+        if scope["type"] == "http":
+            headers = dict(scope.get("headers", []))
+            client_version = headers.get(b"x-llamastack-client-version", b"").decode()
+            if client_version:
+                try:
+                    client_version_parts = tuple(map(int, client_version.split(".")[:2]))
+                    server_version_parts = tuple(map(int, self.server_version.split(".")[:2]))
+                    if client_version_parts != server_version_parts:
+
+                        async def send_version_error(send):
+                            await send(
+                                {
+                                    "type": "http.response.start",
+                                    "status": httpx.codes.UPGRADE_REQUIRED,
+                                    "headers": [[b"content-type", b"application/json"]],
+                                }
+                            )
+                            error_msg = json.dumps(
+                                {
+                                    "error": {
+                                        "message": f"Client version {client_version} is not compatible with server version {self.server_version}. Please update your client."
+                                    }
+                                }
+                            ).encode()
+                            await send({"type": "http.response.body", "body": error_msg})
+
+                        return await send_version_error(send)
+                except (ValueError, IndexError):
+                    # If version parsing fails, let the request through
+                    pass
+
+        return await self.app(scope, receive, send)
+
+
+def create_app() -> StackApp:
+    """Create and configure the FastAPI application.
+
+    This factory function reads configuration from environment variables:
+    - LLAMA_STACK_CONFIG: Path to config file (required)
+
+    Returns:
+        Configured StackApp instance.
+    """
+    # Initialize logging from environment variables first
+    setup_logging()
+
+    config_file = os.getenv("LLAMA_STACK_CONFIG")
+    if config_file is None:
+        raise ValueError("LLAMA_STACK_CONFIG environment variable is required")
+
+    config_file = resolve_config_or_distro(config_file, Mode.RUN)
+
+    # Load and process configuration
+    logger_config = None
+    with open(config_file) as fp:
+        config_contents = yaml.safe_load(fp)
+        if isinstance(config_contents, dict) and (cfg := config_contents.get("logging_config")):
+            logger_config = LoggingConfig(**cfg)
+        logger = get_logger(name=__name__, category="core::server", config=logger_config)
+
+        config = replace_env_vars(config_contents)
+        config = StackRunConfig(**cast_image_name_to_string(config))
+
+    _log_run_config(run_config=config)
+
+    app = StackApp(
+        lifespan=lifespan,
+        docs_url="/docs",
+        redoc_url="/redoc",
+        openapi_url="/openapi.json",
+        config=config,
+    )
+
+    if not os.environ.get("LLAMA_STACK_DISABLE_VERSION_CHECK"):
+        app.add_middleware(ClientVersionMiddleware)
+
+    impls = app.stack.impls
+
+    if config.server.auth:
+        logger.info(f"Enabling authentication with provider: {config.server.auth.provider_config.type.value}")
+        app.add_middleware(AuthenticationMiddleware, auth_config=config.server.auth, impls=impls)
+    else:
+        if config.server.quota:
+            quota = config.server.quota
+            logger.warning(
+                "Configured authenticated_max_requests (%d) but no auth is enabled; "
+                "falling back to anonymous_max_requests (%d) for all the requests",
+                quota.authenticated_max_requests,
+                quota.anonymous_max_requests,
+            )
+
+    if config.server.quota:
+        logger.info("Enabling quota middleware for authenticated and anonymous clients")
+
+        quota = config.server.quota
+        anonymous_max_requests = quota.anonymous_max_requests
+        # if auth is disabled, use the anonymous max requests
+        authenticated_max_requests = quota.authenticated_max_requests if config.server.auth else anonymous_max_requests
+
+        kv_config = quota.kvstore
+        window_map = {"day": 86400}
+        window_seconds = window_map[quota.period.value]
+
+        app.add_middleware(
+            QuotaMiddleware,
+            kv_config=kv_config,
+            anonymous_max_requests=anonymous_max_requests,
+            authenticated_max_requests=authenticated_max_requests,
+            window_seconds=window_seconds,
+        )
+
+    if config.server.cors:
+        logger.info("Enabling CORS")
+        cors_config = process_cors_config(config.server.cors)
+        if cors_config:
+            app.add_middleware(CORSMiddleware, **cors_config.model_dump())
+
+    if config.telemetry.enabled:
+        setup_logger(Telemetry())
+
+    # Load external APIs if configured
+    external_apis = load_external_apis(config)
+    all_routes = get_all_api_routes(external_apis)
+
+    if config.apis:
+        apis_to_serve = set(config.apis)
+    else:
+        apis_to_serve = set(impls.keys())
+
+    for inf in builtin_automatically_routed_apis():
+        # if we do not serve the corresponding router API, we should not serve the routing table API
+        if inf.router_api.value not in apis_to_serve:
+            continue
+        apis_to_serve.add(inf.routing_table_api.value)
+
+    apis_to_serve.add("inspect")
+    apis_to_serve.add("providers")
+    apis_to_serve.add("prompts")
+    apis_to_serve.add("conversations")
+    for api_str in apis_to_serve:
+        api = Api(api_str)
+
+        routes = all_routes[api]
+        try:
+            impl = impls[api]
+        except KeyError as e:
+            raise ValueError(f"Could not find provider implementation for {api} API") from e
+
+        for route, _ in routes:
+            if not hasattr(impl, route.name):
+                # ideally this should be a typing violation already
+                raise ValueError(f"Could not find method {route.name} on {impl}!")
+
+            impl_method = getattr(impl, route.name)
+            # Filter out HEAD method since it's automatically handled by FastAPI for GET routes
+            available_methods = [m for m in route.methods if m != "HEAD"]
+            if not available_methods:
+                raise ValueError(f"No methods found for {route.name} on {impl}")
+            method = available_methods[0]
+            logger.debug(f"{method} {route.path}")
+
+            with warnings.catch_warnings():
+                warnings.filterwarnings("ignore", category=UserWarning, module="pydantic._internal._fields")
+                getattr(app, method.lower())(route.path, response_model=None)(
+                    create_dynamic_typed_route(
+                        impl_method,
+                        method.lower(),
+                        route.path,
+                    )
+                )
+
+    logger.debug(f"serving APIs: {apis_to_serve}")
+
+    app.exception_handler(RequestValidationError)(global_exception_handler)
+    app.exception_handler(Exception)(global_exception_handler)
+
+    if config.telemetry.enabled:
+        app.add_middleware(TracingMiddleware, impls=impls, external_apis=external_apis)
+
+    return app
+
+
+def _log_run_config(run_config: StackRunConfig):
+    """Logs the run config with redacted fields and disabled providers removed."""
+    logger.info("Run configuration:")
+    safe_config = redact_sensitive_fields(run_config.model_dump(mode="json"))
+    clean_config = remove_disabled_providers(safe_config)
+    logger.info(yaml.dump(clean_config, indent=2))
+
+
+def extract_path_params(route: str) -> list[str]:
+    segments = route.split("/")
+    params = [seg[1:-1] for seg in segments if seg.startswith("{") and seg.endswith("}")]
+    # to handle path params like {param:path}
+    params = [param.split(":")[0] for param in params]
+    return params
+
+
+def remove_disabled_providers(obj):
+    if isinstance(obj, dict):
+        keys = ["provider_id", "shield_id", "provider_model_id", "model_id"]
+        if any(k in obj and obj[k] in ("__disabled__", "", None) for k in keys):
+            return None
+        return {k: v for k, v in ((k, remove_disabled_providers(v)) for k, v in obj.items()) if v is not None}
+    elif isinstance(obj, list):
+        return [item for item in (remove_disabled_providers(i) for i in obj) if item is not None]
+    else:
+        return obj
--- a/src/llama_stack/core/server/tracing.py
+++ b/src/llama_stack/core/server/tracing.py
@ -0,0 +1,80 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from aiohttp import hdrs
+
+from llama_stack.core.external import ExternalApiSpec
+from llama_stack.core.server.routes import find_matching_route, initialize_route_impls
+from llama_stack.core.telemetry.tracing import end_trace, start_trace
+from llama_stack.log import get_logger
+
+logger = get_logger(name=__name__, category="core::server")
+
+
+class TracingMiddleware:
+    def __init__(self, app, impls, external_apis: dict[str, ExternalApiSpec]):
+        self.app = app
+        self.impls = impls
+        self.external_apis = external_apis
+        # FastAPI built-in paths that should bypass custom routing
+        self.fastapi_paths = ("/docs", "/redoc", "/openapi.json", "/favicon.ico", "/static")
+
+    async def __call__(self, scope, receive, send):
+        if scope.get("type") == "lifespan":
+            return await self.app(scope, receive, send)
+
+        path = scope.get("path", "")
+
+        # Check if the path is a FastAPI built-in path
+        if path.startswith(self.fastapi_paths):
+            # Pass through to FastAPI's built-in handlers
+            logger.debug(f"Bypassing custom routing for FastAPI built-in path: {path}")
+            return await self.app(scope, receive, send)
+
+        if not hasattr(self, "route_impls"):
+            self.route_impls = initialize_route_impls(self.impls, self.external_apis)
+
+        try:
+            _, _, route_path, webmethod = find_matching_route(
+                scope.get("method", hdrs.METH_GET), path, self.route_impls
+            )
+        except ValueError:
+            # If no matching endpoint is found, pass through to FastAPI
+            logger.debug(f"No matching route found for path: {path}, falling back to FastAPI")
+            return await self.app(scope, receive, send)
+
+        # Log deprecation warning if route is deprecated
+        if getattr(webmethod, "deprecated", False):
+            logger.warning(
+                f"DEPRECATED ROUTE USED: {scope.get('method', 'GET')} {path} - "
+                f"This route is deprecated and may be removed in a future version. "
+                f"Please check the docs for the supported version."
+            )
+
+        trace_attributes = {"__location__": "server", "raw_path": path}
+
+        # Extract W3C trace context headers and store as trace attributes
+        headers = dict(scope.get("headers", []))
+        traceparent = headers.get(b"traceparent", b"").decode()
+        if traceparent:
+            trace_attributes["traceparent"] = traceparent
+        tracestate = headers.get(b"tracestate", b"").decode()
+        if tracestate:
+            trace_attributes["tracestate"] = tracestate
+
+        trace_path = webmethod.descriptive_name or route_path
+        trace_context = await start_trace(trace_path, trace_attributes)
+
+        async def send_with_trace_id(message):
+            if message["type"] == "http.response.start":
+                headers = message.get("headers", [])
+                headers.append([b"x-trace-id", str(trace_context.trace_id).encode()])
+                message["headers"] = headers
+            await send(message)
+
+        try:
+            return await self.app(scope, receive, send_with_trace_id)
+        finally:
+            await end_trace()