chore: enable pyupgrade fixes (#1806)

# What does this PR do? The goal of this PR is code base modernization. Schema reflection code needed a minor adjustment to handle UnionTypes and collections.abc.AsyncIterator. (Both are preferred for latest Python releases.) Note to reviewers: almost all changes here are automatically generated by pyupgrade. Some additional unused imports were cleaned up. The only change worth of note can be found under `docs/openapi_generator` and `llama_stack/strong_typing/schema.py` where reflection code was updated to deal with "newer" types. Signed-off-by: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
2025-05-01 17:23:50 -04:00 · 2025-05-01 17:23:50 -04:00 · 9e6561a1ec
commit 9e6561a1ec
parent ffe3d0b2cd
319 changed files with 2843 additions and 3033 deletions
--- a/llama_stack/providers/remote/datasetio/huggingface/config.py
+++ b/llama_stack/providers/remote/datasetio/huggingface/config.py
@ -3,7 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Any, Dict
+from typing import Any

 from pydantic import BaseModel

@ -17,7 +17,7 @@ class HuggingfaceDatasetIOConfig(BaseModel):
    kvstore: KVStoreConfig

    @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
        return {
            "kvstore": SqliteKVStoreConfig.sample_run_config(
                __distro_dir__=__distro_dir__,
--- a/llama_stack/providers/remote/datasetio/huggingface/huggingface.py
+++ b/llama_stack/providers/remote/datasetio/huggingface/huggingface.py
@ -3,7 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Any, Dict, List, Optional
+from typing import Any
 from urllib.parse import parse_qs, urlparse

 import datasets as hf_datasets
@ -70,8 +70,8 @@ class HuggingfaceDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
    async def iterrows(
        self,
        dataset_id: str,
-        start_index: Optional[int] = None,
-        limit: Optional[int] = None,
+        start_index: int | None = None,
+        limit: int | None = None,
    ) -> PaginatedResponse:
        dataset_def = self.dataset_infos[dataset_id]
        path, params = parse_hf_params(dataset_def)
@ -80,7 +80,7 @@ class HuggingfaceDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
        records = [loaded_dataset[i] for i in range(len(loaded_dataset))]
        return paginate_records(records, start_index, limit)

-    async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None:
+    async def append_rows(self, dataset_id: str, rows: list[dict[str, Any]]) -> None:
        dataset_def = self.dataset_infos[dataset_id]
        path, params = parse_hf_params(dataset_def)
        loaded_dataset = hf_datasets.load_dataset(path, **params)
--- a/llama_stack/providers/remote/datasetio/nvidia/config.py
+++ b/llama_stack/providers/remote/datasetio/nvidia/config.py
@ -6,7 +6,7 @@

 import os
 import warnings
-from typing import Any, Dict, Optional
+from typing import Any

 from pydantic import BaseModel, Field

@ -14,17 +14,17 @@ from pydantic import BaseModel, Field
 class NvidiaDatasetIOConfig(BaseModel):
    """Configuration for NVIDIA DatasetIO implementation."""

-    api_key: Optional[str] = Field(
+    api_key: str | None = Field(
        default_factory=lambda: os.getenv("NVIDIA_API_KEY"),
        description="The NVIDIA API key.",
    )

-    dataset_namespace: Optional[str] = Field(
+    dataset_namespace: str | None = Field(
        default_factory=lambda: os.getenv("NVIDIA_DATASET_NAMESPACE", "default"),
        description="The NVIDIA dataset namespace.",
    )

-    project_id: Optional[str] = Field(
+    project_id: str | None = Field(
        default_factory=lambda: os.getenv("NVIDIA_PROJECT_ID", "test-project"),
        description="The NVIDIA project ID.",
    )
@ -52,7 +52,7 @@ class NvidiaDatasetIOConfig(BaseModel):
            )

    @classmethod
-    def sample_run_config(cls, **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
        return {
            "api_key": "${env.NVIDIA_API_KEY:}",
            "dataset_namespace": "${env.NVIDIA_DATASET_NAMESPACE:default}",
--- a/llama_stack/providers/remote/datasetio/nvidia/datasetio.py
+++ b/llama_stack/providers/remote/datasetio/nvidia/datasetio.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict, List, Optional
+from typing import Any

 import aiohttp

@ -27,11 +27,11 @@ class NvidiaDatasetIOAdapter:
        self,
        method: str,
        path: str,
-        headers: Optional[Dict[str, Any]] = None,
-        params: Optional[Dict[str, Any]] = None,
-        json: Optional[Dict[str, Any]] = None,
+        headers: dict[str, Any] | None = None,
+        params: dict[str, Any] | None = None,
+        json: dict[str, Any] | None = None,
        **kwargs,
-    ) -> Dict[str, Any]:
+    ) -> dict[str, Any]:
        """Helper method to make HTTP requests to the Customizer API."""
        url = f"{self.config.datasets_url}{path}"
        request_headers = self.headers.copy()
@ -82,11 +82,11 @@ class NvidiaDatasetIOAdapter:
    async def update_dataset(
        self,
        dataset_id: str,
-        dataset_schema: Dict[str, ParamType],
+        dataset_schema: dict[str, ParamType],
        url: URL,
-        provider_dataset_id: Optional[str] = None,
-        provider_id: Optional[str] = None,
-        metadata: Optional[Dict[str, Any]] = None,
+        provider_dataset_id: str | None = None,
+        provider_id: str | None = None,
+        metadata: dict[str, Any] | None = None,
    ) -> None:
        raise NotImplementedError("Not implemented")

@ -103,10 +103,10 @@ class NvidiaDatasetIOAdapter:
    async def iterrows(
        self,
        dataset_id: str,
-        start_index: Optional[int] = None,
-        limit: Optional[int] = None,
+        start_index: int | None = None,
+        limit: int | None = None,
    ) -> PaginatedResponse:
        raise NotImplementedError("Not implemented")

-    async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None:
+    async def append_rows(self, dataset_id: str, rows: list[dict[str, Any]]) -> None:
        raise NotImplementedError("Not implemented")
--- a/llama_stack/providers/remote/eval/nvidia/init.py
+++ b/llama_stack/providers/remote/eval/nvidia/init.py
@ -3,7 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Any, Dict
+from typing import Any

 from llama_stack.distribution.datatypes import Api

@ -12,7 +12,7 @@ from .config import NVIDIAEvalConfig

 async def get_adapter_impl(
    config: NVIDIAEvalConfig,
-    deps: Dict[Api, Any],
+    deps: dict[Api, Any],
 ):
    from .eval import NVIDIAEvalImpl

--- a/llama_stack/providers/remote/eval/nvidia/config.py
+++ b/llama_stack/providers/remote/eval/nvidia/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import os
-from typing import Any, Dict
+from typing import Any

 from pydantic import BaseModel, Field

@ -23,7 +23,7 @@ class NVIDIAEvalConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
        return {
            "evaluator_url": "${env.NVIDIA_EVALUATOR_URL:http://localhost:7331}",
        }
--- a/llama_stack/providers/remote/eval/nvidia/eval.py
+++ b/llama_stack/providers/remote/eval/nvidia/eval.py
@ -3,7 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Any, Dict, List
+from typing import Any

 import requests

@ -101,8 +101,8 @@ class NVIDIAEvalImpl(
    async def evaluate_rows(
        self,
        benchmark_id: str,
-        input_rows: List[Dict[str, Any]],
-        scoring_functions: List[str],
+        input_rows: list[dict[str, Any]],
+        scoring_functions: list[str],
        benchmark_config: BenchmarkConfig,
    ) -> EvaluateResponse:
        raise NotImplementedError()
--- a/llama_stack/providers/remote/inference/anthropic/init.py
+++ b/llama_stack/providers/remote/inference/anthropic/init.py
@ -4,15 +4,13 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Optional
-
 from pydantic import BaseModel

 from .config import AnthropicConfig


 class AnthropicProviderDataValidator(BaseModel):
-    anthropic_api_key: Optional[str] = None
+    anthropic_api_key: str | None = None


 async def get_adapter_impl(config: AnthropicConfig, _deps):
--- a/llama_stack/providers/remote/inference/anthropic/config.py
+++ b/llama_stack/providers/remote/inference/anthropic/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict, Optional
+from typing import Any

 from pydantic import BaseModel, Field

@ -12,7 +12,7 @@ from llama_stack.schema_utils import json_schema_type


 class AnthropicProviderDataValidator(BaseModel):
-    anthropic_api_key: Optional[str] = Field(
+    anthropic_api_key: str | None = Field(
        default=None,
        description="API key for Anthropic models",
    )
@ -20,13 +20,13 @@ class AnthropicProviderDataValidator(BaseModel):

@json_schema_type
 class AnthropicConfig(BaseModel):
-    api_key: Optional[str] = Field(
+    api_key: str | None = Field(
        default=None,
        description="API key for Anthropic models",
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.ANTHROPIC_API_KEY}", **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.ANTHROPIC_API_KEY}", **kwargs) -> dict[str, Any]:
        return {
            "api_key": api_key,
        }
--- a/llama_stack/providers/remote/inference/bedrock/bedrock.py
+++ b/llama_stack/providers/remote/inference/bedrock/bedrock.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.

 import json
-from typing import AsyncGenerator, AsyncIterator, Dict, List, Optional, Union
+from collections.abc import AsyncGenerator, AsyncIterator

 from botocore.client import BaseClient

@ -79,26 +79,26 @@ class BedrockInferenceAdapter(
        self,
        model_id: str,
        content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
    ) -> AsyncGenerator:
        raise NotImplementedError()

    async def chat_completion(
        self,
        model_id: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
-    ) -> Union[ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]]:
+        messages: list[Message],
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_choice: ToolChoice | None = ToolChoice.auto,
+        tool_prompt_format: ToolPromptFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+        tool_config: ToolConfig | None = None,
+    ) -> ChatCompletionResponse | AsyncIterator[ChatCompletionResponseStreamChunk]:
        if sampling_params is None:
            sampling_params = SamplingParams()
        model = await self.model_store.get_model(model_id)
@ -151,7 +151,7 @@ class BedrockInferenceAdapter(
        async for chunk in process_chat_completion_stream_response(stream, request):
            yield chunk

-    async def _get_params_for_chat_completion(self, request: ChatCompletionRequest) -> Dict:
+    async def _get_params_for_chat_completion(self, request: ChatCompletionRequest) -> dict:
        bedrock_model = request.model

        sampling_params = request.sampling_params
@ -176,10 +176,10 @@ class BedrockInferenceAdapter(
    async def embeddings(
        self,
        model_id: str,
-        contents: List[str] | List[InterleavedContentItem],
-        text_truncation: Optional[TextTruncation] = TextTruncation.none,
-        output_dimension: Optional[int] = None,
-        task_type: Optional[EmbeddingTaskType] = None,
+        contents: list[str] | list[InterleavedContentItem],
+        text_truncation: TextTruncation | None = TextTruncation.none,
+        output_dimension: int | None = None,
+        task_type: EmbeddingTaskType | None = None,
    ) -> EmbeddingsResponse:
        model = await self.model_store.get_model(model_id)
        embeddings = []
--- a/llama_stack/providers/remote/inference/cerebras/cerebras.py
+++ b/llama_stack/providers/remote/inference/cerebras/cerebras.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import AsyncGenerator, List, Optional, Union
+from collections.abc import AsyncGenerator

 from cerebras.cloud.sdk import AsyncCerebras

@ -79,10 +79,10 @@ class CerebrasInferenceAdapter(
        self,
        model_id: str,
        content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
    ) -> AsyncGenerator:
        if sampling_params is None:
            sampling_params = SamplingParams()
@ -120,15 +120,15 @@ class CerebrasInferenceAdapter(
    async def chat_completion(
        self,
        model_id: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
+        messages: list[Message],
+        sampling_params: SamplingParams | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_choice: ToolChoice | None = ToolChoice.auto,
+        tool_prompt_format: ToolPromptFormat | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+        tool_config: ToolConfig | None = None,
    ) -> AsyncGenerator:
        if sampling_params is None:
            sampling_params = SamplingParams()
@ -166,7 +166,7 @@ class CerebrasInferenceAdapter(
        async for chunk in process_chat_completion_stream_response(stream, request):
            yield chunk

-    async def _get_params(self, request: Union[ChatCompletionRequest, CompletionRequest]) -> dict:
+    async def _get_params(self, request: ChatCompletionRequest | CompletionRequest) -> dict:
        if request.sampling_params and isinstance(request.sampling_params.strategy, TopKSamplingStrategy):
            raise ValueError("`top_k` not supported by Cerebras")

@ -188,9 +188,9 @@ class CerebrasInferenceAdapter(
    async def embeddings(
        self,
        model_id: str,
-        contents: List[str] | List[InterleavedContentItem],
-        text_truncation: Optional[TextTruncation] = TextTruncation.none,
-        output_dimension: Optional[int] = None,
-        task_type: Optional[EmbeddingTaskType] = None,
+        contents: list[str] | list[InterleavedContentItem],
+        text_truncation: TextTruncation | None = TextTruncation.none,
+        output_dimension: int | None = None,
+        task_type: EmbeddingTaskType | None = None,
    ) -> EmbeddingsResponse:
        raise NotImplementedError()
--- a/llama_stack/providers/remote/inference/cerebras/config.py
+++ b/llama_stack/providers/remote/inference/cerebras/config.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.

 import os
-from typing import Any, Dict, Optional
+from typing import Any

 from pydantic import BaseModel, Field, SecretStr

@ -20,13 +20,13 @@ class CerebrasImplConfig(BaseModel):
        default=os.environ.get("CEREBRAS_BASE_URL", DEFAULT_BASE_URL),
        description="Base URL for the Cerebras API",
    )
-    api_key: Optional[SecretStr] = Field(
+    api_key: SecretStr | None = Field(
        default=os.environ.get("CEREBRAS_API_KEY"),
        description="Cerebras API Key",
    )

    @classmethod
-    def sample_run_config(cls, **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
        return {
            "base_url": DEFAULT_BASE_URL,
            "api_key": "${env.CEREBRAS_API_KEY}",
--- a/llama_stack/providers/remote/inference/cerebras_openai_compat/config.py
+++ b/llama_stack/providers/remote/inference/cerebras_openai_compat/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict, Optional
+from typing import Any

 from pydantic import BaseModel, Field

@ -12,7 +12,7 @@ from llama_stack.schema_utils import json_schema_type


 class CerebrasProviderDataValidator(BaseModel):
-    cerebras_api_key: Optional[str] = Field(
+    cerebras_api_key: str | None = Field(
        default=None,
        description="API key for Cerebras models",
    )
@ -20,7 +20,7 @@ class CerebrasProviderDataValidator(BaseModel):

@json_schema_type
 class CerebrasCompatConfig(BaseModel):
-    api_key: Optional[str] = Field(
+    api_key: str | None = Field(
        default=None,
        description="The Cerebras API key",
    )
@ -31,7 +31,7 @@ class CerebrasCompatConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.CEREBRAS_API_KEY}", **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.CEREBRAS_API_KEY}", **kwargs) -> dict[str, Any]:
        return {
            "openai_compat_api_base": "https://api.cerebras.ai/v1",
            "api_key": api_key,
--- a/llama_stack/providers/remote/inference/databricks/config.py
+++ b/llama_stack/providers/remote/inference/databricks/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict
+from typing import Any

 from pydantic import BaseModel, Field

@ -28,7 +28,7 @@ class DatabricksImplConfig(BaseModel):
        url: str = "${env.DATABRICKS_URL}",
        api_token: str = "${env.DATABRICKS_API_TOKEN}",
        **kwargs: Any,
-    ) -> Dict[str, Any]:
+    ) -> dict[str, Any]:
        return {
            "url": url,
            "api_token": api_token,
--- a/llama_stack/providers/remote/inference/databricks/databricks.py
+++ b/llama_stack/providers/remote/inference/databricks/databricks.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import AsyncGenerator, List, Optional
+from collections.abc import AsyncGenerator

 from openai import OpenAI

@ -78,25 +78,25 @@ class DatabricksInferenceAdapter(
        self,
        model: str,
        content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
    ) -> AsyncGenerator:
        raise NotImplementedError()

    async def chat_completion(
        self,
        model: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
+        messages: list[Message],
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_choice: ToolChoice | None = ToolChoice.auto,
+        tool_prompt_format: ToolPromptFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+        tool_config: ToolConfig | None = None,
    ) -> AsyncGenerator:
        if sampling_params is None:
            sampling_params = SamplingParams()
@ -146,9 +146,9 @@ class DatabricksInferenceAdapter(
    async def embeddings(
        self,
        model_id: str,
-        contents: List[str] | List[InterleavedContentItem],
-        text_truncation: Optional[TextTruncation] = TextTruncation.none,
-        output_dimension: Optional[int] = None,
-        task_type: Optional[EmbeddingTaskType] = None,
+        contents: list[str] | list[InterleavedContentItem],
+        text_truncation: TextTruncation | None = TextTruncation.none,
+        output_dimension: int | None = None,
+        task_type: EmbeddingTaskType | None = None,
    ) -> EmbeddingsResponse:
        raise NotImplementedError()
--- a/llama_stack/providers/remote/inference/fireworks/config.py
+++ b/llama_stack/providers/remote/inference/fireworks/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict, Optional
+from typing import Any

 from pydantic import BaseModel, Field, SecretStr

@ -17,13 +17,13 @@ class FireworksImplConfig(BaseModel):
        default="https://api.fireworks.ai/inference/v1",
        description="The URL for the Fireworks server",
    )
-    api_key: Optional[SecretStr] = Field(
+    api_key: SecretStr | None = Field(
        default=None,
        description="The Fireworks.ai API Key",
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.FIREWORKS_API_KEY}", **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.FIREWORKS_API_KEY}", **kwargs) -> dict[str, Any]:
        return {
            "url": "https://api.fireworks.ai/inference/v1",
            "api_key": api_key,
--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@ -4,7 +4,8 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union
+from collections.abc import AsyncGenerator, AsyncIterator
+from typing import Any

 from fireworks.client import Fireworks
 from openai import AsyncOpenAI
@ -105,10 +106,10 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
        self,
        model_id: str,
        content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
    ) -> AsyncGenerator:
        if sampling_params is None:
            sampling_params = SamplingParams()
@ -146,9 +147,9 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv

    def _build_options(
        self,
-        sampling_params: Optional[SamplingParams],
+        sampling_params: SamplingParams | None,
        fmt: ResponseFormat,
-        logprobs: Optional[LogProbConfig],
+        logprobs: LogProbConfig | None,
    ) -> dict:
        options = get_sampling_options(sampling_params)
        options.setdefault("max_tokens", 512)
@ -177,15 +178,15 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
    async def chat_completion(
        self,
        model_id: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
+        messages: list[Message],
+        sampling_params: SamplingParams | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_choice: ToolChoice | None = ToolChoice.auto,
+        tool_prompt_format: ToolPromptFormat | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+        tool_config: ToolConfig | None = None,
    ) -> AsyncGenerator:
        if sampling_params is None:
            sampling_params = SamplingParams()
@ -229,7 +230,7 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
        async for chunk in process_chat_completion_stream_response(stream, request):
            yield chunk

-    async def _get_params(self, request: Union[ChatCompletionRequest, CompletionRequest]) -> dict:
+    async def _get_params(self, request: ChatCompletionRequest | CompletionRequest) -> dict:
        input_dict = {}
        media_present = request_has_media(request)

@ -263,10 +264,10 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
    async def embeddings(
        self,
        model_id: str,
-        contents: List[str] | List[InterleavedContentItem],
-        text_truncation: Optional[TextTruncation] = TextTruncation.none,
-        output_dimension: Optional[int] = None,
-        task_type: Optional[EmbeddingTaskType] = None,
+        contents: list[str] | list[InterleavedContentItem],
+        text_truncation: TextTruncation | None = TextTruncation.none,
+        output_dimension: int | None = None,
+        task_type: EmbeddingTaskType | None = None,
    ) -> EmbeddingsResponse:
        model = await self.model_store.get_model(model_id)

@ -288,24 +289,24 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
    async def openai_completion(
        self,
        model: str,
-        prompt: Union[str, List[str], List[int], List[List[int]]],
-        best_of: Optional[int] = None,
-        echo: Optional[bool] = None,
-        frequency_penalty: Optional[float] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        presence_penalty: Optional[float] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-        guided_choice: Optional[List[str]] = None,
-        prompt_logprobs: Optional[int] = None,
+        prompt: str | list[str] | list[int] | list[list[int]],
+        best_of: int | None = None,
+        echo: bool | None = None,
+        frequency_penalty: float | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        presence_penalty: float | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+        guided_choice: list[str] | None = None,
+        prompt_logprobs: int | None = None,
    ) -> OpenAICompletion:
        model_obj = await self.model_store.get_model(model)

@ -338,29 +339,29 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProv
    async def openai_chat_completion(
        self,
        model: str,
-        messages: List[OpenAIMessageParam],
-        frequency_penalty: Optional[float] = None,
-        function_call: Optional[Union[str, Dict[str, Any]]] = None,
-        functions: Optional[List[Dict[str, Any]]] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_completion_tokens: Optional[int] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        parallel_tool_calls: Optional[bool] = None,
-        presence_penalty: Optional[float] = None,
-        response_format: Optional[OpenAIResponseFormatParam] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
-        tools: Optional[List[Dict[str, Any]]] = None,
-        top_logprobs: Optional[int] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+        messages: list[OpenAIMessageParam],
+        frequency_penalty: float | None = None,
+        function_call: str | dict[str, Any] | None = None,
+        functions: list[dict[str, Any]] | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_completion_tokens: int | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        parallel_tool_calls: bool | None = None,
+        presence_penalty: float | None = None,
+        response_format: OpenAIResponseFormatParam | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        tool_choice: str | dict[str, Any] | None = None,
+        tools: list[dict[str, Any]] | None = None,
+        top_logprobs: int | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
        model_obj = await self.model_store.get_model(model)

        # Divert Llama Models through Llama Stack inference APIs because
--- a/llama_stack/providers/remote/inference/fireworks_openai_compat/config.py
+++ b/llama_stack/providers/remote/inference/fireworks_openai_compat/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict, Optional
+from typing import Any

 from pydantic import BaseModel, Field

@ -12,7 +12,7 @@ from llama_stack.schema_utils import json_schema_type


 class FireworksProviderDataValidator(BaseModel):
-    fireworks_api_key: Optional[str] = Field(
+    fireworks_api_key: str | None = Field(
        default=None,
        description="API key for Fireworks models",
    )
@ -20,7 +20,7 @@ class FireworksProviderDataValidator(BaseModel):

@json_schema_type
 class FireworksCompatConfig(BaseModel):
-    api_key: Optional[str] = Field(
+    api_key: str | None = Field(
        default=None,
        description="The Fireworks API key",
    )
@ -31,7 +31,7 @@ class FireworksCompatConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.FIREWORKS_API_KEY}", **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.FIREWORKS_API_KEY}", **kwargs) -> dict[str, Any]:
        return {
            "openai_compat_api_base": "https://api.fireworks.ai/inference/v1",
            "api_key": api_key,
--- a/llama_stack/providers/remote/inference/gemini/init.py
+++ b/llama_stack/providers/remote/inference/gemini/init.py
@ -4,15 +4,13 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Optional
-
 from pydantic import BaseModel

 from .config import GeminiConfig


 class GeminiProviderDataValidator(BaseModel):
-    gemini_api_key: Optional[str] = None
+    gemini_api_key: str | None = None


 async def get_adapter_impl(config: GeminiConfig, _deps):
--- a/llama_stack/providers/remote/inference/gemini/config.py
+++ b/llama_stack/providers/remote/inference/gemini/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict, Optional
+from typing import Any

 from pydantic import BaseModel, Field

@ -12,7 +12,7 @@ from llama_stack.schema_utils import json_schema_type


 class GeminiProviderDataValidator(BaseModel):
-    gemini_api_key: Optional[str] = Field(
+    gemini_api_key: str | None = Field(
        default=None,
        description="API key for Gemini models",
    )
@ -20,13 +20,13 @@ class GeminiProviderDataValidator(BaseModel):

@json_schema_type
 class GeminiConfig(BaseModel):
-    api_key: Optional[str] = Field(
+    api_key: str | None = Field(
        default=None,
        description="API key for Gemini models",
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.GEMINI_API_KEY}", **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.GEMINI_API_KEY}", **kwargs) -> dict[str, Any]:
        return {
            "api_key": api_key,
        }
--- a/llama_stack/providers/remote/inference/groq/config.py
+++ b/llama_stack/providers/remote/inference/groq/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict, Optional
+from typing import Any

 from pydantic import BaseModel, Field

@ -12,7 +12,7 @@ from llama_stack.schema_utils import json_schema_type


 class GroqProviderDataValidator(BaseModel):
-    groq_api_key: Optional[str] = Field(
+    groq_api_key: str | None = Field(
        default=None,
        description="API key for Groq models",
    )
@ -20,7 +20,7 @@ class GroqProviderDataValidator(BaseModel):

@json_schema_type
 class GroqConfig(BaseModel):
-    api_key: Optional[str] = Field(
+    api_key: str | None = Field(
        # The Groq client library loads the GROQ_API_KEY environment variable by default
        default=None,
        description="The Groq API key",
@ -32,7 +32,7 @@ class GroqConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.GROQ_API_KEY}", **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.GROQ_API_KEY}", **kwargs) -> dict[str, Any]:
        return {
            "url": "https://api.groq.com",
            "api_key": api_key,
--- a/llama_stack/providers/remote/inference/groq/groq.py
+++ b/llama_stack/providers/remote/inference/groq/groq.py
@ -4,7 +4,8 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, AsyncIterator, Dict, List, Optional, Union
+from collections.abc import AsyncIterator
+from typing import Any

 from openai import AsyncOpenAI

@ -59,29 +60,29 @@ class GroqInferenceAdapter(LiteLLMOpenAIMixin):
    async def openai_chat_completion(
        self,
        model: str,
-        messages: List[OpenAIMessageParam],
-        frequency_penalty: Optional[float] = None,
-        function_call: Optional[Union[str, Dict[str, Any]]] = None,
-        functions: Optional[List[Dict[str, Any]]] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_completion_tokens: Optional[int] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        parallel_tool_calls: Optional[bool] = None,
-        presence_penalty: Optional[float] = None,
-        response_format: Optional[OpenAIResponseFormatParam] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
-        tools: Optional[List[Dict[str, Any]]] = None,
-        top_logprobs: Optional[int] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+        messages: list[OpenAIMessageParam],
+        frequency_penalty: float | None = None,
+        function_call: str | dict[str, Any] | None = None,
+        functions: list[dict[str, Any]] | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_completion_tokens: int | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        parallel_tool_calls: bool | None = None,
+        presence_penalty: float | None = None,
+        response_format: OpenAIResponseFormatParam | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        tool_choice: str | dict[str, Any] | None = None,
+        tools: list[dict[str, Any]] | None = None,
+        top_logprobs: int | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
        model_obj = await self.model_store.get_model(model)

        # Groq does not support json_schema response format, so we need to convert it to json_object
--- a/llama_stack/providers/remote/inference/groq_openai_compat/config.py
+++ b/llama_stack/providers/remote/inference/groq_openai_compat/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict, Optional
+from typing import Any

 from pydantic import BaseModel, Field

@ -12,7 +12,7 @@ from llama_stack.schema_utils import json_schema_type


 class GroqProviderDataValidator(BaseModel):
-    groq_api_key: Optional[str] = Field(
+    groq_api_key: str | None = Field(
        default=None,
        description="API key for Groq models",
    )
@ -20,7 +20,7 @@ class GroqProviderDataValidator(BaseModel):

@json_schema_type
 class GroqCompatConfig(BaseModel):
-    api_key: Optional[str] = Field(
+    api_key: str | None = Field(
        default=None,
        description="The Groq API key",
    )
@ -31,7 +31,7 @@ class GroqCompatConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.GROQ_API_KEY}", **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.GROQ_API_KEY}", **kwargs) -> dict[str, Any]:
        return {
            "openai_compat_api_base": "https://api.groq.com/openai/v1",
            "api_key": api_key,
--- a/llama_stack/providers/remote/inference/llama_openai_compat/config.py
+++ b/llama_stack/providers/remote/inference/llama_openai_compat/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict, Optional
+from typing import Any

 from pydantic import BaseModel, Field

@ -12,7 +12,7 @@ from llama_stack.schema_utils import json_schema_type


 class LlamaProviderDataValidator(BaseModel):
-    llama_api_key: Optional[str] = Field(
+    llama_api_key: str | None = Field(
        default=None,
        description="API key for api.llama models",
    )
@ -20,7 +20,7 @@ class LlamaProviderDataValidator(BaseModel):

@json_schema_type
 class LlamaCompatConfig(BaseModel):
-    api_key: Optional[str] = Field(
+    api_key: str | None = Field(
        default=None,
        description="The Llama API key",
    )
@ -31,7 +31,7 @@ class LlamaCompatConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.LLAMA_API_KEY}", **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.LLAMA_API_KEY}", **kwargs) -> dict[str, Any]:
        return {
            "openai_compat_api_base": "https://api.llama.com/compat/v1/",
            "api_key": api_key,
--- a/llama_stack/providers/remote/inference/nvidia/config.py
+++ b/llama_stack/providers/remote/inference/nvidia/config.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.

 import os
-from typing import Any, Dict, Optional
+from typing import Any

 from pydantic import BaseModel, Field, SecretStr

@ -39,7 +39,7 @@ class NVIDIAConfig(BaseModel):
        default_factory=lambda: os.getenv("NVIDIA_BASE_URL", "https://integrate.api.nvidia.com"),
        description="A base url for accessing the NVIDIA NIM",
    )
-    api_key: Optional[SecretStr] = Field(
+    api_key: SecretStr | None = Field(
        default_factory=lambda: os.getenv("NVIDIA_API_KEY"),
        description="The NVIDIA API key, only needed of using the hosted service",
    )
@ -53,7 +53,7 @@ class NVIDIAConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
        return {
            "url": "${env.NVIDIA_BASE_URL:https://integrate.api.nvidia.com}",
            "api_key": "${env.NVIDIA_API_KEY:}",
--- a/llama_stack/providers/remote/inference/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py
@ -6,8 +6,9 @@

 import logging
 import warnings
+from collections.abc import AsyncIterator
 from functools import lru_cache
-from typing import Any, AsyncIterator, Dict, List, Optional, Union
+from typing import Any

 from openai import APIConnectionError, AsyncOpenAI, BadRequestError

@ -141,11 +142,11 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
        self,
        model_id: str,
        content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-    ) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]:
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+    ) -> CompletionResponse | AsyncIterator[CompletionResponseStreamChunk]:
        if sampling_params is None:
            sampling_params = SamplingParams()
        if content_has_media(content):
@ -182,20 +183,20 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
    async def embeddings(
        self,
        model_id: str,
-        contents: List[str] | List[InterleavedContentItem],
-        text_truncation: Optional[TextTruncation] = TextTruncation.none,
-        output_dimension: Optional[int] = None,
-        task_type: Optional[EmbeddingTaskType] = None,
+        contents: list[str] | list[InterleavedContentItem],
+        text_truncation: TextTruncation | None = TextTruncation.none,
+        output_dimension: int | None = None,
+        task_type: EmbeddingTaskType | None = None,
    ) -> EmbeddingsResponse:
        if any(content_has_media(content) for content in contents):
            raise NotImplementedError("Media is not supported")

        #
-        # Llama Stack: contents = List[str] | List[InterleavedContentItem]
+        # Llama Stack: contents = list[str] | list[InterleavedContentItem]
        #  ->
-        # OpenAI: input = str | List[str]
+        # OpenAI: input = str | list[str]
        #
-        # we can ignore str and always pass List[str] to OpenAI
+        # we can ignore str and always pass list[str] to OpenAI
        #
        flat_contents = [content.text if isinstance(content, TextContentItem) else content for content in contents]
        input = [content.text if isinstance(content, TextContentItem) else content for content in flat_contents]
@ -231,25 +232,25 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
            raise ValueError(f"Failed to get embeddings: {e}") from e

        #
-        # OpenAI: CreateEmbeddingResponse(data=[Embedding(embedding=List[float], ...)], ...)
+        # OpenAI: CreateEmbeddingResponse(data=[Embedding(embedding=list[float], ...)], ...)
        #  ->
-        # Llama Stack: EmbeddingsResponse(embeddings=List[List[float]])
+        # Llama Stack: EmbeddingsResponse(embeddings=list[list[float]])
        #
        return EmbeddingsResponse(embeddings=[embedding.embedding for embedding in response.data])

    async def chat_completion(
        self,
        model_id: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
-    ) -> Union[ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]]:
+        messages: list[Message],
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_choice: ToolChoice | None = ToolChoice.auto,
+        tool_prompt_format: ToolPromptFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+        tool_config: ToolConfig | None = None,
+    ) -> ChatCompletionResponse | AsyncIterator[ChatCompletionResponseStreamChunk]:
        if sampling_params is None:
            sampling_params = SamplingParams()
        if tool_prompt_format:
@ -286,24 +287,24 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
    async def openai_completion(
        self,
        model: str,
-        prompt: Union[str, List[str], List[int], List[List[int]]],
-        best_of: Optional[int] = None,
-        echo: Optional[bool] = None,
-        frequency_penalty: Optional[float] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        presence_penalty: Optional[float] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-        guided_choice: Optional[List[str]] = None,
-        prompt_logprobs: Optional[int] = None,
+        prompt: str | list[str] | list[int] | list[list[int]],
+        best_of: int | None = None,
+        echo: bool | None = None,
+        frequency_penalty: float | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        presence_penalty: float | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+        guided_choice: list[str] | None = None,
+        prompt_logprobs: int | None = None,
    ) -> OpenAICompletion:
        provider_model_id = await self._get_provider_model_id(model)

@ -335,29 +336,29 @@ class NVIDIAInferenceAdapter(Inference, ModelRegistryHelper):
    async def openai_chat_completion(
        self,
        model: str,
-        messages: List[OpenAIMessageParam],
-        frequency_penalty: Optional[float] = None,
-        function_call: Optional[Union[str, Dict[str, Any]]] = None,
-        functions: Optional[List[Dict[str, Any]]] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_completion_tokens: Optional[int] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        parallel_tool_calls: Optional[bool] = None,
-        presence_penalty: Optional[float] = None,
-        response_format: Optional[OpenAIResponseFormatParam] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
-        tools: Optional[List[Dict[str, Any]]] = None,
-        top_logprobs: Optional[int] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+        messages: list[OpenAIMessageParam],
+        frequency_penalty: float | None = None,
+        function_call: str | dict[str, Any] | None = None,
+        functions: list[dict[str, Any]] | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_completion_tokens: int | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        parallel_tool_calls: bool | None = None,
+        presence_penalty: float | None = None,
+        response_format: OpenAIResponseFormatParam | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        tool_choice: str | dict[str, Any] | None = None,
+        tools: list[dict[str, Any]] | None = None,
+        top_logprobs: int | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
        provider_model_id = await self._get_provider_model_id(model)

        params = await prepare_openai_completion_params(
--- a/llama_stack/providers/remote/inference/nvidia/openai_utils.py
+++ b/llama_stack/providers/remote/inference/nvidia/openai_utils.py
@ -5,7 +5,8 @@
 # the root directory of this source tree.

 import warnings
-from typing import Any, AsyncGenerator, Dict, List, Optional
+from collections.abc import AsyncGenerator
+from typing import Any

 from openai import AsyncStream
 from openai.types.chat.chat_completion import (
@ -64,7 +65,7 @@ async def convert_chat_completion_request(
        )

    nvext = {}
-    payload: Dict[str, Any] = dict(
+    payload: dict[str, Any] = dict(
        model=request.model,
        messages=[await convert_message_to_openai_dict_new(message) for message in request.messages],
        stream=request.stream,
@ -137,7 +138,7 @@ def convert_completion_request(
    # logprobs.top_k -> logprobs

    nvext = {}
-    payload: Dict[str, Any] = dict(
+    payload: dict[str, Any] = dict(
        model=request.model,
        prompt=request.content,
        stream=request.stream,
@ -176,8 +177,8 @@ def convert_completion_request(


 def _convert_openai_completion_logprobs(
-    logprobs: Optional[OpenAICompletionLogprobs],
-) -> Optional[List[TokenLogProbs]]:
+    logprobs: OpenAICompletionLogprobs | None,
+) -> list[TokenLogProbs] | None:
    """
    Convert an OpenAI CompletionLogprobs into a list of TokenLogProbs.
    """
--- a/llama_stack/providers/remote/inference/nvidia/utils.py
+++ b/llama_stack/providers/remote/inference/nvidia/utils.py
@ -5,7 +5,6 @@
 # the root directory of this source tree.

 import logging
-from typing import Tuple

 import httpx

@ -18,7 +17,7 @@ def _is_nvidia_hosted(config: NVIDIAConfig) -> bool:
    return "integrate.api.nvidia.com" in config.url


-async def _get_health(url: str) -> Tuple[bool, bool]:
+async def _get_health(url: str) -> tuple[bool, bool]:
    """
    Query {url}/v1/health/{live,ready} to check if the server is running and ready

--- a/llama_stack/providers/remote/inference/ollama/config.py
+++ b/llama_stack/providers/remote/inference/ollama/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict
+from typing import Any

 from pydantic import BaseModel

@ -15,5 +15,5 @@ class OllamaImplConfig(BaseModel):
    url: str = DEFAULT_OLLAMA_URL

    @classmethod
-    def sample_run_config(cls, url: str = "${env.OLLAMA_URL:http://localhost:11434}", **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, url: str = "${env.OLLAMA_URL:http://localhost:11434}", **kwargs) -> dict[str, Any]:
        return {"url": url}
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -5,7 +5,8 @@
 # the root directory of this source tree.


-from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union
+from collections.abc import AsyncGenerator, AsyncIterator
+from typing import Any

 import httpx
 from ollama import AsyncClient  # type: ignore[attr-defined]
@ -130,10 +131,10 @@ class OllamaInferenceAdapter(
        self,
        model_id: str,
        content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
    ) -> CompletionResponse | AsyncGenerator[CompletionResponseStreamChunk, None]:
        if sampling_params is None:
            sampling_params = SamplingParams()
@ -188,15 +189,15 @@ class OllamaInferenceAdapter(
    async def chat_completion(
        self,
        model_id: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
+        messages: list[Message],
+        sampling_params: SamplingParams | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_choice: ToolChoice | None = ToolChoice.auto,
+        tool_prompt_format: ToolPromptFormat | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+        tool_config: ToolConfig | None = None,
    ) -> ChatCompletionResponse | AsyncGenerator[ChatCompletionResponseStreamChunk, None]:
        if sampling_params is None:
            sampling_params = SamplingParams()
@ -216,7 +217,7 @@ class OllamaInferenceAdapter(
        else:
            return await self._nonstream_chat_completion(request)

-    async def _get_params(self, request: Union[ChatCompletionRequest, CompletionRequest]) -> dict:
+    async def _get_params(self, request: ChatCompletionRequest | CompletionRequest) -> dict:
        sampling_options = get_sampling_options(request.sampling_params)
        # This is needed since the Ollama API expects num_predict to be set
        # for early truncation instead of max_tokens.
@ -314,10 +315,10 @@ class OllamaInferenceAdapter(
    async def embeddings(
        self,
        model_id: str,
-        contents: List[str] | List[InterleavedContentItem],
-        text_truncation: Optional[TextTruncation] = TextTruncation.none,
-        output_dimension: Optional[int] = None,
-        task_type: Optional[EmbeddingTaskType] = None,
+        contents: list[str] | list[InterleavedContentItem],
+        text_truncation: TextTruncation | None = TextTruncation.none,
+        output_dimension: int | None = None,
+        task_type: EmbeddingTaskType | None = None,
    ) -> EmbeddingsResponse:
        model = await self._get_model(model_id)

@ -365,24 +366,24 @@ class OllamaInferenceAdapter(
    async def openai_completion(
        self,
        model: str,
-        prompt: Union[str, List[str], List[int], List[List[int]]],
-        best_of: Optional[int] = None,
-        echo: Optional[bool] = None,
-        frequency_penalty: Optional[float] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        presence_penalty: Optional[float] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-        guided_choice: Optional[List[str]] = None,
-        prompt_logprobs: Optional[int] = None,
+        prompt: str | list[str] | list[int] | list[list[int]],
+        best_of: int | None = None,
+        echo: bool | None = None,
+        frequency_penalty: float | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        presence_penalty: float | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+        guided_choice: list[str] | None = None,
+        prompt_logprobs: int | None = None,
    ) -> OpenAICompletion:
        if not isinstance(prompt, str):
            raise ValueError("Ollama does not support non-string prompts for completion")
@ -416,29 +417,29 @@ class OllamaInferenceAdapter(
    async def openai_chat_completion(
        self,
        model: str,
-        messages: List[OpenAIMessageParam],
-        frequency_penalty: Optional[float] = None,
-        function_call: Optional[Union[str, Dict[str, Any]]] = None,
-        functions: Optional[List[Dict[str, Any]]] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_completion_tokens: Optional[int] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        parallel_tool_calls: Optional[bool] = None,
-        presence_penalty: Optional[float] = None,
-        response_format: Optional[OpenAIResponseFormatParam] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
-        tools: Optional[List[Dict[str, Any]]] = None,
-        top_logprobs: Optional[int] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+        messages: list[OpenAIMessageParam],
+        frequency_penalty: float | None = None,
+        function_call: str | dict[str, Any] | None = None,
+        functions: list[dict[str, Any]] | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_completion_tokens: int | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        parallel_tool_calls: bool | None = None,
+        presence_penalty: float | None = None,
+        response_format: OpenAIResponseFormatParam | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        tool_choice: str | dict[str, Any] | None = None,
+        tools: list[dict[str, Any]] | None = None,
+        top_logprobs: int | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
        model_obj = await self._get_model(model)

        # ollama still makes tool calls even when tool_choice is "none"
@ -480,27 +481,27 @@ class OllamaInferenceAdapter(
    async def batch_completion(
        self,
        model_id: str,
-        content_batch: List[InterleavedContent],
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        logprobs: Optional[LogProbConfig] = None,
+        content_batch: list[InterleavedContent],
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        logprobs: LogProbConfig | None = None,
    ):
        raise NotImplementedError("Batch completion is not supported for Ollama")

    async def batch_chat_completion(
        self,
        model_id: str,
-        messages_batch: List[List[Message]],
-        sampling_params: Optional[SamplingParams] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_config: Optional[ToolConfig] = None,
-        response_format: Optional[ResponseFormat] = None,
-        logprobs: Optional[LogProbConfig] = None,
+        messages_batch: list[list[Message]],
+        sampling_params: SamplingParams | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_config: ToolConfig | None = None,
+        response_format: ResponseFormat | None = None,
+        logprobs: LogProbConfig | None = None,
    ):
        raise NotImplementedError("Batch chat completion is not supported for Ollama")


-async def convert_message_to_openai_dict_for_ollama(message: Message) -> List[dict]:
+async def convert_message_to_openai_dict_for_ollama(message: Message) -> list[dict]:
    async def _convert_content(content) -> dict:
        if isinstance(content, ImageContentItem):
            return {
--- a/llama_stack/providers/remote/inference/openai/init.py
+++ b/llama_stack/providers/remote/inference/openai/init.py
@ -4,15 +4,13 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Optional
-
 from pydantic import BaseModel

 from .config import OpenAIConfig


 class OpenAIProviderDataValidator(BaseModel):
-    openai_api_key: Optional[str] = None
+    openai_api_key: str | None = None


 async def get_adapter_impl(config: OpenAIConfig, _deps):
--- a/llama_stack/providers/remote/inference/openai/config.py
+++ b/llama_stack/providers/remote/inference/openai/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict, Optional
+from typing import Any

 from pydantic import BaseModel, Field

@ -12,7 +12,7 @@ from llama_stack.schema_utils import json_schema_type


 class OpenAIProviderDataValidator(BaseModel):
-    openai_api_key: Optional[str] = Field(
+    openai_api_key: str | None = Field(
        default=None,
        description="API key for OpenAI models",
    )
@ -20,13 +20,13 @@ class OpenAIProviderDataValidator(BaseModel):

@json_schema_type
 class OpenAIConfig(BaseModel):
-    api_key: Optional[str] = Field(
+    api_key: str | None = Field(
        default=None,
        description="API key for OpenAI models",
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.OPENAI_API_KEY}", **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.OPENAI_API_KEY}", **kwargs) -> dict[str, Any]:
        return {
            "api_key": api_key,
        }
--- a/llama_stack/providers/remote/inference/passthrough/config.py
+++ b/llama_stack/providers/remote/inference/passthrough/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict, Optional
+from typing import Any

 from pydantic import BaseModel, Field, SecretStr

@ -18,13 +18,13 @@ class PassthroughImplConfig(BaseModel):
        description="The URL for the passthrough endpoint",
    )

-    api_key: Optional[SecretStr] = Field(
+    api_key: SecretStr | None = Field(
        default=None,
        description="API Key for the passthrouth endpoint",
    )

    @classmethod
-    def sample_run_config(cls, **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
        return {
            "url": "${env.PASSTHROUGH_URL}",
            "api_key": "${env.PASSTHROUGH_API_KEY}",
--- a/llama_stack/providers/remote/inference/passthrough/passthrough.py
+++ b/llama_stack/providers/remote/inference/passthrough/passthrough.py
@ -4,7 +4,8 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union
+from collections.abc import AsyncGenerator, AsyncIterator
+from typing import Any

 from llama_stack_client import AsyncLlamaStackClient

@ -93,10 +94,10 @@ class PassthroughInferenceAdapter(Inference):
        self,
        model_id: str,
        content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
    ) -> AsyncGenerator:
        if sampling_params is None:
            sampling_params = SamplingParams()
@ -123,15 +124,15 @@ class PassthroughInferenceAdapter(Inference):
    async def chat_completion(
        self,
        model_id: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
+        messages: list[Message],
+        sampling_params: SamplingParams | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_choice: ToolChoice | None = ToolChoice.auto,
+        tool_prompt_format: ToolPromptFormat | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+        tool_config: ToolConfig | None = None,
    ) -> AsyncGenerator:
        if sampling_params is None:
            sampling_params = SamplingParams()
@ -165,7 +166,7 @@ class PassthroughInferenceAdapter(Inference):
        else:
            return await self._nonstream_chat_completion(json_params)

-    async def _nonstream_chat_completion(self, json_params: Dict[str, Any]) -> ChatCompletionResponse:
+    async def _nonstream_chat_completion(self, json_params: dict[str, Any]) -> ChatCompletionResponse:
        client = self._get_client()
        response = await client.inference.chat_completion(**json_params)

@ -178,7 +179,7 @@ class PassthroughInferenceAdapter(Inference):
            logprobs=response.logprobs,
        )

-    async def _stream_chat_completion(self, json_params: Dict[str, Any]) -> AsyncGenerator:
+    async def _stream_chat_completion(self, json_params: dict[str, Any]) -> AsyncGenerator:
        client = self._get_client()
        stream_response = await client.inference.chat_completion(**json_params)

@ -193,10 +194,10 @@ class PassthroughInferenceAdapter(Inference):
    async def embeddings(
        self,
        model_id: str,
-        contents: List[InterleavedContent],
-        text_truncation: Optional[TextTruncation] = TextTruncation.none,
-        output_dimension: Optional[int] = None,
-        task_type: Optional[EmbeddingTaskType] = None,
+        contents: list[InterleavedContent],
+        text_truncation: TextTruncation | None = TextTruncation.none,
+        output_dimension: int | None = None,
+        task_type: EmbeddingTaskType | None = None,
    ) -> EmbeddingsResponse:
        client = self._get_client()
        model = await self.model_store.get_model(model_id)
@ -212,24 +213,24 @@ class PassthroughInferenceAdapter(Inference):
    async def openai_completion(
        self,
        model: str,
-        prompt: Union[str, List[str], List[int], List[List[int]]],
-        best_of: Optional[int] = None,
-        echo: Optional[bool] = None,
-        frequency_penalty: Optional[float] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        presence_penalty: Optional[float] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-        guided_choice: Optional[List[str]] = None,
-        prompt_logprobs: Optional[int] = None,
+        prompt: str | list[str] | list[int] | list[list[int]],
+        best_of: int | None = None,
+        echo: bool | None = None,
+        frequency_penalty: float | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        presence_penalty: float | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+        guided_choice: list[str] | None = None,
+        prompt_logprobs: int | None = None,
    ) -> OpenAICompletion:
        client = self._get_client()
        model_obj = await self.model_store.get_model(model)
@ -261,29 +262,29 @@ class PassthroughInferenceAdapter(Inference):
    async def openai_chat_completion(
        self,
        model: str,
-        messages: List[OpenAIMessageParam],
-        frequency_penalty: Optional[float] = None,
-        function_call: Optional[Union[str, Dict[str, Any]]] = None,
-        functions: Optional[List[Dict[str, Any]]] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_completion_tokens: Optional[int] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        parallel_tool_calls: Optional[bool] = None,
-        presence_penalty: Optional[float] = None,
-        response_format: Optional[OpenAIResponseFormatParam] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
-        tools: Optional[List[Dict[str, Any]]] = None,
-        top_logprobs: Optional[int] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+        messages: list[OpenAIMessageParam],
+        frequency_penalty: float | None = None,
+        function_call: str | dict[str, Any] | None = None,
+        functions: list[dict[str, Any]] | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_completion_tokens: int | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        parallel_tool_calls: bool | None = None,
+        presence_penalty: float | None = None,
+        response_format: OpenAIResponseFormatParam | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        tool_choice: str | dict[str, Any] | None = None,
+        tools: list[dict[str, Any]] | None = None,
+        top_logprobs: int | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
        client = self._get_client()
        model_obj = await self.model_store.get_model(model)

@ -315,7 +316,7 @@ class PassthroughInferenceAdapter(Inference):

        return await client.inference.openai_chat_completion(**params)

-    def cast_value_to_json_dict(self, request_params: Dict[str, Any]) -> Dict[str, Any]:
+    def cast_value_to_json_dict(self, request_params: dict[str, Any]) -> dict[str, Any]:
        json_params = {}
        for key, value in request_params.items():
            json_input = convert_pydantic_to_json_value(value)
--- a/llama_stack/providers/remote/inference/runpod/config.py
+++ b/llama_stack/providers/remote/inference/runpod/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict, Optional
+from typing import Any

 from pydantic import BaseModel, Field

@ -13,17 +13,17 @@ from llama_stack.schema_utils import json_schema_type

@json_schema_type
 class RunpodImplConfig(BaseModel):
-    url: Optional[str] = Field(
+    url: str | None = Field(
        default=None,
        description="The URL for the Runpod model serving endpoint",
    )
-    api_token: Optional[str] = Field(
+    api_token: str | None = Field(
        default=None,
        description="The API token",
    )

    @classmethod
-    def sample_run_config(cls, **kwargs: Any) -> Dict[str, Any]:
+    def sample_run_config(cls, **kwargs: Any) -> dict[str, Any]:
        return {
            "url": "${env.RUNPOD_URL:}",
            "api_token": "${env.RUNPOD_API_TOKEN:}",
--- a/llama_stack/providers/remote/inference/runpod/runpod.py
+++ b/llama_stack/providers/remote/inference/runpod/runpod.py
@ -3,7 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import AsyncGenerator
+from collections.abc import AsyncGenerator

 from openai import OpenAI

--- a/llama_stack/providers/remote/inference/sambanova/config.py
+++ b/llama_stack/providers/remote/inference/sambanova/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict, Optional
+from typing import Any

 from pydantic import BaseModel, Field

@ -17,13 +17,13 @@ class SambaNovaImplConfig(BaseModel):
        default="https://api.sambanova.ai/v1",
        description="The URL for the SambaNova AI server",
    )
-    api_key: Optional[str] = Field(
+    api_key: str | None = Field(
        default=None,
        description="The SambaNova.ai API Key",
    )

    @classmethod
-    def sample_run_config(cls, **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
        return {
            "url": "https://api.sambanova.ai/v1",
            "api_key": "${env.SAMBANOVA_API_KEY}",
--- a/llama_stack/providers/remote/inference/sambanova/sambanova.py
+++ b/llama_stack/providers/remote/inference/sambanova/sambanova.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.

 import json
-from typing import AsyncGenerator, List, Optional
+from collections.abc import AsyncGenerator

 from openai import OpenAI

@ -77,25 +77,25 @@ class SambaNovaInferenceAdapter(
        self,
        model_id: str,
        content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
    ) -> AsyncGenerator:
        raise NotImplementedError()

    async def chat_completion(
        self,
        model_id: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
-        stream: Optional[bool] = False,
-        tool_config: Optional[ToolConfig] = None,
-        logprobs: Optional[LogProbConfig] = None,
+        messages: list[Message],
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_choice: ToolChoice | None = ToolChoice.auto,
+        tool_prompt_format: ToolPromptFormat | None = ToolPromptFormat.json,
+        stream: bool | None = False,
+        tool_config: ToolConfig | None = None,
+        logprobs: LogProbConfig | None = None,
    ) -> AsyncGenerator:
        if sampling_params is None:
            sampling_params = SamplingParams()
@ -146,10 +146,10 @@ class SambaNovaInferenceAdapter(
    async def embeddings(
        self,
        model_id: str,
-        contents: List[str] | List[InterleavedContentItem],
-        text_truncation: Optional[TextTruncation] = TextTruncation.none,
-        output_dimension: Optional[int] = None,
-        task_type: Optional[EmbeddingTaskType] = None,
+        contents: list[str] | list[InterleavedContentItem],
+        text_truncation: TextTruncation | None = TextTruncation.none,
+        output_dimension: int | None = None,
+        task_type: EmbeddingTaskType | None = None,
    ) -> EmbeddingsResponse:
        raise NotImplementedError()

@ -186,7 +186,7 @@ class SambaNovaInferenceAdapter(

        return params

-    async def convert_to_sambanova_messages(self, messages: List[Message]) -> List[dict]:
+    async def convert_to_sambanova_messages(self, messages: list[Message]) -> list[dict]:
        conversation = []
        for message in messages:
            content = {}
@ -244,7 +244,7 @@ class SambaNovaInferenceAdapter(

        return content

-    def convert_to_sambanova_tool(self, tools: List[ToolDefinition]) -> List[dict]:
+    def convert_to_sambanova_tool(self, tools: list[ToolDefinition]) -> list[dict]:
        if tools is None:
            return tools

@ -292,7 +292,7 @@ class SambaNovaInferenceAdapter(
    def convert_to_sambanova_tool_calls(
        self,
        tool_calls,
-    ) -> List[ToolCall]:
+    ) -> list[ToolCall]:
        if not tool_calls:
            return []

--- a/llama_stack/providers/remote/inference/sambanova_openai_compat/config.py
+++ b/llama_stack/providers/remote/inference/sambanova_openai_compat/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict, Optional
+from typing import Any

 from pydantic import BaseModel, Field

@ -12,7 +12,7 @@ from llama_stack.schema_utils import json_schema_type


 class SambaNovaProviderDataValidator(BaseModel):
-    sambanova_api_key: Optional[str] = Field(
+    sambanova_api_key: str | None = Field(
        default=None,
        description="API key for SambaNova models",
    )
@ -20,7 +20,7 @@ class SambaNovaProviderDataValidator(BaseModel):

@json_schema_type
 class SambaNovaCompatConfig(BaseModel):
-    api_key: Optional[str] = Field(
+    api_key: str | None = Field(
        default=None,
        description="The SambaNova API key",
    )
@ -31,7 +31,7 @@ class SambaNovaCompatConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.SAMBANOVA_API_KEY}", **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.SAMBANOVA_API_KEY}", **kwargs) -> dict[str, Any]:
        return {
            "openai_compat_api_base": "https://api.sambanova.ai/v1",
            "api_key": api_key,
--- a/llama_stack/providers/remote/inference/tgi/init.py
+++ b/llama_stack/providers/remote/inference/tgi/init.py
@ -4,13 +4,11 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Union
-
 from .config import InferenceAPIImplConfig, InferenceEndpointImplConfig, TGIImplConfig


 async def get_adapter_impl(
-    config: Union[InferenceAPIImplConfig, InferenceEndpointImplConfig, TGIImplConfig],
+    config: InferenceAPIImplConfig | InferenceEndpointImplConfig | TGIImplConfig,
    _deps,
 ):
    from .tgi import InferenceAPIAdapter, InferenceEndpointAdapter, TGIAdapter
--- a/llama_stack/providers/remote/inference/tgi/config.py
+++ b/llama_stack/providers/remote/inference/tgi/config.py
@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Optional

 from pydantic import BaseModel, Field, SecretStr

@ -29,7 +28,7 @@ class InferenceEndpointImplConfig(BaseModel):
    endpoint_name: str = Field(
        description="The name of the Hugging Face Inference Endpoint in the format of '{namespace}/{endpoint_name}' (e.g. 'my-cool-org/meta-llama-3-1-8b-instruct-rce'). Namespace is optional and will default to the user account if not provided.",
    )
-    api_token: Optional[SecretStr] = Field(
+    api_token: SecretStr | None = Field(
        default=None,
        description="Your Hugging Face user access token (will default to locally saved token if not provided)",
    )
@ -52,7 +51,7 @@ class InferenceAPIImplConfig(BaseModel):
    huggingface_repo: str = Field(
        description="The model ID of the model on the Hugging Face Hub (e.g. 'meta-llama/Meta-Llama-3.1-70B-Instruct')",
    )
-    api_token: Optional[SecretStr] = Field(
+    api_token: SecretStr | None = Field(
        default=None,
        description="Your Hugging Face user access token (will default to locally saved token if not provided)",
    )
--- a/llama_stack/providers/remote/inference/tgi/tgi.py
+++ b/llama_stack/providers/remote/inference/tgi/tgi.py
@ -6,7 +6,7 @@


 import logging
-from typing import AsyncGenerator, List, Optional
+from collections.abc import AsyncGenerator

 from huggingface_hub import AsyncInferenceClient, HfApi

@ -105,10 +105,10 @@ class _HfAdapter(
        self,
        model_id: str,
        content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
    ) -> AsyncGenerator:
        if sampling_params is None:
            sampling_params = SamplingParams()
@ -134,7 +134,7 @@ class _HfAdapter(

    def _build_options(
        self,
-        sampling_params: Optional[SamplingParams] = None,
+        sampling_params: SamplingParams | None = None,
        fmt: ResponseFormat = None,
    ):
        options = get_sampling_options(sampling_params)
@ -209,15 +209,15 @@ class _HfAdapter(
    async def chat_completion(
        self,
        model_id: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
+        messages: list[Message],
+        sampling_params: SamplingParams | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_choice: ToolChoice | None = ToolChoice.auto,
+        tool_prompt_format: ToolPromptFormat | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+        tool_config: ToolConfig | None = None,
    ) -> AsyncGenerator:
        if sampling_params is None:
            sampling_params = SamplingParams()
@ -284,10 +284,10 @@ class _HfAdapter(
    async def embeddings(
        self,
        model_id: str,
-        contents: List[str] | List[InterleavedContentItem],
-        text_truncation: Optional[TextTruncation] = TextTruncation.none,
-        output_dimension: Optional[int] = None,
-        task_type: Optional[EmbeddingTaskType] = None,
+        contents: list[str] | list[InterleavedContentItem],
+        text_truncation: TextTruncation | None = TextTruncation.none,
+        output_dimension: int | None = None,
+        task_type: EmbeddingTaskType | None = None,
    ) -> EmbeddingsResponse:
        raise NotImplementedError()

--- a/llama_stack/providers/remote/inference/together/config.py
+++ b/llama_stack/providers/remote/inference/together/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict, Optional
+from typing import Any

 from pydantic import BaseModel, Field, SecretStr

@ -17,13 +17,13 @@ class TogetherImplConfig(BaseModel):
        default="https://api.together.xyz/v1",
        description="The URL for the Together AI server",
    )
-    api_key: Optional[SecretStr] = Field(
+    api_key: SecretStr | None = Field(
        default=None,
        description="The Together AI API Key",
    )

    @classmethod
-    def sample_run_config(cls, **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
        return {
            "url": "https://api.together.xyz/v1",
            "api_key": "${env.TOGETHER_API_KEY:}",
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@ -4,7 +4,8 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union
+from collections.abc import AsyncGenerator, AsyncIterator
+from typing import Any

 from openai import AsyncOpenAI
 from together import AsyncTogether
@ -86,10 +87,10 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
        self,
        model_id: str,
        content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
    ) -> AsyncGenerator:
        if sampling_params is None:
            sampling_params = SamplingParams()
@ -147,8 +148,8 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi

    def _build_options(
        self,
-        sampling_params: Optional[SamplingParams],
-        logprobs: Optional[LogProbConfig],
+        sampling_params: SamplingParams | None,
+        logprobs: LogProbConfig | None,
        fmt: ResponseFormat,
    ) -> dict:
        options = get_sampling_options(sampling_params)
@ -175,15 +176,15 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
    async def chat_completion(
        self,
        model_id: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
+        messages: list[Message],
+        sampling_params: SamplingParams | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_choice: ToolChoice | None = ToolChoice.auto,
+        tool_prompt_format: ToolPromptFormat | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+        tool_config: ToolConfig | None = None,
    ) -> AsyncGenerator:
        if sampling_params is None:
            sampling_params = SamplingParams()
@ -224,7 +225,7 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
        async for chunk in process_chat_completion_stream_response(stream, request):
            yield chunk

-    async def _get_params(self, request: Union[ChatCompletionRequest, CompletionRequest]) -> dict:
+    async def _get_params(self, request: ChatCompletionRequest | CompletionRequest) -> dict:
        input_dict = {}
        media_present = request_has_media(request)
        llama_model = self.get_llama_model(request.model)
@ -249,10 +250,10 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
    async def embeddings(
        self,
        model_id: str,
-        contents: List[str] | List[InterleavedContentItem],
-        text_truncation: Optional[TextTruncation] = TextTruncation.none,
-        output_dimension: Optional[int] = None,
-        task_type: Optional[EmbeddingTaskType] = None,
+        contents: list[str] | list[InterleavedContentItem],
+        text_truncation: TextTruncation | None = TextTruncation.none,
+        output_dimension: int | None = None,
+        task_type: EmbeddingTaskType | None = None,
    ) -> EmbeddingsResponse:
        model = await self.model_store.get_model(model_id)
        assert all(not content_has_media(content) for content in contents), (
@ -269,24 +270,24 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
    async def openai_completion(
        self,
        model: str,
-        prompt: Union[str, List[str], List[int], List[List[int]]],
-        best_of: Optional[int] = None,
-        echo: Optional[bool] = None,
-        frequency_penalty: Optional[float] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        presence_penalty: Optional[float] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-        guided_choice: Optional[List[str]] = None,
-        prompt_logprobs: Optional[int] = None,
+        prompt: str | list[str] | list[int] | list[list[int]],
+        best_of: int | None = None,
+        echo: bool | None = None,
+        frequency_penalty: float | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        presence_penalty: float | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+        guided_choice: list[str] | None = None,
+        prompt_logprobs: int | None = None,
    ) -> OpenAICompletion:
        model_obj = await self.model_store.get_model(model)
        params = await prepare_openai_completion_params(
@ -313,29 +314,29 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
    async def openai_chat_completion(
        self,
        model: str,
-        messages: List[OpenAIMessageParam],
-        frequency_penalty: Optional[float] = None,
-        function_call: Optional[Union[str, Dict[str, Any]]] = None,
-        functions: Optional[List[Dict[str, Any]]] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_completion_tokens: Optional[int] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        parallel_tool_calls: Optional[bool] = None,
-        presence_penalty: Optional[float] = None,
-        response_format: Optional[OpenAIResponseFormatParam] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
-        tools: Optional[List[Dict[str, Any]]] = None,
-        top_logprobs: Optional[int] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+        messages: list[OpenAIMessageParam],
+        frequency_penalty: float | None = None,
+        function_call: str | dict[str, Any] | None = None,
+        functions: list[dict[str, Any]] | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_completion_tokens: int | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        parallel_tool_calls: bool | None = None,
+        presence_penalty: float | None = None,
+        response_format: OpenAIResponseFormatParam | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        tool_choice: str | dict[str, Any] | None = None,
+        tools: list[dict[str, Any]] | None = None,
+        top_logprobs: int | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
        model_obj = await self.model_store.get_model(model)
        params = await prepare_openai_completion_params(
            model=model_obj.provider_resource_id,
--- a/llama_stack/providers/remote/inference/together_openai_compat/config.py
+++ b/llama_stack/providers/remote/inference/together_openai_compat/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict, Optional
+from typing import Any

 from pydantic import BaseModel, Field

@ -12,7 +12,7 @@ from llama_stack.schema_utils import json_schema_type


 class TogetherProviderDataValidator(BaseModel):
-    together_api_key: Optional[str] = Field(
+    together_api_key: str | None = Field(
        default=None,
        description="API key for Together models",
    )
@ -20,7 +20,7 @@ class TogetherProviderDataValidator(BaseModel):

@json_schema_type
 class TogetherCompatConfig(BaseModel):
-    api_key: Optional[str] = Field(
+    api_key: str | None = Field(
        default=None,
        description="The Together API key",
    )
@ -31,7 +31,7 @@ class TogetherCompatConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, api_key: str = "${env.TOGETHER_API_KEY}", **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, api_key: str = "${env.TOGETHER_API_KEY}", **kwargs) -> dict[str, Any]:
        return {
            "openai_compat_api_base": "https://api.together.xyz/v1",
            "api_key": api_key,
--- a/llama_stack/providers/remote/inference/vllm/config.py
+++ b/llama_stack/providers/remote/inference/vllm/config.py
@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Optional

 from pydantic import BaseModel, Field

@ -13,7 +12,7 @@ from llama_stack.schema_utils import json_schema_type

@json_schema_type
 class VLLMInferenceAdapterConfig(BaseModel):
-    url: Optional[str] = Field(
+    url: str | None = Field(
        default=None,
        description="The URL for the vLLM model serving endpoint",
    )
@ -21,7 +20,7 @@ class VLLMInferenceAdapterConfig(BaseModel):
        default=4096,
        description="Maximum number of tokens to generate.",
    )
-    api_token: Optional[str] = Field(
+    api_token: str | None = Field(
        default="fake",
        description="The API token",
    )
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@ -5,7 +5,8 @@
 # the root directory of this source tree.
 import json
 import logging
-from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union
+from collections.abc import AsyncGenerator, AsyncIterator
+from typing import Any

 import httpx
 from openai import AsyncOpenAI
@ -94,7 +95,7 @@ def build_hf_repo_model_entries():

 def _convert_to_vllm_tool_calls_in_response(
    tool_calls,
-) -> List[ToolCall]:
+) -> list[ToolCall]:
    if not tool_calls:
        return []

@ -109,7 +110,7 @@ def _convert_to_vllm_tool_calls_in_response(
    ]


-def _convert_to_vllm_tools_in_request(tools: List[ToolDefinition]) -> List[dict]:
+def _convert_to_vllm_tools_in_request(tools: list[ToolDefinition]) -> list[dict]:
    compat_tools = []

    for tool in tools:
@ -262,10 +263,10 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
        self,
        model_id: str,
        content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
    ) -> CompletionResponse | AsyncGenerator[CompletionResponseStreamChunk, None]:
        self._lazy_initialize_client()
        if sampling_params is None:
@ -287,15 +288,15 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
    async def chat_completion(
        self,
        model_id: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
+        messages: list[Message],
+        sampling_params: SamplingParams | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_choice: ToolChoice | None = ToolChoice.auto,
+        tool_prompt_format: ToolPromptFormat | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+        tool_config: ToolConfig | None = None,
    ) -> ChatCompletionResponse | AsyncGenerator[ChatCompletionResponseStreamChunk, None]:
        self._lazy_initialize_client()
        if sampling_params is None:
@ -385,7 +386,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
            )
        return model

-    async def _get_params(self, request: Union[ChatCompletionRequest, CompletionRequest]) -> dict:
+    async def _get_params(self, request: ChatCompletionRequest | CompletionRequest) -> dict:
        options = get_sampling_options(request.sampling_params)
        if "max_tokens" not in options:
            options["max_tokens"] = self.config.max_tokens
@ -422,10 +423,10 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
    async def embeddings(
        self,
        model_id: str,
-        contents: List[str] | List[InterleavedContentItem],
-        text_truncation: Optional[TextTruncation] = TextTruncation.none,
-        output_dimension: Optional[int] = None,
-        task_type: Optional[EmbeddingTaskType] = None,
+        contents: list[str] | list[InterleavedContentItem],
+        text_truncation: TextTruncation | None = TextTruncation.none,
+        output_dimension: int | None = None,
+        task_type: EmbeddingTaskType | None = None,
    ) -> EmbeddingsResponse:
        self._lazy_initialize_client()
        assert self.client is not None
@ -448,29 +449,29 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
    async def openai_completion(
        self,
        model: str,
-        prompt: Union[str, List[str], List[int], List[List[int]]],
-        best_of: Optional[int] = None,
-        echo: Optional[bool] = None,
-        frequency_penalty: Optional[float] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        presence_penalty: Optional[float] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-        guided_choice: Optional[List[str]] = None,
-        prompt_logprobs: Optional[int] = None,
+        prompt: str | list[str] | list[int] | list[list[int]],
+        best_of: int | None = None,
+        echo: bool | None = None,
+        frequency_penalty: float | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        presence_penalty: float | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+        guided_choice: list[str] | None = None,
+        prompt_logprobs: int | None = None,
    ) -> OpenAICompletion:
        self._lazy_initialize_client()
        model_obj = await self._get_model(model)

-        extra_body: Dict[str, Any] = {}
+        extra_body: dict[str, Any] = {}
        if prompt_logprobs is not None and prompt_logprobs >= 0:
            extra_body["prompt_logprobs"] = prompt_logprobs
        if guided_choice:
@ -501,29 +502,29 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
    async def openai_chat_completion(
        self,
        model: str,
-        messages: List[OpenAIMessageParam],
-        frequency_penalty: Optional[float] = None,
-        function_call: Optional[Union[str, Dict[str, Any]]] = None,
-        functions: Optional[List[Dict[str, Any]]] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_completion_tokens: Optional[int] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        parallel_tool_calls: Optional[bool] = None,
-        presence_penalty: Optional[float] = None,
-        response_format: Optional[OpenAIResponseFormatParam] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
-        tools: Optional[List[Dict[str, Any]]] = None,
-        top_logprobs: Optional[int] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+        messages: list[OpenAIMessageParam],
+        frequency_penalty: float | None = None,
+        function_call: str | dict[str, Any] | None = None,
+        functions: list[dict[str, Any]] | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_completion_tokens: int | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        parallel_tool_calls: bool | None = None,
+        presence_penalty: float | None = None,
+        response_format: OpenAIResponseFormatParam | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        tool_choice: str | dict[str, Any] | None = None,
+        tools: list[dict[str, Any]] | None = None,
+        top_logprobs: int | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
        self._lazy_initialize_client()
        model_obj = await self._get_model(model)
        params = await prepare_openai_completion_params(
@ -556,21 +557,21 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
    async def batch_completion(
        self,
        model_id: str,
-        content_batch: List[InterleavedContent],
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        logprobs: Optional[LogProbConfig] = None,
+        content_batch: list[InterleavedContent],
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        logprobs: LogProbConfig | None = None,
    ):
        raise NotImplementedError("Batch completion is not supported for Ollama")

    async def batch_chat_completion(
        self,
        model_id: str,
-        messages_batch: List[List[Message]],
-        sampling_params: Optional[SamplingParams] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_config: Optional[ToolConfig] = None,
-        response_format: Optional[ResponseFormat] = None,
-        logprobs: Optional[LogProbConfig] = None,
+        messages_batch: list[list[Message]],
+        sampling_params: SamplingParams | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_config: ToolConfig | None = None,
+        response_format: ResponseFormat | None = None,
+        logprobs: LogProbConfig | None = None,
    ):
        raise NotImplementedError("Batch chat completion is not supported for Ollama")
--- a/llama_stack/providers/remote/inference/watsonx/config.py
+++ b/llama_stack/providers/remote/inference/watsonx/config.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.

 import os
-from typing import Any, Dict, Optional
+from typing import Any

 from pydantic import BaseModel, Field, SecretStr

@ -24,11 +24,11 @@ class WatsonXConfig(BaseModel):
        default_factory=lambda: os.getenv("WATSONX_BASE_URL", "https://us-south.ml.cloud.ibm.com"),
        description="A base url for accessing the watsonx.ai",
    )
-    api_key: Optional[SecretStr] = Field(
+    api_key: SecretStr | None = Field(
        default_factory=lambda: os.getenv("WATSONX_API_KEY"),
        description="The watsonx API key, only needed of using the hosted service",
    )
-    project_id: Optional[str] = Field(
+    project_id: str | None = Field(
        default_factory=lambda: os.getenv("WATSONX_PROJECT_ID"),
        description="The Project ID key, only needed of using the hosted service",
    )
@ -38,7 +38,7 @@ class WatsonXConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
        return {
            "url": "${env.WATSONX_BASE_URL:https://us-south.ml.cloud.ibm.com}",
            "api_key": "${env.WATSONX_API_KEY:}",
--- a/llama_stack/providers/remote/inference/watsonx/watsonx.py
+++ b/llama_stack/providers/remote/inference/watsonx/watsonx.py
@ -4,7 +4,8 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union
+from collections.abc import AsyncGenerator, AsyncIterator
+from typing import Any

 from ibm_watson_machine_learning.foundation_models import Model
 from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
@ -78,10 +79,10 @@ class WatsonXInferenceAdapter(Inference, ModelRegistryHelper):
        self,
        model_id: str,
        content: InterleavedContent,
-        sampling_params: Optional[SamplingParams] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
+        sampling_params: SamplingParams | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
    ) -> AsyncGenerator:
        if sampling_params is None:
            sampling_params = SamplingParams()
@ -152,15 +153,15 @@ class WatsonXInferenceAdapter(Inference, ModelRegistryHelper):
    async def chat_completion(
        self,
        model_id: str,
-        messages: List[Message],
-        sampling_params: Optional[SamplingParams] = None,
-        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: Optional[ToolPromptFormat] = None,
-        response_format: Optional[ResponseFormat] = None,
-        stream: Optional[bool] = False,
-        logprobs: Optional[LogProbConfig] = None,
-        tool_config: Optional[ToolConfig] = None,
+        messages: list[Message],
+        sampling_params: SamplingParams | None = None,
+        tools: list[ToolDefinition] | None = None,
+        tool_choice: ToolChoice | None = ToolChoice.auto,
+        tool_prompt_format: ToolPromptFormat | None = None,
+        response_format: ResponseFormat | None = None,
+        stream: bool | None = False,
+        logprobs: LogProbConfig | None = None,
+        tool_config: ToolConfig | None = None,
    ) -> AsyncGenerator:
        if sampling_params is None:
            sampling_params = SamplingParams()
@ -217,7 +218,7 @@ class WatsonXInferenceAdapter(Inference, ModelRegistryHelper):
        async for chunk in process_chat_completion_stream_response(stream, request):
            yield chunk

-    async def _get_params(self, request: Union[ChatCompletionRequest, CompletionRequest]) -> dict:
+    async def _get_params(self, request: ChatCompletionRequest | CompletionRequest) -> dict:
        input_dict = {"params": {}}
        media_present = request_has_media(request)
        llama_model = self.get_llama_model(request.model)
@ -252,34 +253,34 @@ class WatsonXInferenceAdapter(Inference, ModelRegistryHelper):
    async def embeddings(
        self,
        model_id: str,
-        contents: List[str] | List[InterleavedContentItem],
-        text_truncation: Optional[TextTruncation] = TextTruncation.none,
-        output_dimension: Optional[int] = None,
-        task_type: Optional[EmbeddingTaskType] = None,
+        contents: list[str] | list[InterleavedContentItem],
+        text_truncation: TextTruncation | None = TextTruncation.none,
+        output_dimension: int | None = None,
+        task_type: EmbeddingTaskType | None = None,
    ) -> EmbeddingsResponse:
        raise NotImplementedError("embedding is not supported for watsonx")

    async def openai_completion(
        self,
        model: str,
-        prompt: Union[str, List[str], List[int], List[List[int]]],
-        best_of: Optional[int] = None,
-        echo: Optional[bool] = None,
-        frequency_penalty: Optional[float] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        presence_penalty: Optional[float] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-        guided_choice: Optional[List[str]] = None,
-        prompt_logprobs: Optional[int] = None,
+        prompt: str | list[str] | list[int] | list[list[int]],
+        best_of: int | None = None,
+        echo: bool | None = None,
+        frequency_penalty: float | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        presence_penalty: float | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+        guided_choice: list[str] | None = None,
+        prompt_logprobs: int | None = None,
    ) -> OpenAICompletion:
        model_obj = await self.model_store.get_model(model)
        params = await prepare_openai_completion_params(
@ -306,29 +307,29 @@ class WatsonXInferenceAdapter(Inference, ModelRegistryHelper):
    async def openai_chat_completion(
        self,
        model: str,
-        messages: List[OpenAIMessageParam],
-        frequency_penalty: Optional[float] = None,
-        function_call: Optional[Union[str, Dict[str, Any]]] = None,
-        functions: Optional[List[Dict[str, Any]]] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
-        logprobs: Optional[bool] = None,
-        max_completion_tokens: Optional[int] = None,
-        max_tokens: Optional[int] = None,
-        n: Optional[int] = None,
-        parallel_tool_calls: Optional[bool] = None,
-        presence_penalty: Optional[float] = None,
-        response_format: Optional[OpenAIResponseFormatParam] = None,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = None,
-        stream_options: Optional[Dict[str, Any]] = None,
-        temperature: Optional[float] = None,
-        tool_choice: Optional[Union[str, Dict[str, Any]]] = None,
-        tools: Optional[List[Dict[str, Any]]] = None,
-        top_logprobs: Optional[int] = None,
-        top_p: Optional[float] = None,
-        user: Optional[str] = None,
-    ) -> Union[OpenAIChatCompletion, AsyncIterator[OpenAIChatCompletionChunk]]:
+        messages: list[OpenAIMessageParam],
+        frequency_penalty: float | None = None,
+        function_call: str | dict[str, Any] | None = None,
+        functions: list[dict[str, Any]] | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_completion_tokens: int | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        parallel_tool_calls: bool | None = None,
+        presence_penalty: float | None = None,
+        response_format: OpenAIResponseFormatParam | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        tool_choice: str | dict[str, Any] | None = None,
+        tools: list[dict[str, Any]] | None = None,
+        top_logprobs: int | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
        model_obj = await self.model_store.get_model(model)
        params = await prepare_openai_completion_params(
            model=model_obj.provider_resource_id,
--- a/llama_stack/providers/remote/post_training/nvidia/config.py
+++ b/llama_stack/providers/remote/post_training/nvidia/config.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.

 import os
-from typing import Any, Dict, Optional
+from typing import Any

 from pydantic import BaseModel, Field

@ -15,23 +15,23 @@ from pydantic import BaseModel, Field
 class NvidiaPostTrainingConfig(BaseModel):
    """Configuration for NVIDIA Post Training implementation."""

-    api_key: Optional[str] = Field(
+    api_key: str | None = Field(
        default_factory=lambda: os.getenv("NVIDIA_API_KEY"),
        description="The NVIDIA API key.",
    )

-    dataset_namespace: Optional[str] = Field(
+    dataset_namespace: str | None = Field(
        default_factory=lambda: os.getenv("NVIDIA_DATASET_NAMESPACE", "default"),
        description="The NVIDIA dataset namespace.",
    )

-    project_id: Optional[str] = Field(
+    project_id: str | None = Field(
        default_factory=lambda: os.getenv("NVIDIA_PROJECT_ID", "test-example-model@v1"),
        description="The NVIDIA project ID.",
    )

    # ToDO: validate this, add default value
-    customizer_url: Optional[str] = Field(
+    customizer_url: str | None = Field(
        default_factory=lambda: os.getenv("NVIDIA_CUSTOMIZER_URL"),
        description="Base URL for the NeMo Customizer API",
    )
@ -53,7 +53,7 @@ class NvidiaPostTrainingConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
        return {
            "api_key": "${env.NVIDIA_API_KEY:}",
            "dataset_namespace": "${env.NVIDIA_DATASET_NAMESPACE:default}",
@ -71,27 +71,27 @@ class SFTLoRADefaultConfig(BaseModel):
    n_epochs: int = 50

    # NeMo customizer specific parameters
-    log_every_n_steps: Optional[int] = None
+    log_every_n_steps: int | None = None
    val_check_interval: float = 0.25
    sequence_packing_enabled: bool = False
    weight_decay: float = 0.01
    lr: float = 0.0001

    # SFT specific parameters
-    hidden_dropout: Optional[float] = None
-    attention_dropout: Optional[float] = None
-    ffn_dropout: Optional[float] = None
+    hidden_dropout: float | None = None
+    attention_dropout: float | None = None
+    ffn_dropout: float | None = None

    # LoRA default parameters
    lora_adapter_dim: int = 8
-    lora_adapter_dropout: Optional[float] = None
+    lora_adapter_dropout: float | None = None
    lora_alpha: int = 16

    # Data config
    batch_size: int = 8

    @classmethod
-    def sample_config(cls) -> Dict[str, Any]:
+    def sample_config(cls) -> dict[str, Any]:
        """Return a sample configuration for NVIDIA training."""
        return {
            "n_epochs": 50,
--- a/llama_stack/providers/remote/post_training/nvidia/models.py
+++ b/llama_stack/providers/remote/post_training/nvidia/models.py
@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import List

 from llama_stack.models.llama.sku_types import CoreModelId
 from llama_stack.providers.utils.inference.model_registry import (
@ -24,5 +23,5 @@ _MODEL_ENTRIES = [
 ]


-def get_model_entries() -> List[ProviderModelEntry]:
+def get_model_entries() -> list[ProviderModelEntry]:
    return _MODEL_ENTRIES
--- a/llama_stack/providers/remote/post_training/nvidia/post_training.py
+++ b/llama_stack/providers/remote/post_training/nvidia/post_training.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.
 import warnings
 from datetime import datetime
-from typing import Any, Dict, List, Literal, Optional
+from typing import Any, Literal

 import aiohttp
 from pydantic import BaseModel, ConfigDict
@ -50,7 +50,7 @@ class NvidiaPostTrainingJob(PostTrainingJob):


 class ListNvidiaPostTrainingJobs(BaseModel):
-    data: List[NvidiaPostTrainingJob]
+    data: list[NvidiaPostTrainingJob]


 class NvidiaPostTrainingJobStatusResponse(PostTrainingJobStatusResponse):
@ -83,11 +83,11 @@ class NvidiaPostTrainingAdapter(ModelRegistryHelper):
        self,
        method: str,
        path: str,
-        headers: Optional[Dict[str, Any]] = None,
-        params: Optional[Dict[str, Any]] = None,
-        json: Optional[Dict[str, Any]] = None,
+        headers: dict[str, Any] | None = None,
+        params: dict[str, Any] | None = None,
+        json: dict[str, Any] | None = None,
        **kwargs,
-    ) -> Dict[str, Any]:
+    ) -> dict[str, Any]:
        """Helper method to make HTTP requests to the Customizer API."""
        url = f"{self.customizer_url}{path}"
        request_headers = self.headers.copy()
@ -109,9 +109,9 @@ class NvidiaPostTrainingAdapter(ModelRegistryHelper):

    async def get_training_jobs(
        self,
-        page: Optional[int] = 1,
-        page_size: Optional[int] = 10,
-        sort: Optional[Literal["created_at", "-created_at"]] = "created_at",
+        page: int | None = 1,
+        page_size: int | None = 10,
+        sort: Literal["created_at", "-created_at"] | None = "created_at",
    ) -> ListNvidiaPostTrainingJobs:
        """Get all customization jobs.
        Updated the base class return type from ListPostTrainingJobsResponse to ListNvidiaPostTrainingJobs.
@ -207,12 +207,12 @@ class NvidiaPostTrainingAdapter(ModelRegistryHelper):
    async def supervised_fine_tune(
        self,
        job_uuid: str,
-        training_config: Dict[str, Any],
-        hyperparam_search_config: Dict[str, Any],
-        logger_config: Dict[str, Any],
+        training_config: dict[str, Any],
+        hyperparam_search_config: dict[str, Any],
+        logger_config: dict[str, Any],
        model: str,
-        checkpoint_dir: Optional[str],
-        algorithm_config: Optional[AlgorithmConfig] = None,
+        checkpoint_dir: str | None,
+        algorithm_config: AlgorithmConfig | None = None,
    ) -> NvidiaPostTrainingJob:
        """
        Fine-tunes a model on a dataset.
@ -423,8 +423,8 @@ class NvidiaPostTrainingAdapter(ModelRegistryHelper):
        finetuned_model: str,
        algorithm_config: DPOAlignmentConfig,
        training_config: TrainingConfig,
-        hyperparam_search_config: Dict[str, Any],
-        logger_config: Dict[str, Any],
+        hyperparam_search_config: dict[str, Any],
+        logger_config: dict[str, Any],
    ) -> PostTrainingJob:
        """Optimize a model based on preference data."""
        raise NotImplementedError("Preference optimization is not implemented yet")
--- a/llama_stack/providers/remote/post_training/nvidia/utils.py
+++ b/llama_stack/providers/remote/post_training/nvidia/utils.py
@ -6,7 +6,7 @@

 import logging
 import warnings
-from typing import Any, Dict, Set, Tuple
+from typing import Any

 from pydantic import BaseModel

@ -18,7 +18,7 @@ from .config import NvidiaPostTrainingConfig
 logger = logging.getLogger(__name__)


-def warn_unsupported_params(config_dict: Any, supported_keys: Set[str], config_name: str) -> None:
+def warn_unsupported_params(config_dict: Any, supported_keys: set[str], config_name: str) -> None:
    keys = set(config_dict.__annotations__.keys()) if isinstance(config_dict, BaseModel) else config_dict.keys()
    unsupported_params = [k for k in keys if k not in supported_keys]
    if unsupported_params:
@ -28,7 +28,7 @@ def warn_unsupported_params(config_dict: Any, supported_keys: Set[str], config_n


 def validate_training_params(
-    training_config: Dict[str, Any], supported_keys: Set[str], config_name: str = "TrainingConfig"
+    training_config: dict[str, Any], supported_keys: set[str], config_name: str = "TrainingConfig"
 ) -> None:
    """
    Validates training parameters against supported keys.
@ -57,7 +57,7 @@ def validate_training_params(


 # ToDo: implement post health checks for customizer are enabled
-async def _get_health(url: str) -> Tuple[bool, bool]: ...
+async def _get_health(url: str) -> tuple[bool, bool]: ...


 async def check_health(config: NvidiaPostTrainingConfig) -> None: ...
--- a/llama_stack/providers/remote/safety/bedrock/bedrock.py
+++ b/llama_stack/providers/remote/safety/bedrock/bedrock.py
@ -6,7 +6,7 @@

 import json
 import logging
-from typing import Any, Dict, List
+from typing import Any

 from llama_stack.apis.inference import Message
 from llama_stack.apis.safety import (
@ -53,7 +53,7 @@ class BedrockSafetyAdapter(Safety, ShieldsProtocolPrivate):
            )

    async def run_shield(
-        self, shield_id: str, messages: List[Message], params: Dict[str, Any] = None
+        self, shield_id: str, messages: list[Message], params: dict[str, Any] = None
    ) -> RunShieldResponse:
        shield = await self.shield_store.get_shield(shield_id)
        if not shield:
--- a/llama_stack/providers/remote/safety/nvidia/config.py
+++ b/llama_stack/providers/remote/safety/nvidia/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import os
-from typing import Any, Dict, Optional
+from typing import Any

 from pydantic import BaseModel, Field

@ -27,10 +27,10 @@ class NVIDIASafetyConfig(BaseModel):
        default_factory=lambda: os.getenv("GUARDRAILS_SERVICE_URL", "http://0.0.0.0:7331"),
        description="The url for accessing the guardrails service",
    )
-    config_id: Optional[str] = Field(default="self-check", description="Config ID to use from the config store")
+    config_id: str | None = Field(default="self-check", description="Config ID to use from the config store")

    @classmethod
-    def sample_run_config(cls, **kwargs) -> Dict[str, Any]:
+    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
        return {
            "guardrails_service_url": "${env.GUARDRAILS_SERVICE_URL:http://localhost:7331}",
            "config_id": "self-check",
--- a/llama_stack/providers/remote/safety/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/safety/nvidia/nvidia.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.

 import logging
-from typing import Any, List, Optional
+from typing import Any

 import requests

@ -41,7 +41,7 @@ class NVIDIASafetyAdapter(Safety, ShieldsProtocolPrivate):
            raise ValueError("Shield model not provided.")

    async def run_shield(
-        self, shield_id: str, messages: List[Message], params: Optional[dict[str, Any]] = None
+        self, shield_id: str, messages: list[Message], params: dict[str, Any] | None = None
    ) -> RunShieldResponse:
        """
        Run a safety shield check against the provided messages.
@ -112,7 +112,7 @@ class NeMoGuardrails:
        response.raise_for_status()
        return response.json()

-    async def run(self, messages: List[Message]) -> RunShieldResponse:
+    async def run(self, messages: list[Message]) -> RunShieldResponse:
        """
        Queries the /v1/guardrails/checks endpoint of the NeMo guardrails deployed API.

--- a/llama_stack/providers/remote/tool_runtime/bing_search/bing_search.py
+++ b/llama_stack/providers/remote/tool_runtime/bing_search/bing_search.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.

 import json
-from typing import Any, Dict, Optional
+from typing import Any

 import httpx

@ -50,7 +50,7 @@ class BingSearchToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, NeedsRequestP
        return provider_data.bing_search_api_key

    async def list_runtime_tools(
-        self, tool_group_id: Optional[str] = None, mcp_endpoint: Optional[URL] = None
+        self, tool_group_id: str | None = None, mcp_endpoint: URL | None = None
    ) -> ListToolDefsResponse:
        return ListToolDefsResponse(
            data=[
@ -68,7 +68,7 @@ class BingSearchToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, NeedsRequestP
            ]
        )

-    async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> ToolInvocationResult:
+    async def invoke_tool(self, tool_name: str, kwargs: dict[str, Any]) -> ToolInvocationResult:
        api_key = self._get_api_key()
        headers = {
            "Ocp-Apim-Subscription-Key": api_key,
--- a/llama_stack/providers/remote/tool_runtime/bing_search/config.py
+++ b/llama_stack/providers/remote/tool_runtime/bing_search/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict, Optional
+from typing import Any

 from pydantic import BaseModel

@ -12,11 +12,11 @@ from pydantic import BaseModel
 class BingSearchToolConfig(BaseModel):
    """Configuration for Bing Search Tool Runtime"""

-    api_key: Optional[str] = None
+    api_key: str | None = None
    top_k: int = 3

    @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
        return {
            "api_key": "${env.BING_API_KEY:}",
        }
--- a/llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py
+++ b/llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict, Optional
+from typing import Any

 import httpx

@ -49,7 +49,7 @@ class BraveSearchToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, NeedsRequest
        return provider_data.brave_search_api_key

    async def list_runtime_tools(
-        self, tool_group_id: Optional[str] = None, mcp_endpoint: Optional[URL] = None
+        self, tool_group_id: str | None = None, mcp_endpoint: URL | None = None
    ) -> ListToolDefsResponse:
        return ListToolDefsResponse(
            data=[
@ -68,7 +68,7 @@ class BraveSearchToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, NeedsRequest
            ]
        )

-    async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> ToolInvocationResult:
+    async def invoke_tool(self, tool_name: str, kwargs: dict[str, Any]) -> ToolInvocationResult:
        api_key = self._get_api_key()
        url = "https://api.search.brave.com/res/v1/web/search"
        headers = {
--- a/llama_stack/providers/remote/tool_runtime/brave_search/config.py
+++ b/llama_stack/providers/remote/tool_runtime/brave_search/config.py
@ -4,13 +4,13 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict, Optional
+from typing import Any

 from pydantic import BaseModel, Field


 class BraveSearchToolConfig(BaseModel):
-    api_key: Optional[str] = Field(
+    api_key: str | None = Field(
        default=None,
        description="The Brave Search API Key",
    )
@ -20,7 +20,7 @@ class BraveSearchToolConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, __distro_dir__: str) -> Dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str) -> dict[str, Any]:
        return {
            "api_key": "${env.BRAVE_SEARCH_API_KEY:}",
            "max_results": 3,
--- a/llama_stack/providers/remote/tool_runtime/model_context_protocol/config.py
+++ b/llama_stack/providers/remote/tool_runtime/model_context_protocol/config.py
@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict
+from typing import Any

 from pydantic import BaseModel


 class ModelContextProtocolConfig(BaseModel):
    @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
        return {}
--- a/llama_stack/providers/remote/tool_runtime/model_context_protocol/model_context_protocol.py
+++ b/llama_stack/providers/remote/tool_runtime/model_context_protocol/model_context_protocol.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict, Optional
+from typing import Any
 from urllib.parse import urlparse

 from mcp import ClientSession
@ -31,7 +31,7 @@ class ModelContextProtocolToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime):
        pass

    async def list_runtime_tools(
-        self, tool_group_id: Optional[str] = None, mcp_endpoint: Optional[URL] = None
+        self, tool_group_id: str | None = None, mcp_endpoint: URL | None = None
    ) -> ListToolDefsResponse:
        if mcp_endpoint is None:
            raise ValueError("mcp_endpoint is required")
@ -63,7 +63,7 @@ class ModelContextProtocolToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime):
                    )
        return ListToolDefsResponse(data=tools)

-    async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> ToolInvocationResult:
+    async def invoke_tool(self, tool_name: str, kwargs: dict[str, Any]) -> ToolInvocationResult:
        tool = await self.tool_store.get_tool(tool_name)
        if tool.metadata is None or tool.metadata.get("endpoint") is None:
            raise ValueError(f"Tool {tool_name} does not have metadata")
--- a/llama_stack/providers/remote/tool_runtime/tavily_search/config.py
+++ b/llama_stack/providers/remote/tool_runtime/tavily_search/config.py
@ -4,13 +4,13 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict, Optional
+from typing import Any

 from pydantic import BaseModel, Field


 class TavilySearchToolConfig(BaseModel):
-    api_key: Optional[str] = Field(
+    api_key: str | None = Field(
        default=None,
        description="The Tavily Search API Key",
    )
@ -20,7 +20,7 @@ class TavilySearchToolConfig(BaseModel):
    )

    @classmethod
-    def sample_run_config(cls, __distro_dir__: str) -> Dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str) -> dict[str, Any]:
        return {
            "api_key": "${env.TAVILY_SEARCH_API_KEY:}",
            "max_results": 3,
--- a/llama_stack/providers/remote/tool_runtime/tavily_search/tavily_search.py
+++ b/llama_stack/providers/remote/tool_runtime/tavily_search/tavily_search.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.

 import json
-from typing import Any, Dict, Optional
+from typing import Any

 import httpx

@ -49,7 +49,7 @@ class TavilySearchToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, NeedsReques
        return provider_data.tavily_search_api_key

    async def list_runtime_tools(
-        self, tool_group_id: Optional[str] = None, mcp_endpoint: Optional[URL] = None
+        self, tool_group_id: str | None = None, mcp_endpoint: URL | None = None
    ) -> ListToolDefsResponse:
        return ListToolDefsResponse(
            data=[
@ -67,7 +67,7 @@ class TavilySearchToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, NeedsReques
            ]
        )

-    async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> ToolInvocationResult:
+    async def invoke_tool(self, tool_name: str, kwargs: dict[str, Any]) -> ToolInvocationResult:
        api_key = self._get_api_key()
        async with httpx.AsyncClient() as client:
            response = await client.post(
--- a/llama_stack/providers/remote/tool_runtime/wolfram_alpha/config.py
+++ b/llama_stack/providers/remote/tool_runtime/wolfram_alpha/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict, Optional
+from typing import Any

 from pydantic import BaseModel

@ -12,10 +12,10 @@ from pydantic import BaseModel
 class WolframAlphaToolConfig(BaseModel):
    """Configuration for WolframAlpha Tool Runtime"""

-    api_key: Optional[str] = None
+    api_key: str | None = None

    @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
        return {
            "api_key": "${env.WOLFRAM_ALPHA_API_KEY:}",
        }
--- a/llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py
+++ b/llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.

 import json
-from typing import Any, Dict, Optional
+from typing import Any

 import httpx

@ -50,7 +50,7 @@ class WolframAlphaToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, NeedsReques
        return provider_data.wolfram_alpha_api_key

    async def list_runtime_tools(
-        self, tool_group_id: Optional[str] = None, mcp_endpoint: Optional[URL] = None
+        self, tool_group_id: str | None = None, mcp_endpoint: URL | None = None
    ) -> ListToolDefsResponse:
        return ListToolDefsResponse(
            data=[
@ -68,7 +68,7 @@ class WolframAlphaToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime, NeedsReques
            ]
        )

-    async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> ToolInvocationResult:
+    async def invoke_tool(self, tool_name: str, kwargs: dict[str, Any]) -> ToolInvocationResult:
        api_key = self._get_api_key()
        params = {
            "input": kwargs["query"],
--- a/llama_stack/providers/remote/vector_io/chroma/init.py
+++ b/llama_stack/providers/remote/vector_io/chroma/init.py
@ -4,14 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Dict
-
 from llama_stack.providers.datatypes import Api, ProviderSpec

 from .config import ChromaVectorIOConfig


-async def get_adapter_impl(config: ChromaVectorIOConfig, deps: Dict[Api, ProviderSpec]):
+async def get_adapter_impl(config: ChromaVectorIOConfig, deps: dict[Api, ProviderSpec]):
    from .chroma import ChromaVectorIOAdapter

    impl = ChromaVectorIOAdapter(config, deps[Api.inference])
--- a/llama_stack/providers/remote/vector_io/chroma/chroma.py
+++ b/llama_stack/providers/remote/vector_io/chroma/chroma.py
@ -6,7 +6,7 @@
 import asyncio
 import json
 import logging
-from typing import Any, Dict, List, Optional, Union
+from typing import Any
 from urllib.parse import urlparse

 import chromadb
@ -27,7 +27,7 @@ from .config import ChromaVectorIOConfig as RemoteChromaVectorIOConfig
 log = logging.getLogger(__name__)


-ChromaClientType = Union[chromadb.AsyncHttpClient, chromadb.PersistentClient]
+ChromaClientType = chromadb.AsyncHttpClient | chromadb.PersistentClient


 # this is a helper to allow us to use async and non-async chroma clients interchangeably
@ -42,7 +42,7 @@ class ChromaIndex(EmbeddingIndex):
        self.client = client
        self.collection = collection

-    async def add_chunks(self, chunks: List[Chunk], embeddings: NDArray):
+    async def add_chunks(self, chunks: list[Chunk], embeddings: NDArray):
        assert len(chunks) == len(embeddings), (
            f"Chunk length {len(chunks)} does not match embedding length {len(embeddings)}"
        )
@ -89,7 +89,7 @@ class ChromaIndex(EmbeddingIndex):
 class ChromaVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
    def __init__(
        self,
-        config: Union[RemoteChromaVectorIOConfig, InlineChromaVectorIOConfig],
+        config: RemoteChromaVectorIOConfig | InlineChromaVectorIOConfig,
        inference_api: Api.inference,
    ) -> None:
        log.info(f"Initializing ChromaVectorIOAdapter with url: {config}")
@ -137,8 +137,8 @@ class ChromaVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
    async def insert_chunks(
        self,
        vector_db_id: str,
-        chunks: List[Chunk],
-        ttl_seconds: Optional[int] = None,
+        chunks: list[Chunk],
+        ttl_seconds: int | None = None,
    ) -> None:
        index = await self._get_and_cache_vector_db_index(vector_db_id)

@ -148,7 +148,7 @@ class ChromaVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
        self,
        vector_db_id: str,
        query: InterleavedContent,
-        params: Optional[Dict[str, Any]] = None,
+        params: dict[str, Any] | None = None,
    ) -> QueryChunksResponse:
        index = await self._get_and_cache_vector_db_index(vector_db_id)

--- a/llama_stack/providers/remote/vector_io/chroma/config.py
+++ b/llama_stack/providers/remote/vector_io/chroma/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict
+from typing import Any

 from pydantic import BaseModel

@ -13,5 +13,5 @@ class ChromaVectorIOConfig(BaseModel):
    url: str

    @classmethod
-    def sample_run_config(cls, url: str = "${env.CHROMADB_URL}", **kwargs: Any) -> Dict[str, Any]:
+    def sample_run_config(cls, url: str = "${env.CHROMADB_URL}", **kwargs: Any) -> dict[str, Any]:
        return {"url": url}
--- a/llama_stack/providers/remote/vector_io/milvus/init.py
+++ b/llama_stack/providers/remote/vector_io/milvus/init.py
@ -4,14 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Dict
-
 from llama_stack.providers.datatypes import Api, ProviderSpec

 from .config import MilvusVectorIOConfig


-async def get_adapter_impl(config: MilvusVectorIOConfig, deps: Dict[Api, ProviderSpec]):
+async def get_adapter_impl(config: MilvusVectorIOConfig, deps: dict[Api, ProviderSpec]):
    from .milvus import MilvusVectorIOAdapter

    assert isinstance(config, MilvusVectorIOConfig), f"Unexpected config type: {type(config)}"
--- a/llama_stack/providers/remote/vector_io/milvus/config.py
+++ b/llama_stack/providers/remote/vector_io/milvus/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict, Optional
+from typing import Any

 from pydantic import BaseModel

@ -14,9 +14,9 @@ from llama_stack.schema_utils import json_schema_type
@json_schema_type
 class MilvusVectorIOConfig(BaseModel):
    uri: str
-    token: Optional[str] = None
+    token: str | None = None
    consistency_level: str = "Strong"

    @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]:
+    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
        return {"uri": "${env.MILVUS_ENDPOINT}", "token": "${env.MILVUS_TOKEN}"}
--- a/llama_stack/providers/remote/vector_io/milvus/milvus.py
+++ b/llama_stack/providers/remote/vector_io/milvus/milvus.py
@ -9,7 +9,7 @@ import hashlib
 import logging
 import os
 import uuid
-from typing import Any, Dict, List, Optional, Union
+from typing import Any

 from numpy.typing import NDArray
 from pymilvus import MilvusClient
@ -39,7 +39,7 @@ class MilvusIndex(EmbeddingIndex):
        if await asyncio.to_thread(self.client.has_collection, self.collection_name):
            await asyncio.to_thread(self.client.drop_collection, collection_name=self.collection_name)

-    async def add_chunks(self, chunks: List[Chunk], embeddings: NDArray):
+    async def add_chunks(self, chunks: list[Chunk], embeddings: NDArray):
        assert len(chunks) == len(embeddings), (
            f"Chunk length {len(chunks)} does not match embedding length {len(embeddings)}"
        )
@ -89,7 +89,7 @@ class MilvusIndex(EmbeddingIndex):

 class MilvusVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
    def __init__(
-        self, config: Union[RemoteMilvusVectorIOConfig, InlineMilvusVectorIOConfig], inference_api: Api.inference
+        self, config: RemoteMilvusVectorIOConfig | InlineMilvusVectorIOConfig, inference_api: Api.inference
    ) -> None:
        self.config = config
        self.cache = {}
@ -124,7 +124,7 @@ class MilvusVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):

        self.cache[vector_db.identifier] = index

-    async def _get_and_cache_vector_db_index(self, vector_db_id: str) -> Optional[VectorDBWithIndex]:
+    async def _get_and_cache_vector_db_index(self, vector_db_id: str) -> VectorDBWithIndex | None:
        if vector_db_id in self.cache:
            return self.cache[vector_db_id]

@ -148,8 +148,8 @@ class MilvusVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
    async def insert_chunks(
        self,
        vector_db_id: str,
-        chunks: List[Chunk],
-        ttl_seconds: Optional[int] = None,
+        chunks: list[Chunk],
+        ttl_seconds: int | None = None,
    ) -> None:
        index = await self._get_and_cache_vector_db_index(vector_db_id)
        if not index:
@ -161,7 +161,7 @@ class MilvusVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
        self,
        vector_db_id: str,
        query: InterleavedContent,
-        params: Optional[Dict[str, Any]] = None,
+        params: dict[str, Any] | None = None,
    ) -> QueryChunksResponse:
        index = await self._get_and_cache_vector_db_index(vector_db_id)
        if not index:
@ -172,7 +172,7 @@ class MilvusVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):

 def generate_chunk_id(document_id: str, chunk_text: str) -> str:
    """Generate a unique chunk ID using a hash of document ID and chunk text."""
-    hash_input = f"{document_id}:{chunk_text}".encode("utf-8")
+    hash_input = f"{document_id}:{chunk_text}".encode()
    return str(uuid.UUID(hashlib.md5(hash_input).hexdigest()))


--- a/llama_stack/providers/remote/vector_io/pgvector/init.py
+++ b/llama_stack/providers/remote/vector_io/pgvector/init.py
@ -4,14 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Dict
-
 from llama_stack.providers.datatypes import Api, ProviderSpec

 from .config import PGVectorVectorIOConfig


-async def get_adapter_impl(config: PGVectorVectorIOConfig, deps: Dict[Api, ProviderSpec]):
+async def get_adapter_impl(config: PGVectorVectorIOConfig, deps: dict[Api, ProviderSpec]):
    from .pgvector import PGVectorVectorIOAdapter

    impl = PGVectorVectorIOAdapter(config, deps[Api.inference])
--- a/llama_stack/providers/remote/vector_io/pgvector/config.py
+++ b/llama_stack/providers/remote/vector_io/pgvector/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict
+from typing import Any

 from pydantic import BaseModel, Field

@ -28,5 +28,5 @@ class PGVectorVectorIOConfig(BaseModel):
        user: str = "${env.PGVECTOR_USER}",
        password: str = "${env.PGVECTOR_PASSWORD}",
        **kwargs: Any,
-    ) -> Dict[str, Any]:
+    ) -> dict[str, Any]:
        return {"host": host, "port": port, "db": db, "user": user, "password": password}
--- a/llama_stack/providers/remote/vector_io/pgvector/pgvector.py
+++ b/llama_stack/providers/remote/vector_io/pgvector/pgvector.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.

 import logging
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any

 import psycopg2
 from numpy.typing import NDArray
@ -33,7 +33,7 @@ def check_extension_version(cur):
    return result[0] if result else None


-def upsert_models(conn, keys_models: List[Tuple[str, BaseModel]]):
+def upsert_models(conn, keys_models: list[tuple[str, BaseModel]]):
    with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
        query = sql.SQL(
            """
@ -74,7 +74,7 @@ class PGVectorIndex(EmbeddingIndex):
            """
            )

-    async def add_chunks(self, chunks: List[Chunk], embeddings: NDArray):
+    async def add_chunks(self, chunks: list[Chunk], embeddings: NDArray):
        assert len(chunks) == len(embeddings), (
            f"Chunk length {len(chunks)} does not match embedding length {len(embeddings)}"
        )
@ -180,8 +180,8 @@ class PGVectorVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
    async def insert_chunks(
        self,
        vector_db_id: str,
-        chunks: List[Chunk],
-        ttl_seconds: Optional[int] = None,
+        chunks: list[Chunk],
+        ttl_seconds: int | None = None,
    ) -> None:
        index = await self._get_and_cache_vector_db_index(vector_db_id)
        await index.insert_chunks(chunks)
@ -190,7 +190,7 @@ class PGVectorVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
        self,
        vector_db_id: str,
        query: InterleavedContent,
-        params: Optional[Dict[str, Any]] = None,
+        params: dict[str, Any] | None = None,
    ) -> QueryChunksResponse:
        index = await self._get_and_cache_vector_db_index(vector_db_id)
        return await index.query_chunks(query, params)
--- a/llama_stack/providers/remote/vector_io/qdrant/init.py
+++ b/llama_stack/providers/remote/vector_io/qdrant/init.py
@ -4,14 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Dict
-
 from llama_stack.providers.datatypes import Api, ProviderSpec

 from .config import QdrantVectorIOConfig


-async def get_adapter_impl(config: QdrantVectorIOConfig, deps: Dict[Api, ProviderSpec]):
+async def get_adapter_impl(config: QdrantVectorIOConfig, deps: dict[Api, ProviderSpec]):
    from .qdrant import QdrantVectorIOAdapter

    impl = QdrantVectorIOAdapter(config, deps[Api.inference])
--- a/llama_stack/providers/remote/vector_io/qdrant/config.py
+++ b/llama_stack/providers/remote/vector_io/qdrant/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict, Optional
+from typing import Any

 from pydantic import BaseModel

@ -13,19 +13,19 @@ from llama_stack.schema_utils import json_schema_type

@json_schema_type
 class QdrantVectorIOConfig(BaseModel):
-    location: Optional[str] = None
-    url: Optional[str] = None
-    port: Optional[int] = 6333
+    location: str | None = None
+    url: str | None = None
+    port: int | None = 6333
    grpc_port: int = 6334
    prefer_grpc: bool = False
-    https: Optional[bool] = None
-    api_key: Optional[str] = None
-    prefix: Optional[str] = None
-    timeout: Optional[int] = None
-    host: Optional[str] = None
+    https: bool | None = None
+    api_key: str | None = None
+    prefix: str | None = None
+    timeout: int | None = None
+    host: str | None = None

    @classmethod
-    def sample_run_config(cls, **kwargs: Any) -> Dict[str, Any]:
+    def sample_run_config(cls, **kwargs: Any) -> dict[str, Any]:
        return {
            "api_key": "${env.QDRANT_API_KEY}",
        }
--- a/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
+++ b/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
@ -6,7 +6,7 @@

 import logging
 import uuid
-from typing import Any, Dict, List, Optional, Union
+from typing import Any

 from numpy.typing import NDArray
 from qdrant_client import AsyncQdrantClient, models
@ -44,7 +44,7 @@ class QdrantIndex(EmbeddingIndex):
        self.client = client
        self.collection_name = collection_name

-    async def add_chunks(self, chunks: List[Chunk], embeddings: NDArray):
+    async def add_chunks(self, chunks: list[Chunk], embeddings: NDArray):
        assert len(chunks) == len(embeddings), (
            f"Chunk length {len(chunks)} does not match embedding length {len(embeddings)}"
        )
@ -101,7 +101,7 @@ class QdrantIndex(EmbeddingIndex):

 class QdrantVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
    def __init__(
-        self, config: Union[RemoteQdrantVectorIOConfig, InlineQdrantVectorIOConfig], inference_api: Api.inference
+        self, config: RemoteQdrantVectorIOConfig | InlineQdrantVectorIOConfig, inference_api: Api.inference
    ) -> None:
        self.config = config
        self.client: AsyncQdrantClient = None
@ -131,7 +131,7 @@ class QdrantVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
            await self.cache[vector_db_id].index.delete()
            del self.cache[vector_db_id]

-    async def _get_and_cache_vector_db_index(self, vector_db_id: str) -> Optional[VectorDBWithIndex]:
+    async def _get_and_cache_vector_db_index(self, vector_db_id: str) -> VectorDBWithIndex | None:
        if vector_db_id in self.cache:
            return self.cache[vector_db_id]

@ -150,8 +150,8 @@ class QdrantVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
    async def insert_chunks(
        self,
        vector_db_id: str,
-        chunks: List[Chunk],
-        ttl_seconds: Optional[int] = None,
+        chunks: list[Chunk],
+        ttl_seconds: int | None = None,
    ) -> None:
        index = await self._get_and_cache_vector_db_index(vector_db_id)
        if not index:
@ -163,7 +163,7 @@ class QdrantVectorIOAdapter(VectorIO, VectorDBsProtocolPrivate):
        self,
        vector_db_id: str,
        query: InterleavedContent,
-        params: Optional[Dict[str, Any]] = None,
+        params: dict[str, Any] | None = None,
    ) -> QueryChunksResponse:
        index = await self._get_and_cache_vector_db_index(vector_db_id)
        if not index:
--- a/llama_stack/providers/remote/vector_io/weaviate/init.py
+++ b/llama_stack/providers/remote/vector_io/weaviate/init.py
@ -4,14 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Dict
-
 from llama_stack.providers.datatypes import Api, ProviderSpec

-from .config import WeaviateRequestProviderData, WeaviateVectorIOConfig  # noqa: F401
+from .config import WeaviateVectorIOConfig


-async def get_adapter_impl(config: WeaviateVectorIOConfig, deps: Dict[Api, ProviderSpec]):
+async def get_adapter_impl(config: WeaviateVectorIOConfig, deps: dict[Api, ProviderSpec]):
    from .weaviate import WeaviateVectorIOAdapter

    impl = WeaviateVectorIOAdapter(config, deps[Api.inference])
--- a/llama_stack/providers/remote/vector_io/weaviate/config.py
+++ b/llama_stack/providers/remote/vector_io/weaviate/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any, Dict
+from typing import Any

 from pydantic import BaseModel

@ -16,5 +16,5 @@ class WeaviateRequestProviderData(BaseModel):

 class WeaviateVectorIOConfig(BaseModel):
    @classmethod
-    def sample_run_config(cls, **kwargs: Any) -> Dict[str, Any]:
+    def sample_run_config(cls, **kwargs: Any) -> dict[str, Any]:
        return {}
--- a/llama_stack/providers/remote/vector_io/weaviate/weaviate.py
+++ b/llama_stack/providers/remote/vector_io/weaviate/weaviate.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.
 import json
 import logging
-from typing import Any, Dict, List, Optional
+from typing import Any

 import weaviate
 import weaviate.classes as wvc
@ -33,7 +33,7 @@ class WeaviateIndex(EmbeddingIndex):
        self.client = client
        self.collection_name = collection_name

-    async def add_chunks(self, chunks: List[Chunk], embeddings: NDArray):
+    async def add_chunks(self, chunks: list[Chunk], embeddings: NDArray):
        assert len(chunks) == len(embeddings), (
            f"Chunk length {len(chunks)} does not match embedding length {len(embeddings)}"
        )
@ -80,7 +80,7 @@ class WeaviateIndex(EmbeddingIndex):

        return QueryChunksResponse(chunks=chunks, scores=scores)

-    async def delete(self, chunk_ids: List[str]) -> None:
+    async def delete(self, chunk_ids: list[str]) -> None:
        collection = self.client.collections.get(self.collection_name)
        collection.data.delete_many(where=Filter.by_property("id").contains_any(chunk_ids))

@ -144,7 +144,7 @@ class WeaviateVectorIOAdapter(
            self.inference_api,
        )

-    async def _get_and_cache_vector_db_index(self, vector_db_id: str) -> Optional[VectorDBWithIndex]:
+    async def _get_and_cache_vector_db_index(self, vector_db_id: str) -> VectorDBWithIndex | None:
        if vector_db_id in self.cache:
            return self.cache[vector_db_id]

@ -167,8 +167,8 @@ class WeaviateVectorIOAdapter(
    async def insert_chunks(
        self,
        vector_db_id: str,
-        chunks: List[Chunk],
-        ttl_seconds: Optional[int] = None,
+        chunks: list[Chunk],
+        ttl_seconds: int | None = None,
    ) -> None:
        index = await self._get_and_cache_vector_db_index(vector_db_id)
        if not index:
@ -180,7 +180,7 @@ class WeaviateVectorIOAdapter(
        self,
        vector_db_id: str,
        query: InterleavedContent,
-        params: Optional[Dict[str, Any]] = None,
+        params: dict[str, Any] | None = None,
    ) -> QueryChunksResponse:
        index = await self._get_and_cache_vector_db_index(vector_db_id)
        if not index: