updated with vllm based values

rh-pre-commit.version: 2.3.2 rh-pre-commit.check-secrets: ENABLED
2025-12-12 04:00:42 +00:00 · 2025-10-22 18:20:32 +08:00 · 2025-10-22 18:20:32 +08:00 · 17e74251e2
commit 17e74251e2
parent a701f68bd7
11 changed files with 551 additions and 102 deletions
--- a/llama_stack/providers/registry/inference.py
+++ b/llama_stack/providers/registry/inference.py
@ -41,7 +41,7 @@ def available_providers() -> list[ProviderSpec]:
            provider_type="inline::sentence-transformers",
            # CrossEncoder depends on torchao.quantization
            pip_packages=[
-                "torch torchvision torchao>=0.12.0 --extra-index-url https://download.pytorch.org/whl/cpu",
+                "torch torchvision torchao>=0.12.0 --extra-index-url https://download.pytorch.org/whl/cu130",
                "sentence-transformers --no-deps",
                # required by some SentenceTransformers architectures for tensor rearrange/merge ops
                "einops",
--- a/llama_stack/providers/registry/post_training.py
+++ b/llama_stack/providers/registry/post_training.py
@ -32,7 +32,7 @@ def available_providers() -> list[ProviderSpec]:
                "provider_type": "inline::torchtune-cpu",
                "pip_packages": (
                    cast(list[str], torchtune_def["pip_packages"])
-                    + ["torch torchtune>=0.5.0 torchao>=0.12.0 --extra-index-url https://download.pytorch.org/whl/cpu"]
+                    + ["torch torchtune>=0.5.0 torchao>=0.12.0 --extra-index-url https://download.pytorch.org/whl/cu130"]
                ),
            },
        ),
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@ -98,7 +98,7 @@ class VLLMInferenceAdapter(OpenAIMixin):
        params = params.model_copy()

        # Apply vLLM-specific defaults
-        if params.max_tokens is None and self.config.max_tokens:
+        if (params.max_tokens is None or params.max_tokens == 0) and self.config.max_tokens:
            params.max_tokens = self.config.max_tokens

        # This is to be consistent with OpenAI API and support vLLM <= v0.6.3