Merge branch 'main' into HuggingfacePostTrainingConfig-branch

2025-10-04 04:04:14 +00:00 · 2025-08-28 14:53:07 +05:30 · 2025-08-28 14:53:07 +05:30 · 66f4af7fec
commit 66f4af7fec
parent 0390054ff5 d73955a41e
31 changed files with 118 additions and 187 deletions
--- a/llama_stack/core/build.py
+++ b/llama_stack/core/build.py
@ -80,7 +80,7 @@ def get_provider_dependencies(
    normal_deps = []
    special_deps = []
    for package in deps:
-        if "--no-deps" in package or "--index-url" in package:
+        if any(f in package for f in ["--no-deps", "--index-url", "--extra-index-url"]):
            special_deps.append(package)
        else:
            normal_deps.append(package)
--- a/llama_stack/distributions/ci-tests/build.yaml
+++ b/llama_stack/distributions/ci-tests/build.yaml
@ -34,7 +34,7 @@ distribution_spec:
    telemetry:
    - provider_type: inline::meta-reference
    post_training:
-    - provider_type: inline::huggingface-cpu
+    - provider_type: inline::torchtune-cpu
    eval:
    - provider_type: inline::meta-reference
    datasetio:
--- a/llama_stack/distributions/ci-tests/run.yaml
+++ b/llama_stack/distributions/ci-tests/run.yaml
@ -156,13 +156,10 @@ providers:
      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/trace_store.db
      otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
  post_training:
-  - provider_id: huggingface-cpu
-    provider_type: inline::huggingface-cpu
+  - provider_id: torchtune-cpu
+    provider_type: inline::torchtune-cpu
    config:
-      checkpoint_format: huggingface
-      distributed_backend: null
-      device: cpu
-      dpo_output_dir: ~/.llama/distributions/ci-tests/dpo_output
+      checkpoint_format: meta
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
--- a/llama_stack/distributions/meta-reference-gpu/doc_template.md
+++ b/llama_stack/distributions/meta-reference-gpu/doc_template.md
@ -1,7 +1,7 @@
 ---
 orphan: true
 ---
-# Meta Reference Distribution
+# Meta Reference GPU Distribution

 ```{toctree}
 :maxdepth: 2
@ -29,7 +29,7 @@ The following environment variables can be configured:

 ## Prerequisite: Downloading Models

-Please use `llama model list --downloaded` to check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
+Please use `llama model list --downloaded` to check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](../../references/llama_cli_reference/download_models.md) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.

 ```
 $ llama model list --downloaded
--- a/llama_stack/distributions/starter-gpu/build.yaml
+++ b/llama_stack/distributions/starter-gpu/build.yaml
@ -35,7 +35,7 @@ distribution_spec:
    telemetry:
    - provider_type: inline::meta-reference
    post_training:
-    - provider_type: inline::torchtune-gpu
+    - provider_type: inline::huggingface-gpu
    eval:
    - provider_type: inline::meta-reference
    datasetio:
--- a/llama_stack/distributions/starter-gpu/run.yaml
+++ b/llama_stack/distributions/starter-gpu/run.yaml
@ -156,10 +156,13 @@ providers:
      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/trace_store.db
      otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
  post_training:
-  - provider_id: torchtune-gpu
-    provider_type: inline::torchtune-gpu
+  - provider_id: huggingface-gpu
+    provider_type: inline::huggingface-gpu
    config:
-      checkpoint_format: meta
+      checkpoint_format: huggingface
+      distributed_backend: null
+      device: cpu
+      dpo_output_dir: ~/.llama/distributions/starter-gpu/dpo_output
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
--- a/llama_stack/distributions/starter-gpu/starter_gpu.py
+++ b/llama_stack/distributions/starter-gpu/starter_gpu.py
@ -17,6 +17,6 @@ def get_distribution_template() -> DistributionTemplate:
    template.description = "Quick start template for running Llama Stack with several popular providers. This distribution is intended for GPU-enabled environments."

    template.providers["post_training"] = [
-        BuildProvider(provider_type="inline::torchtune-gpu"),
+        BuildProvider(provider_type="inline::huggingface-gpu"),
    ]
    return template
--- a/llama_stack/distributions/starter/build.yaml
+++ b/llama_stack/distributions/starter/build.yaml
@ -35,7 +35,7 @@ distribution_spec:
    telemetry:
    - provider_type: inline::meta-reference
    post_training:
-    - provider_type: inline::huggingface-cpu
+    - provider_type: inline::torchtune-cpu
    eval:
    - provider_type: inline::meta-reference
    datasetio:
--- a/llama_stack/distributions/starter/run.yaml
+++ b/llama_stack/distributions/starter/run.yaml
@ -156,13 +156,10 @@ providers:
      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/trace_store.db
      otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
  post_training:
-  - provider_id: huggingface-cpu
-    provider_type: inline::huggingface-cpu
+  - provider_id: torchtune-cpu
+    provider_type: inline::torchtune-cpu
    config:
-      checkpoint_format: huggingface
-      distributed_backend: null
-      device: cpu
-      dpo_output_dir: ~/.llama/distributions/starter/dpo_output
+      checkpoint_format: meta
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
--- a/llama_stack/distributions/starter/starter.py
+++ b/llama_stack/distributions/starter/starter.py
@ -120,7 +120,7 @@ def get_distribution_template() -> DistributionTemplate:
        ],
        "agents": [BuildProvider(provider_type="inline::meta-reference")],
        "telemetry": [BuildProvider(provider_type="inline::meta-reference")],
-        "post_training": [BuildProvider(provider_type="inline::huggingface-cpu")],
+        "post_training": [BuildProvider(provider_type="inline::torchtune-cpu")],
        "eval": [BuildProvider(provider_type="inline::meta-reference")],
        "datasetio": [
            BuildProvider(provider_type="remote::huggingface"),
--- a/llama_stack/providers/registry/inference.py
+++ b/llama_stack/providers/registry/inference.py
@ -40,8 +40,9 @@ def available_providers() -> list[ProviderSpec]:
        InlineProviderSpec(
            api=Api.inference,
            provider_type="inline::sentence-transformers",
+            # CrossEncoder depends on torchao.quantization
            pip_packages=[
-                "torch torchvision --index-url https://download.pytorch.org/whl/cpu",
+                "torch torchvision torchao>=0.12.0 --extra-index-url https://download.pytorch.org/whl/cpu",
                "sentence-transformers --no-deps",
            ],
            module="llama_stack.providers.inline.inference.sentence_transformers",
--- a/llama_stack/providers/registry/post_training.py
+++ b/llama_stack/providers/registry/post_training.py
@ -13,7 +13,7 @@ from llama_stack.providers.datatypes import AdapterSpec, Api, InlineProviderSpec
 # The CPU version is used for distributions that don't have GPU support -- they result in smaller container images.
 torchtune_def = dict(
    api=Api.post_training,
-    pip_packages=["torchtune==0.5.0", "torchao==0.8.0", "numpy"],
+    pip_packages=["numpy"],
    module="llama_stack.providers.inline.post_training.torchtune",
    config_class="llama_stack.providers.inline.post_training.torchtune.TorchtunePostTrainingConfig",
    api_dependencies=[
@ -23,56 +23,39 @@ torchtune_def = dict(
    description="TorchTune-based post-training provider for fine-tuning and optimizing models using Meta's TorchTune framework.",
 )

-huggingface_def = dict(
-    api=Api.post_training,
-    pip_packages=["trl", "transformers", "peft", "datasets"],
-    module="llama_stack.providers.inline.post_training.huggingface",
-    config_class="llama_stack.providers.inline.post_training.huggingface.HuggingFacePostTrainingConfig",
-    api_dependencies=[
-        Api.datasetio,
-        Api.datasets,
-    ],
-    description="HuggingFace-based post-training provider for fine-tuning models using the HuggingFace ecosystem.",
-)
-

 def available_providers() -> list[ProviderSpec]:
    return [
        InlineProviderSpec(
-            **{
+            **{  # type: ignore
                **torchtune_def,
                "provider_type": "inline::torchtune-cpu",
                "pip_packages": (
                    cast(list[str], torchtune_def["pip_packages"])
-                    + ["torch torchtune==0.5.0 torchao==0.8.0 --index-url https://download.pytorch.org/whl/cpu"]
+                    + ["torch torchtune>=0.5.0 torchao>=0.12.0 --extra-index-url https://download.pytorch.org/whl/cpu"]
                ),
            },
        ),
        InlineProviderSpec(
-            **{
-                **huggingface_def,
-                "provider_type": "inline::huggingface-cpu",
-                "pip_packages": (
-                    cast(list[str], huggingface_def["pip_packages"])
-                    + ["torch --index-url https://download.pytorch.org/whl/cpu"]
-                ),
-            },
-        ),
-        InlineProviderSpec(
-            **{
+            **{  # type: ignore
                **torchtune_def,
                "provider_type": "inline::torchtune-gpu",
                "pip_packages": (
-                    cast(list[str], torchtune_def["pip_packages"]) + ["torch torchtune==0.5.0 torchao==0.8.0"]
+                    cast(list[str], torchtune_def["pip_packages"]) + ["torch torchtune>=0.5.0 torchao>=0.12.0"]
                ),
            },
        ),
        InlineProviderSpec(
-            **{
-                **huggingface_def,
-                "provider_type": "inline::huggingface-gpu",
-                "pip_packages": (cast(list[str], huggingface_def["pip_packages"]) + ["torch"]),
-            },
+            api=Api.post_training,
+            provider_type="inline::huggingface-gpu",
+            pip_packages=["trl", "transformers", "peft", "datasets", "torch"],
+            module="llama_stack.providers.inline.post_training.huggingface",
+            config_class="llama_stack.providers.inline.post_training.huggingface.HuggingFacePostTrainingConfig",
+            api_dependencies=[
+                Api.datasetio,
+                Api.datasets,
+            ],
+            description="HuggingFace-based post-training provider for fine-tuning models using the HuggingFace ecosystem.",
        ),
        remote_provider_spec(
            api=Api.post_training,
--- a/llama_stack/testing/inference_recorder.py
+++ b/llama_stack/testing/inference_recorder.py
@ -9,7 +9,6 @@ from __future__ import annotations  # for forward references
 import hashlib
 import json
 import os
-import sqlite3
 from collections.abc import Generator
 from contextlib import contextmanager
 from enum import StrEnum
@ -125,28 +124,13 @@ class ResponseStorage:
    def __init__(self, test_dir: Path):
        self.test_dir = test_dir
        self.responses_dir = self.test_dir / "responses"
-        self.db_path = self.test_dir / "index.sqlite"

        self._ensure_directories()
-        self._init_database()

    def _ensure_directories(self):
        self.test_dir.mkdir(parents=True, exist_ok=True)
        self.responses_dir.mkdir(exist_ok=True)

-    def _init_database(self):
-        with sqlite3.connect(self.db_path) as conn:
-            conn.execute("""
-                CREATE TABLE IF NOT EXISTS recordings (
-                    request_hash TEXT PRIMARY KEY,
-                    response_file TEXT,
-                    endpoint TEXT,
-                    model TEXT,
-                    timestamp TEXT,
-                    is_streaming BOOLEAN
-                )
-            """)
-
    def store_recording(self, request_hash: str, request: dict[str, Any], response: dict[str, Any]):
        """Store a request/response pair."""
        # Generate unique response filename
@ -169,34 +153,9 @@ class ResponseStorage:
            f.write("\n")
            f.flush()

-        # Update SQLite index
-        with sqlite3.connect(self.db_path) as conn:
-            conn.execute(
-                """
-                INSERT OR REPLACE INTO recordings
-                (request_hash, response_file, endpoint, model, timestamp, is_streaming)
-                VALUES (?, ?, ?, ?, datetime('now'), ?)
-            """,
-                (
-                    request_hash,
-                    response_file,
-                    request.get("endpoint", ""),
-                    request.get("model", ""),
-                    response.get("is_streaming", False),
-                ),
-            )
-
    def find_recording(self, request_hash: str) -> dict[str, Any] | None:
        """Find a recorded response by request hash."""
-        with sqlite3.connect(self.db_path) as conn:
-            result = conn.execute(
-                "SELECT response_file FROM recordings WHERE request_hash = ?", (request_hash,)
-            ).fetchone()
-
-        if not result:
-            return None
-
-        response_file = result[0]
+        response_file = f"{request_hash[:12]}.json"
        response_path = self.responses_dir / response_file

        if not response_path.exists():
--- a/llama_stack/ui/package-lock.json
+++ b/llama_stack/ui/package-lock.json
@ -18,7 +18,7 @@
        "class-variance-authority": "^0.7.1",
        "clsx": "^2.1.1",
        "framer-motion": "^11.18.2",
-        "llama-stack-client": "^0.2.18",
+        "llama-stack-client": "^0.2.19",
        "lucide-react": "^0.510.0",
        "next": "15.3.3",
        "next-auth": "^4.24.11",
@ -36,7 +36,7 @@
        "@eslint/eslintrc": "^3",
        "@tailwindcss/postcss": "^4",
        "@testing-library/dom": "^10.4.1",
-        "@testing-library/jest-dom": "^6.6.3",
+        "@testing-library/jest-dom": "^6.8.0",
        "@testing-library/react": "^16.3.0",
        "@types/jest": "^29.5.14",
        "@types/node": "^20",
@ -3597,18 +3597,17 @@
      }
    },
    "node_modules/@testing-library/jest-dom": {
-      "version": "6.6.3",
-      "resolved": "https://registry.npmjs.org/@testing-library/jest-dom/-/jest-dom-6.6.3.tgz",
-      "integrity": "sha512-IteBhl4XqYNkM54f4ejhLRJiZNqcSCoXUOG2CPK7qbD322KjQozM4kHQOfkG2oln9b9HTYqs+Sae8vBATubxxA==",
+      "version": "6.8.0",
+      "resolved": "https://registry.npmjs.org/@testing-library/jest-dom/-/jest-dom-6.8.0.tgz",
+      "integrity": "sha512-WgXcWzVM6idy5JaftTVC8Vs83NKRmGJz4Hqs4oyOuO2J4r/y79vvKZsb+CaGyCSEbUPI6OsewfPd0G1A0/TUZQ==",
      "dev": true,
      "license": "MIT",
      "dependencies": {
        "@adobe/css-tools": "^4.4.0",
        "aria-query": "^5.0.0",
-        "chalk": "^3.0.0",
        "css.escape": "^1.5.1",
        "dom-accessibility-api": "^0.6.3",
-        "lodash": "^4.17.21",
+        "picocolors": "^1.1.1",
        "redent": "^3.0.0"
      },
      "engines": {
@ -3617,20 +3616,6 @@
        "yarn": ">=1"
      }
    },
-    "node_modules/@testing-library/jest-dom/node_modules/chalk": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/chalk/-/chalk-3.0.0.tgz",
-      "integrity": "sha512-4D3B6Wf41KOYRFdszmDqMCGq5VV/uMAB273JILmO+3jAlh8X4qDtdtgCR3fxtbLEMzSx22QdhnDcJvu2u1fVwg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "ansi-styles": "^4.1.0",
-        "supports-color": "^7.1.0"
-      },
-      "engines": {
-        "node": ">=8"
-      }
-    },
    "node_modules/@testing-library/jest-dom/node_modules/dom-accessibility-api": {
      "version": "0.6.3",
      "resolved": "https://registry.npmjs.org/dom-accessibility-api/-/dom-accessibility-api-0.6.3.tgz",
@ -10021,9 +10006,9 @@
      "license": "MIT"
    },
    "node_modules/llama-stack-client": {
-      "version": "0.2.18",
-      "resolved": "https://registry.npmjs.org/llama-stack-client/-/llama-stack-client-0.2.18.tgz",
-      "integrity": "sha512-k+xQOz/TIU0cINP4Aih8q6xs7f/6qs0fLDMXTTKQr5C0F1jtCjRiwsas7bTsDfpKfYhg/7Xy/wPw/uZgi6aIVg==",
+      "version": "0.2.19",
+      "resolved": "https://registry.npmjs.org/llama-stack-client/-/llama-stack-client-0.2.19.tgz",
+      "integrity": "sha512-sDuAhUdEGlERZ3jlMUzPXcQTgMv/pGbDrPX0ifbE5S+gr7Q+7ohuQYrIXe+hXgIipFjq+y4b2c5laZ76tmAyEA==",
      "license": "MIT",
      "dependencies": {
        "@types/node": "^18.11.18",
@ -10066,13 +10051,6 @@
        "url": "https://github.com/sponsors/sindresorhus"
      }
    },
-    "node_modules/lodash": {
-      "version": "4.17.21",
-      "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz",
-      "integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==",
-      "dev": true,
-      "license": "MIT"
-    },
    "node_modules/lodash.merge": {
      "version": "4.6.2",
      "resolved": "https://registry.npmjs.org/lodash.merge/-/lodash.merge-4.6.2.tgz",
--- a/llama_stack/ui/package.json
+++ b/llama_stack/ui/package.json
@ -23,7 +23,7 @@
    "class-variance-authority": "^0.7.1",
    "clsx": "^2.1.1",
    "framer-motion": "^11.18.2",
-    "llama-stack-client": "^0.2.18",
+    "llama-stack-client": "^0.2.19",
    "lucide-react": "^0.510.0",
    "next": "15.3.3",
    "next-auth": "^4.24.11",
@ -41,7 +41,7 @@
    "@eslint/eslintrc": "^3",
    "@tailwindcss/postcss": "^4",
    "@testing-library/dom": "^10.4.1",
-    "@testing-library/jest-dom": "^6.6.3",
+    "@testing-library/jest-dom": "^6.8.0",
    "@testing-library/react": "^16.3.0",
    "@types/jest": "^29.5.14",
    "@types/node": "^20",