[torchtune integration] post training + eval (#670)

## What does this PR do? - Add related Apis in experimental-post-training template to enable eval on the finetuned checkpoint in the template - A small bug fix on meta reference eval - A small error handle improvement on post training ## Test Plan From client side issued an E2E post training request https://github.com/meta-llama/llama-stack-client-python/pull/70 and get eval results successfully <img width="1315" alt="Screenshot 2024-12-20 at 12 06 59 PM" src="https://github.com/user-attachments/assets/a09bd524-59ae-490c-908f-2e36ccf27c0a" />
2024-12-20 13:43:13 -08:00 · 2024-12-20 13:43:13 -08:00 · 06cb0c837e
commit 06cb0c837e
parent c8be0bf1c9
4 changed files with 52 additions and 3 deletions
--- a/llama_stack/providers/inline/eval/meta_reference/eval.py
+++ b/llama_stack/providers/inline/eval/meta_reference/eval.py
@ -15,7 +15,7 @@ from llama_stack.apis.agents import Agents
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
 from llama_stack.apis.eval_tasks import EvalTask
-from llama_stack.apis.inference import Inference
+from llama_stack.apis.inference import Inference, UserMessage
 from llama_stack.apis.scoring import Scoring
 from llama_stack.providers.datatypes import EvalTasksProtocolPrivate
 from llama_stack.providers.utils.kvstore import kvstore_impl
--- a/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
+++ b/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
@ -110,6 +110,10 @@ class LoraFinetuningSingleDevice:
            self.checkpoint_dir = config.checkpoint_dir
        else:
            model = resolve_model(self.model_id)
            if model is None:
                raise ValueError(
                    f"{self.model_id} not found. Your model id should be in the llama models SKU list"
                )
            self.checkpoint_dir = model_checkpoint_dir(model)
        self._output_dir = str(DEFAULT_CHECKPOINT_DIR)
--- a/llama_stack/templates/experimental-post-training/build.yaml
+++ b/llama_stack/templates/experimental-post-training/build.yaml
@ -4,10 +4,22 @@ distribution_spec:
  description: Experimental template for post training
  docker_image: null
  providers:
    inference:
    - inline::meta-reference
    eval:
    - inline::meta-reference
    scoring:
    - inline::basic
    post_training:
    - inline::torchtune
    datasetio:
    - remote::huggingface
    telemetry:
    - inline::meta-reference
    agents:
    - inline::meta-reference
    safety:
    - inline::llama-guard
    memory:
    - inline::faiss
 image_type: conda
--- a/llama_stack/templates/experimental-post-training/run.yaml
+++ b/llama_stack/templates/experimental-post-training/run.yaml
@ -3,9 +3,14 @@ image_name: experimental-post-training
 docker_image: null
 conda_env: experimental-post-training
 apis:
- inference
+- agents
 - telemetry
 - datasetio
 - eval
 - inference
 - memory
 - safety
 - scoring
 - telemetry
 - post_training
 providers:
  inference:
@ -14,6 +19,14 @@ providers:
    config:
      max_seq_len: 4096
      checkpoint_dir: null
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
  scoring:
  - provider_id: basic
    provider_type: inline::basic
    config: {}
  datasetio:
  - provider_id: huggingface-0
    provider_type: remote::huggingface
@ -26,6 +39,26 @@ providers:
  - provider_id: torchtune-post-training
    provider_type: inline::torchtune
    config: {}
  agents:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      persistence_store:
        type: sqlite
        namespace: null
        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/agents_store.db
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
    config: {}
  memory:
  - provider_id: faiss
    provider_type: inline::faiss
    config:
      kvstore:
        type: sqlite
        namespace: null
        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/faiss_store.db
 metadata_store:
  namespace: null