From 1e2faa461fd5843f83fc3db75cab5c10a7353194 Mon Sep 17 00:00:00 2001
From: Dinesh Yeduguru <yvdinesh@gmail.com>
Date: Mon, 2 Dec 2024 16:10:16 -0800
Subject: [PATCH 1/5] update client cli docs (#560)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Test plan:
make html
sphinx-autobuild source build/html


![Screenshot 2024-12-02 at 3 32
18 PM](https://github.com/user-attachments/assets/061d5ca6-178f-463a-854c-acb96ca3bb0d)
---
 .../llama_stack_client_cli_reference.md       | 75 +++++++++++++++++--
 1 file changed, 68 insertions(+), 7 deletions(-)
diff --git a/docs/source/references/llama_stack_client_cli_reference.md b/docs/source/references/llama_stack_client_cli_reference.md
index d3835e488..b35aa189d 100644
--- a/docs/source/references/llama_stack_client_cli_reference.md
+++ b/docs/source/references/llama_stack_client_cli_reference.md
@@ -27,8 +27,6 @@ $ llama-stack-client configure
 Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:5000
 ```
 
-## Provider Commands
-
 ### `llama-stack-client providers list`
 ```bash
 $ llama-stack-client providers list
@@ -119,8 +117,25 @@ $ llama-stack-client memory_banks list
 +--------------+----------------+--------+-------------------+------------------------+--------------------------+
 ```
 
-## Shield Management
+### `llama-stack-client memory_banks register`
+```bash
+$ llama-stack-client memory_banks register <memory-bank-id> --type <type> [--provider-id <provider-id>] [--provider-memory-bank-id <provider-memory-bank-id>] [--chunk-size <chunk-size>] [--embedding-model <embedding-model>] [--overlap-size <overlap-size>]
+```
 
+Options:
+- `--type`: Required. Type of memory bank. Choices: "vector", "keyvalue", "keyword", "graph"
+- `--provider-id`: Optional. Provider ID for the memory bank
+- `--provider-memory-bank-id`: Optional. Provider's memory bank ID
+- `--chunk-size`: Optional. Chunk size in tokens (for vector type). Default: 512
+- `--embedding-model`: Optional. Embedding model (for vector type). Default: "all-MiniLM-L6-v2"
+- `--overlap-size`: Optional. Overlap size in tokens (for vector type). Default: 64
+
+### `llama-stack-client memory_banks unregister`
+```bash
+$ llama-stack-client memory_banks unregister <memory-bank-id>
+```
+
+## Shield Management
 ### `llama-stack-client shields list`
 ```bash
 $ llama-stack-client shields list
@@ -134,16 +149,51 @@ $ llama-stack-client shields list
 +--------------+----------+----------------+-------------+
 ```
 
-## Evaluation Tasks
+### `llama-stack-client shields register`
+```bash
+$ llama-stack-client shields register --shield-id <shield-id> [--provider-id <provider-id>] [--provider-shield-id <provider-shield-id>] [--params <params>]
+```
+
+Options:
+- `--shield-id`: Required. ID of the shield
+- `--provider-id`: Optional. Provider ID for the shield
+- `--provider-shield-id`: Optional. Provider's shield ID
+- `--params`: Optional. JSON configuration parameters for the shield
+
+## Eval Task Management
 
 ### `llama-stack-client eval_tasks list`
 ```bash
-$ llama-stack-client eval run_benchmark <task_id1> <task_id2> --num-examples 10 --output-dir ./ --eval-task-config ~/eval_task_config.json
+$ llama-stack-client eval_tasks list
 ```
 
-where `eval_task_config.json` is the path to the eval task config file in JSON format. An example eval_task_config
+### `llama-stack-client eval_tasks register`
+```bash
+$ llama-stack-client eval_tasks register --eval-task-id <eval-task-id> --dataset-id <dataset-id> --scoring-functions <function1> [<function2> ...] [--provider-id <provider-id>] [--provider-eval-task-id <provider-eval-task-id>] [--metadata <metadata>]
 ```
-$ cat ~/eval_task_config.json
+
+Options:
+- `--eval-task-id`: Required. ID of the eval task
+- `--dataset-id`: Required. ID of the dataset to evaluate
+- `--scoring-functions`: Required. One or more scoring functions to use for evaluation
+- `--provider-id`: Optional. Provider ID for the eval task
+- `--provider-eval-task-id`: Optional. Provider's eval task ID
+- `--metadata`: Optional. Metadata for the eval task in JSON format
+
+## Eval execution
+### `llama-stack-client eval run-benchmark`
+```bash
+$ llama-stack-client eval run-benchmark <eval-task-id1> [<eval-task-id2> ...] --eval-task-config <config-file> --output-dir <output-dir> [--num-examples <num>] [--visualize]
+```
+
+Options:
+- `--eval-task-config`: Required. Path to the eval task config file in JSON format
+- `--output-dir`: Required. Path to the directory where evaluation results will be saved
+- `--num-examples`: Optional. Number of examples to evaluate (useful for debugging)
+- `--visualize`: Optional flag. If set, visualizes evaluation results after completion
+
+Example eval_task_config.json:
+```json
 {
     "type": "benchmark",
     "eval_candidate": {
@@ -160,3 +210,14 @@ $ cat ~/eval_task_config.json
     }
 }
 ```
+
+### `llama-stack-client eval run-scoring`
+```bash
+$ llama-stack-client eval run-scoring <eval-task-id> --eval-task-config <config-file> --output-dir <output-dir> [--num-examples <num>] [--visualize]
+```
+
+Options:
+- `--eval-task-config`: Required. Path to the eval task config file in JSON format
+- `--output-dir`: Required. Path to the directory where scoring results will be saved
+- `--num-examples`: Optional. Number of examples to evaluate (useful for debugging)
+- `--visualize`: Optional flag. If set, visualizes scoring results after completion

From 4c7b1a8fb3acb8f65dac9c2f066f86e31d6cd805 Mon Sep 17 00:00:00 2001
From: dltn <6599399+dltn@users.noreply.github.com>
Date: Mon, 2 Dec 2024 19:48:46 -0800
Subject: [PATCH 2/5] Bump version to 0.0.57

---
 requirements.txt | 4 ++--
 setup.py         | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 0ff43e246..8698495b1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,8 +2,8 @@ blobfile
 fire
 httpx
 huggingface-hub
-llama-models>=0.0.56
-llama-stack-client>=0.0.56
+llama-models>=0.0.57
+llama-stack-client>=0.0.57
 prompt-toolkit
 python-dotenv
 pydantic>=2
diff --git a/setup.py b/setup.py
index 842cbb30d..3d68021dd 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@ def read_requirements():
 
 setup(
     name="llama_stack",
-    version="0.0.56",
+    version="0.0.57",
     author="Meta Llama",
     author_email="llama-oss@meta.com",
     description="Llama Stack",

From 435f34b05e84f1747b28570234f25878cf0b31c4 Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Tue, 3 Dec 2024 05:55:14 -0500
Subject: [PATCH 3/5] reduce the accuracy requirements to pass the chat
 completion structured output test (#522)

i find `test_structured_output` to be flakey. it's both a functionality
and accuracy test -

```
        answer = AnswerFormat.model_validate_json(response.completion_message.content)
        assert answer.first_name == "Michael"
        assert answer.last_name == "Jordan"
        assert answer.year_of_birth == 1963
        assert answer.num_seasons_in_nba == 15
```

it's an accuracy test because it checks the value of first/last name,
birth year, and num seasons.

i find that -
- llama-3.1-8b-instruct and llama-3.2-3b-instruct pass the functionality
portion
- llama-3.2-3b-instruct consistently fails the accuracy portion
(thinking MJ was in the NBA for 14 seasons)
 - llama-3.1-8b-instruct occasionally fails the accuracy portion

suggestions (not mutually exclusive) -
1. turn the test into functionality only, skip the value checks
2. split the test into a functionality version and an xfail accuracy
version
3. add context to the prompt so the llm can answer without accessing
embedded memory

# What does this PR do?

implements option (3) by adding context to the system prompt.


## Test Plan


`pytest -s -v ... llama_stack/providers/tests/inference/ ... -k
structured_output`


## Before submitting

- [x] Ran pre-commit to handle lint / formatting issues.
- [x] Read the [contributor
guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md),
      Pull Request section?
- [x] Updated relevant documentation.
- [x] Wrote necessary unit or integration tests.
---
 .../providers/tests/inference/test_text_inference.py   | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/llama_stack/providers/tests/inference/test_text_inference.py b/llama_stack/providers/tests/inference/test_text_inference.py
index f0f1d0eb2..9e5c67375 100644
--- a/llama_stack/providers/tests/inference/test_text_inference.py
+++ b/llama_stack/providers/tests/inference/test_text_inference.py
@@ -211,7 +211,15 @@ class TestInference:
         response = await inference_impl.chat_completion(
             model_id=inference_model,
             messages=[
-                SystemMessage(content="You are a helpful assistant."),
+                # we include context about Michael Jordan in the prompt so that the test is
+                # focused on the funtionality of the model and not on the information embedded
+                # in the model. Llama 3.2 3B Instruct tends to think MJ played for 14 seasons.
+                SystemMessage(
+                    content=(
+                        "You are a helpful assistant.\n\n"
+                        "Michael Jordan was born in 1963. He played basketball for the Chicago Bulls for 15 seasons."
+                    )
+                ),
                 UserMessage(content="Please give me information about Michael Jordan."),
             ],
             stream=False,

From fd19a8a517fc22975b9b93faa5b997117a5cf2e8 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Tue, 3 Dec 2024 18:50:18 -0800
Subject: [PATCH 4/5] add missing __init__

---
 llama_stack/providers/inline/scoring/__init__.py | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 llama_stack/providers/inline/scoring/__init__.py

diff --git a/llama_stack/providers/inline/scoring/__init__.py b/llama_stack/providers/inline/scoring/__init__.py
new file mode 100644
index 000000000..756f351d8
--- /dev/null
+++ b/llama_stack/providers/inline/scoring/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.

From 6e10d0b23eb662776586f30c476902791a1089d9 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Tue, 3 Dec 2024 18:52:43 -0800
Subject: [PATCH 5/5] precommit

---
 llama_stack/providers/inline/scoring/braintrust/__init__.py   | 3 ++-
 llama_stack/providers/inline/scoring/braintrust/braintrust.py | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/llama_stack/providers/inline/scoring/braintrust/__init__.py b/llama_stack/providers/inline/scoring/braintrust/__init__.py
index dc4ea4951..2ddc58bd2 100644
--- a/llama_stack/providers/inline/scoring/braintrust/__init__.py
+++ b/llama_stack/providers/inline/scoring/braintrust/__init__.py
@@ -5,9 +5,10 @@
 # the root directory of this source tree.
 from typing import Dict
 
-from llama_stack.distribution.datatypes import Api, ProviderSpec
 from pydantic import BaseModel
 
+from llama_stack.distribution.datatypes import Api, ProviderSpec
+
 from .config import BraintrustScoringConfig
 
 
diff --git a/llama_stack/providers/inline/scoring/braintrust/braintrust.py b/llama_stack/providers/inline/scoring/braintrust/braintrust.py
index cf6e22a29..ee515d588 100644
--- a/llama_stack/providers/inline/scoring/braintrust/braintrust.py
+++ b/llama_stack/providers/inline/scoring/braintrust/braintrust.py
@@ -16,6 +16,7 @@ import os
 
 from autoevals.llm import Factuality
 from autoevals.ragas import AnswerCorrectness
+
 from llama_stack.distribution.request_headers import NeedsRequestProviderData
 from llama_stack.providers.datatypes import ScoringFunctionsProtocolPrivate