From 1e2faa461fd5843f83fc3db75cab5c10a7353194 Mon Sep 17 00:00:00 2001 From: Dinesh Yeduguru Date: Mon, 2 Dec 2024 16:10:16 -0800 Subject: [PATCH 1/5] update client cli docs (#560) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test plan: make html sphinx-autobuild source build/html ![Screenshot 2024-12-02 at 3 32 18 PM](https://github.com/user-attachments/assets/061d5ca6-178f-463a-854c-acb96ca3bb0d) --- .../llama_stack_client_cli_reference.md | 75 +++++++++++++++++-- 1 file changed, 68 insertions(+), 7 deletions(-) diff --git a/docs/source/references/llama_stack_client_cli_reference.md b/docs/source/references/llama_stack_client_cli_reference.md index d3835e488..b35aa189d 100644 --- a/docs/source/references/llama_stack_client_cli_reference.md +++ b/docs/source/references/llama_stack_client_cli_reference.md @@ -27,8 +27,6 @@ $ llama-stack-client configure Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:5000 ``` -## Provider Commands - ### `llama-stack-client providers list` ```bash $ llama-stack-client providers list @@ -119,8 +117,25 @@ $ llama-stack-client memory_banks list +--------------+----------------+--------+-------------------+------------------------+--------------------------+ ``` -## Shield Management +### `llama-stack-client memory_banks register` +```bash +$ llama-stack-client memory_banks register --type [--provider-id ] [--provider-memory-bank-id ] [--chunk-size ] [--embedding-model ] [--overlap-size ] +``` +Options: +- `--type`: Required. Type of memory bank. Choices: "vector", "keyvalue", "keyword", "graph" +- `--provider-id`: Optional. Provider ID for the memory bank +- `--provider-memory-bank-id`: Optional. Provider's memory bank ID +- `--chunk-size`: Optional. Chunk size in tokens (for vector type). Default: 512 +- `--embedding-model`: Optional. Embedding model (for vector type). Default: "all-MiniLM-L6-v2" +- `--overlap-size`: Optional. Overlap size in tokens (for vector type). Default: 64 + +### `llama-stack-client memory_banks unregister` +```bash +$ llama-stack-client memory_banks unregister +``` + +## Shield Management ### `llama-stack-client shields list` ```bash $ llama-stack-client shields list @@ -134,16 +149,51 @@ $ llama-stack-client shields list +--------------+----------+----------------+-------------+ ``` -## Evaluation Tasks +### `llama-stack-client shields register` +```bash +$ llama-stack-client shields register --shield-id [--provider-id ] [--provider-shield-id ] [--params ] +``` + +Options: +- `--shield-id`: Required. ID of the shield +- `--provider-id`: Optional. Provider ID for the shield +- `--provider-shield-id`: Optional. Provider's shield ID +- `--params`: Optional. JSON configuration parameters for the shield + +## Eval Task Management ### `llama-stack-client eval_tasks list` ```bash -$ llama-stack-client eval run_benchmark --num-examples 10 --output-dir ./ --eval-task-config ~/eval_task_config.json +$ llama-stack-client eval_tasks list ``` -where `eval_task_config.json` is the path to the eval task config file in JSON format. An example eval_task_config +### `llama-stack-client eval_tasks register` +```bash +$ llama-stack-client eval_tasks register --eval-task-id --dataset-id --scoring-functions [ ...] [--provider-id ] [--provider-eval-task-id ] [--metadata ] ``` -$ cat ~/eval_task_config.json + +Options: +- `--eval-task-id`: Required. ID of the eval task +- `--dataset-id`: Required. ID of the dataset to evaluate +- `--scoring-functions`: Required. One or more scoring functions to use for evaluation +- `--provider-id`: Optional. Provider ID for the eval task +- `--provider-eval-task-id`: Optional. Provider's eval task ID +- `--metadata`: Optional. Metadata for the eval task in JSON format + +## Eval execution +### `llama-stack-client eval run-benchmark` +```bash +$ llama-stack-client eval run-benchmark [ ...] --eval-task-config --output-dir [--num-examples ] [--visualize] +``` + +Options: +- `--eval-task-config`: Required. Path to the eval task config file in JSON format +- `--output-dir`: Required. Path to the directory where evaluation results will be saved +- `--num-examples`: Optional. Number of examples to evaluate (useful for debugging) +- `--visualize`: Optional flag. If set, visualizes evaluation results after completion + +Example eval_task_config.json: +```json { "type": "benchmark", "eval_candidate": { @@ -160,3 +210,14 @@ $ cat ~/eval_task_config.json } } ``` + +### `llama-stack-client eval run-scoring` +```bash +$ llama-stack-client eval run-scoring --eval-task-config --output-dir [--num-examples ] [--visualize] +``` + +Options: +- `--eval-task-config`: Required. Path to the eval task config file in JSON format +- `--output-dir`: Required. Path to the directory where scoring results will be saved +- `--num-examples`: Optional. Number of examples to evaluate (useful for debugging) +- `--visualize`: Optional flag. If set, visualizes scoring results after completion From 4c7b1a8fb3acb8f65dac9c2f066f86e31d6cd805 Mon Sep 17 00:00:00 2001 From: dltn <6599399+dltn@users.noreply.github.com> Date: Mon, 2 Dec 2024 19:48:46 -0800 Subject: [PATCH 2/5] Bump version to 0.0.57 --- requirements.txt | 4 ++-- setup.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 0ff43e246..8698495b1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,8 +2,8 @@ blobfile fire httpx huggingface-hub -llama-models>=0.0.56 -llama-stack-client>=0.0.56 +llama-models>=0.0.57 +llama-stack-client>=0.0.57 prompt-toolkit python-dotenv pydantic>=2 diff --git a/setup.py b/setup.py index 842cbb30d..3d68021dd 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ def read_requirements(): setup( name="llama_stack", - version="0.0.56", + version="0.0.57", author="Meta Llama", author_email="llama-oss@meta.com", description="Llama Stack", From 435f34b05e84f1747b28570234f25878cf0b31c4 Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Tue, 3 Dec 2024 05:55:14 -0500 Subject: [PATCH 3/5] reduce the accuracy requirements to pass the chat completion structured output test (#522) i find `test_structured_output` to be flakey. it's both a functionality and accuracy test - ``` answer = AnswerFormat.model_validate_json(response.completion_message.content) assert answer.first_name == "Michael" assert answer.last_name == "Jordan" assert answer.year_of_birth == 1963 assert answer.num_seasons_in_nba == 15 ``` it's an accuracy test because it checks the value of first/last name, birth year, and num seasons. i find that - - llama-3.1-8b-instruct and llama-3.2-3b-instruct pass the functionality portion - llama-3.2-3b-instruct consistently fails the accuracy portion (thinking MJ was in the NBA for 14 seasons) - llama-3.1-8b-instruct occasionally fails the accuracy portion suggestions (not mutually exclusive) - 1. turn the test into functionality only, skip the value checks 2. split the test into a functionality version and an xfail accuracy version 3. add context to the prompt so the llm can answer without accessing embedded memory # What does this PR do? implements option (3) by adding context to the system prompt. ## Test Plan `pytest -s -v ... llama_stack/providers/tests/inference/ ... -k structured_output` ## Before submitting - [x] Ran pre-commit to handle lint / formatting issues. - [x] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [x] Updated relevant documentation. - [x] Wrote necessary unit or integration tests. --- .../providers/tests/inference/test_text_inference.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/llama_stack/providers/tests/inference/test_text_inference.py b/llama_stack/providers/tests/inference/test_text_inference.py index f0f1d0eb2..9e5c67375 100644 --- a/llama_stack/providers/tests/inference/test_text_inference.py +++ b/llama_stack/providers/tests/inference/test_text_inference.py @@ -211,7 +211,15 @@ class TestInference: response = await inference_impl.chat_completion( model_id=inference_model, messages=[ - SystemMessage(content="You are a helpful assistant."), + # we include context about Michael Jordan in the prompt so that the test is + # focused on the funtionality of the model and not on the information embedded + # in the model. Llama 3.2 3B Instruct tends to think MJ played for 14 seasons. + SystemMessage( + content=( + "You are a helpful assistant.\n\n" + "Michael Jordan was born in 1963. He played basketball for the Chicago Bulls for 15 seasons." + ) + ), UserMessage(content="Please give me information about Michael Jordan."), ], stream=False, From fd19a8a517fc22975b9b93faa5b997117a5cf2e8 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Tue, 3 Dec 2024 18:50:18 -0800 Subject: [PATCH 4/5] add missing __init__ --- llama_stack/providers/inline/scoring/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 llama_stack/providers/inline/scoring/__init__.py diff --git a/llama_stack/providers/inline/scoring/__init__.py b/llama_stack/providers/inline/scoring/__init__.py new file mode 100644 index 000000000..756f351d8 --- /dev/null +++ b/llama_stack/providers/inline/scoring/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. From 6e10d0b23eb662776586f30c476902791a1089d9 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Tue, 3 Dec 2024 18:52:43 -0800 Subject: [PATCH 5/5] precommit --- llama_stack/providers/inline/scoring/braintrust/__init__.py | 3 ++- llama_stack/providers/inline/scoring/braintrust/braintrust.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/llama_stack/providers/inline/scoring/braintrust/__init__.py b/llama_stack/providers/inline/scoring/braintrust/__init__.py index dc4ea4951..2ddc58bd2 100644 --- a/llama_stack/providers/inline/scoring/braintrust/__init__.py +++ b/llama_stack/providers/inline/scoring/braintrust/__init__.py @@ -5,9 +5,10 @@ # the root directory of this source tree. from typing import Dict -from llama_stack.distribution.datatypes import Api, ProviderSpec from pydantic import BaseModel +from llama_stack.distribution.datatypes import Api, ProviderSpec + from .config import BraintrustScoringConfig diff --git a/llama_stack/providers/inline/scoring/braintrust/braintrust.py b/llama_stack/providers/inline/scoring/braintrust/braintrust.py index cf6e22a29..ee515d588 100644 --- a/llama_stack/providers/inline/scoring/braintrust/braintrust.py +++ b/llama_stack/providers/inline/scoring/braintrust/braintrust.py @@ -16,6 +16,7 @@ import os from autoevals.llm import Factuality from autoevals.ragas import AnswerCorrectness + from llama_stack.distribution.request_headers import NeedsRequestProviderData from llama_stack.providers.datatypes import ScoringFunctionsProtocolPrivate