diff --git a/docs/source/references/llama_stack_client_cli_reference.md b/docs/source/references/llama_stack_client_cli_reference.md index d3835e488..b35aa189d 100644 --- a/docs/source/references/llama_stack_client_cli_reference.md +++ b/docs/source/references/llama_stack_client_cli_reference.md @@ -27,8 +27,6 @@ $ llama-stack-client configure Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:5000 ``` -## Provider Commands - ### `llama-stack-client providers list` ```bash $ llama-stack-client providers list @@ -119,8 +117,25 @@ $ llama-stack-client memory_banks list +--------------+----------------+--------+-------------------+------------------------+--------------------------+ ``` -## Shield Management +### `llama-stack-client memory_banks register` +```bash +$ llama-stack-client memory_banks register --type [--provider-id ] [--provider-memory-bank-id ] [--chunk-size ] [--embedding-model ] [--overlap-size ] +``` +Options: +- `--type`: Required. Type of memory bank. Choices: "vector", "keyvalue", "keyword", "graph" +- `--provider-id`: Optional. Provider ID for the memory bank +- `--provider-memory-bank-id`: Optional. Provider's memory bank ID +- `--chunk-size`: Optional. Chunk size in tokens (for vector type). Default: 512 +- `--embedding-model`: Optional. Embedding model (for vector type). Default: "all-MiniLM-L6-v2" +- `--overlap-size`: Optional. Overlap size in tokens (for vector type). Default: 64 + +### `llama-stack-client memory_banks unregister` +```bash +$ llama-stack-client memory_banks unregister +``` + +## Shield Management ### `llama-stack-client shields list` ```bash $ llama-stack-client shields list @@ -134,16 +149,51 @@ $ llama-stack-client shields list +--------------+----------+----------------+-------------+ ``` -## Evaluation Tasks +### `llama-stack-client shields register` +```bash +$ llama-stack-client shields register --shield-id [--provider-id ] [--provider-shield-id ] [--params ] +``` + +Options: +- `--shield-id`: Required. ID of the shield +- `--provider-id`: Optional. Provider ID for the shield +- `--provider-shield-id`: Optional. Provider's shield ID +- `--params`: Optional. JSON configuration parameters for the shield + +## Eval Task Management ### `llama-stack-client eval_tasks list` ```bash -$ llama-stack-client eval run_benchmark --num-examples 10 --output-dir ./ --eval-task-config ~/eval_task_config.json +$ llama-stack-client eval_tasks list ``` -where `eval_task_config.json` is the path to the eval task config file in JSON format. An example eval_task_config +### `llama-stack-client eval_tasks register` +```bash +$ llama-stack-client eval_tasks register --eval-task-id --dataset-id --scoring-functions [ ...] [--provider-id ] [--provider-eval-task-id ] [--metadata ] ``` -$ cat ~/eval_task_config.json + +Options: +- `--eval-task-id`: Required. ID of the eval task +- `--dataset-id`: Required. ID of the dataset to evaluate +- `--scoring-functions`: Required. One or more scoring functions to use for evaluation +- `--provider-id`: Optional. Provider ID for the eval task +- `--provider-eval-task-id`: Optional. Provider's eval task ID +- `--metadata`: Optional. Metadata for the eval task in JSON format + +## Eval execution +### `llama-stack-client eval run-benchmark` +```bash +$ llama-stack-client eval run-benchmark [ ...] --eval-task-config --output-dir [--num-examples ] [--visualize] +``` + +Options: +- `--eval-task-config`: Required. Path to the eval task config file in JSON format +- `--output-dir`: Required. Path to the directory where evaluation results will be saved +- `--num-examples`: Optional. Number of examples to evaluate (useful for debugging) +- `--visualize`: Optional flag. If set, visualizes evaluation results after completion + +Example eval_task_config.json: +```json { "type": "benchmark", "eval_candidate": { @@ -160,3 +210,14 @@ $ cat ~/eval_task_config.json } } ``` + +### `llama-stack-client eval run-scoring` +```bash +$ llama-stack-client eval run-scoring --eval-task-config --output-dir [--num-examples ] [--visualize] +``` + +Options: +- `--eval-task-config`: Required. Path to the eval task config file in JSON format +- `--output-dir`: Required. Path to the directory where scoring results will be saved +- `--num-examples`: Optional. Number of examples to evaluate (useful for debugging) +- `--visualize`: Optional flag. If set, visualizes scoring results after completion diff --git a/llama_stack/providers/inline/scoring/__init__.py b/llama_stack/providers/inline/scoring/__init__.py new file mode 100644 index 000000000..756f351d8 --- /dev/null +++ b/llama_stack/providers/inline/scoring/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/llama_stack/providers/inline/scoring/braintrust/__init__.py b/llama_stack/providers/inline/scoring/braintrust/__init__.py index dc4ea4951..2ddc58bd2 100644 --- a/llama_stack/providers/inline/scoring/braintrust/__init__.py +++ b/llama_stack/providers/inline/scoring/braintrust/__init__.py @@ -5,9 +5,10 @@ # the root directory of this source tree. from typing import Dict -from llama_stack.distribution.datatypes import Api, ProviderSpec from pydantic import BaseModel +from llama_stack.distribution.datatypes import Api, ProviderSpec + from .config import BraintrustScoringConfig diff --git a/llama_stack/providers/inline/scoring/braintrust/braintrust.py b/llama_stack/providers/inline/scoring/braintrust/braintrust.py index cf6e22a29..ee515d588 100644 --- a/llama_stack/providers/inline/scoring/braintrust/braintrust.py +++ b/llama_stack/providers/inline/scoring/braintrust/braintrust.py @@ -16,6 +16,7 @@ import os from autoevals.llm import Factuality from autoevals.ragas import AnswerCorrectness + from llama_stack.distribution.request_headers import NeedsRequestProviderData from llama_stack.providers.datatypes import ScoringFunctionsProtocolPrivate diff --git a/llama_stack/providers/tests/inference/test_text_inference.py b/llama_stack/providers/tests/inference/test_text_inference.py index f0f1d0eb2..9e5c67375 100644 --- a/llama_stack/providers/tests/inference/test_text_inference.py +++ b/llama_stack/providers/tests/inference/test_text_inference.py @@ -211,7 +211,15 @@ class TestInference: response = await inference_impl.chat_completion( model_id=inference_model, messages=[ - SystemMessage(content="You are a helpful assistant."), + # we include context about Michael Jordan in the prompt so that the test is + # focused on the funtionality of the model and not on the information embedded + # in the model. Llama 3.2 3B Instruct tends to think MJ played for 14 seasons. + SystemMessage( + content=( + "You are a helpful assistant.\n\n" + "Michael Jordan was born in 1963. He played basketball for the Chicago Bulls for 15 seasons." + ) + ), UserMessage(content="Please give me information about Michael Jordan."), ], stream=False, diff --git a/requirements.txt b/requirements.txt index 0ff43e246..8698495b1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,8 +2,8 @@ blobfile fire httpx huggingface-hub -llama-models>=0.0.56 -llama-stack-client>=0.0.56 +llama-models>=0.0.57 +llama-stack-client>=0.0.57 prompt-toolkit python-dotenv pydantic>=2 diff --git a/setup.py b/setup.py index 842cbb30d..3d68021dd 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ def read_requirements(): setup( name="llama_stack", - version="0.0.56", + version="0.0.57", author="Meta Llama", author_email="llama-oss@meta.com", description="Llama Stack",