From 60cb7f64affb1306be9dc072bb69ea1b05361b91 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Mon, 25 Nov 2024 09:42:27 -0800
Subject: [PATCH 01/17] add missing __init__

---
 llama_stack/providers/utils/scoring/__init__.py | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 llama_stack/providers/utils/scoring/__init__.py

diff --git a/llama_stack/providers/utils/scoring/__init__.py b/llama_stack/providers/utils/scoring/__init__.py
new file mode 100644
index 000000000..756f351d8
--- /dev/null
+++ b/llama_stack/providers/utils/scoring/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.

From de7af28756e6558fae2679b8034d4664cd1ce776 Mon Sep 17 00:00:00 2001
From: Dinesh Yeduguru <yvdinesh@gmail.com>
Date: Mon, 25 Nov 2024 13:17:02 -0800
Subject: [PATCH 02/17] Tgi fixture (#519)

# What does this PR do?

* Add a test fixture for tgi
* Fixes the logic to correctly pass the llama model for chat completion

Fixes #514

## Test Plan

pytest -k "tgi"
llama_stack/providers/tests/inference/test_text_inference.py --env
TGI_URL=http://localhost:$INFERENCE_PORT --env TGI_API_TOKEN=$HF_TOKEN
---
 .../providers/remote/inference/tgi/tgi.py      |  8 +++++---
 .../providers/tests/inference/fixtures.py      | 18 ++++++++++++++++++
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/llama_stack/providers/remote/inference/tgi/tgi.py b/llama_stack/providers/remote/inference/tgi/tgi.py
index 621188284..01981c62b 100644
--- a/llama_stack/providers/remote/inference/tgi/tgi.py
+++ b/llama_stack/providers/remote/inference/tgi/tgi.py
@@ -89,8 +89,9 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
         stream: Optional[bool] = False,
         logprobs: Optional[LogProbConfig] = None,
     ) -> AsyncGenerator:
+        model = await self.model_store.get_model(model_id)
         request = CompletionRequest(
-            model=model_id,
+            model=model.provider_resource_id,
             content=content,
             sampling_params=sampling_params,
             response_format=response_format,
@@ -194,8 +195,9 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
         stream: Optional[bool] = False,
         logprobs: Optional[LogProbConfig] = None,
     ) -> AsyncGenerator:
+        model = await self.model_store.get_model(model_id)
         request = ChatCompletionRequest(
-            model=model_id,
+            model=model.provider_resource_id,
             messages=messages,
             sampling_params=sampling_params,
             tools=tools or [],
@@ -249,7 +251,7 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
 
     def _get_params(self, request: ChatCompletionRequest) -> dict:
         prompt, input_tokens = chat_completion_request_to_model_input_info(
-            request, self.formatter
+            request, self.register_helper.get_llama_model(request.model), self.formatter
         )
         return dict(
             prompt=prompt,
diff --git a/llama_stack/providers/tests/inference/fixtures.py b/llama_stack/providers/tests/inference/fixtures.py
index 2007818e5..a427eef12 100644
--- a/llama_stack/providers/tests/inference/fixtures.py
+++ b/llama_stack/providers/tests/inference/fixtures.py
@@ -20,6 +20,7 @@ from llama_stack.providers.remote.inference.bedrock import BedrockConfig
 from llama_stack.providers.remote.inference.fireworks import FireworksImplConfig
 from llama_stack.providers.remote.inference.nvidia import NVIDIAConfig
 from llama_stack.providers.remote.inference.ollama import OllamaImplConfig
+from llama_stack.providers.remote.inference.tgi import TGIImplConfig
 from llama_stack.providers.remote.inference.together import TogetherImplConfig
 from llama_stack.providers.remote.inference.vllm import VLLMInferenceAdapterConfig
 from llama_stack.providers.tests.resolver import construct_stack_for_test
@@ -156,6 +157,22 @@ def inference_nvidia() -> ProviderFixture:
     )
 
 
+@pytest.fixture(scope="session")
+def inference_tgi() -> ProviderFixture:
+    return ProviderFixture(
+        providers=[
+            Provider(
+                provider_id="tgi",
+                provider_type="remote::tgi",
+                config=TGIImplConfig(
+                    url=get_env_or_fail("TGI_URL"),
+                    api_token=os.getenv("TGI_API_TOKEN", None),
+                ).model_dump(),
+            )
+        ],
+    )
+
+
 def get_model_short_name(model_name: str) -> str:
     """Convert model name to a short test identifier.
 
@@ -190,6 +207,7 @@ INFERENCE_FIXTURES = [
     "remote",
     "bedrock",
     "nvidia",
+    "tgi",
 ]
 
 
From bbd81231ce4032a6cfc8f7fb2df0b258a003cc31 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Mon, 25 Nov 2024 17:23:27 -0800
Subject: [PATCH 03/17] add missing __init__

---
 llama_stack/providers/inline/datasetio/__init__.py | 5 +++++
 llama_stack/providers/remote/datasetio/__init__.py | 5 +++++
 2 files changed, 10 insertions(+)
 create mode 100644 llama_stack/providers/inline/datasetio/__init__.py
 create mode 100644 llama_stack/providers/remote/datasetio/__init__.py

diff --git a/llama_stack/providers/inline/datasetio/__init__.py b/llama_stack/providers/inline/datasetio/__init__.py
new file mode 100644
index 000000000..756f351d8
--- /dev/null
+++ b/llama_stack/providers/inline/datasetio/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/llama_stack/providers/remote/datasetio/__init__.py b/llama_stack/providers/remote/datasetio/__init__.py
new file mode 100644
index 000000000..756f351d8
--- /dev/null
+++ b/llama_stack/providers/remote/datasetio/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.

From 2936133f95b5b5bb90e34e27630643434c53a7da Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Mon, 25 Nov 2024 18:55:54 -0800
Subject: [PATCH 04/17] precommit

---
 llama_stack/providers/remote/datasetio/huggingface/config.py   | 3 ++-
 .../providers/remote/datasetio/huggingface/huggingface.py      | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/llama_stack/providers/remote/datasetio/huggingface/config.py b/llama_stack/providers/remote/datasetio/huggingface/config.py
index 46470ce49..1cdae0625 100644
--- a/llama_stack/providers/remote/datasetio/huggingface/config.py
+++ b/llama_stack/providers/remote/datasetio/huggingface/config.py
@@ -3,12 +3,13 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+from pydantic import BaseModel
+
 from llama_stack.distribution.utils.config_dirs import RUNTIME_BASE_DIR
 from llama_stack.providers.utils.kvstore.config import (
     KVStoreConfig,
     SqliteKVStoreConfig,
 )
-from pydantic import BaseModel
 
 
 class HuggingfaceDatasetIOConfig(BaseModel):
diff --git a/llama_stack/providers/remote/datasetio/huggingface/huggingface.py b/llama_stack/providers/remote/datasetio/huggingface/huggingface.py
index 8d34df672..c2e4506bf 100644
--- a/llama_stack/providers/remote/datasetio/huggingface/huggingface.py
+++ b/llama_stack/providers/remote/datasetio/huggingface/huggingface.py
@@ -9,6 +9,7 @@ from llama_stack.apis.datasetio import *  # noqa: F403
 
 
 import datasets as hf_datasets
+
 from llama_stack.providers.datatypes import DatasetsProtocolPrivate
 from llama_stack.providers.utils.datasetio.url_utils import get_dataframe_from_url
 from llama_stack.providers.utils.kvstore import kvstore_impl

From d3956a1d22bbd480f4e14fd3f79b01cab7a23661 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Mon, 25 Nov 2024 22:02:45 -0800
Subject: [PATCH 05/17] fix description

---
 .../scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py
index 554590f12..dc5df8e78 100644
--- a/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py
+++ b/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py
@@ -10,7 +10,7 @@ from llama_stack.apis.scoring_functions import ScoringFn
 
 answer_correctness_fn_def = ScoringFn(
     identifier="braintrust::answer-correctness",
-    description="Test whether an output is factual, compared to an original (`expected`) value. One of Braintrust LLM basd scorer https://github.com/braintrustdata/autoevals/blob/main/py/autoevals/llm.py",
+    description="Scores the correctness of the answer based on the ground truth.. One of Braintrust LLM basd scorer https://github.com/braintrustdata/autoevals/blob/main/py/autoevals/llm.py",
     params=None,
     provider_id="braintrust",
     provider_resource_id="answer-correctness",

From 50cc165077acc76021a61a280b0c28cbefd96c12 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Tue, 26 Nov 2024 13:11:21 -0800
Subject: [PATCH 06/17] fixes tests & move braintrust api_keys to request
 headers (#535)

# What does this PR do?

- braintrust scoring provider requires OPENAI_API_KEY env variable to be
set
- move this to be able to be set as request headers (e.g. like together
/ fireworks api keys)
- fixes pytest with agents dependency

## Test Plan

**E2E**
```
llama stack run
```
```yaml
scoring:
  - provider_id: braintrust-0
    provider_type: inline::braintrust
    config: {}
```

**Client**
```python
self.client = LlamaStackClient(
    base_url=os.environ.get("LLAMA_STACK_ENDPOINT", "http://localhost:5000"),
    provider_data={
        "openai_api_key": os.environ.get("OPENAI_API_KEY", ""),
    },
)
```
- run `llama-stack-client eval run_scoring`

**Unit Test**
```
pytest -v -s -m meta_reference_eval_together_inference eval/test_eval.py
```

```
pytest -v -s -m braintrust_scoring_together_inference scoring/test_scoring.py --env OPENAI_API_KEY=$OPENAI_API_KEY
```
<img width="745" alt="image"
src="https://github.com/user-attachments/assets/68f5cdda-f6c8-496d-8b4f-1b3dabeca9c2">

## Before submitting

- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Ran pre-commit to handle lint / formatting issues.
- [ ] Read the [contributor
guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md),
      Pull Request section?
- [ ] Updated relevant documentation.
- [ ] Wrote necessary unit or integration tests.
---
 llama_stack/distribution/request_headers.py   |  2 +-
 .../inline/scoring/braintrust/__init__.py     |  5 ++++
 .../inline/scoring/braintrust/braintrust.py   | 23 +++++++++++++++++--
 .../inline/scoring/braintrust/config.py       |  6 ++++-
 llama_stack/providers/registry/scoring.py     |  1 +
 llama_stack/providers/tests/eval/conftest.py  | 16 +++++++++++++
 llama_stack/providers/tests/eval/fixtures.py  | 20 ++++++++++++++--
 .../providers/tests/scoring/fixtures.py       |  7 ++++--
 8 files changed, 72 insertions(+), 8 deletions(-)

diff --git a/llama_stack/distribution/request_headers.py b/llama_stack/distribution/request_headers.py
index 27ef3046a..41952edfd 100644
--- a/llama_stack/distribution/request_headers.py
+++ b/llama_stack/distribution/request_headers.py
@@ -35,7 +35,7 @@ class NeedsRequestProviderData:
             provider_data = validator(**val)
             return provider_data
         except Exception as e:
-            log.error("Error parsing provider data", e)
+            log.error(f"Error parsing provider data: {e}")
 
 
 def set_request_provider_data(headers: Dict[str, str]):
diff --git a/llama_stack/providers/inline/scoring/braintrust/__init__.py b/llama_stack/providers/inline/scoring/braintrust/__init__.py
index f442a6c3b..dc4ea4951 100644
--- a/llama_stack/providers/inline/scoring/braintrust/__init__.py
+++ b/llama_stack/providers/inline/scoring/braintrust/__init__.py
@@ -6,10 +6,15 @@
 from typing import Dict
 
 from llama_stack.distribution.datatypes import Api, ProviderSpec
+from pydantic import BaseModel
 
 from .config import BraintrustScoringConfig
 
 
+class BraintrustProviderDataValidator(BaseModel):
+    openai_api_key: str
+
+
 async def get_provider_impl(
     config: BraintrustScoringConfig,
     deps: Dict[Api, ProviderSpec],
diff --git a/llama_stack/providers/inline/scoring/braintrust/braintrust.py b/llama_stack/providers/inline/scoring/braintrust/braintrust.py
index 00817bb33..cf6e22a29 100644
--- a/llama_stack/providers/inline/scoring/braintrust/braintrust.py
+++ b/llama_stack/providers/inline/scoring/braintrust/braintrust.py
@@ -12,9 +12,11 @@ from llama_stack.apis.common.type_system import *  # noqa: F403
 from llama_stack.apis.datasetio import *  # noqa: F403
 from llama_stack.apis.datasets import *  # noqa: F403
 
-# from .scoring_fn.braintrust_scoring_fn import BraintrustScoringFn
+import os
+
 from autoevals.llm import Factuality
 from autoevals.ragas import AnswerCorrectness
+from llama_stack.distribution.request_headers import NeedsRequestProviderData
 from llama_stack.providers.datatypes import ScoringFunctionsProtocolPrivate
 
 from llama_stack.providers.utils.scoring.aggregation_utils import aggregate_average
@@ -24,7 +26,9 @@ from .scoring_fn.fn_defs.answer_correctness import answer_correctness_fn_def
 from .scoring_fn.fn_defs.factuality import factuality_fn_def
 
 
-class BraintrustScoringImpl(Scoring, ScoringFunctionsProtocolPrivate):
+class BraintrustScoringImpl(
+    Scoring, ScoringFunctionsProtocolPrivate, NeedsRequestProviderData
+):
     def __init__(
         self,
         config: BraintrustScoringConfig,
@@ -79,12 +83,25 @@ class BraintrustScoringImpl(Scoring, ScoringFunctionsProtocolPrivate):
                     f"Dataset {dataset_id} does not have a '{required_column}' column of type 'string'."
                 )
 
+    async def set_api_key(self) -> None:
+        # api key is in the request headers
+        if self.config.openai_api_key is None:
+            provider_data = self.get_request_provider_data()
+            if provider_data is None or not provider_data.openai_api_key:
+                raise ValueError(
+                    'Pass OpenAI API Key in the header X-LlamaStack-ProviderData as { "openai_api_key": <your api key>}'
+                )
+            self.config.openai_api_key = provider_data.openai_api_key
+
+        os.environ["OPENAI_API_KEY"] = self.config.openai_api_key
+
     async def score_batch(
         self,
         dataset_id: str,
         scoring_functions: List[str],
         save_results_dataset: bool = False,
     ) -> ScoreBatchResponse:
+        await self.set_api_key()
         await self.validate_scoring_input_dataset_schema(dataset_id=dataset_id)
         all_rows = await self.datasetio_api.get_rows_paginated(
             dataset_id=dataset_id,
@@ -105,6 +122,7 @@ class BraintrustScoringImpl(Scoring, ScoringFunctionsProtocolPrivate):
     async def score_row(
         self, input_row: Dict[str, Any], scoring_fn_identifier: Optional[str] = None
     ) -> ScoringResultRow:
+        await self.set_api_key()
         assert scoring_fn_identifier is not None, "scoring_fn_identifier cannot be None"
         expected_answer = input_row["expected_answer"]
         generated_answer = input_row["generated_answer"]
@@ -118,6 +136,7 @@ class BraintrustScoringImpl(Scoring, ScoringFunctionsProtocolPrivate):
     async def score(
         self, input_rows: List[Dict[str, Any]], scoring_functions: List[str]
     ) -> ScoreResponse:
+        await self.set_api_key()
         res = {}
         for scoring_fn_id in scoring_functions:
             if scoring_fn_id not in self.supported_fn_defs_registry:
diff --git a/llama_stack/providers/inline/scoring/braintrust/config.py b/llama_stack/providers/inline/scoring/braintrust/config.py
index fef6df5c8..fae0b17eb 100644
--- a/llama_stack/providers/inline/scoring/braintrust/config.py
+++ b/llama_stack/providers/inline/scoring/braintrust/config.py
@@ -6,4 +6,8 @@
 from llama_stack.apis.scoring import *  # noqa: F401, F403
 
 
-class BraintrustScoringConfig(BaseModel): ...
+class BraintrustScoringConfig(BaseModel):
+    openai_api_key: Optional[str] = Field(
+        default=None,
+        description="The OpenAI API Key",
+    )
diff --git a/llama_stack/providers/registry/scoring.py b/llama_stack/providers/registry/scoring.py
index 2da9797bc..f31ff44d7 100644
--- a/llama_stack/providers/registry/scoring.py
+++ b/llama_stack/providers/registry/scoring.py
@@ -44,5 +44,6 @@ def available_providers() -> List[ProviderSpec]:
                 Api.datasetio,
                 Api.datasets,
             ],
+            provider_data_validator="llama_stack.providers.inline.scoring.braintrust.BraintrustProviderDataValidator",
         ),
     ]
diff --git a/llama_stack/providers/tests/eval/conftest.py b/llama_stack/providers/tests/eval/conftest.py
index 171fae51a..b310439ce 100644
--- a/llama_stack/providers/tests/eval/conftest.py
+++ b/llama_stack/providers/tests/eval/conftest.py
@@ -6,10 +6,14 @@
 
 import pytest
 
+from ..agents.fixtures import AGENTS_FIXTURES
+
 from ..conftest import get_provider_fixture_overrides
 
 from ..datasetio.fixtures import DATASETIO_FIXTURES
 from ..inference.fixtures import INFERENCE_FIXTURES
+from ..memory.fixtures import MEMORY_FIXTURES
+from ..safety.fixtures import SAFETY_FIXTURES
 from ..scoring.fixtures import SCORING_FIXTURES
 from .fixtures import EVAL_FIXTURES
 
@@ -20,6 +24,9 @@ DEFAULT_PROVIDER_COMBINATIONS = [
             "scoring": "basic",
             "datasetio": "localfs",
             "inference": "fireworks",
+            "agents": "meta_reference",
+            "safety": "llama_guard",
+            "memory": "faiss",
         },
         id="meta_reference_eval_fireworks_inference",
         marks=pytest.mark.meta_reference_eval_fireworks_inference,
@@ -30,6 +37,9 @@ DEFAULT_PROVIDER_COMBINATIONS = [
             "scoring": "basic",
             "datasetio": "localfs",
             "inference": "together",
+            "agents": "meta_reference",
+            "safety": "llama_guard",
+            "memory": "faiss",
         },
         id="meta_reference_eval_together_inference",
         marks=pytest.mark.meta_reference_eval_together_inference,
@@ -40,6 +50,9 @@ DEFAULT_PROVIDER_COMBINATIONS = [
             "scoring": "basic",
             "datasetio": "huggingface",
             "inference": "together",
+            "agents": "meta_reference",
+            "safety": "llama_guard",
+            "memory": "faiss",
         },
         id="meta_reference_eval_together_inference_huggingface_datasetio",
         marks=pytest.mark.meta_reference_eval_together_inference_huggingface_datasetio,
@@ -75,6 +88,9 @@ def pytest_generate_tests(metafunc):
             "scoring": SCORING_FIXTURES,
             "datasetio": DATASETIO_FIXTURES,
             "inference": INFERENCE_FIXTURES,
+            "agents": AGENTS_FIXTURES,
+            "safety": SAFETY_FIXTURES,
+            "memory": MEMORY_FIXTURES,
         }
         combinations = (
             get_provider_fixture_overrides(metafunc.config, available_fixtures)
diff --git a/llama_stack/providers/tests/eval/fixtures.py b/llama_stack/providers/tests/eval/fixtures.py
index a6b404d0c..50dc9c16e 100644
--- a/llama_stack/providers/tests/eval/fixtures.py
+++ b/llama_stack/providers/tests/eval/fixtures.py
@@ -40,14 +40,30 @@ async def eval_stack(request):
 
     providers = {}
     provider_data = {}
-    for key in ["datasetio", "eval", "scoring", "inference"]:
+    for key in [
+        "datasetio",
+        "eval",
+        "scoring",
+        "inference",
+        "agents",
+        "safety",
+        "memory",
+    ]:
         fixture = request.getfixturevalue(f"{key}_{fixture_dict[key]}")
         providers[key] = fixture.providers
         if fixture.provider_data:
             provider_data.update(fixture.provider_data)
 
     test_stack = await construct_stack_for_test(
-        [Api.eval, Api.datasetio, Api.inference, Api.scoring],
+        [
+            Api.eval,
+            Api.datasetio,
+            Api.inference,
+            Api.scoring,
+            Api.agents,
+            Api.safety,
+            Api.memory,
+        ],
         providers,
         provider_data,
     )
diff --git a/llama_stack/providers/tests/scoring/fixtures.py b/llama_stack/providers/tests/scoring/fixtures.py
index d89b211ef..a9f088e07 100644
--- a/llama_stack/providers/tests/scoring/fixtures.py
+++ b/llama_stack/providers/tests/scoring/fixtures.py
@@ -10,9 +10,10 @@ import pytest_asyncio
 from llama_stack.apis.models import ModelInput
 
 from llama_stack.distribution.datatypes import Api, Provider
-
+from llama_stack.providers.inline.scoring.braintrust import BraintrustScoringConfig
 from llama_stack.providers.tests.resolver import construct_stack_for_test
 from ..conftest import ProviderFixture, remote_stack_fixture
+from ..env import get_env_or_fail
 
 
 @pytest.fixture(scope="session")
@@ -40,7 +41,9 @@ def scoring_braintrust() -> ProviderFixture:
             Provider(
                 provider_id="braintrust",
                 provider_type="inline::braintrust",
-                config={},
+                config=BraintrustScoringConfig(
+                    openai_api_key=get_env_or_fail("OPENAI_API_KEY"),
+                ).model_dump(),
             )
         ],
     )

From 060b4eb776f1bd5a816ee882f5c475a3555f8816 Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Tue, 26 Nov 2024 20:46:44 -0500
Subject: [PATCH 07/17] allow env NVIDIA_BASE_URL to set NVIDIAConfig.url
 (#531)

# What does this PR do?

this allows setting an NVIDIA_BASE_URL variable to control the
NVIDIAConfig.url option


## Test Plan

`pytest -s -v --providers inference=nvidia
llama_stack/providers/tests/inference/ --env
NVIDIA_BASE_URL=http://localhost:8000`


## Before submitting

- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [x] Ran pre-commit to handle lint / formatting issues.
- [x] Read the [contributor
guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md),
      Pull Request section?
- [ ] Updated relevant documentation.
- [ ] Wrote necessary unit or integration tests.
---
 llama_stack/providers/remote/inference/nvidia/config.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llama_stack/providers/remote/inference/nvidia/config.py b/llama_stack/providers/remote/inference/nvidia/config.py
index c50143043..28be43f4c 100644
--- a/llama_stack/providers/remote/inference/nvidia/config.py
+++ b/llama_stack/providers/remote/inference/nvidia/config.py
@@ -35,7 +35,9 @@ class NVIDIAConfig(BaseModel):
     """
 
     url: str = Field(
-        default="https://integrate.api.nvidia.com",
+        default_factory=lambda: os.getenv(
+            "NVIDIA_BASE_URL", "https://integrate.api.nvidia.com"
+        ),
         description="A base url for accessing the NVIDIA NIM",
     )
     api_key: Optional[str] = Field(

From b1a63df8cdae6e45d1db10f8c73eca6cd75ba68e Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Tue, 26 Nov 2024 22:04:21 -0800
Subject: [PATCH 08/17] move playground ui to llama-stack repo (#536)

# What does this PR do?

- Move Llama Stack Playground UI to llama-stack repo under
llama_stack/distribution
- Original PR in llama-stack-apps:
https://github.com/meta-llama/llama-stack-apps/pull/127

## Test Plan
```
cd llama-stack/llama_stack/distribution/ui
streamlit run app.py
```


## Before submitting

- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Ran pre-commit to handle lint / formatting issues.
- [ ] Read the [contributor
guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md),
      Pull Request section?
- [ ] Updated relevant documentation.
- [ ] Wrote necessary unit or integration tests.
---
 llama_stack/distribution/ui/README.md        |  11 ++
 llama_stack/distribution/ui/__init__.py      |   5 +
 llama_stack/distribution/ui/app.py           | 173 +++++++++++++++++++
 llama_stack/distribution/ui/modules/api.py   |  41 +++++
 llama_stack/distribution/ui/modules/utils.py |  31 ++++
 llama_stack/distribution/ui/requirements.txt |   3 +
 6 files changed, 264 insertions(+)
 create mode 100644 llama_stack/distribution/ui/README.md
 create mode 100644 llama_stack/distribution/ui/__init__.py
 create mode 100644 llama_stack/distribution/ui/app.py
 create mode 100644 llama_stack/distribution/ui/modules/api.py
 create mode 100644 llama_stack/distribution/ui/modules/utils.py
 create mode 100644 llama_stack/distribution/ui/requirements.txt

diff --git a/llama_stack/distribution/ui/README.md b/llama_stack/distribution/ui/README.md
new file mode 100644
index 000000000..a91883067
--- /dev/null
+++ b/llama_stack/distribution/ui/README.md
@@ -0,0 +1,11 @@
+# LLama Stack UI
+
+[!NOTE] This is a work in progress.
+
+## Running Streamlit App
+
+```
+cd llama_stack/distribution/ui
+pip install -r requirements.txt
+streamlit run app.py
+```
diff --git a/llama_stack/distribution/ui/__init__.py b/llama_stack/distribution/ui/__init__.py
new file mode 100644
index 000000000..756f351d8
--- /dev/null
+++ b/llama_stack/distribution/ui/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/llama_stack/distribution/ui/app.py b/llama_stack/distribution/ui/app.py
new file mode 100644
index 000000000..763b126a7
--- /dev/null
+++ b/llama_stack/distribution/ui/app.py
@@ -0,0 +1,173 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+
+import pandas as pd
+
+import streamlit as st
+
+from modules.api import LlamaStackEvaluation
+
+from modules.utils import process_dataset
+
+EVALUATION_API = LlamaStackEvaluation()
+
+
+def main():
+    # Add collapsible sidebar
+    with st.sidebar:
+        # Add collapse button
+        if "sidebar_state" not in st.session_state:
+            st.session_state.sidebar_state = True
+
+        if st.session_state.sidebar_state:
+            st.title("Navigation")
+            page = st.radio(
+                "Select a Page",
+                ["Application Evaluation"],
+                index=0,
+            )
+        else:
+            page = "Application Evaluation"  # Default page when sidebar is collapsed
+
+    # Main content area
+    st.title("🦙 Llama Stack Evaluations")
+
+    if page == "Application Evaluation":
+        application_evaluation_page()
+
+
+def application_evaluation_page():
+    # File uploader
+    uploaded_file = st.file_uploader("Upload Dataset", type=["csv", "xlsx", "xls"])
+
+    if uploaded_file is None:
+        st.error("No file uploaded")
+        return
+
+    # Process uploaded file
+    df = process_dataset(uploaded_file)
+    if df is None:
+        st.error("Error processing file")
+        return
+
+    # Display dataset information
+    st.success("Dataset loaded successfully!")
+
+    # Display dataframe preview
+    st.subheader("Dataset Preview")
+    st.dataframe(df)
+
+    # Select Scoring Functions to Run Evaluation On
+    st.subheader("Select Scoring Functions")
+    scoring_functions = EVALUATION_API.list_scoring_functions()
+    scoring_functions = {sf.identifier: sf for sf in scoring_functions}
+    scoring_functions_names = list(scoring_functions.keys())
+    selected_scoring_functions = st.multiselect(
+        "Choose one or more scoring functions",
+        options=scoring_functions_names,
+        help="Choose one or more scoring functions.",
+    )
+
+    available_models = EVALUATION_API.list_models()
+    available_models = [m.identifier for m in available_models]
+
+    scoring_params = {}
+    if selected_scoring_functions:
+        st.write("Selected:")
+        for scoring_fn_id in selected_scoring_functions:
+            scoring_fn = scoring_functions[scoring_fn_id]
+            st.write(f"- **{scoring_fn_id}**: {scoring_fn.description}")
+            new_params = None
+            if scoring_fn.params:
+                new_params = {}
+                for param_name, param_value in scoring_fn.params.to_dict().items():
+                    if param_name == "type":
+                        new_params[param_name] = param_value
+                        continue
+
+                    if param_name == "judge_model":
+                        value = st.selectbox(
+                            f"Select **{param_name}** for {scoring_fn_id}",
+                            options=available_models,
+                            index=0,
+                            key=f"{scoring_fn_id}_{param_name}",
+                        )
+                        new_params[param_name] = value
+                    else:
+                        value = st.text_area(
+                            f"Enter value for **{param_name}** in {scoring_fn_id} in valid JSON format",
+                            value=json.dumps(param_value, indent=2),
+                            height=80,
+                        )
+                        try:
+                            new_params[param_name] = json.loads(value)
+                        except json.JSONDecodeError:
+                            st.error(
+                                f"Invalid JSON for **{param_name}** in {scoring_fn_id}"
+                            )
+
+                st.json(new_params)
+            scoring_params[scoring_fn_id] = new_params
+
+        # Add run evaluation button & slider
+        total_rows = len(df)
+        num_rows = st.slider("Number of rows to evaluate", 1, total_rows, total_rows)
+
+        if st.button("Run Evaluation"):
+            progress_text = "Running evaluation..."
+            progress_bar = st.progress(0, text=progress_text)
+            rows = df.to_dict(orient="records")
+            if num_rows < total_rows:
+                rows = rows[:num_rows]
+
+            # Create separate containers for progress text and results
+            progress_text_container = st.empty()
+            results_container = st.empty()
+            output_res = {}
+            for i, r in enumerate(rows):
+                # Update progress
+                progress = i / len(rows)
+                progress_bar.progress(progress, text=progress_text)
+
+                # Run evaluation for current row
+                score_res = EVALUATION_API.run_scoring(
+                    r,
+                    scoring_function_ids=selected_scoring_functions,
+                    scoring_params=scoring_params,
+                )
+
+                for k in r.keys():
+                    if k not in output_res:
+                        output_res[k] = []
+                    output_res[k].append(r[k])
+
+                for fn_id in selected_scoring_functions:
+                    if fn_id not in output_res:
+                        output_res[fn_id] = []
+                    output_res[fn_id].append(score_res.results[fn_id].score_rows[0])
+
+                # Display current row results using separate containers
+                progress_text_container.write(
+                    f"Expand to see current processed result ({i+1}/{len(rows)})"
+                )
+                results_container.json(
+                    score_res.to_json(),
+                    expanded=2,
+                )
+
+            progress_bar.progress(1.0, text="Evaluation complete!")
+
+            # Display results in dataframe
+            if output_res:
+                output_df = pd.DataFrame(output_res)
+                st.subheader("Evaluation Results")
+                st.dataframe(output_df)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/llama_stack/distribution/ui/modules/api.py b/llama_stack/distribution/ui/modules/api.py
new file mode 100644
index 000000000..a8d8bf37d
--- /dev/null
+++ b/llama_stack/distribution/ui/modules/api.py
@@ -0,0 +1,41 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import os
+
+from typing import Optional
+
+from llama_stack_client import LlamaStackClient
+
+
+class LlamaStackEvaluation:
+    def __init__(self):
+        self.client = LlamaStackClient(
+            base_url=os.environ.get("LLAMA_STACK_ENDPOINT", "http://localhost:5000"),
+            provider_data={
+                "fireworks_api_key": os.environ.get("FIREWORKS_API_KEY", ""),
+                "together_api_key": os.environ.get("TOGETHER_API_KEY", ""),
+                "openai_api_key": os.environ.get("OPENAI_API_KEY", ""),
+            },
+        )
+
+    def list_scoring_functions(self):
+        """List all available scoring functions"""
+        return self.client.scoring_functions.list()
+
+    def list_models(self):
+        """List all available judge models"""
+        return self.client.models.list()
+
+    def run_scoring(
+        self, row, scoring_function_ids: list[str], scoring_params: Optional[dict]
+    ):
+        """Run scoring on a single row"""
+        if not scoring_params:
+            scoring_params = {fn_id: None for fn_id in scoring_function_ids}
+        return self.client.scoring.score(
+            input_rows=[row], scoring_functions=scoring_params
+        )
diff --git a/llama_stack/distribution/ui/modules/utils.py b/llama_stack/distribution/ui/modules/utils.py
new file mode 100644
index 000000000..f8da2e54e
--- /dev/null
+++ b/llama_stack/distribution/ui/modules/utils.py
@@ -0,0 +1,31 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import os
+
+import pandas as pd
+import streamlit as st
+
+
+def process_dataset(file):
+    if file is None:
+        return "No file uploaded", None
+
+    try:
+        # Determine file type and read accordingly
+        file_ext = os.path.splitext(file.name)[1].lower()
+        if file_ext == ".csv":
+            df = pd.read_csv(file)
+        elif file_ext in [".xlsx", ".xls"]:
+            df = pd.read_excel(file)
+        else:
+            return "Unsupported file format. Please upload a CSV or Excel file.", None
+
+        return df
+
+    except Exception as e:
+        st.error(f"Error processing file: {str(e)}")
+        return None
diff --git a/llama_stack/distribution/ui/requirements.txt b/llama_stack/distribution/ui/requirements.txt
new file mode 100644
index 000000000..c03959444
--- /dev/null
+++ b/llama_stack/distribution/ui/requirements.txt
@@ -0,0 +1,3 @@
+streamlit
+pandas
+llama-stack-client>=0.0.55

From 9088206eda1fecdfe2d643c9acb68a20c97460e0 Mon Sep 17 00:00:00 2001
From: Sean <sablair@gmail.com>
Date: Fri, 29 Nov 2024 21:43:56 +0800
Subject: [PATCH 09/17] fix[documentation]: Update links to point to correct
 pages (#549)

# What does this PR do?

In short, provide a summary of what this PR does and why. Usually, the
relevant context should be present in a linked issue.

- [x] Addresses issue (#548)


## Test Plan

Please describe:
No automated tests. Clicked on each link to ensure I was directed to the
right page.

## Sources


## Before submitting

- [x] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [x] Ran pre-commit to handle lint / formatting issues.
- [x] Read the [contributor
guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md),
      Pull Request section?
- [x] Updated relevant documentation.
- [ ] ~Wrote necessary unit or integration tests.~
---
 .../01_Local_Cloud_Inference101.ipynb              |  2 +-
 .../02_Prompt_Engineering101.ipynb                 |  2 +-
 docs/zero_to_hero_guide/03_Image_Chat101.ipynb     |  2 +-
 docs/zero_to_hero_guide/05_Memory101.ipynb         |  2 +-
 docs/zero_to_hero_guide/06_Safety101.ipynb         |  2 +-
 docs/zero_to_hero_guide/README.md                  | 14 +++++++-------
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb b/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb
index 7225f0741..bdfd3520f 100644
--- a/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb
+++ b/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb
@@ -231,7 +231,7 @@
    "source": [
     "Thanks for checking out this notebook! \n",
     "\n",
-    "The next one will be a guide on [Prompt Engineering](./01_Prompt_Engineering101.ipynb), please continue learning!"
+    "The next one will be a guide on [Prompt Engineering](./02_Prompt_Engineering101.ipynb), please continue learning!"
    ]
   }
  ],
diff --git a/docs/zero_to_hero_guide/02_Prompt_Engineering101.ipynb b/docs/zero_to_hero_guide/02_Prompt_Engineering101.ipynb
index c66192d81..c1c8a5aa9 100644
--- a/docs/zero_to_hero_guide/02_Prompt_Engineering101.ipynb
+++ b/docs/zero_to_hero_guide/02_Prompt_Engineering101.ipynb
@@ -276,7 +276,7 @@
    "source": [
     "Thanks for checking out this notebook! \n",
     "\n",
-    "The next one will be a guide on how to chat with images, continue to the notebook [here](./02_Image_Chat101.ipynb). Happy learning!"
+    "The next one will be a guide on how to chat with images, continue to the notebook [here](./03_Image_Chat101.ipynb). Happy learning!"
    ]
   }
  ],
diff --git a/docs/zero_to_hero_guide/03_Image_Chat101.ipynb b/docs/zero_to_hero_guide/03_Image_Chat101.ipynb
index 93042f3fc..02c32191f 100644
--- a/docs/zero_to_hero_guide/03_Image_Chat101.ipynb
+++ b/docs/zero_to_hero_guide/03_Image_Chat101.ipynb
@@ -175,7 +175,7 @@
    "source": [
     "Thanks for checking out this notebook! \n",
     "\n",
-    "The next one in the series will teach you one of the favorite applications of Large Language Models: [Tool Calling](./03_Tool_Calling101.ipynb). Enjoy!"
+    "The next one in the series will teach you one of the favorite applications of Large Language Models: [Tool Calling](./04_Tool_Calling101.ipynb). Enjoy!"
    ]
   }
  ],
diff --git a/docs/zero_to_hero_guide/05_Memory101.ipynb b/docs/zero_to_hero_guide/05_Memory101.ipynb
index e7e64d8fa..21678fd55 100644
--- a/docs/zero_to_hero_guide/05_Memory101.ipynb
+++ b/docs/zero_to_hero_guide/05_Memory101.ipynb
@@ -373,7 +373,7 @@
    "source": [
     "Awesome, now we can embed all our notes with Llama-stack and ask it about the meaning of life :)\n",
     "\n",
-    "Next up, we will learn about the safety features and how to use them: [notebook link](./05_Safety101.ipynb)"
+    "Next up, we will learn about the safety features and how to use them: [notebook link](./06_Safety101.ipynb)."
    ]
   }
  ],
diff --git a/docs/zero_to_hero_guide/06_Safety101.ipynb b/docs/zero_to_hero_guide/06_Safety101.ipynb
index bf37e83ea..6b5bd53bf 100644
--- a/docs/zero_to_hero_guide/06_Safety101.ipynb
+++ b/docs/zero_to_hero_guide/06_Safety101.ipynb
@@ -107,7 +107,7 @@
    "source": [
     "Thanks for leaning about the Safety API of Llama-Stack. \n",
     "\n",
-    "Finally, we learn about the Agents API, [here](./06_Agents101.ipynb)"
+    "Finally, we learn about the Agents API, [here](./07_Agents101.ipynb)."
    ]
   }
  ],
diff --git a/docs/zero_to_hero_guide/README.md b/docs/zero_to_hero_guide/README.md
index 449e40430..9b373fd9a 100644
--- a/docs/zero_to_hero_guide/README.md
+++ b/docs/zero_to_hero_guide/README.md
@@ -229,13 +229,13 @@ This command initializes the model to interact with your local Llama Stack insta
 **Explore Other Guides**: Dive deeper into specific topics by following these guides:
 - [Understanding Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html#decide-your-inference-provider)
 - [Inference 101](00_Inference101.ipynb)
-- [Local and Cloud Model Toggling 101](00_Local_Cloud_Inference101.ipynb)
-- [Prompt Engineering](01_Prompt_Engineering101.ipynb)
-- [Chat with Image - LlamaStack Vision API](02_Image_Chat101.ipynb)
-- [Tool Calling: How to and Details](03_Tool_Calling101.ipynb)
-- [Memory API: Show Simple In-Memory Retrieval](04_Memory101.ipynb)
-- [Using Safety API in Conversation](05_Safety101.ipynb)
-- [Agents API: Explain Components](06_Agents101.ipynb)
+- [Local and Cloud Model Toggling 101](01_Local_Cloud_Inference101.ipynb)
+- [Prompt Engineering](02_Prompt_Engineering101.ipynb)
+- [Chat with Image - LlamaStack Vision API](03_Image_Chat101.ipynb)
+- [Tool Calling: How to and Details](04_Tool_Calling101.ipynb)
+- [Memory API: Show Simple In-Memory Retrieval](05_Memory101.ipynb)
+- [Using Safety API in Conversation](06_Safety101.ipynb)
+- [Agents API: Explain Components](07_Agents101.ipynb)
 
 
 **Explore Client SDKs**: Utilize our client SDKs for various languages to integrate Llama Stack into your applications:

From 5fc2ee6f77e96d84c668c400ff742b153d2e5e8e Mon Sep 17 00:00:00 2001
From: Jeffrey Lind <124309394+JeffreyLind3@users.noreply.github.com>
Date: Fri, 29 Nov 2024 11:11:50 -0500
Subject: [PATCH 10/17] Fix URLs to Llama Stack Read the Docs Webpages (#547)

# What does this PR do?

Many of the URLs pointing to the Llama Stack's Read The Docs webpages
were broken, presumably due to recent refactor of the documentation.
This PR fixes all effected URLs throughout the repository.
---
 README.md                                        | 16 ++++++++--------
 docs/source/contributing/new_api_provider.md     |  2 +-
 .../self_hosted_distro/meta-reference-gpu.md     |  2 +-
 .../meta-reference-quantized-gpu.md              |  2 +-
 docs/to_situate/developer_cookbook.md            |  6 +++---
 docs/zero_to_hero_guide/README.md                |  4 ++--
 .../templates/meta-reference-gpu/doc_template.md |  2 +-
 .../meta-reference-quantized-gpu/doc_template.md |  2 +-
 8 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index 27f1d3614..8e57292c3 100644
--- a/README.md
+++ b/README.md
@@ -93,12 +93,12 @@ Additionally, we have designed every element of the Stack such that APIs as well
 
 | **Distribution** 	|           **Llama Stack Docker**           	| Start This Distribution 	|
 |:----------------:	|:------------------------------------------:	|:-----------------------:	|
-|  Meta Reference  	| [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general) 	|       [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/meta-reference-gpu.html)       	|
-|  Meta Reference Quantized  	| [llamastack/distribution-meta-reference-quantized-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-quantized-gpu/general) 	|       [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/meta-reference-quantized-gpu.html)       	|
-|      Ollama      	|       [llamastack/distribution-ollama](https://hub.docker.com/repository/docker/llamastack/distribution-ollama/general)       	|       [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/ollama.html)       	|
-|        TGI       	|         [llamastack/distribution-tgi](https://hub.docker.com/repository/docker/llamastack/distribution-tgi/general)        	|       [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/tgi.html)       	|
-|        Together       	|         [llamastack/distribution-together](https://hub.docker.com/repository/docker/llamastack/distribution-together/general)        	|       [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/remote_hosted_distro/together.html)       	|
-|        Fireworks       	|         [llamastack/distribution-fireworks](https://hub.docker.com/repository/docker/llamastack/distribution-fireworks/general)        	|       [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/remote_hosted_distro/fireworks.html)       	|
+|  Meta Reference  	| [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general) 	|       [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-gpu.html)       	|
+|  Meta Reference Quantized  	| [llamastack/distribution-meta-reference-quantized-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-quantized-gpu/general) 	|       [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-quantized-gpu.html)       	|
+|      Ollama      	|       [llamastack/distribution-ollama](https://hub.docker.com/repository/docker/llamastack/distribution-ollama/general)       	|       [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/ollama.html)       	|
+|        TGI       	|         [llamastack/distribution-tgi](https://hub.docker.com/repository/docker/llamastack/distribution-tgi/general)        	|       [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/tgi.html)       	|
+|        Together       	|         [llamastack/distribution-together](https://hub.docker.com/repository/docker/llamastack/distribution-together/general)        	|       [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/together.html)       	|
+|        Fireworks       	|         [llamastack/distribution-fireworks](https://hub.docker.com/repository/docker/llamastack/distribution-fireworks/general)        	|       [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/fireworks.html)       	|
 
 ## Installation
 
@@ -128,7 +128,7 @@ You have two ways to install this repository:
 
 Please checkout our [Documentation](https://llama-stack.readthedocs.io/en/latest/index.html) page for more details.
 
-* [CLI reference](https://llama-stack.readthedocs.io/en/latest/cli_reference/index.html)
+* [CLI reference](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/index.html)
     * Guide using `llama` CLI to work with Llama models (download, study prompts), and building/starting a Llama Stack distribution.
 * [Getting Started](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html)
     * Quick guide to start a Llama Stack server.
@@ -136,7 +136,7 @@ Please checkout our [Documentation](https://llama-stack.readthedocs.io/en/latest
     * The complete Llama Stack lesson [Colab notebook](https://colab.research.google.com/drive/1dtVmxotBsI4cGZQNsJRYPrLiDeT0Wnwt) of the new [Llama 3.2 course on Deeplearning.ai](https://learn.deeplearning.ai/courses/introducing-multimodal-llama-3-2/lesson/8/llama-stack).
     * A [Zero-to-Hero Guide](https://github.com/meta-llama/llama-stack/tree/main/docs/zero_to_hero_guide) that guide you through all the key components of llama stack with code samples.
 * [Contributing](CONTRIBUTING.md)
-    * [Adding a new API Provider](https://llama-stack.readthedocs.io/en/latest/api_providers/new_api_provider.html) to walk-through how to add a new API provider.
+    * [Adding a new API Provider](https://llama-stack.readthedocs.io/en/latest/contributing/new_api_provider.html) to walk-through how to add a new API provider.
 
 ## Llama Stack Client SDKs
 
diff --git a/docs/source/contributing/new_api_provider.md b/docs/source/contributing/new_api_provider.md
index 9fea31d87..e0a35e946 100644
--- a/docs/source/contributing/new_api_provider.md
+++ b/docs/source/contributing/new_api_provider.md
@@ -8,7 +8,7 @@ This guide contains references to walk you through adding a new API provider.
     - {repopath}`Remote Providers::llama_stack/providers/remote`
     - {repopath}`Inline Providers::llama_stack/providers/inline`
 
-3. [Build a Llama Stack distribution](https://llama-stack.readthedocs.io/en/latest/distribution_dev/building_distro.html) with your API provider.
+3. [Build a Llama Stack distribution](https://llama-stack.readthedocs.io/en/latest/distributions/building_distro.html) with your API provider.
 4. Test your code!
 
 ## Testing your newly added API providers
diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
index 084e90dfb..f9717894f 100644
--- a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
@@ -36,7 +36,7 @@ The following environment variables can be configured:
 
 ## Prerequisite: Downloading Models
 
-Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
+Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
 
 ```
 $ ls ~/.llama/checkpoints
diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
index 0c679788c..3ca161d07 100644
--- a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
@@ -36,7 +36,7 @@ The following environment variables can be configured:
 
 ## Prerequisite: Downloading Models
 
-Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
+Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
 
 ```
 $ ls ~/.llama/checkpoints
diff --git a/docs/to_situate/developer_cookbook.md b/docs/to_situate/developer_cookbook.md
index 152035e9f..56ebd7a76 100644
--- a/docs/to_situate/developer_cookbook.md
+++ b/docs/to_situate/developer_cookbook.md
@@ -13,13 +13,13 @@ Based on your developer needs, below are references to guides to help you get st
 * Developer Need: I want to start a local Llama Stack server with my GPU using meta-reference implementations.
 * Effort: 5min
 * Guide:
-  - Please see our [meta-reference-gpu](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/meta-reference-gpu.html) on starting up a meta-reference Llama Stack server.
+  - Please see our [meta-reference-gpu](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-gpu.html) on starting up a meta-reference Llama Stack server.
 
 ### Llama Stack Server with Remote Providers
 * Developer need: I want a Llama Stack distribution with a remote provider.
 * Effort: 10min
 * Guide
-  - Please see our [Distributions Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/index.html) on starting up distributions with remote providers.
+  - Please see our [Distributions Guide](https://llama-stack.readthedocs.io/en/latest/concepts/index.html#distributions) on starting up distributions with remote providers.
 
 
 ### On-Device (iOS) Llama Stack
@@ -38,4 +38,4 @@ Based on your developer needs, below are references to guides to help you get st
 * Developer Need: I want to add a new API provider to Llama Stack.
 * Effort: 3hr
 * Guide
-  - Please see our [Adding a New API Provider](https://llama-stack.readthedocs.io/en/latest/api_providers/new_api_provider.html) guide for adding a new API provider.
+  - Please see our [Adding a New API Provider](https://llama-stack.readthedocs.io/en/latest/contributing/new_api_provider.html) guide for adding a new API provider.
diff --git a/docs/zero_to_hero_guide/README.md b/docs/zero_to_hero_guide/README.md
index 9b373fd9a..09a4a6d50 100644
--- a/docs/zero_to_hero_guide/README.md
+++ b/docs/zero_to_hero_guide/README.md
@@ -227,7 +227,7 @@ This command initializes the model to interact with your local Llama Stack insta
 ## Next Steps
 
 **Explore Other Guides**: Dive deeper into specific topics by following these guides:
-- [Understanding Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html#decide-your-inference-provider)
+- [Understanding Distribution](https://llama-stack.readthedocs.io/en/latest/concepts/index.html#distributions)
 - [Inference 101](00_Inference101.ipynb)
 - [Local and Cloud Model Toggling 101](01_Local_Cloud_Inference101.ipynb)
 - [Prompt Engineering](02_Prompt_Engineering101.ipynb)
@@ -244,7 +244,7 @@ This command initializes the model to interact with your local Llama Stack insta
   - [Swift SDK](https://github.com/meta-llama/llama-stack-client-swift)
   - [Kotlin SDK](https://github.com/meta-llama/llama-stack-client-kotlin)
 
-**Advanced Configuration**: Learn how to customize your Llama Stack distribution by referring to the [Building a Llama Stack Distribution](https://llama-stack.readthedocs.io/en/latest/distributions/index.html#building-your-own-distribution) guide.
+**Advanced Configuration**: Learn how to customize your Llama Stack distribution by referring to the [Building a Llama Stack Distribution](https://llama-stack.readthedocs.io/en/latest/distributions/building_distro.html) guide.
 
 **Explore Example Apps**: Check out [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) for example applications built using Llama Stack.
 
diff --git a/llama_stack/templates/meta-reference-gpu/doc_template.md b/llama_stack/templates/meta-reference-gpu/doc_template.md
index 865944476..f9870adbd 100644
--- a/llama_stack/templates/meta-reference-gpu/doc_template.md
+++ b/llama_stack/templates/meta-reference-gpu/doc_template.md
@@ -29,7 +29,7 @@ The following environment variables can be configured:
 
 ## Prerequisite: Downloading Models
 
-Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
+Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
 
 ```
 $ ls ~/.llama/checkpoints
diff --git a/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md b/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md
index 567d83941..9e3c56d92 100644
--- a/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md
+++ b/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md
@@ -31,7 +31,7 @@ The following environment variables can be configured:
 
 ## Prerequisite: Downloading Models
 
-Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
+Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
 
 ```
 $ ls ~/.llama/checkpoints

From 2fc1c16d5864a3a0a82b0e1d5048465dfb74f12c Mon Sep 17 00:00:00 2001
From: Jeffrey Lind <124309394+JeffreyLind3@users.noreply.github.com>
Date: Fri, 29 Nov 2024 11:12:53 -0500
Subject: [PATCH 11/17] Fix Zero to Hero README.md Formatting (#546)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?
The formatting shown in the picture below in the Zero to Hero README.md
was fixed with this PR (also shown in a picture below).

**Before**
<img width="1014" alt="Screenshot 2024-11-28 at 1 47 32 PM"
src="https://github.com/user-attachments/assets/02d2281e-83ae-43eb-a1c7-702bc2365120">

**After**
<img width="1014" alt="Screenshot 2024-11-28 at 1 50 19 PM"
src="https://github.com/user-attachments/assets/03e54f40-c347-4737-8b91-197eee70a52f">
---
 docs/zero_to_hero_guide/README.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/zero_to_hero_guide/README.md b/docs/zero_to_hero_guide/README.md
index 09a4a6d50..5490f767f 100644
--- a/docs/zero_to_hero_guide/README.md
+++ b/docs/zero_to_hero_guide/README.md
@@ -120,13 +120,13 @@ export SAFETY_MODEL="meta-llama/Llama-Guard-3-1B"
 
 3. **Run the Llama Stack**:
    - Run the stack with command shared by the API from earlier:
-     ```bash
-     llama stack run ollama  \
-    --port $LLAMA_STACK_PORT \
-    --env INFERENCE_MODEL=$INFERENCE_MODEL \
-    --env SAFETY_MODEL=$SAFETY_MODEL \
-    --env OLLAMA_URL=http://localhost:11434
-     ```
+    ```bash
+    llama stack run ollama  \
+        --port $LLAMA_STACK_PORT \
+        --env INFERENCE_MODEL=$INFERENCE_MODEL \
+        --env SAFETY_MODEL=$SAFETY_MODEL \
+        --env OLLAMA_URL=http://localhost:11434
+    ```
 
 Note: Everytime you run a new model with `ollama run`, you will need to restart the llama stack. Otherwise it won't see the new model
 

From 8a3887c7eb8781ab12b9ed7df3f23debee01e199 Mon Sep 17 00:00:00 2001
From: raghotham <rsm@meta.com>
Date: Sat, 30 Nov 2024 12:28:03 -0600
Subject: [PATCH 12/17] Guide readme fix (#552)

# What does this PR do?
Fixes readme to remove redundant information and added
llama-stack-client cli instructions.

## Before submitting

- [ X] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ X] Ran pre-commit to handle lint / formatting issues.
- [ X] Read the [contributor
guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md),
      Pull Request section?
- [ X] Updated relevant documentation.
---
 docs/zero_to_hero_guide/README.md | 201 ++++++++++++++++--------------
 1 file changed, 109 insertions(+), 92 deletions(-)

diff --git a/docs/zero_to_hero_guide/README.md b/docs/zero_to_hero_guide/README.md
index 5490f767f..68c012164 100644
--- a/docs/zero_to_hero_guide/README.md
+++ b/docs/zero_to_hero_guide/README.md
@@ -1,37 +1,21 @@
 # Llama Stack: from Zero to Hero
 
-Llama-Stack allows you to configure your distribution from various providers, allowing you to focus on going from zero to production super fast.
+Llama Stack defines and standardizes the set of core building blocks needed to bring generative AI applications to market. These building blocks are presented in the form of interoperable APIs with a broad set of Providers providing their implementations. These building blocks are assembled into Distributions which are easy for developers to get from zero to production.
 
-This guide will walk you through how to build a local distribution, using Ollama as an inference provider.
+This guide will walk you through an end-to-end workflow with Llama Stack with Ollama as the inference provider and ChromaDB as the memory provider. Please note the steps for configuring your provider and distribution will vary a little depending on the services you use. However, the user experience will remain universal - this is the power of Llama-Stack.
 
-We also have a set of notebooks walking you through how to use Llama-Stack APIs:
+If you're looking for more specific topics, we have a [Zero to Hero Guide](#next-steps) that covers everything from Tool Calling to Agents in detail. Feel free to skip to the end to explore the advanced topics you're interested in.
 
-- Inference
-- Prompt Engineering
-- Chatting with Images
-- Tool Calling
-- Memory API for RAG
-- Safety API
-- Agentic API
-
-Below, we will learn how to get started with Ollama as an inference provider, please note the steps for configuring your provider will vary a little depending on the service. However, the user experience will remain universal-this is the power of Llama-Stack.
-
-Prototype locally using Ollama, deploy to the cloud with your favorite provider or own deployment. Use any API from any provider while focussing on development.
-
-# Ollama Quickstart Guide
-
-This guide will walk you through setting up an end-to-end workflow with Llama Stack with ollama, enabling you to perform text generation using the `Llama3.2-3B-Instruct` model. Follow these steps to get started quickly.
-
-If you're looking for more specific topics like tool calling or agent setup, we have a [Zero to Hero Guide](#next-steps) that covers everything from Tool Calling to Agents in detail. Feel free to skip to the end to explore the advanced topics you're interested in.
-
-> If you'd prefer not to set up a local server, explore our notebook on [tool calling with the Together API](Tool_Calling101_Using_Together's_Llama_Stack_Server.ipynb). This guide will show you how to leverage Together.ai's Llama Stack Server API, allowing you to get started with Llama Stack without the need for a locally built and running server.
+> If you'd prefer not to set up a local server, explore our notebook on [tool calling with the Together API](Tool_Calling101_Using_Together's_Llama_Stack_Server.ipynb). This notebook will show you how to leverage together.ai's Llama Stack Server API, allowing you to get started with Llama Stack without the need for a locally built and running server.
 
 ## Table of Contents
-1. [Setup ollama](#setup-ollama)
+1. [Setup and run ollama](#setup-ollama)
 2. [Install Dependencies and Set Up Environment](#install-dependencies-and-set-up-environment)
 3. [Build, Configure, and Run Llama Stack](#build-configure-and-run-llama-stack)
-4. [Run Ollama Model](#run-ollama-model)
-5. [Next Steps](#next-steps)
+4. [Test with llama-stack-client CLI](#test-with-llama-stack-client-cli)
+5. [Test with curl](#test-with-curl)
+6. [Test with Python](#test-with-python)
+7. [Next Steps](#next-steps)
 
 ---
 
@@ -39,107 +23,137 @@ If you're looking for more specific topics like tool calling or agent setup, we
 
 1. **Download Ollama App**:
    - Go to [https://ollama.com/download](https://ollama.com/download).
-   - Download and unzip `Ollama-darwin.zip`.
+   - Follow instructions based on the OS you are on. For example, if you are on a Mac, download and unzip `Ollama-darwin.zip`.
    - Run the `Ollama` application.
 
 1. **Download the Ollama CLI**:
-   - Ensure you have the `ollama` command line tool by downloading and installing it from the same website.
+   Ensure you have the `ollama` command line tool by downloading and installing it from the same website.
 
 1. **Start ollama server**:
-   - Open the terminal and run:
-      ```
-      ollama serve
-      ```
-
+   Open the terminal and run:
+   ```
+   ollama serve
+   ```
 1. **Run the model**:
-   - Open the terminal and run:
-     ```bash
-     ollama run llama3.2:3b-instruct-fp16
-     ```
-     **Note**: The supported models for llama stack for now is listed in [here](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/ollama.py#L43)
-
+   Open the terminal and run:
+   ```bash
+   ollama run llama3.2:3b-instruct-fp16 --keepalive -1m
+   ```
+   **Note**:
+     - The supported models for llama stack for now is listed in [here](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/ollama.py#L43)
+     - `keepalive -1m` is used so that ollama continues to keep the model in memory indefinitely. Otherwise, ollama frees up memory and you would have to run `ollama run` again.
 
 ---
 
 ## Install Dependencies and Set Up Environment
 
 1. **Create a Conda Environment**:
-   - Create a new Conda environment with Python 3.10:
-     ```bash
-     conda create -n ollama python=3.10
-     ```
-   - Activate the environment:
-     ```bash
-     conda activate ollama
-     ```
+   Create a new Conda environment with Python 3.10:
+   ```bash
+   conda create -n ollama python=3.10
+   ```
+   Activate the environment:
+   ```bash
+   conda activate ollama
+   ```
 
 2. **Install ChromaDB**:
-   - Install `chromadb` using `pip`:
-     ```bash
-     pip install chromadb
-     ```
+   Install `chromadb` using `pip`:
+   ```bash
+   pip install chromadb
+   ```
 
 3. **Run ChromaDB**:
-   - Start the ChromaDB server:
-     ```bash
-     chroma run --host localhost --port 8000 --path ./my_chroma_data
-     ```
+   Start the ChromaDB server:
+   ```bash
+   chroma run --host localhost --port 8000 --path ./my_chroma_data
+   ```
 
 4. **Install Llama Stack**:
-   - Open a new terminal and install `llama-stack`:
-     ```bash
-     conda activate hack
-     pip install llama-stack==0.0.53
-     ```
+   Open a new terminal and install `llama-stack`:
+   ```bash
+   conda activate ollama
+   pip install llama-stack==0.0.55
+   ```
 
 ---
 
 ## Build, Configure, and Run Llama Stack
 
 1. **Build the Llama Stack**:
-   - Build the Llama Stack using the `ollama` template:
-     ```bash
-     llama stack build --template ollama --image-type conda
-     ```
-
-After this step, you will see the console output:
-
-```
-Build Successful! Next steps:
+   Build the Llama Stack using the `ollama` template:
+   ```bash
+   llama stack build --template ollama --image-type conda
+   ```
+   **Expected Output:**
+   ```
+   ...
+   Build Successful! Next steps:
    1. Set the environment variables: LLAMASTACK_PORT, OLLAMA_URL, INFERENCE_MODEL, SAFETY_MODEL
-   2. `llama stack run /Users/username/.llama/distributions/llamastack-ollama/ollama-run.yaml`
-```
+   2. `llama stack run /Users/<username>/.llama/distributions/llamastack-ollama/ollama-run.yaml
+   ```
 
-2. **Set the ENV variables by exporting them to the terminal**:
-```bash
-export OLLAMA_URL="http://localhost:11434"
-export LLAMA_STACK_PORT=5001
-export INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct"
-export SAFETY_MODEL="meta-llama/Llama-Guard-3-1B"
-```
+3. **Set the ENV variables by exporting them to the terminal**:
+   ```bash
+   export OLLAMA_URL="http://localhost:11434"
+   export LLAMA_STACK_PORT=5051
+   export INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct"
+   export SAFETY_MODEL="meta-llama/Llama-Guard-3-1B"
+   ```
 
 3. **Run the Llama Stack**:
-   - Run the stack with command shared by the API from earlier:
-    ```bash
-    llama stack run ollama  \
-        --port $LLAMA_STACK_PORT \
-        --env INFERENCE_MODEL=$INFERENCE_MODEL \
-        --env SAFETY_MODEL=$SAFETY_MODEL \
-        --env OLLAMA_URL=http://localhost:11434
-    ```
-
-Note: Everytime you run a new model with `ollama run`, you will need to restart the llama stack. Otherwise it won't see the new model
+   Run the stack with command shared by the API from earlier:
+   ```bash
+   llama stack run ollama  \
+      --port $LLAMA_STACK_PORT \
+      --env INFERENCE_MODEL=$INFERENCE_MODEL \
+      --env SAFETY_MODEL=$SAFETY_MODEL \
+      --env OLLAMA_URL=$OLLAMA_URL
+   ```
+   Note: Everytime you run a new model with `ollama run`, you will need to restart the llama stack. Otherwise it won't see the new model.
 
 The server will start and listen on `http://localhost:5051`.
 
 ---
+## Test with `llama-stack-client` CLI
+After setting up the server, open a new terminal window and install the llama-stack-client package.
 
-## Testing with `curl`
+1. Install the llama-stack-client package
+   ```bash
+   conda activate ollama
+   pip install llama-stack-client
+   ```
+2. Configure the CLI to point to the llama-stack server.
+   ```bash
+   llama-stack-client configure --endpoint http://localhost:5051
+   ```
+   **Expected Output:**
+   ```bash
+   Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:5051
+   ```
+3. Test the CLI by running inference:
+   ```bash
+   llama-stack-client inference chat-completion --message "Write me a 2-sentence poem about the moon"
+   ```
+   **Expected Output:**
+   ```bash
+   ChatCompletionResponse(
+       completion_message=CompletionMessage(
+           content='Here is a 2-sentence poem about the moon:\n\nSilver crescent shining bright in the night,\nA beacon of wonder, full of gentle light.',
+           role='assistant',
+           stop_reason='end_of_turn',
+           tool_calls=[]
+       ),
+       logprobs=None
+   )
+   ```
+
+## Test with `curl`
 
 After setting up the server, open a new terminal window and verify it's working by sending a `POST` request using `curl`:
 
 ```bash
-curl http://localhost:5051/inference/chat_completion \
+curl http://localhost:$LLAMA_STACK_PORT/inference/chat_completion \
 -H "Content-Type: application/json" \
 -d '{
     "model": "Llama3.2-3B-Instruct",
@@ -168,15 +182,16 @@ You can check the available models with the command `llama-stack-client models l
 
 ---
 
-## Testing with Python
+## Test with Python
 
 You can also interact with the Llama Stack server using a simple Python script. Below is an example:
 
-### 1. Active Conda Environment and Install Required Python Packages
+### 1. Activate Conda Environment and Install Required Python Packages
 The `llama-stack-client` library offers a robust and efficient python methods for interacting with the Llama Stack server.
 
 ```bash
-conda activate your-llama-stack-conda-env
+conda activate ollama
+pip install llama-stack-client
 ```
 
 Note, the client library gets installed by default if you install the server library
@@ -188,6 +203,8 @@ touch test_llama_stack.py
 
 ### 3. Create a Chat Completion Request in Python
 
+In `test_llama_stack.py`, write the following code:
+
 ```python
 from llama_stack_client import LlamaStackClient
 

From fe48b9fb8c4df70f6566f14726194f9fbe325414 Mon Sep 17 00:00:00 2001
From: Dinesh Yeduguru <dineshyv@meta.com>
Date: Sat, 30 Nov 2024 12:27:31 -0800
Subject: [PATCH 13/17] Bump version to 0.0.56

---
 requirements.txt | 4 ++--
 setup.py         | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index b5b7587d0..0ff43e246 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,8 +2,8 @@ blobfile
 fire
 httpx
 huggingface-hub
-llama-models>=0.0.55
-llama-stack-client>=0.0.55
+llama-models>=0.0.56
+llama-stack-client>=0.0.56
 prompt-toolkit
 python-dotenv
 pydantic>=2
diff --git a/setup.py b/setup.py
index a4efd08c6..842cbb30d 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@ def read_requirements():
 
 setup(
     name="llama_stack",
-    version="0.0.55",
+    version="0.0.56",
     author="Meta Llama",
     author_email="llama-oss@meta.com",
     description="Llama Stack",

From 6bcd1bd9f10a7bdda040e9549828770d5793145b Mon Sep 17 00:00:00 2001
From: Aidan Do <aidando73@gmail.com>
Date: Tue, 3 Dec 2024 06:06:20 +1100
Subject: [PATCH 14/17] Fix broken Ollama link (#554)

# What does this PR do?

Fixes a broken Ollama link and formatting on this page:
https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/ollama.html

<img width="714" alt="Screenshot 2024-12-02 at 21 04 17"
src="https://github.com/user-attachments/assets/ada893c3-e1bd-4f04-826f-9ce1a11330a3">

<img width="822" alt="image"
src="https://github.com/user-attachments/assets/ab47cec3-3fcc-4671-92ae-febbc5003e6f">

To:

<img width="714" alt="Screenshot 2024-12-02 at 21 05 07"
src="https://github.com/user-attachments/assets/07a41653-1978-4472-bfa0-5f65dbf5cab5">

<img width="616" alt="image"
src="https://github.com/user-attachments/assets/dd0022e6-3468-4de0-bd55-c4ce2840c7d6">


## Before submitting

- [x] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).

Co-authored-by: Aidan Do <aidand@canva.com>
---
 docs/source/distributions/self_hosted_distro/ollama.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/distributions/self_hosted_distro/ollama.md b/docs/source/distributions/self_hosted_distro/ollama.md
index 0eb245483..9f81d9329 100644
--- a/docs/source/distributions/self_hosted_distro/ollama.md
+++ b/docs/source/distributions/self_hosted_distro/ollama.md
@@ -118,9 +118,9 @@ llama stack run ./run-with-safety.yaml \
 
 ### (Optional) Update Model Serving Configuration
 
-> [!NOTE]
-> Please check the [OLLAMA_SUPPORTED_MODELS](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers.remote/inference/ollama/ollama.py) for the supported Ollama models.
-
+```{note}
+Please check the [model_aliases](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/ollama.py#L45) variable for supported Ollama models.
+```
 
 To serve a new model with `ollama`
 ```bash

From 1e2faa461fd5843f83fc3db75cab5c10a7353194 Mon Sep 17 00:00:00 2001
From: Dinesh Yeduguru <yvdinesh@gmail.com>
Date: Mon, 2 Dec 2024 16:10:16 -0800
Subject: [PATCH 15/17] update client cli docs (#560)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Test plan:
make html
sphinx-autobuild source build/html


![Screenshot 2024-12-02 at 3 32
18 PM](https://github.com/user-attachments/assets/061d5ca6-178f-463a-854c-acb96ca3bb0d)
---
 .../llama_stack_client_cli_reference.md       | 75 +++++++++++++++++--
 1 file changed, 68 insertions(+), 7 deletions(-)

diff --git a/docs/source/references/llama_stack_client_cli_reference.md b/docs/source/references/llama_stack_client_cli_reference.md
index d3835e488..b35aa189d 100644
--- a/docs/source/references/llama_stack_client_cli_reference.md
+++ b/docs/source/references/llama_stack_client_cli_reference.md
@@ -27,8 +27,6 @@ $ llama-stack-client configure
 Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:5000
 ```
 
-## Provider Commands
-
 ### `llama-stack-client providers list`
 ```bash
 $ llama-stack-client providers list
@@ -119,8 +117,25 @@ $ llama-stack-client memory_banks list
 +--------------+----------------+--------+-------------------+------------------------+--------------------------+
 ```
 
-## Shield Management
+### `llama-stack-client memory_banks register`
+```bash
+$ llama-stack-client memory_banks register <memory-bank-id> --type <type> [--provider-id <provider-id>] [--provider-memory-bank-id <provider-memory-bank-id>] [--chunk-size <chunk-size>] [--embedding-model <embedding-model>] [--overlap-size <overlap-size>]
+```
 
+Options:
+- `--type`: Required. Type of memory bank. Choices: "vector", "keyvalue", "keyword", "graph"
+- `--provider-id`: Optional. Provider ID for the memory bank
+- `--provider-memory-bank-id`: Optional. Provider's memory bank ID
+- `--chunk-size`: Optional. Chunk size in tokens (for vector type). Default: 512
+- `--embedding-model`: Optional. Embedding model (for vector type). Default: "all-MiniLM-L6-v2"
+- `--overlap-size`: Optional. Overlap size in tokens (for vector type). Default: 64
+
+### `llama-stack-client memory_banks unregister`
+```bash
+$ llama-stack-client memory_banks unregister <memory-bank-id>
+```
+
+## Shield Management
 ### `llama-stack-client shields list`
 ```bash
 $ llama-stack-client shields list
@@ -134,16 +149,51 @@ $ llama-stack-client shields list
 +--------------+----------+----------------+-------------+
 ```
 
-## Evaluation Tasks
+### `llama-stack-client shields register`
+```bash
+$ llama-stack-client shields register --shield-id <shield-id> [--provider-id <provider-id>] [--provider-shield-id <provider-shield-id>] [--params <params>]
+```
+
+Options:
+- `--shield-id`: Required. ID of the shield
+- `--provider-id`: Optional. Provider ID for the shield
+- `--provider-shield-id`: Optional. Provider's shield ID
+- `--params`: Optional. JSON configuration parameters for the shield
+
+## Eval Task Management
 
 ### `llama-stack-client eval_tasks list`
 ```bash
-$ llama-stack-client eval run_benchmark <task_id1> <task_id2> --num-examples 10 --output-dir ./ --eval-task-config ~/eval_task_config.json
+$ llama-stack-client eval_tasks list
 ```
 
-where `eval_task_config.json` is the path to the eval task config file in JSON format. An example eval_task_config
+### `llama-stack-client eval_tasks register`
+```bash
+$ llama-stack-client eval_tasks register --eval-task-id <eval-task-id> --dataset-id <dataset-id> --scoring-functions <function1> [<function2> ...] [--provider-id <provider-id>] [--provider-eval-task-id <provider-eval-task-id>] [--metadata <metadata>]
 ```
-$ cat ~/eval_task_config.json
+
+Options:
+- `--eval-task-id`: Required. ID of the eval task
+- `--dataset-id`: Required. ID of the dataset to evaluate
+- `--scoring-functions`: Required. One or more scoring functions to use for evaluation
+- `--provider-id`: Optional. Provider ID for the eval task
+- `--provider-eval-task-id`: Optional. Provider's eval task ID
+- `--metadata`: Optional. Metadata for the eval task in JSON format
+
+## Eval execution
+### `llama-stack-client eval run-benchmark`
+```bash
+$ llama-stack-client eval run-benchmark <eval-task-id1> [<eval-task-id2> ...] --eval-task-config <config-file> --output-dir <output-dir> [--num-examples <num>] [--visualize]
+```
+
+Options:
+- `--eval-task-config`: Required. Path to the eval task config file in JSON format
+- `--output-dir`: Required. Path to the directory where evaluation results will be saved
+- `--num-examples`: Optional. Number of examples to evaluate (useful for debugging)
+- `--visualize`: Optional flag. If set, visualizes evaluation results after completion
+
+Example eval_task_config.json:
+```json
 {
     "type": "benchmark",
     "eval_candidate": {
@@ -160,3 +210,14 @@ $ cat ~/eval_task_config.json
     }
 }
 ```
+
+### `llama-stack-client eval run-scoring`
+```bash
+$ llama-stack-client eval run-scoring <eval-task-id> --eval-task-config <config-file> --output-dir <output-dir> [--num-examples <num>] [--visualize]
+```
+
+Options:
+- `--eval-task-config`: Required. Path to the eval task config file in JSON format
+- `--output-dir`: Required. Path to the directory where scoring results will be saved
+- `--num-examples`: Optional. Number of examples to evaluate (useful for debugging)
+- `--visualize`: Optional flag. If set, visualizes scoring results after completion

From 4c7b1a8fb3acb8f65dac9c2f066f86e31d6cd805 Mon Sep 17 00:00:00 2001
From: dltn <6599399+dltn@users.noreply.github.com>
Date: Mon, 2 Dec 2024 19:48:46 -0800
Subject: [PATCH 16/17] Bump version to 0.0.57

---
 requirements.txt | 4 ++--
 setup.py         | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 0ff43e246..8698495b1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,8 +2,8 @@ blobfile
 fire
 httpx
 huggingface-hub
-llama-models>=0.0.56
-llama-stack-client>=0.0.56
+llama-models>=0.0.57
+llama-stack-client>=0.0.57
 prompt-toolkit
 python-dotenv
 pydantic>=2
diff --git a/setup.py b/setup.py
index 842cbb30d..3d68021dd 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@ def read_requirements():
 
 setup(
     name="llama_stack",
-    version="0.0.56",
+    version="0.0.57",
     author="Meta Llama",
     author_email="llama-oss@meta.com",
     description="Llama Stack",

From 435f34b05e84f1747b28570234f25878cf0b31c4 Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Tue, 3 Dec 2024 05:55:14 -0500
Subject: [PATCH 17/17] reduce the accuracy requirements to pass the chat
 completion structured output test (#522)

i find `test_structured_output` to be flakey. it's both a functionality
and accuracy test -

```
        answer = AnswerFormat.model_validate_json(response.completion_message.content)
        assert answer.first_name == "Michael"
        assert answer.last_name == "Jordan"
        assert answer.year_of_birth == 1963
        assert answer.num_seasons_in_nba == 15
```

it's an accuracy test because it checks the value of first/last name,
birth year, and num seasons.

i find that -
- llama-3.1-8b-instruct and llama-3.2-3b-instruct pass the functionality
portion
- llama-3.2-3b-instruct consistently fails the accuracy portion
(thinking MJ was in the NBA for 14 seasons)
 - llama-3.1-8b-instruct occasionally fails the accuracy portion

suggestions (not mutually exclusive) -
1. turn the test into functionality only, skip the value checks
2. split the test into a functionality version and an xfail accuracy
version
3. add context to the prompt so the llm can answer without accessing
embedded memory

# What does this PR do?

implements option (3) by adding context to the system prompt.


## Test Plan


`pytest -s -v ... llama_stack/providers/tests/inference/ ... -k
structured_output`


## Before submitting

- [x] Ran pre-commit to handle lint / formatting issues.
- [x] Read the [contributor
guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md),
      Pull Request section?
- [x] Updated relevant documentation.
- [x] Wrote necessary unit or integration tests.
---
 .../providers/tests/inference/test_text_inference.py   | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/llama_stack/providers/tests/inference/test_text_inference.py b/llama_stack/providers/tests/inference/test_text_inference.py
index f0f1d0eb2..9e5c67375 100644
--- a/llama_stack/providers/tests/inference/test_text_inference.py
+++ b/llama_stack/providers/tests/inference/test_text_inference.py
@@ -211,7 +211,15 @@ class TestInference:
         response = await inference_impl.chat_completion(
             model_id=inference_model,
             messages=[
-                SystemMessage(content="You are a helpful assistant."),
+                # we include context about Michael Jordan in the prompt so that the test is
+                # focused on the funtionality of the model and not on the information embedded
+                # in the model. Llama 3.2 3B Instruct tends to think MJ played for 14 seasons.
+                SystemMessage(
+                    content=(
+                        "You are a helpful assistant.\n\n"
+                        "Michael Jordan was born in 1963. He played basketball for the Chicago Bulls for 15 seasons."
+                    )
+                ),
                 UserMessage(content="Please give me information about Michael Jordan."),
             ],
             stream=False,