From 50cc165077acc76021a61a280b0c28cbefd96c12 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Tue, 26 Nov 2024 13:11:21 -0800 Subject: [PATCH 01/69] fixes tests & move braintrust api_keys to request headers (#535) # What does this PR do? - braintrust scoring provider requires OPENAI_API_KEY env variable to be set - move this to be able to be set as request headers (e.g. like together / fireworks api keys) - fixes pytest with agents dependency ## Test Plan **E2E** ``` llama stack run ``` ```yaml scoring: - provider_id: braintrust-0 provider_type: inline::braintrust config: {} ``` **Client** ```python self.client = LlamaStackClient( base_url=os.environ.get("LLAMA_STACK_ENDPOINT", "http://localhost:5000"), provider_data={ "openai_api_key": os.environ.get("OPENAI_API_KEY", ""), }, ) ``` - run `llama-stack-client eval run_scoring` **Unit Test** ``` pytest -v -s -m meta_reference_eval_together_inference eval/test_eval.py ``` ``` pytest -v -s -m braintrust_scoring_together_inference scoring/test_scoring.py --env OPENAI_API_KEY=$OPENAI_API_KEY ``` image ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Ran pre-commit to handle lint / formatting issues. - [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ ] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests. --- llama_stack/distribution/request_headers.py | 2 +- .../inline/scoring/braintrust/__init__.py | 5 ++++ .../inline/scoring/braintrust/braintrust.py | 23 +++++++++++++++++-- .../inline/scoring/braintrust/config.py | 6 ++++- llama_stack/providers/registry/scoring.py | 1 + llama_stack/providers/tests/eval/conftest.py | 16 +++++++++++++ llama_stack/providers/tests/eval/fixtures.py | 20 ++++++++++++++-- .../providers/tests/scoring/fixtures.py | 7 ++++-- 8 files changed, 72 insertions(+), 8 deletions(-) diff --git a/llama_stack/distribution/request_headers.py b/llama_stack/distribution/request_headers.py index 27ef3046a..41952edfd 100644 --- a/llama_stack/distribution/request_headers.py +++ b/llama_stack/distribution/request_headers.py @@ -35,7 +35,7 @@ class NeedsRequestProviderData: provider_data = validator(**val) return provider_data except Exception as e: - log.error("Error parsing provider data", e) + log.error(f"Error parsing provider data: {e}") def set_request_provider_data(headers: Dict[str, str]): diff --git a/llama_stack/providers/inline/scoring/braintrust/__init__.py b/llama_stack/providers/inline/scoring/braintrust/__init__.py index f442a6c3b..dc4ea4951 100644 --- a/llama_stack/providers/inline/scoring/braintrust/__init__.py +++ b/llama_stack/providers/inline/scoring/braintrust/__init__.py @@ -6,10 +6,15 @@ from typing import Dict from llama_stack.distribution.datatypes import Api, ProviderSpec +from pydantic import BaseModel from .config import BraintrustScoringConfig +class BraintrustProviderDataValidator(BaseModel): + openai_api_key: str + + async def get_provider_impl( config: BraintrustScoringConfig, deps: Dict[Api, ProviderSpec], diff --git a/llama_stack/providers/inline/scoring/braintrust/braintrust.py b/llama_stack/providers/inline/scoring/braintrust/braintrust.py index 00817bb33..cf6e22a29 100644 --- a/llama_stack/providers/inline/scoring/braintrust/braintrust.py +++ b/llama_stack/providers/inline/scoring/braintrust/braintrust.py @@ -12,9 +12,11 @@ from llama_stack.apis.common.type_system import * # noqa: F403 from llama_stack.apis.datasetio import * # noqa: F403 from llama_stack.apis.datasets import * # noqa: F403 -# from .scoring_fn.braintrust_scoring_fn import BraintrustScoringFn +import os + from autoevals.llm import Factuality from autoevals.ragas import AnswerCorrectness +from llama_stack.distribution.request_headers import NeedsRequestProviderData from llama_stack.providers.datatypes import ScoringFunctionsProtocolPrivate from llama_stack.providers.utils.scoring.aggregation_utils import aggregate_average @@ -24,7 +26,9 @@ from .scoring_fn.fn_defs.answer_correctness import answer_correctness_fn_def from .scoring_fn.fn_defs.factuality import factuality_fn_def -class BraintrustScoringImpl(Scoring, ScoringFunctionsProtocolPrivate): +class BraintrustScoringImpl( + Scoring, ScoringFunctionsProtocolPrivate, NeedsRequestProviderData +): def __init__( self, config: BraintrustScoringConfig, @@ -79,12 +83,25 @@ class BraintrustScoringImpl(Scoring, ScoringFunctionsProtocolPrivate): f"Dataset {dataset_id} does not have a '{required_column}' column of type 'string'." ) + async def set_api_key(self) -> None: + # api key is in the request headers + if self.config.openai_api_key is None: + provider_data = self.get_request_provider_data() + if provider_data is None or not provider_data.openai_api_key: + raise ValueError( + 'Pass OpenAI API Key in the header X-LlamaStack-ProviderData as { "openai_api_key": }' + ) + self.config.openai_api_key = provider_data.openai_api_key + + os.environ["OPENAI_API_KEY"] = self.config.openai_api_key + async def score_batch( self, dataset_id: str, scoring_functions: List[str], save_results_dataset: bool = False, ) -> ScoreBatchResponse: + await self.set_api_key() await self.validate_scoring_input_dataset_schema(dataset_id=dataset_id) all_rows = await self.datasetio_api.get_rows_paginated( dataset_id=dataset_id, @@ -105,6 +122,7 @@ class BraintrustScoringImpl(Scoring, ScoringFunctionsProtocolPrivate): async def score_row( self, input_row: Dict[str, Any], scoring_fn_identifier: Optional[str] = None ) -> ScoringResultRow: + await self.set_api_key() assert scoring_fn_identifier is not None, "scoring_fn_identifier cannot be None" expected_answer = input_row["expected_answer"] generated_answer = input_row["generated_answer"] @@ -118,6 +136,7 @@ class BraintrustScoringImpl(Scoring, ScoringFunctionsProtocolPrivate): async def score( self, input_rows: List[Dict[str, Any]], scoring_functions: List[str] ) -> ScoreResponse: + await self.set_api_key() res = {} for scoring_fn_id in scoring_functions: if scoring_fn_id not in self.supported_fn_defs_registry: diff --git a/llama_stack/providers/inline/scoring/braintrust/config.py b/llama_stack/providers/inline/scoring/braintrust/config.py index fef6df5c8..fae0b17eb 100644 --- a/llama_stack/providers/inline/scoring/braintrust/config.py +++ b/llama_stack/providers/inline/scoring/braintrust/config.py @@ -6,4 +6,8 @@ from llama_stack.apis.scoring import * # noqa: F401, F403 -class BraintrustScoringConfig(BaseModel): ... +class BraintrustScoringConfig(BaseModel): + openai_api_key: Optional[str] = Field( + default=None, + description="The OpenAI API Key", + ) diff --git a/llama_stack/providers/registry/scoring.py b/llama_stack/providers/registry/scoring.py index 2da9797bc..f31ff44d7 100644 --- a/llama_stack/providers/registry/scoring.py +++ b/llama_stack/providers/registry/scoring.py @@ -44,5 +44,6 @@ def available_providers() -> List[ProviderSpec]: Api.datasetio, Api.datasets, ], + provider_data_validator="llama_stack.providers.inline.scoring.braintrust.BraintrustProviderDataValidator", ), ] diff --git a/llama_stack/providers/tests/eval/conftest.py b/llama_stack/providers/tests/eval/conftest.py index 171fae51a..b310439ce 100644 --- a/llama_stack/providers/tests/eval/conftest.py +++ b/llama_stack/providers/tests/eval/conftest.py @@ -6,10 +6,14 @@ import pytest +from ..agents.fixtures import AGENTS_FIXTURES + from ..conftest import get_provider_fixture_overrides from ..datasetio.fixtures import DATASETIO_FIXTURES from ..inference.fixtures import INFERENCE_FIXTURES +from ..memory.fixtures import MEMORY_FIXTURES +from ..safety.fixtures import SAFETY_FIXTURES from ..scoring.fixtures import SCORING_FIXTURES from .fixtures import EVAL_FIXTURES @@ -20,6 +24,9 @@ DEFAULT_PROVIDER_COMBINATIONS = [ "scoring": "basic", "datasetio": "localfs", "inference": "fireworks", + "agents": "meta_reference", + "safety": "llama_guard", + "memory": "faiss", }, id="meta_reference_eval_fireworks_inference", marks=pytest.mark.meta_reference_eval_fireworks_inference, @@ -30,6 +37,9 @@ DEFAULT_PROVIDER_COMBINATIONS = [ "scoring": "basic", "datasetio": "localfs", "inference": "together", + "agents": "meta_reference", + "safety": "llama_guard", + "memory": "faiss", }, id="meta_reference_eval_together_inference", marks=pytest.mark.meta_reference_eval_together_inference, @@ -40,6 +50,9 @@ DEFAULT_PROVIDER_COMBINATIONS = [ "scoring": "basic", "datasetio": "huggingface", "inference": "together", + "agents": "meta_reference", + "safety": "llama_guard", + "memory": "faiss", }, id="meta_reference_eval_together_inference_huggingface_datasetio", marks=pytest.mark.meta_reference_eval_together_inference_huggingface_datasetio, @@ -75,6 +88,9 @@ def pytest_generate_tests(metafunc): "scoring": SCORING_FIXTURES, "datasetio": DATASETIO_FIXTURES, "inference": INFERENCE_FIXTURES, + "agents": AGENTS_FIXTURES, + "safety": SAFETY_FIXTURES, + "memory": MEMORY_FIXTURES, } combinations = ( get_provider_fixture_overrides(metafunc.config, available_fixtures) diff --git a/llama_stack/providers/tests/eval/fixtures.py b/llama_stack/providers/tests/eval/fixtures.py index a6b404d0c..50dc9c16e 100644 --- a/llama_stack/providers/tests/eval/fixtures.py +++ b/llama_stack/providers/tests/eval/fixtures.py @@ -40,14 +40,30 @@ async def eval_stack(request): providers = {} provider_data = {} - for key in ["datasetio", "eval", "scoring", "inference"]: + for key in [ + "datasetio", + "eval", + "scoring", + "inference", + "agents", + "safety", + "memory", + ]: fixture = request.getfixturevalue(f"{key}_{fixture_dict[key]}") providers[key] = fixture.providers if fixture.provider_data: provider_data.update(fixture.provider_data) test_stack = await construct_stack_for_test( - [Api.eval, Api.datasetio, Api.inference, Api.scoring], + [ + Api.eval, + Api.datasetio, + Api.inference, + Api.scoring, + Api.agents, + Api.safety, + Api.memory, + ], providers, provider_data, ) diff --git a/llama_stack/providers/tests/scoring/fixtures.py b/llama_stack/providers/tests/scoring/fixtures.py index d89b211ef..a9f088e07 100644 --- a/llama_stack/providers/tests/scoring/fixtures.py +++ b/llama_stack/providers/tests/scoring/fixtures.py @@ -10,9 +10,10 @@ import pytest_asyncio from llama_stack.apis.models import ModelInput from llama_stack.distribution.datatypes import Api, Provider - +from llama_stack.providers.inline.scoring.braintrust import BraintrustScoringConfig from llama_stack.providers.tests.resolver import construct_stack_for_test from ..conftest import ProviderFixture, remote_stack_fixture +from ..env import get_env_or_fail @pytest.fixture(scope="session") @@ -40,7 +41,9 @@ def scoring_braintrust() -> ProviderFixture: Provider( provider_id="braintrust", provider_type="inline::braintrust", - config={}, + config=BraintrustScoringConfig( + openai_api_key=get_env_or_fail("OPENAI_API_KEY"), + ).model_dump(), ) ], ) From 060b4eb776f1bd5a816ee882f5c475a3555f8816 Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Tue, 26 Nov 2024 20:46:44 -0500 Subject: [PATCH 02/69] allow env NVIDIA_BASE_URL to set NVIDIAConfig.url (#531) # What does this PR do? this allows setting an NVIDIA_BASE_URL variable to control the NVIDIAConfig.url option ## Test Plan `pytest -s -v --providers inference=nvidia llama_stack/providers/tests/inference/ --env NVIDIA_BASE_URL=http://localhost:8000` ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [x] Ran pre-commit to handle lint / formatting issues. - [x] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ ] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests. --- llama_stack/providers/remote/inference/nvidia/config.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llama_stack/providers/remote/inference/nvidia/config.py b/llama_stack/providers/remote/inference/nvidia/config.py index c50143043..28be43f4c 100644 --- a/llama_stack/providers/remote/inference/nvidia/config.py +++ b/llama_stack/providers/remote/inference/nvidia/config.py @@ -35,7 +35,9 @@ class NVIDIAConfig(BaseModel): """ url: str = Field( - default="https://integrate.api.nvidia.com", + default_factory=lambda: os.getenv( + "NVIDIA_BASE_URL", "https://integrate.api.nvidia.com" + ), description="A base url for accessing the NVIDIA NIM", ) api_key: Optional[str] = Field( From b1a63df8cdae6e45d1db10f8c73eca6cd75ba68e Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Tue, 26 Nov 2024 22:04:21 -0800 Subject: [PATCH 03/69] move playground ui to llama-stack repo (#536) # What does this PR do? - Move Llama Stack Playground UI to llama-stack repo under llama_stack/distribution - Original PR in llama-stack-apps: https://github.com/meta-llama/llama-stack-apps/pull/127 ## Test Plan ``` cd llama-stack/llama_stack/distribution/ui streamlit run app.py ``` ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Ran pre-commit to handle lint / formatting issues. - [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ ] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests. --- llama_stack/distribution/ui/README.md | 11 ++ llama_stack/distribution/ui/__init__.py | 5 + llama_stack/distribution/ui/app.py | 173 +++++++++++++++++++ llama_stack/distribution/ui/modules/api.py | 41 +++++ llama_stack/distribution/ui/modules/utils.py | 31 ++++ llama_stack/distribution/ui/requirements.txt | 3 + 6 files changed, 264 insertions(+) create mode 100644 llama_stack/distribution/ui/README.md create mode 100644 llama_stack/distribution/ui/__init__.py create mode 100644 llama_stack/distribution/ui/app.py create mode 100644 llama_stack/distribution/ui/modules/api.py create mode 100644 llama_stack/distribution/ui/modules/utils.py create mode 100644 llama_stack/distribution/ui/requirements.txt diff --git a/llama_stack/distribution/ui/README.md b/llama_stack/distribution/ui/README.md new file mode 100644 index 000000000..a91883067 --- /dev/null +++ b/llama_stack/distribution/ui/README.md @@ -0,0 +1,11 @@ +# LLama Stack UI + +[!NOTE] This is a work in progress. + +## Running Streamlit App + +``` +cd llama_stack/distribution/ui +pip install -r requirements.txt +streamlit run app.py +``` diff --git a/llama_stack/distribution/ui/__init__.py b/llama_stack/distribution/ui/__init__.py new file mode 100644 index 000000000..756f351d8 --- /dev/null +++ b/llama_stack/distribution/ui/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/llama_stack/distribution/ui/app.py b/llama_stack/distribution/ui/app.py new file mode 100644 index 000000000..763b126a7 --- /dev/null +++ b/llama_stack/distribution/ui/app.py @@ -0,0 +1,173 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import json + +import pandas as pd + +import streamlit as st + +from modules.api import LlamaStackEvaluation + +from modules.utils import process_dataset + +EVALUATION_API = LlamaStackEvaluation() + + +def main(): + # Add collapsible sidebar + with st.sidebar: + # Add collapse button + if "sidebar_state" not in st.session_state: + st.session_state.sidebar_state = True + + if st.session_state.sidebar_state: + st.title("Navigation") + page = st.radio( + "Select a Page", + ["Application Evaluation"], + index=0, + ) + else: + page = "Application Evaluation" # Default page when sidebar is collapsed + + # Main content area + st.title("🦙 Llama Stack Evaluations") + + if page == "Application Evaluation": + application_evaluation_page() + + +def application_evaluation_page(): + # File uploader + uploaded_file = st.file_uploader("Upload Dataset", type=["csv", "xlsx", "xls"]) + + if uploaded_file is None: + st.error("No file uploaded") + return + + # Process uploaded file + df = process_dataset(uploaded_file) + if df is None: + st.error("Error processing file") + return + + # Display dataset information + st.success("Dataset loaded successfully!") + + # Display dataframe preview + st.subheader("Dataset Preview") + st.dataframe(df) + + # Select Scoring Functions to Run Evaluation On + st.subheader("Select Scoring Functions") + scoring_functions = EVALUATION_API.list_scoring_functions() + scoring_functions = {sf.identifier: sf for sf in scoring_functions} + scoring_functions_names = list(scoring_functions.keys()) + selected_scoring_functions = st.multiselect( + "Choose one or more scoring functions", + options=scoring_functions_names, + help="Choose one or more scoring functions.", + ) + + available_models = EVALUATION_API.list_models() + available_models = [m.identifier for m in available_models] + + scoring_params = {} + if selected_scoring_functions: + st.write("Selected:") + for scoring_fn_id in selected_scoring_functions: + scoring_fn = scoring_functions[scoring_fn_id] + st.write(f"- **{scoring_fn_id}**: {scoring_fn.description}") + new_params = None + if scoring_fn.params: + new_params = {} + for param_name, param_value in scoring_fn.params.to_dict().items(): + if param_name == "type": + new_params[param_name] = param_value + continue + + if param_name == "judge_model": + value = st.selectbox( + f"Select **{param_name}** for {scoring_fn_id}", + options=available_models, + index=0, + key=f"{scoring_fn_id}_{param_name}", + ) + new_params[param_name] = value + else: + value = st.text_area( + f"Enter value for **{param_name}** in {scoring_fn_id} in valid JSON format", + value=json.dumps(param_value, indent=2), + height=80, + ) + try: + new_params[param_name] = json.loads(value) + except json.JSONDecodeError: + st.error( + f"Invalid JSON for **{param_name}** in {scoring_fn_id}" + ) + + st.json(new_params) + scoring_params[scoring_fn_id] = new_params + + # Add run evaluation button & slider + total_rows = len(df) + num_rows = st.slider("Number of rows to evaluate", 1, total_rows, total_rows) + + if st.button("Run Evaluation"): + progress_text = "Running evaluation..." + progress_bar = st.progress(0, text=progress_text) + rows = df.to_dict(orient="records") + if num_rows < total_rows: + rows = rows[:num_rows] + + # Create separate containers for progress text and results + progress_text_container = st.empty() + results_container = st.empty() + output_res = {} + for i, r in enumerate(rows): + # Update progress + progress = i / len(rows) + progress_bar.progress(progress, text=progress_text) + + # Run evaluation for current row + score_res = EVALUATION_API.run_scoring( + r, + scoring_function_ids=selected_scoring_functions, + scoring_params=scoring_params, + ) + + for k in r.keys(): + if k not in output_res: + output_res[k] = [] + output_res[k].append(r[k]) + + for fn_id in selected_scoring_functions: + if fn_id not in output_res: + output_res[fn_id] = [] + output_res[fn_id].append(score_res.results[fn_id].score_rows[0]) + + # Display current row results using separate containers + progress_text_container.write( + f"Expand to see current processed result ({i+1}/{len(rows)})" + ) + results_container.json( + score_res.to_json(), + expanded=2, + ) + + progress_bar.progress(1.0, text="Evaluation complete!") + + # Display results in dataframe + if output_res: + output_df = pd.DataFrame(output_res) + st.subheader("Evaluation Results") + st.dataframe(output_df) + + +if __name__ == "__main__": + main() diff --git a/llama_stack/distribution/ui/modules/api.py b/llama_stack/distribution/ui/modules/api.py new file mode 100644 index 000000000..a8d8bf37d --- /dev/null +++ b/llama_stack/distribution/ui/modules/api.py @@ -0,0 +1,41 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import os + +from typing import Optional + +from llama_stack_client import LlamaStackClient + + +class LlamaStackEvaluation: + def __init__(self): + self.client = LlamaStackClient( + base_url=os.environ.get("LLAMA_STACK_ENDPOINT", "http://localhost:5000"), + provider_data={ + "fireworks_api_key": os.environ.get("FIREWORKS_API_KEY", ""), + "together_api_key": os.environ.get("TOGETHER_API_KEY", ""), + "openai_api_key": os.environ.get("OPENAI_API_KEY", ""), + }, + ) + + def list_scoring_functions(self): + """List all available scoring functions""" + return self.client.scoring_functions.list() + + def list_models(self): + """List all available judge models""" + return self.client.models.list() + + def run_scoring( + self, row, scoring_function_ids: list[str], scoring_params: Optional[dict] + ): + """Run scoring on a single row""" + if not scoring_params: + scoring_params = {fn_id: None for fn_id in scoring_function_ids} + return self.client.scoring.score( + input_rows=[row], scoring_functions=scoring_params + ) diff --git a/llama_stack/distribution/ui/modules/utils.py b/llama_stack/distribution/ui/modules/utils.py new file mode 100644 index 000000000..f8da2e54e --- /dev/null +++ b/llama_stack/distribution/ui/modules/utils.py @@ -0,0 +1,31 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import os + +import pandas as pd +import streamlit as st + + +def process_dataset(file): + if file is None: + return "No file uploaded", None + + try: + # Determine file type and read accordingly + file_ext = os.path.splitext(file.name)[1].lower() + if file_ext == ".csv": + df = pd.read_csv(file) + elif file_ext in [".xlsx", ".xls"]: + df = pd.read_excel(file) + else: + return "Unsupported file format. Please upload a CSV or Excel file.", None + + return df + + except Exception as e: + st.error(f"Error processing file: {str(e)}") + return None diff --git a/llama_stack/distribution/ui/requirements.txt b/llama_stack/distribution/ui/requirements.txt new file mode 100644 index 000000000..c03959444 --- /dev/null +++ b/llama_stack/distribution/ui/requirements.txt @@ -0,0 +1,3 @@ +streamlit +pandas +llama-stack-client>=0.0.55 From 9088206eda1fecdfe2d643c9acb68a20c97460e0 Mon Sep 17 00:00:00 2001 From: Sean Date: Fri, 29 Nov 2024 21:43:56 +0800 Subject: [PATCH 04/69] fix[documentation]: Update links to point to correct pages (#549) # What does this PR do? In short, provide a summary of what this PR does and why. Usually, the relevant context should be present in a linked issue. - [x] Addresses issue (#548) ## Test Plan Please describe: No automated tests. Clicked on each link to ensure I was directed to the right page. ## Sources ## Before submitting - [x] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [x] Ran pre-commit to handle lint / formatting issues. - [x] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [x] Updated relevant documentation. - [ ] ~Wrote necessary unit or integration tests.~ --- .../01_Local_Cloud_Inference101.ipynb | 2 +- .../02_Prompt_Engineering101.ipynb | 2 +- docs/zero_to_hero_guide/03_Image_Chat101.ipynb | 2 +- docs/zero_to_hero_guide/05_Memory101.ipynb | 2 +- docs/zero_to_hero_guide/06_Safety101.ipynb | 2 +- docs/zero_to_hero_guide/README.md | 14 +++++++------- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb b/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb index 7225f0741..bdfd3520f 100644 --- a/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb +++ b/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb @@ -231,7 +231,7 @@ "source": [ "Thanks for checking out this notebook! \n", "\n", - "The next one will be a guide on [Prompt Engineering](./01_Prompt_Engineering101.ipynb), please continue learning!" + "The next one will be a guide on [Prompt Engineering](./02_Prompt_Engineering101.ipynb), please continue learning!" ] } ], diff --git a/docs/zero_to_hero_guide/02_Prompt_Engineering101.ipynb b/docs/zero_to_hero_guide/02_Prompt_Engineering101.ipynb index c66192d81..c1c8a5aa9 100644 --- a/docs/zero_to_hero_guide/02_Prompt_Engineering101.ipynb +++ b/docs/zero_to_hero_guide/02_Prompt_Engineering101.ipynb @@ -276,7 +276,7 @@ "source": [ "Thanks for checking out this notebook! \n", "\n", - "The next one will be a guide on how to chat with images, continue to the notebook [here](./02_Image_Chat101.ipynb). Happy learning!" + "The next one will be a guide on how to chat with images, continue to the notebook [here](./03_Image_Chat101.ipynb). Happy learning!" ] } ], diff --git a/docs/zero_to_hero_guide/03_Image_Chat101.ipynb b/docs/zero_to_hero_guide/03_Image_Chat101.ipynb index 93042f3fc..02c32191f 100644 --- a/docs/zero_to_hero_guide/03_Image_Chat101.ipynb +++ b/docs/zero_to_hero_guide/03_Image_Chat101.ipynb @@ -175,7 +175,7 @@ "source": [ "Thanks for checking out this notebook! \n", "\n", - "The next one in the series will teach you one of the favorite applications of Large Language Models: [Tool Calling](./03_Tool_Calling101.ipynb). Enjoy!" + "The next one in the series will teach you one of the favorite applications of Large Language Models: [Tool Calling](./04_Tool_Calling101.ipynb). Enjoy!" ] } ], diff --git a/docs/zero_to_hero_guide/05_Memory101.ipynb b/docs/zero_to_hero_guide/05_Memory101.ipynb index e7e64d8fa..21678fd55 100644 --- a/docs/zero_to_hero_guide/05_Memory101.ipynb +++ b/docs/zero_to_hero_guide/05_Memory101.ipynb @@ -373,7 +373,7 @@ "source": [ "Awesome, now we can embed all our notes with Llama-stack and ask it about the meaning of life :)\n", "\n", - "Next up, we will learn about the safety features and how to use them: [notebook link](./05_Safety101.ipynb)" + "Next up, we will learn about the safety features and how to use them: [notebook link](./06_Safety101.ipynb)." ] } ], diff --git a/docs/zero_to_hero_guide/06_Safety101.ipynb b/docs/zero_to_hero_guide/06_Safety101.ipynb index bf37e83ea..6b5bd53bf 100644 --- a/docs/zero_to_hero_guide/06_Safety101.ipynb +++ b/docs/zero_to_hero_guide/06_Safety101.ipynb @@ -107,7 +107,7 @@ "source": [ "Thanks for leaning about the Safety API of Llama-Stack. \n", "\n", - "Finally, we learn about the Agents API, [here](./06_Agents101.ipynb)" + "Finally, we learn about the Agents API, [here](./07_Agents101.ipynb)." ] } ], diff --git a/docs/zero_to_hero_guide/README.md b/docs/zero_to_hero_guide/README.md index 449e40430..9b373fd9a 100644 --- a/docs/zero_to_hero_guide/README.md +++ b/docs/zero_to_hero_guide/README.md @@ -229,13 +229,13 @@ This command initializes the model to interact with your local Llama Stack insta **Explore Other Guides**: Dive deeper into specific topics by following these guides: - [Understanding Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html#decide-your-inference-provider) - [Inference 101](00_Inference101.ipynb) -- [Local and Cloud Model Toggling 101](00_Local_Cloud_Inference101.ipynb) -- [Prompt Engineering](01_Prompt_Engineering101.ipynb) -- [Chat with Image - LlamaStack Vision API](02_Image_Chat101.ipynb) -- [Tool Calling: How to and Details](03_Tool_Calling101.ipynb) -- [Memory API: Show Simple In-Memory Retrieval](04_Memory101.ipynb) -- [Using Safety API in Conversation](05_Safety101.ipynb) -- [Agents API: Explain Components](06_Agents101.ipynb) +- [Local and Cloud Model Toggling 101](01_Local_Cloud_Inference101.ipynb) +- [Prompt Engineering](02_Prompt_Engineering101.ipynb) +- [Chat with Image - LlamaStack Vision API](03_Image_Chat101.ipynb) +- [Tool Calling: How to and Details](04_Tool_Calling101.ipynb) +- [Memory API: Show Simple In-Memory Retrieval](05_Memory101.ipynb) +- [Using Safety API in Conversation](06_Safety101.ipynb) +- [Agents API: Explain Components](07_Agents101.ipynb) **Explore Client SDKs**: Utilize our client SDKs for various languages to integrate Llama Stack into your applications: From 5fc2ee6f77e96d84c668c400ff742b153d2e5e8e Mon Sep 17 00:00:00 2001 From: Jeffrey Lind <124309394+JeffreyLind3@users.noreply.github.com> Date: Fri, 29 Nov 2024 11:11:50 -0500 Subject: [PATCH 05/69] Fix URLs to Llama Stack Read the Docs Webpages (#547) # What does this PR do? Many of the URLs pointing to the Llama Stack's Read The Docs webpages were broken, presumably due to recent refactor of the documentation. This PR fixes all effected URLs throughout the repository. --- README.md | 16 ++++++++-------- docs/source/contributing/new_api_provider.md | 2 +- .../self_hosted_distro/meta-reference-gpu.md | 2 +- .../meta-reference-quantized-gpu.md | 2 +- docs/to_situate/developer_cookbook.md | 6 +++--- docs/zero_to_hero_guide/README.md | 4 ++-- .../templates/meta-reference-gpu/doc_template.md | 2 +- .../meta-reference-quantized-gpu/doc_template.md | 2 +- 8 files changed, 18 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 27f1d3614..8e57292c3 100644 --- a/README.md +++ b/README.md @@ -93,12 +93,12 @@ Additionally, we have designed every element of the Stack such that APIs as well | **Distribution** | **Llama Stack Docker** | Start This Distribution | |:----------------: |:------------------------------------------: |:-----------------------: | -| Meta Reference | [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/meta-reference-gpu.html) | -| Meta Reference Quantized | [llamastack/distribution-meta-reference-quantized-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-quantized-gpu/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/meta-reference-quantized-gpu.html) | -| Ollama | [llamastack/distribution-ollama](https://hub.docker.com/repository/docker/llamastack/distribution-ollama/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/ollama.html) | -| TGI | [llamastack/distribution-tgi](https://hub.docker.com/repository/docker/llamastack/distribution-tgi/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/tgi.html) | -| Together | [llamastack/distribution-together](https://hub.docker.com/repository/docker/llamastack/distribution-together/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/remote_hosted_distro/together.html) | -| Fireworks | [llamastack/distribution-fireworks](https://hub.docker.com/repository/docker/llamastack/distribution-fireworks/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/remote_hosted_distro/fireworks.html) | +| Meta Reference | [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-gpu.html) | +| Meta Reference Quantized | [llamastack/distribution-meta-reference-quantized-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-quantized-gpu/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-quantized-gpu.html) | +| Ollama | [llamastack/distribution-ollama](https://hub.docker.com/repository/docker/llamastack/distribution-ollama/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/ollama.html) | +| TGI | [llamastack/distribution-tgi](https://hub.docker.com/repository/docker/llamastack/distribution-tgi/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/tgi.html) | +| Together | [llamastack/distribution-together](https://hub.docker.com/repository/docker/llamastack/distribution-together/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/together.html) | +| Fireworks | [llamastack/distribution-fireworks](https://hub.docker.com/repository/docker/llamastack/distribution-fireworks/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/fireworks.html) | ## Installation @@ -128,7 +128,7 @@ You have two ways to install this repository: Please checkout our [Documentation](https://llama-stack.readthedocs.io/en/latest/index.html) page for more details. -* [CLI reference](https://llama-stack.readthedocs.io/en/latest/cli_reference/index.html) +* [CLI reference](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/index.html) * Guide using `llama` CLI to work with Llama models (download, study prompts), and building/starting a Llama Stack distribution. * [Getting Started](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) * Quick guide to start a Llama Stack server. @@ -136,7 +136,7 @@ Please checkout our [Documentation](https://llama-stack.readthedocs.io/en/latest * The complete Llama Stack lesson [Colab notebook](https://colab.research.google.com/drive/1dtVmxotBsI4cGZQNsJRYPrLiDeT0Wnwt) of the new [Llama 3.2 course on Deeplearning.ai](https://learn.deeplearning.ai/courses/introducing-multimodal-llama-3-2/lesson/8/llama-stack). * A [Zero-to-Hero Guide](https://github.com/meta-llama/llama-stack/tree/main/docs/zero_to_hero_guide) that guide you through all the key components of llama stack with code samples. * [Contributing](CONTRIBUTING.md) - * [Adding a new API Provider](https://llama-stack.readthedocs.io/en/latest/api_providers/new_api_provider.html) to walk-through how to add a new API provider. + * [Adding a new API Provider](https://llama-stack.readthedocs.io/en/latest/contributing/new_api_provider.html) to walk-through how to add a new API provider. ## Llama Stack Client SDKs diff --git a/docs/source/contributing/new_api_provider.md b/docs/source/contributing/new_api_provider.md index 9fea31d87..e0a35e946 100644 --- a/docs/source/contributing/new_api_provider.md +++ b/docs/source/contributing/new_api_provider.md @@ -8,7 +8,7 @@ This guide contains references to walk you through adding a new API provider. - {repopath}`Remote Providers::llama_stack/providers/remote` - {repopath}`Inline Providers::llama_stack/providers/inline` -3. [Build a Llama Stack distribution](https://llama-stack.readthedocs.io/en/latest/distribution_dev/building_distro.html) with your API provider. +3. [Build a Llama Stack distribution](https://llama-stack.readthedocs.io/en/latest/distributions/building_distro.html) with your API provider. 4. Test your code! ## Testing your newly added API providers diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md index 084e90dfb..f9717894f 100644 --- a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md +++ b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md @@ -36,7 +36,7 @@ The following environment variables can be configured: ## Prerequisite: Downloading Models -Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints. +Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints. ``` $ ls ~/.llama/checkpoints diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md index 0c679788c..3ca161d07 100644 --- a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md +++ b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md @@ -36,7 +36,7 @@ The following environment variables can be configured: ## Prerequisite: Downloading Models -Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints. +Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints. ``` $ ls ~/.llama/checkpoints diff --git a/docs/to_situate/developer_cookbook.md b/docs/to_situate/developer_cookbook.md index 152035e9f..56ebd7a76 100644 --- a/docs/to_situate/developer_cookbook.md +++ b/docs/to_situate/developer_cookbook.md @@ -13,13 +13,13 @@ Based on your developer needs, below are references to guides to help you get st * Developer Need: I want to start a local Llama Stack server with my GPU using meta-reference implementations. * Effort: 5min * Guide: - - Please see our [meta-reference-gpu](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/meta-reference-gpu.html) on starting up a meta-reference Llama Stack server. + - Please see our [meta-reference-gpu](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-gpu.html) on starting up a meta-reference Llama Stack server. ### Llama Stack Server with Remote Providers * Developer need: I want a Llama Stack distribution with a remote provider. * Effort: 10min * Guide - - Please see our [Distributions Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/index.html) on starting up distributions with remote providers. + - Please see our [Distributions Guide](https://llama-stack.readthedocs.io/en/latest/concepts/index.html#distributions) on starting up distributions with remote providers. ### On-Device (iOS) Llama Stack @@ -38,4 +38,4 @@ Based on your developer needs, below are references to guides to help you get st * Developer Need: I want to add a new API provider to Llama Stack. * Effort: 3hr * Guide - - Please see our [Adding a New API Provider](https://llama-stack.readthedocs.io/en/latest/api_providers/new_api_provider.html) guide for adding a new API provider. + - Please see our [Adding a New API Provider](https://llama-stack.readthedocs.io/en/latest/contributing/new_api_provider.html) guide for adding a new API provider. diff --git a/docs/zero_to_hero_guide/README.md b/docs/zero_to_hero_guide/README.md index 9b373fd9a..09a4a6d50 100644 --- a/docs/zero_to_hero_guide/README.md +++ b/docs/zero_to_hero_guide/README.md @@ -227,7 +227,7 @@ This command initializes the model to interact with your local Llama Stack insta ## Next Steps **Explore Other Guides**: Dive deeper into specific topics by following these guides: -- [Understanding Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html#decide-your-inference-provider) +- [Understanding Distribution](https://llama-stack.readthedocs.io/en/latest/concepts/index.html#distributions) - [Inference 101](00_Inference101.ipynb) - [Local and Cloud Model Toggling 101](01_Local_Cloud_Inference101.ipynb) - [Prompt Engineering](02_Prompt_Engineering101.ipynb) @@ -244,7 +244,7 @@ This command initializes the model to interact with your local Llama Stack insta - [Swift SDK](https://github.com/meta-llama/llama-stack-client-swift) - [Kotlin SDK](https://github.com/meta-llama/llama-stack-client-kotlin) -**Advanced Configuration**: Learn how to customize your Llama Stack distribution by referring to the [Building a Llama Stack Distribution](https://llama-stack.readthedocs.io/en/latest/distributions/index.html#building-your-own-distribution) guide. +**Advanced Configuration**: Learn how to customize your Llama Stack distribution by referring to the [Building a Llama Stack Distribution](https://llama-stack.readthedocs.io/en/latest/distributions/building_distro.html) guide. **Explore Example Apps**: Check out [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) for example applications built using Llama Stack. diff --git a/llama_stack/templates/meta-reference-gpu/doc_template.md b/llama_stack/templates/meta-reference-gpu/doc_template.md index 865944476..f9870adbd 100644 --- a/llama_stack/templates/meta-reference-gpu/doc_template.md +++ b/llama_stack/templates/meta-reference-gpu/doc_template.md @@ -29,7 +29,7 @@ The following environment variables can be configured: ## Prerequisite: Downloading Models -Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints. +Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints. ``` $ ls ~/.llama/checkpoints diff --git a/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md b/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md index 567d83941..9e3c56d92 100644 --- a/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md +++ b/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md @@ -31,7 +31,7 @@ The following environment variables can be configured: ## Prerequisite: Downloading Models -Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints. +Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints. ``` $ ls ~/.llama/checkpoints From 2fc1c16d5864a3a0a82b0e1d5048465dfb74f12c Mon Sep 17 00:00:00 2001 From: Jeffrey Lind <124309394+JeffreyLind3@users.noreply.github.com> Date: Fri, 29 Nov 2024 11:12:53 -0500 Subject: [PATCH 06/69] Fix Zero to Hero README.md Formatting (#546) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? The formatting shown in the picture below in the Zero to Hero README.md was fixed with this PR (also shown in a picture below). **Before** Screenshot 2024-11-28 at 1 47 32 PM **After** Screenshot 2024-11-28 at 1 50 19 PM --- docs/zero_to_hero_guide/README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/zero_to_hero_guide/README.md b/docs/zero_to_hero_guide/README.md index 09a4a6d50..5490f767f 100644 --- a/docs/zero_to_hero_guide/README.md +++ b/docs/zero_to_hero_guide/README.md @@ -120,13 +120,13 @@ export SAFETY_MODEL="meta-llama/Llama-Guard-3-1B" 3. **Run the Llama Stack**: - Run the stack with command shared by the API from earlier: - ```bash - llama stack run ollama \ - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env SAFETY_MODEL=$SAFETY_MODEL \ - --env OLLAMA_URL=http://localhost:11434 - ``` + ```bash + llama stack run ollama \ + --port $LLAMA_STACK_PORT \ + --env INFERENCE_MODEL=$INFERENCE_MODEL \ + --env SAFETY_MODEL=$SAFETY_MODEL \ + --env OLLAMA_URL=http://localhost:11434 + ``` Note: Everytime you run a new model with `ollama run`, you will need to restart the llama stack. Otherwise it won't see the new model From 8a3887c7eb8781ab12b9ed7df3f23debee01e199 Mon Sep 17 00:00:00 2001 From: raghotham Date: Sat, 30 Nov 2024 12:28:03 -0600 Subject: [PATCH 07/69] Guide readme fix (#552) # What does this PR do? Fixes readme to remove redundant information and added llama-stack-client cli instructions. ## Before submitting - [ X] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ X] Ran pre-commit to handle lint / formatting issues. - [ X] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ X] Updated relevant documentation. --- docs/zero_to_hero_guide/README.md | 201 ++++++++++++++++-------------- 1 file changed, 109 insertions(+), 92 deletions(-) diff --git a/docs/zero_to_hero_guide/README.md b/docs/zero_to_hero_guide/README.md index 5490f767f..68c012164 100644 --- a/docs/zero_to_hero_guide/README.md +++ b/docs/zero_to_hero_guide/README.md @@ -1,37 +1,21 @@ # Llama Stack: from Zero to Hero -Llama-Stack allows you to configure your distribution from various providers, allowing you to focus on going from zero to production super fast. +Llama Stack defines and standardizes the set of core building blocks needed to bring generative AI applications to market. These building blocks are presented in the form of interoperable APIs with a broad set of Providers providing their implementations. These building blocks are assembled into Distributions which are easy for developers to get from zero to production. -This guide will walk you through how to build a local distribution, using Ollama as an inference provider. +This guide will walk you through an end-to-end workflow with Llama Stack with Ollama as the inference provider and ChromaDB as the memory provider. Please note the steps for configuring your provider and distribution will vary a little depending on the services you use. However, the user experience will remain universal - this is the power of Llama-Stack. -We also have a set of notebooks walking you through how to use Llama-Stack APIs: +If you're looking for more specific topics, we have a [Zero to Hero Guide](#next-steps) that covers everything from Tool Calling to Agents in detail. Feel free to skip to the end to explore the advanced topics you're interested in. -- Inference -- Prompt Engineering -- Chatting with Images -- Tool Calling -- Memory API for RAG -- Safety API -- Agentic API - -Below, we will learn how to get started with Ollama as an inference provider, please note the steps for configuring your provider will vary a little depending on the service. However, the user experience will remain universal-this is the power of Llama-Stack. - -Prototype locally using Ollama, deploy to the cloud with your favorite provider or own deployment. Use any API from any provider while focussing on development. - -# Ollama Quickstart Guide - -This guide will walk you through setting up an end-to-end workflow with Llama Stack with ollama, enabling you to perform text generation using the `Llama3.2-3B-Instruct` model. Follow these steps to get started quickly. - -If you're looking for more specific topics like tool calling or agent setup, we have a [Zero to Hero Guide](#next-steps) that covers everything from Tool Calling to Agents in detail. Feel free to skip to the end to explore the advanced topics you're interested in. - -> If you'd prefer not to set up a local server, explore our notebook on [tool calling with the Together API](Tool_Calling101_Using_Together's_Llama_Stack_Server.ipynb). This guide will show you how to leverage Together.ai's Llama Stack Server API, allowing you to get started with Llama Stack without the need for a locally built and running server. +> If you'd prefer not to set up a local server, explore our notebook on [tool calling with the Together API](Tool_Calling101_Using_Together's_Llama_Stack_Server.ipynb). This notebook will show you how to leverage together.ai's Llama Stack Server API, allowing you to get started with Llama Stack without the need for a locally built and running server. ## Table of Contents -1. [Setup ollama](#setup-ollama) +1. [Setup and run ollama](#setup-ollama) 2. [Install Dependencies and Set Up Environment](#install-dependencies-and-set-up-environment) 3. [Build, Configure, and Run Llama Stack](#build-configure-and-run-llama-stack) -4. [Run Ollama Model](#run-ollama-model) -5. [Next Steps](#next-steps) +4. [Test with llama-stack-client CLI](#test-with-llama-stack-client-cli) +5. [Test with curl](#test-with-curl) +6. [Test with Python](#test-with-python) +7. [Next Steps](#next-steps) --- @@ -39,107 +23,137 @@ If you're looking for more specific topics like tool calling or agent setup, we 1. **Download Ollama App**: - Go to [https://ollama.com/download](https://ollama.com/download). - - Download and unzip `Ollama-darwin.zip`. + - Follow instructions based on the OS you are on. For example, if you are on a Mac, download and unzip `Ollama-darwin.zip`. - Run the `Ollama` application. 1. **Download the Ollama CLI**: - - Ensure you have the `ollama` command line tool by downloading and installing it from the same website. + Ensure you have the `ollama` command line tool by downloading and installing it from the same website. 1. **Start ollama server**: - - Open the terminal and run: - ``` - ollama serve - ``` - + Open the terminal and run: + ``` + ollama serve + ``` 1. **Run the model**: - - Open the terminal and run: - ```bash - ollama run llama3.2:3b-instruct-fp16 - ``` - **Note**: The supported models for llama stack for now is listed in [here](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/ollama.py#L43) - + Open the terminal and run: + ```bash + ollama run llama3.2:3b-instruct-fp16 --keepalive -1m + ``` + **Note**: + - The supported models for llama stack for now is listed in [here](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/ollama.py#L43) + - `keepalive -1m` is used so that ollama continues to keep the model in memory indefinitely. Otherwise, ollama frees up memory and you would have to run `ollama run` again. --- ## Install Dependencies and Set Up Environment 1. **Create a Conda Environment**: - - Create a new Conda environment with Python 3.10: - ```bash - conda create -n ollama python=3.10 - ``` - - Activate the environment: - ```bash - conda activate ollama - ``` + Create a new Conda environment with Python 3.10: + ```bash + conda create -n ollama python=3.10 + ``` + Activate the environment: + ```bash + conda activate ollama + ``` 2. **Install ChromaDB**: - - Install `chromadb` using `pip`: - ```bash - pip install chromadb - ``` + Install `chromadb` using `pip`: + ```bash + pip install chromadb + ``` 3. **Run ChromaDB**: - - Start the ChromaDB server: - ```bash - chroma run --host localhost --port 8000 --path ./my_chroma_data - ``` + Start the ChromaDB server: + ```bash + chroma run --host localhost --port 8000 --path ./my_chroma_data + ``` 4. **Install Llama Stack**: - - Open a new terminal and install `llama-stack`: - ```bash - conda activate hack - pip install llama-stack==0.0.53 - ``` + Open a new terminal and install `llama-stack`: + ```bash + conda activate ollama + pip install llama-stack==0.0.55 + ``` --- ## Build, Configure, and Run Llama Stack 1. **Build the Llama Stack**: - - Build the Llama Stack using the `ollama` template: - ```bash - llama stack build --template ollama --image-type conda - ``` - -After this step, you will see the console output: - -``` -Build Successful! Next steps: + Build the Llama Stack using the `ollama` template: + ```bash + llama stack build --template ollama --image-type conda + ``` + **Expected Output:** + ``` + ... + Build Successful! Next steps: 1. Set the environment variables: LLAMASTACK_PORT, OLLAMA_URL, INFERENCE_MODEL, SAFETY_MODEL - 2. `llama stack run /Users/username/.llama/distributions/llamastack-ollama/ollama-run.yaml` -``` + 2. `llama stack run /Users//.llama/distributions/llamastack-ollama/ollama-run.yaml + ``` -2. **Set the ENV variables by exporting them to the terminal**: -```bash -export OLLAMA_URL="http://localhost:11434" -export LLAMA_STACK_PORT=5001 -export INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" -export SAFETY_MODEL="meta-llama/Llama-Guard-3-1B" -``` +3. **Set the ENV variables by exporting them to the terminal**: + ```bash + export OLLAMA_URL="http://localhost:11434" + export LLAMA_STACK_PORT=5051 + export INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" + export SAFETY_MODEL="meta-llama/Llama-Guard-3-1B" + ``` 3. **Run the Llama Stack**: - - Run the stack with command shared by the API from earlier: - ```bash - llama stack run ollama \ - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env SAFETY_MODEL=$SAFETY_MODEL \ - --env OLLAMA_URL=http://localhost:11434 - ``` - -Note: Everytime you run a new model with `ollama run`, you will need to restart the llama stack. Otherwise it won't see the new model + Run the stack with command shared by the API from earlier: + ```bash + llama stack run ollama \ + --port $LLAMA_STACK_PORT \ + --env INFERENCE_MODEL=$INFERENCE_MODEL \ + --env SAFETY_MODEL=$SAFETY_MODEL \ + --env OLLAMA_URL=$OLLAMA_URL + ``` + Note: Everytime you run a new model with `ollama run`, you will need to restart the llama stack. Otherwise it won't see the new model. The server will start and listen on `http://localhost:5051`. --- +## Test with `llama-stack-client` CLI +After setting up the server, open a new terminal window and install the llama-stack-client package. -## Testing with `curl` +1. Install the llama-stack-client package + ```bash + conda activate ollama + pip install llama-stack-client + ``` +2. Configure the CLI to point to the llama-stack server. + ```bash + llama-stack-client configure --endpoint http://localhost:5051 + ``` + **Expected Output:** + ```bash + Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:5051 + ``` +3. Test the CLI by running inference: + ```bash + llama-stack-client inference chat-completion --message "Write me a 2-sentence poem about the moon" + ``` + **Expected Output:** + ```bash + ChatCompletionResponse( + completion_message=CompletionMessage( + content='Here is a 2-sentence poem about the moon:\n\nSilver crescent shining bright in the night,\nA beacon of wonder, full of gentle light.', + role='assistant', + stop_reason='end_of_turn', + tool_calls=[] + ), + logprobs=None + ) + ``` + +## Test with `curl` After setting up the server, open a new terminal window and verify it's working by sending a `POST` request using `curl`: ```bash -curl http://localhost:5051/inference/chat_completion \ +curl http://localhost:$LLAMA_STACK_PORT/inference/chat_completion \ -H "Content-Type: application/json" \ -d '{ "model": "Llama3.2-3B-Instruct", @@ -168,15 +182,16 @@ You can check the available models with the command `llama-stack-client models l --- -## Testing with Python +## Test with Python You can also interact with the Llama Stack server using a simple Python script. Below is an example: -### 1. Active Conda Environment and Install Required Python Packages +### 1. Activate Conda Environment and Install Required Python Packages The `llama-stack-client` library offers a robust and efficient python methods for interacting with the Llama Stack server. ```bash -conda activate your-llama-stack-conda-env +conda activate ollama +pip install llama-stack-client ``` Note, the client library gets installed by default if you install the server library @@ -188,6 +203,8 @@ touch test_llama_stack.py ### 3. Create a Chat Completion Request in Python +In `test_llama_stack.py`, write the following code: + ```python from llama_stack_client import LlamaStackClient From fe48b9fb8c4df70f6566f14726194f9fbe325414 Mon Sep 17 00:00:00 2001 From: Dinesh Yeduguru Date: Sat, 30 Nov 2024 12:27:31 -0800 Subject: [PATCH 08/69] Bump version to 0.0.56 --- requirements.txt | 4 ++-- setup.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index b5b7587d0..0ff43e246 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,8 +2,8 @@ blobfile fire httpx huggingface-hub -llama-models>=0.0.55 -llama-stack-client>=0.0.55 +llama-models>=0.0.56 +llama-stack-client>=0.0.56 prompt-toolkit python-dotenv pydantic>=2 diff --git a/setup.py b/setup.py index a4efd08c6..842cbb30d 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ def read_requirements(): setup( name="llama_stack", - version="0.0.55", + version="0.0.56", author="Meta Llama", author_email="llama-oss@meta.com", description="Llama Stack", From 6bcd1bd9f10a7bdda040e9549828770d5793145b Mon Sep 17 00:00:00 2001 From: Aidan Do Date: Tue, 3 Dec 2024 06:06:20 +1100 Subject: [PATCH 09/69] Fix broken Ollama link (#554) # What does this PR do? Fixes a broken Ollama link and formatting on this page: https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/ollama.html Screenshot 2024-12-02 at 21 04 17 image To: Screenshot 2024-12-02 at 21 05 07 image ## Before submitting - [x] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). Co-authored-by: Aidan Do --- docs/source/distributions/self_hosted_distro/ollama.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/distributions/self_hosted_distro/ollama.md b/docs/source/distributions/self_hosted_distro/ollama.md index 0eb245483..9f81d9329 100644 --- a/docs/source/distributions/self_hosted_distro/ollama.md +++ b/docs/source/distributions/self_hosted_distro/ollama.md @@ -118,9 +118,9 @@ llama stack run ./run-with-safety.yaml \ ### (Optional) Update Model Serving Configuration -> [!NOTE] -> Please check the [OLLAMA_SUPPORTED_MODELS](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers.remote/inference/ollama/ollama.py) for the supported Ollama models. - +```{note} +Please check the [model_aliases](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/ollama.py#L45) variable for supported Ollama models. +``` To serve a new model with `ollama` ```bash From 1e2faa461fd5843f83fc3db75cab5c10a7353194 Mon Sep 17 00:00:00 2001 From: Dinesh Yeduguru Date: Mon, 2 Dec 2024 16:10:16 -0800 Subject: [PATCH 10/69] update client cli docs (#560) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test plan: make html sphinx-autobuild source build/html ![Screenshot 2024-12-02 at 3 32 18 PM](https://github.com/user-attachments/assets/061d5ca6-178f-463a-854c-acb96ca3bb0d) --- .../llama_stack_client_cli_reference.md | 75 +++++++++++++++++-- 1 file changed, 68 insertions(+), 7 deletions(-) diff --git a/docs/source/references/llama_stack_client_cli_reference.md b/docs/source/references/llama_stack_client_cli_reference.md index d3835e488..b35aa189d 100644 --- a/docs/source/references/llama_stack_client_cli_reference.md +++ b/docs/source/references/llama_stack_client_cli_reference.md @@ -27,8 +27,6 @@ $ llama-stack-client configure Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:5000 ``` -## Provider Commands - ### `llama-stack-client providers list` ```bash $ llama-stack-client providers list @@ -119,8 +117,25 @@ $ llama-stack-client memory_banks list +--------------+----------------+--------+-------------------+------------------------+--------------------------+ ``` -## Shield Management +### `llama-stack-client memory_banks register` +```bash +$ llama-stack-client memory_banks register --type [--provider-id ] [--provider-memory-bank-id ] [--chunk-size ] [--embedding-model ] [--overlap-size ] +``` +Options: +- `--type`: Required. Type of memory bank. Choices: "vector", "keyvalue", "keyword", "graph" +- `--provider-id`: Optional. Provider ID for the memory bank +- `--provider-memory-bank-id`: Optional. Provider's memory bank ID +- `--chunk-size`: Optional. Chunk size in tokens (for vector type). Default: 512 +- `--embedding-model`: Optional. Embedding model (for vector type). Default: "all-MiniLM-L6-v2" +- `--overlap-size`: Optional. Overlap size in tokens (for vector type). Default: 64 + +### `llama-stack-client memory_banks unregister` +```bash +$ llama-stack-client memory_banks unregister +``` + +## Shield Management ### `llama-stack-client shields list` ```bash $ llama-stack-client shields list @@ -134,16 +149,51 @@ $ llama-stack-client shields list +--------------+----------+----------------+-------------+ ``` -## Evaluation Tasks +### `llama-stack-client shields register` +```bash +$ llama-stack-client shields register --shield-id [--provider-id ] [--provider-shield-id ] [--params ] +``` + +Options: +- `--shield-id`: Required. ID of the shield +- `--provider-id`: Optional. Provider ID for the shield +- `--provider-shield-id`: Optional. Provider's shield ID +- `--params`: Optional. JSON configuration parameters for the shield + +## Eval Task Management ### `llama-stack-client eval_tasks list` ```bash -$ llama-stack-client eval run_benchmark --num-examples 10 --output-dir ./ --eval-task-config ~/eval_task_config.json +$ llama-stack-client eval_tasks list ``` -where `eval_task_config.json` is the path to the eval task config file in JSON format. An example eval_task_config +### `llama-stack-client eval_tasks register` +```bash +$ llama-stack-client eval_tasks register --eval-task-id --dataset-id --scoring-functions [ ...] [--provider-id ] [--provider-eval-task-id ] [--metadata ] ``` -$ cat ~/eval_task_config.json + +Options: +- `--eval-task-id`: Required. ID of the eval task +- `--dataset-id`: Required. ID of the dataset to evaluate +- `--scoring-functions`: Required. One or more scoring functions to use for evaluation +- `--provider-id`: Optional. Provider ID for the eval task +- `--provider-eval-task-id`: Optional. Provider's eval task ID +- `--metadata`: Optional. Metadata for the eval task in JSON format + +## Eval execution +### `llama-stack-client eval run-benchmark` +```bash +$ llama-stack-client eval run-benchmark [ ...] --eval-task-config --output-dir [--num-examples ] [--visualize] +``` + +Options: +- `--eval-task-config`: Required. Path to the eval task config file in JSON format +- `--output-dir`: Required. Path to the directory where evaluation results will be saved +- `--num-examples`: Optional. Number of examples to evaluate (useful for debugging) +- `--visualize`: Optional flag. If set, visualizes evaluation results after completion + +Example eval_task_config.json: +```json { "type": "benchmark", "eval_candidate": { @@ -160,3 +210,14 @@ $ cat ~/eval_task_config.json } } ``` + +### `llama-stack-client eval run-scoring` +```bash +$ llama-stack-client eval run-scoring --eval-task-config --output-dir [--num-examples ] [--visualize] +``` + +Options: +- `--eval-task-config`: Required. Path to the eval task config file in JSON format +- `--output-dir`: Required. Path to the directory where scoring results will be saved +- `--num-examples`: Optional. Number of examples to evaluate (useful for debugging) +- `--visualize`: Optional flag. If set, visualizes scoring results after completion From 4c7b1a8fb3acb8f65dac9c2f066f86e31d6cd805 Mon Sep 17 00:00:00 2001 From: dltn <6599399+dltn@users.noreply.github.com> Date: Mon, 2 Dec 2024 19:48:46 -0800 Subject: [PATCH 11/69] Bump version to 0.0.57 --- requirements.txt | 4 ++-- setup.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 0ff43e246..8698495b1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,8 +2,8 @@ blobfile fire httpx huggingface-hub -llama-models>=0.0.56 -llama-stack-client>=0.0.56 +llama-models>=0.0.57 +llama-stack-client>=0.0.57 prompt-toolkit python-dotenv pydantic>=2 diff --git a/setup.py b/setup.py index 842cbb30d..3d68021dd 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ def read_requirements(): setup( name="llama_stack", - version="0.0.56", + version="0.0.57", author="Meta Llama", author_email="llama-oss@meta.com", description="Llama Stack", From 435f34b05e84f1747b28570234f25878cf0b31c4 Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Tue, 3 Dec 2024 05:55:14 -0500 Subject: [PATCH 12/69] reduce the accuracy requirements to pass the chat completion structured output test (#522) i find `test_structured_output` to be flakey. it's both a functionality and accuracy test - ``` answer = AnswerFormat.model_validate_json(response.completion_message.content) assert answer.first_name == "Michael" assert answer.last_name == "Jordan" assert answer.year_of_birth == 1963 assert answer.num_seasons_in_nba == 15 ``` it's an accuracy test because it checks the value of first/last name, birth year, and num seasons. i find that - - llama-3.1-8b-instruct and llama-3.2-3b-instruct pass the functionality portion - llama-3.2-3b-instruct consistently fails the accuracy portion (thinking MJ was in the NBA for 14 seasons) - llama-3.1-8b-instruct occasionally fails the accuracy portion suggestions (not mutually exclusive) - 1. turn the test into functionality only, skip the value checks 2. split the test into a functionality version and an xfail accuracy version 3. add context to the prompt so the llm can answer without accessing embedded memory # What does this PR do? implements option (3) by adding context to the system prompt. ## Test Plan `pytest -s -v ... llama_stack/providers/tests/inference/ ... -k structured_output` ## Before submitting - [x] Ran pre-commit to handle lint / formatting issues. - [x] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [x] Updated relevant documentation. - [x] Wrote necessary unit or integration tests. --- .../providers/tests/inference/test_text_inference.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/llama_stack/providers/tests/inference/test_text_inference.py b/llama_stack/providers/tests/inference/test_text_inference.py index f0f1d0eb2..9e5c67375 100644 --- a/llama_stack/providers/tests/inference/test_text_inference.py +++ b/llama_stack/providers/tests/inference/test_text_inference.py @@ -211,7 +211,15 @@ class TestInference: response = await inference_impl.chat_completion( model_id=inference_model, messages=[ - SystemMessage(content="You are a helpful assistant."), + # we include context about Michael Jordan in the prompt so that the test is + # focused on the funtionality of the model and not on the information embedded + # in the model. Llama 3.2 3B Instruct tends to think MJ played for 14 seasons. + SystemMessage( + content=( + "You are a helpful assistant.\n\n" + "Michael Jordan was born in 1963. He played basketball for the Chicago Bulls for 15 seasons." + ) + ), UserMessage(content="Please give me information about Michael Jordan."), ], stream=False, From fd19a8a517fc22975b9b93faa5b997117a5cf2e8 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Tue, 3 Dec 2024 18:50:18 -0800 Subject: [PATCH 13/69] add missing __init__ --- llama_stack/providers/inline/scoring/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 llama_stack/providers/inline/scoring/__init__.py diff --git a/llama_stack/providers/inline/scoring/__init__.py b/llama_stack/providers/inline/scoring/__init__.py new file mode 100644 index 000000000..756f351d8 --- /dev/null +++ b/llama_stack/providers/inline/scoring/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. From 6e10d0b23eb662776586f30c476902791a1089d9 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Tue, 3 Dec 2024 18:52:43 -0800 Subject: [PATCH 14/69] precommit --- llama_stack/providers/inline/scoring/braintrust/__init__.py | 3 ++- llama_stack/providers/inline/scoring/braintrust/braintrust.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/llama_stack/providers/inline/scoring/braintrust/__init__.py b/llama_stack/providers/inline/scoring/braintrust/__init__.py index dc4ea4951..2ddc58bd2 100644 --- a/llama_stack/providers/inline/scoring/braintrust/__init__.py +++ b/llama_stack/providers/inline/scoring/braintrust/__init__.py @@ -5,9 +5,10 @@ # the root directory of this source tree. from typing import Dict -from llama_stack.distribution.datatypes import Api, ProviderSpec from pydantic import BaseModel +from llama_stack.distribution.datatypes import Api, ProviderSpec + from .config import BraintrustScoringConfig diff --git a/llama_stack/providers/inline/scoring/braintrust/braintrust.py b/llama_stack/providers/inline/scoring/braintrust/braintrust.py index cf6e22a29..ee515d588 100644 --- a/llama_stack/providers/inline/scoring/braintrust/braintrust.py +++ b/llama_stack/providers/inline/scoring/braintrust/braintrust.py @@ -16,6 +16,7 @@ import os from autoevals.llm import Factuality from autoevals.ragas import AnswerCorrectness + from llama_stack.distribution.request_headers import NeedsRequestProviderData from llama_stack.providers.datatypes import ScoringFunctionsProtocolPrivate From b6500974eca169ed053a7b95408ac756c160c004 Mon Sep 17 00:00:00 2001 From: Kai Wu Date: Tue, 3 Dec 2024 20:11:19 -0800 Subject: [PATCH 15/69] removed assertion in ollama.py and fixed typo in the readme (#563) # What does this PR do? 1. removed [incorrect assertion](https://github.com/meta-llama/llama-stack/blob/435f34b05e84f1747b28570234f25878cf0b31c4/llama_stack/providers/remote/inference/ollama/ollama.py#L183) in ollama.py 2. fixed a typo in [this line](https://github.com/meta-llama/llama-stack/blob/435f34b05e84f1747b28570234f25878cf0b31c4/docs/source/distributions/importing_as_library.md?plain=1#L24), as `model=` should be `model_id=` . - [x] Addresses issue ([#issue562](https://github.com/meta-llama/llama-stack/issues/562)) ## Test Plan tested with code: ```python import asyncio import os # pip install aiosqlite ollama faiss from llama_stack_client.lib.direct.direct import LlamaStackDirectClient from llama_stack_client.types import SystemMessage, UserMessage async def main(): os.environ["INFERENCE_MODEL"] = "meta-llama/Llama-3.2-1B-Instruct" client = await LlamaStackDirectClient.from_template("ollama") await client.initialize() response = await client.models.list() print(response) model_name = response[0].identifier response = await client.inference.chat_completion( messages=[ SystemMessage(content="You are a friendly assistant.", role="system"), UserMessage( content="hello world, write me a 2 sentence poem about the moon", role="user", ), ], model_id=model_name, stream=False, ) print("\nChat completion response:") print(response, type(response)) asyncio.run(main()) ``` OUTPUT: ``` python test.py Using template ollama with config: apis: - agents - inference - memory - safety - telemetry conda_env: ollama datasets: [] docker_image: null eval_tasks: [] image_name: ollama memory_banks: [] metadata_store: db_path: /Users/kaiwu/.llama/distributions/ollama/registry.db namespace: null type: sqlite models: - metadata: {} model_id: meta-llama/Llama-3.2-1B-Instruct provider_id: ollama provider_model_id: null providers: agents: - config: persistence_store: db_path: /Users/kaiwu/.llama/distributions/ollama/agents_store.db namespace: null type: sqlite provider_id: meta-reference provider_type: inline::meta-reference inference: - config: url: http://localhost:11434 provider_id: ollama provider_type: remote::ollama memory: - config: kvstore: db_path: /Users/kaiwu/.llama/distributions/ollama/faiss_store.db namespace: null type: sqlite provider_id: faiss provider_type: inline::faiss safety: - config: {} provider_id: llama-guard provider_type: inline::llama-guard telemetry: - config: {} provider_id: meta-reference provider_type: inline::meta-reference scoring_fns: [] shields: [] version: '2' [Model(identifier='meta-llama/Llama-3.2-1B-Instruct', provider_resource_id='llama3.2:1b-instruct-fp16', provider_id='ollama', type='model', metadata={})] Chat completion response: completion_message=CompletionMessage(role='assistant', content='Here is a short poem about the moon:\n\nThe moon glows bright in the midnight sky,\nA silver crescent shining, catching the eye.', stop_reason=, tool_calls=[]) logprobs=None ``` ## Sources Please link relevant resources if necessary. ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Ran pre-commit to handle lint / formatting issues. - [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ ] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests. --- docs/source/distributions/importing_as_library.md | 2 +- llama_stack/providers/remote/inference/ollama/ollama.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/source/distributions/importing_as_library.md b/docs/source/distributions/importing_as_library.md index 815660fd4..7e15062df 100644 --- a/docs/source/distributions/importing_as_library.md +++ b/docs/source/distributions/importing_as_library.md @@ -21,7 +21,7 @@ print(response) ```python response = await client.inference.chat_completion( messages=[UserMessage(content="What is the capital of France?", role="user")], - model="Llama3.1-8B-Instruct", + model_id="Llama3.1-8B-Instruct", stream=False, ) print("\nChat completion response:") diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py index 74c0b8601..f89629afc 100644 --- a/llama_stack/providers/remote/inference/ollama/ollama.py +++ b/llama_stack/providers/remote/inference/ollama/ollama.py @@ -180,7 +180,6 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate): async def _nonstream_completion(self, request: CompletionRequest) -> AsyncGenerator: params = await self._get_params(request) r = await self.client.generate(**params) - assert isinstance(r, dict) choice = OpenAICompatCompletionChoice( finish_reason=r["done_reason"] if r["done"] else None, From 64c6df8392c8ceea321375bca12af2b025f6693e Mon Sep 17 00:00:00 2001 From: Henry Tu Date: Wed, 4 Dec 2024 00:15:32 -0500 Subject: [PATCH 16/69] Cerebras Inference Integration (#265) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adding Cerebras Inference as an API provider. ## Testing ### Conda ``` $ llama stack build --template cerebras --image-type conda $ llama stack run ~/.llama/distributions/llamastack-cerebras/cerebras-run.yaml ... Listening on ['::', '0.0.0.0']:5000 INFO: Started server process [12443] INFO: Waiting for application startup. INFO: Application startup complete. INFO: Uvicorn running on http://['::', '0.0.0.0']:5000 (Press CTRL+C to quit) ``` ### Chat Completion ``` $ curl --location 'http://localhost:5000/alpha/inference/chat-completion' --header 'Content-Type: application/json' --data '{ "model_id": "meta-llama/Llama-3.1-8B-Instruct", "messages": [ { "role": "user", "content": "What is the temperature in Seattle right now?" } ], "stream": false, "sampling_params": { "strategy": "top_p", "temperature": 0.5, "max_tokens": 100 }, "tool_choice": "auto", "tool_prompt_format": "json", "tools": [ { "tool_name": "getTemperature", "description": "Gets the current temperature of a location.", "parameters": { "location": { "param_type": "string", "description": "The name of the place to get the temperature from in degress celsius.", "required": true } } } ] }' ``` #### Non-Streaming Response ``` { "completion_message": { "role": "assistant", "content": "", "stop_reason": "end_of_message", "tool_calls": [ { "call_id": "6f42fdcc-6cbb-46ad-a17b-5d20ac64b678", "tool_name": "getTemperature", "arguments": { "location": "Seattle" } } ] }, "logprobs": null } ``` #### Streaming Response ``` data: {"event":{"event_type":"start","delta":"","logprobs":null,"stop_reason":null}} data: {"event":{"event_type":"progress","delta":{"content":"","parse_status":"started"},"logprobs":null,"stop_reason":null}} data: {"event":{"event_type":"progress","delta":{"content":"{\"","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}} data: {"event":{"event_type":"progress","delta":{"content":"type","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}} data: {"event":{"event_type":"progress","delta":{"content":"\":","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}} data: {"event":{"event_type":"progress","delta":{"content":" \"","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}} data: {"event":{"event_type":"progress","delta":{"content":"function","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}} data: {"event":{"event_type":"progress","delta":{"content":"\",","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}} data: {"event":{"event_type":"progress","delta":{"content":" \"","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}} data: {"event":{"event_type":"progress","delta":{"content":"name","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}} data: {"event":{"event_type":"progress","delta":{"content":"\":","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}} data: {"event":{"event_type":"progress","delta":{"content":" \"","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}} data: {"event":{"event_type":"progress","delta":{"content":"get","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}} data: {"event":{"event_type":"progress","delta":{"content":"Temperature","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}} data: {"event":{"event_type":"progress","delta":{"content":"\",","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}} data: {"event":{"event_type":"progress","delta":{"content":" \"","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}} data: {"event":{"event_type":"progress","delta":{"content":"parameters","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}} data: {"event":{"event_type":"progress","delta":{"content":"\":","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}} data: {"event":{"event_type":"progress","delta":{"content":" {\"","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}} data: {"event":{"event_type":"progress","delta":{"content":"location","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}} data: {"event":{"event_type":"progress","delta":{"content":"\":","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}} data: {"event":{"event_type":"progress","delta":{"content":" \"","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}} data: {"event":{"event_type":"progress","delta":{"content":"Seattle","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}} data: {"event":{"event_type":"progress","delta":{"content":"\"}}","parse_status":"in_progress"},"logprobs":null,"stop_reason":null}} data: {"event":{"event_type":"progress","delta":{"content":{"call_id":"e742df1f-0ae9-40ad-a49e-18e5c905484f","tool_name":"getTemperature","arguments":{"location":"Seattle"}},"parse_status":"success"},"logprobs":null,"stop_reason":"end_of_message"}} data: {"event":{"event_type":"complete","delta":"","logprobs":null,"stop_reason":"end_of_message"}} ``` ### Completion ``` $ curl --location 'http://localhost:5000/alpha/inference/completion' --header 'Content-Type: application/json' --data '{ "model_id": "meta-llama/Llama-3.1-8B-Instruct", "content": "1,2,3,", "stream": true, "sampling_params": { "strategy": "top_p", "temperature": 0.5, "max_tokens": 10 }, "tool_choice": "auto", "tool_prompt_format": "json", "tools": [ { "tool_name": "getTemperature", "description": "Gets the current temperature of a location.", "parameters": { "location": { "param_type": "string", "description": "The name of the place to get the temperature from in degress celsius.", "required": true } } } ] }' ``` #### Non-Streaming Response ``` { "content": "4,5,6,7,8,", "stop_reason": "out_of_tokens", "logprobs": null } ``` #### Streaming Response ``` data: {"delta":"4","stop_reason":null,"logprobs":null} data: {"delta":",","stop_reason":null,"logprobs":null} data: {"delta":"5","stop_reason":null,"logprobs":null} data: {"delta":",","stop_reason":null,"logprobs":null} data: {"delta":"6","stop_reason":null,"logprobs":null} data: {"delta":",","stop_reason":null,"logprobs":null} data: {"delta":"7","stop_reason":null,"logprobs":null} data: {"delta":",","stop_reason":null,"logprobs":null} data: {"delta":"8","stop_reason":null,"logprobs":null} data: {"delta":",","stop_reason":null,"logprobs":null} data: {"delta":"","stop_reason":null,"logprobs":null} data: {"delta":"","stop_reason":"out_of_tokens","logprobs":null} ``` ### Pre-Commit Checks ``` trim trailing whitespace.................................................Passed check python ast.........................................................Passed check for merge conflicts................................................Passed check for added large files..............................................Passed fix end of files.........................................................Passed Insert license in comments...............................................Passed flake8...................................................................Passed Format files with µfmt...................................................Passed ``` ### Testing with `test_inference.py` ``` $ export CEREBRAS_API_KEY= $ pytest -v -s llama_stack/providers/tests/inference/test_text_inference.py -m "cerebras and llama_8b" /net/henryt-dev/srv/nfs/henryt-data/ws/llama-stack/.venv/lib/python3.12/site-packages/pytest_asyncio/plugin.py:208: PytestDeprecationWarning: The configuration option "asyncio_default_fixture_loop_scope" is unset. The event loop scope for asynchronous fixtures will default to the fixture caching scope. Future versions of pytest-asyncio will default the loop scope for asynchronous fixtures to function scope. Set the default fixture loop scope explicitly in order to avoid unexpected behavior in the future. Valid fixture loop scopes are: "function", "class", "module", "package", "session" warnings.warn(PytestDeprecationWarning(_DEFAULT_FIXTURE_LOOP_SCOPE_UNSET)) =================================================== test session starts =================================================== platform linux -- Python 3.12.3, pytest-8.3.3, pluggy-1.5.0 -- /net/henryt-dev/srv/nfs/henryt-data/ws/llama-stack/.venv/bin/python3.12 cachedir: .pytest_cache rootdir: /net/henryt-dev/srv/nfs/henryt-data/ws/llama-stack configfile: pyproject.toml plugins: anyio-4.6.2.post1, asyncio-0.24.0 asyncio: mode=Mode.STRICT, default_loop_scope=None collected 128 items / 120 deselected / 8 selected llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_model_list[llama_8b-cerebras] Resolved 4 providers inner-inference => cerebras models => __routing_table__ inference => __autorouted__ inspect => __builtin__ Models: meta-llama/Llama-3.1-8B-Instruct served by cerebras PASSED llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_completion[llama_8b-cerebras] PASSED llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_completions_structured_output[llama_8b-cerebras] SKIPPED llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_chat_completion_non_streaming[llama_8b-cerebras] PASSED llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_structured_output[llama_8b-cerebras] SKIPPED llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_chat_completion_streaming[llama_8b-cerebras] PASSED llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_chat_completion_with_tool_calling[llama_8b-cerebras] PASSED llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_chat_completion_with_tool_calling_streaming[llama_8b-cerebras] PASSED ================================ 6 passed, 2 skipped, 120 deselected, 6 warnings in 3.95s ================================= ``` I ran `python llama_stack/scripts/distro_codegen.py` to run codegen. --- README.md | 2 + distributions/cerebras/build.yaml | 1 + distributions/cerebras/compose.yaml | 16 + distributions/cerebras/run.yaml | 1 + distributions/dependencies.json | 380 ++++++++++-------- docs/source/distributions/building_distro.md | 356 ++++++++++------ .../self_hosted_distro/cerebras.md | 61 +++ docs/source/index.md | 1 + llama_stack/providers/registry/inference.py | 11 + .../remote/inference/cerebras/__init__.py | 21 + .../remote/inference/cerebras/cerebras.py | 191 +++++++++ .../remote/inference/cerebras/config.py | 32 ++ .../providers/tests/inference/fixtures.py | 17 + .../tests/inference/test_text_inference.py | 2 + llama_stack/templates/cerebras/__init__.py | 7 + llama_stack/templates/cerebras/build.yaml | 17 + llama_stack/templates/cerebras/cerebras.py | 71 ++++ .../templates/cerebras/doc_template.md | 60 +++ llama_stack/templates/cerebras/run.yaml | 63 +++ 19 files changed, 1018 insertions(+), 292 deletions(-) create mode 120000 distributions/cerebras/build.yaml create mode 100644 distributions/cerebras/compose.yaml create mode 120000 distributions/cerebras/run.yaml create mode 100644 docs/source/distributions/self_hosted_distro/cerebras.md create mode 100644 llama_stack/providers/remote/inference/cerebras/__init__.py create mode 100644 llama_stack/providers/remote/inference/cerebras/cerebras.py create mode 100644 llama_stack/providers/remote/inference/cerebras/config.py create mode 100644 llama_stack/templates/cerebras/__init__.py create mode 100644 llama_stack/templates/cerebras/build.yaml create mode 100644 llama_stack/templates/cerebras/cerebras.py create mode 100644 llama_stack/templates/cerebras/doc_template.md create mode 100644 llama_stack/templates/cerebras/run.yaml diff --git a/README.md b/README.md index 8e57292c3..0dfb1306d 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,7 @@ Additionally, we have designed every element of the Stack such that APIs as well | **API Provider Builder** | **Environments** | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | Meta Reference | Single Node | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | +| Cerebras | Single Node | | :heavy_check_mark: | | | | | Fireworks | Hosted | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | | | AWS Bedrock | Hosted | | :heavy_check_mark: | | :heavy_check_mark: | | | Together | Hosted | :heavy_check_mark: | :heavy_check_mark: | | :heavy_check_mark: | | @@ -95,6 +96,7 @@ Additionally, we have designed every element of the Stack such that APIs as well |:----------------: |:------------------------------------------: |:-----------------------: | | Meta Reference | [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-gpu.html) | | Meta Reference Quantized | [llamastack/distribution-meta-reference-quantized-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-quantized-gpu/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-quantized-gpu.html) | +| Cerebras | [llamastack/distribution-cerebras](https://hub.docker.com/repository/docker/llamastack/distribution-cerebras/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/cerebras.html) | | Ollama | [llamastack/distribution-ollama](https://hub.docker.com/repository/docker/llamastack/distribution-ollama/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/ollama.html) | | TGI | [llamastack/distribution-tgi](https://hub.docker.com/repository/docker/llamastack/distribution-tgi/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/tgi.html) | | Together | [llamastack/distribution-together](https://hub.docker.com/repository/docker/llamastack/distribution-together/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/together.html) | diff --git a/distributions/cerebras/build.yaml b/distributions/cerebras/build.yaml new file mode 120000 index 000000000..bccbbcf60 --- /dev/null +++ b/distributions/cerebras/build.yaml @@ -0,0 +1 @@ +../../llama_stack/templates/cerebras/build.yaml \ No newline at end of file diff --git a/distributions/cerebras/compose.yaml b/distributions/cerebras/compose.yaml new file mode 100644 index 000000000..f2e9a6f42 --- /dev/null +++ b/distributions/cerebras/compose.yaml @@ -0,0 +1,16 @@ +services: + llamastack: + image: llamastack/distribution-cerebras + network_mode: "host" + volumes: + - ~/.llama:/root/.llama + - ./run.yaml:/root/llamastack-run-cerebras.yaml + ports: + - "5000:5000" + entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-cerebras.yaml" + deploy: + restart_policy: + condition: on-failure + delay: 3s + max_attempts: 5 + window: 60s diff --git a/distributions/cerebras/run.yaml b/distributions/cerebras/run.yaml new file mode 120000 index 000000000..9f9d20b4b --- /dev/null +++ b/distributions/cerebras/run.yaml @@ -0,0 +1 @@ +../../llama_stack/templates/cerebras/run.yaml \ No newline at end of file diff --git a/distributions/dependencies.json b/distributions/dependencies.json index 36426e862..80468cc73 100644 --- a/distributions/dependencies.json +++ b/distributions/dependencies.json @@ -1,4 +1,152 @@ { + "tgi": [ + "aiohttp", + "aiosqlite", + "blobfile", + "chardet", + "chromadb-client", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "huggingface_hub", + "matplotlib", + "nltk", + "numpy", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" + ], + "remote-vllm": [ + "aiosqlite", + "blobfile", + "chardet", + "chromadb-client", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "matplotlib", + "nltk", + "numpy", + "openai", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" + ], + "vllm-gpu": [ + "aiosqlite", + "blobfile", + "chardet", + "chromadb-client", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "matplotlib", + "nltk", + "numpy", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "vllm", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" + ], + "meta-reference-quantized-gpu": [ + "accelerate", + "aiosqlite", + "blobfile", + "chardet", + "chromadb-client", + "fairscale", + "faiss-cpu", + "fastapi", + "fbgemm-gpu", + "fire", + "httpx", + "lm-format-enforcer", + "matplotlib", + "nltk", + "numpy", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "torch", + "torchao==0.5.0", + "torchvision", + "tqdm", + "transformers", + "uvicorn", + "zmq", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" + ], + "meta-reference-gpu": [ + "accelerate", + "aiosqlite", + "blobfile", + "chardet", + "chromadb-client", + "fairscale", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "lm-format-enforcer", + "matplotlib", + "nltk", + "numpy", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "torch", + "torchvision", + "tqdm", + "transformers", + "uvicorn", + "zmq", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" + ], "hf-serverless": [ "aiohttp", "aiosqlite", @@ -54,88 +202,7 @@ "sentence-transformers --no-deps", "torch --index-url https://download.pytorch.org/whl/cpu" ], - "vllm-gpu": [ - "aiosqlite", - "blobfile", - "chardet", - "chromadb-client", - "faiss-cpu", - "fastapi", - "fire", - "httpx", - "matplotlib", - "nltk", - "numpy", - "pandas", - "pillow", - "psycopg2-binary", - "pypdf", - "redis", - "scikit-learn", - "scipy", - "sentencepiece", - "tqdm", - "transformers", - "uvicorn", - "vllm", - "sentence-transformers --no-deps", - "torch --index-url https://download.pytorch.org/whl/cpu" - ], - "remote-vllm": [ - "aiosqlite", - "blobfile", - "chardet", - "chromadb-client", - "faiss-cpu", - "fastapi", - "fire", - "httpx", - "matplotlib", - "nltk", - "numpy", - "openai", - "pandas", - "pillow", - "psycopg2-binary", - "pypdf", - "redis", - "scikit-learn", - "scipy", - "sentencepiece", - "tqdm", - "transformers", - "uvicorn", - "sentence-transformers --no-deps", - "torch --index-url https://download.pytorch.org/whl/cpu" - ], - "fireworks": [ - "aiosqlite", - "blobfile", - "chardet", - "chromadb-client", - "faiss-cpu", - "fastapi", - "fire", - "fireworks-ai", - "httpx", - "matplotlib", - "nltk", - "numpy", - "pandas", - "pillow", - "psycopg2-binary", - "pypdf", - "redis", - "scikit-learn", - "scipy", - "sentencepiece", - "tqdm", - "transformers", - "uvicorn", - "sentence-transformers --no-deps", - "torch --index-url https://download.pytorch.org/whl/cpu" - ], - "tgi": [ + "ollama": [ "aiohttp", "aiosqlite", "blobfile", @@ -145,10 +212,10 @@ "fastapi", "fire", "httpx", - "huggingface_hub", "matplotlib", "nltk", "numpy", + "ollama", "pandas", "pillow", "psycopg2-binary", @@ -190,100 +257,6 @@ "sentence-transformers --no-deps", "torch --index-url https://download.pytorch.org/whl/cpu" ], - "meta-reference-gpu": [ - "accelerate", - "aiosqlite", - "blobfile", - "chardet", - "chromadb-client", - "fairscale", - "faiss-cpu", - "fastapi", - "fire", - "httpx", - "lm-format-enforcer", - "matplotlib", - "nltk", - "numpy", - "pandas", - "pillow", - "psycopg2-binary", - "pypdf", - "redis", - "scikit-learn", - "scipy", - "sentencepiece", - "torch", - "torchvision", - "tqdm", - "transformers", - "uvicorn", - "zmq", - "sentence-transformers --no-deps", - "torch --index-url https://download.pytorch.org/whl/cpu" - ], - "meta-reference-quantized-gpu": [ - "accelerate", - "aiosqlite", - "blobfile", - "chardet", - "chromadb-client", - "fairscale", - "faiss-cpu", - "fastapi", - "fbgemm-gpu", - "fire", - "httpx", - "lm-format-enforcer", - "matplotlib", - "nltk", - "numpy", - "pandas", - "pillow", - "psycopg2-binary", - "pypdf", - "redis", - "scikit-learn", - "scipy", - "sentencepiece", - "torch", - "torchao==0.5.0", - "torchvision", - "tqdm", - "transformers", - "uvicorn", - "zmq", - "sentence-transformers --no-deps", - "torch --index-url https://download.pytorch.org/whl/cpu" - ], - "ollama": [ - "aiohttp", - "aiosqlite", - "blobfile", - "chardet", - "chromadb-client", - "faiss-cpu", - "fastapi", - "fire", - "httpx", - "matplotlib", - "nltk", - "numpy", - "ollama", - "pandas", - "pillow", - "psycopg2-binary", - "pypdf", - "redis", - "scikit-learn", - "scipy", - "sentencepiece", - "tqdm", - "transformers", - "uvicorn", - "sentence-transformers --no-deps", - "torch --index-url https://download.pytorch.org/whl/cpu" - ], "hf-endpoint": [ "aiohttp", "aiosqlite", @@ -311,5 +284,58 @@ "uvicorn", "sentence-transformers --no-deps", "torch --index-url https://download.pytorch.org/whl/cpu" + ], + "fireworks": [ + "aiosqlite", + "blobfile", + "chardet", + "chromadb-client", + "faiss-cpu", + "fastapi", + "fire", + "fireworks-ai", + "httpx", + "matplotlib", + "nltk", + "numpy", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" + ], + "cerebras": [ + "aiosqlite", + "blobfile", + "cerebras_cloud_sdk", + "chardet", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "matplotlib", + "nltk", + "numpy", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" ] } diff --git a/docs/source/distributions/building_distro.md b/docs/source/distributions/building_distro.md index a45d07ebf..67d39159c 100644 --- a/docs/source/distributions/building_distro.md +++ b/docs/source/distributions/building_distro.md @@ -66,121 +66,247 @@ llama stack build --list-templates ``` ``` -+------------------------------+--------------------------------------------+----------------------------------------------------------------------------------+ -| Template Name | Providers | Description | -+------------------------------+--------------------------------------------+----------------------------------------------------------------------------------+ -| hf-serverless | { | Like local, but use Hugging Face Inference API (serverless) for running LLM | -| | "inference": "remote::hf::serverless", | inference. | -| | "memory": "meta-reference", | See https://hf.co/docs/api-inference. | -| | "safety": "meta-reference", | | -| | "agents": "meta-reference", | | -| | "telemetry": "meta-reference" | | -| | } | | -+------------------------------+--------------------------------------------+----------------------------------------------------------------------------------+ -| together | { | Use Together.ai for running LLM inference | -| | "inference": "remote::together", | | -| | "memory": [ | | -| | "meta-reference", | | -| | "remote::weaviate" | | -| | ], | | -| | "safety": "meta-reference", | | -| | "agents": "meta-reference", | | -| | "telemetry": "meta-reference" | | -| | } | | -+------------------------------+--------------------------------------------+----------------------------------------------------------------------------------+ -| fireworks | { | Use Fireworks.ai for running LLM inference | -| | "inference": "remote::fireworks", | | -| | "memory": [ | | -| | "meta-reference", | | -| | "remote::weaviate", | | -| | "remote::chromadb", | | -| | "remote::pgvector" | | -| | ], | | -| | "safety": "meta-reference", | | -| | "agents": "meta-reference", | | -| | "telemetry": "meta-reference" | | -| | } | | -+------------------------------+--------------------------------------------+----------------------------------------------------------------------------------+ -| databricks | { | Use Databricks for running LLM inference | -| | "inference": "remote::databricks", | | -| | "memory": "meta-reference", | | -| | "safety": "meta-reference", | | -| | "agents": "meta-reference", | | -| | "telemetry": "meta-reference" | | -| | } | | -+------------------------------+--------------------------------------------+----------------------------------------------------------------------------------+ -| vllm | { | Like local, but use vLLM for running LLM inference | -| | "inference": "vllm", | | -| | "memory": "meta-reference", | | -| | "safety": "meta-reference", | | -| | "agents": "meta-reference", | | -| | "telemetry": "meta-reference" | | -| | } | | -+------------------------------+--------------------------------------------+----------------------------------------------------------------------------------+ -| tgi | { | Use TGI for running LLM inference | -| | "inference": "remote::tgi", | | -| | "memory": [ | | -| | "meta-reference", | | -| | "remote::chromadb", | | -| | "remote::pgvector" | | -| | ], | | -| | "safety": "meta-reference", | | -| | "agents": "meta-reference", | | -| | "telemetry": "meta-reference" | | -| | } | | -+------------------------------+--------------------------------------------+----------------------------------------------------------------------------------+ -| bedrock | { | Use Amazon Bedrock APIs. | -| | "inference": "remote::bedrock", | | -| | "memory": "meta-reference", | | -| | "safety": "meta-reference", | | -| | "agents": "meta-reference", | | -| | "telemetry": "meta-reference" | | -| | } | | -+------------------------------+--------------------------------------------+----------------------------------------------------------------------------------+ -| meta-reference-gpu | { | Use code from `llama_stack` itself to serve all llama stack APIs | -| | "inference": "meta-reference", | | -| | "memory": [ | | -| | "meta-reference", | | -| | "remote::chromadb", | | -| | "remote::pgvector" | | -| | ], | | -| | "safety": "meta-reference", | | -| | "agents": "meta-reference", | | -| | "telemetry": "meta-reference" | | -| | } | | -+------------------------------+--------------------------------------------+----------------------------------------------------------------------------------+ -| meta-reference-quantized-gpu | { | Use code from `llama_stack` itself to serve all llama stack APIs | -| | "inference": "meta-reference-quantized", | | -| | "memory": [ | | -| | "meta-reference", | | -| | "remote::chromadb", | | -| | "remote::pgvector" | | -| | ], | | -| | "safety": "meta-reference", | | -| | "agents": "meta-reference", | | -| | "telemetry": "meta-reference" | | -| | } | | -+------------------------------+--------------------------------------------+----------------------------------------------------------------------------------+ -| ollama | { | Use ollama for running LLM inference | -| | "inference": "remote::ollama", | | -| | "memory": [ | | -| | "meta-reference", | | -| | "remote::chromadb", | | -| | "remote::pgvector" | | -| | ], | | -| | "safety": "meta-reference", | | -| | "agents": "meta-reference", | | -| | "telemetry": "meta-reference" | | -| | } | | -+------------------------------+--------------------------------------------+----------------------------------------------------------------------------------+ -| hf-endpoint | { | Like local, but use Hugging Face Inference Endpoints for running LLM inference. | -| | "inference": "remote::hf::endpoint", | See https://hf.co/docs/api-endpoints. | -| | "memory": "meta-reference", | | -| | "safety": "meta-reference", | | -| | "agents": "meta-reference", | | -| | "telemetry": "meta-reference" | | -| | } | | -+------------------------------+--------------------------------------------+----------------------------------------------------------------------------------+ ++------------------------------+----------------------------------------+-----------------------------------------------------------------------------+ +| Template Name | Providers | Description | ++------------------------------+----------------------------------------+-----------------------------------------------------------------------------+ +| tgi | { | Use (an external) TGI server for running LLM inference | +| | "inference": [ | | +| | "remote::tgi" | | +| | ], | | +| | "memory": [ | | +| | "inline::faiss", | | +| | "remote::chromadb", | | +| | "remote::pgvector" | | +| | ], | | +| | "safety": [ | | +| | "inline::llama-guard" | | +| | ], | | +| | "agents": [ | | +| | "inline::meta-reference" | | +| | ], | | +| | "telemetry": [ | | +| | "inline::meta-reference" | | +| | ] | | +| | } | | ++------------------------------+----------------------------------------+-----------------------------------------------------------------------------+ +| remote-vllm | { | Use (an external) vLLM server for running LLM inference | +| | "inference": [ | | +| | "remote::vllm" | | +| | ], | | +| | "memory": [ | | +| | "inline::faiss", | | +| | "remote::chromadb", | | +| | "remote::pgvector" | | +| | ], | | +| | "safety": [ | | +| | "inline::llama-guard" | | +| | ], | | +| | "agents": [ | | +| | "inline::meta-reference" | | +| | ], | | +| | "telemetry": [ | | +| | "inline::meta-reference" | | +| | ] | | +| | } | | ++------------------------------+----------------------------------------+-----------------------------------------------------------------------------+ +| vllm-gpu | { | Use a built-in vLLM engine for running LLM inference | +| | "inference": [ | | +| | "inline::vllm" | | +| | ], | | +| | "memory": [ | | +| | "inline::faiss", | | +| | "remote::chromadb", | | +| | "remote::pgvector" | | +| | ], | | +| | "safety": [ | | +| | "inline::llama-guard" | | +| | ], | | +| | "agents": [ | | +| | "inline::meta-reference" | | +| | ], | | +| | "telemetry": [ | | +| | "inline::meta-reference" | | +| | ] | | +| | } | | ++------------------------------+----------------------------------------+-----------------------------------------------------------------------------+ +| meta-reference-quantized-gpu | { | Use Meta Reference with fp8, int4 quantization for running LLM inference | +| | "inference": [ | | +| | "inline::meta-reference-quantized" | | +| | ], | | +| | "memory": [ | | +| | "inline::faiss", | | +| | "remote::chromadb", | | +| | "remote::pgvector" | | +| | ], | | +| | "safety": [ | | +| | "inline::llama-guard" | | +| | ], | | +| | "agents": [ | | +| | "inline::meta-reference" | | +| | ], | | +| | "telemetry": [ | | +| | "inline::meta-reference" | | +| | ] | | +| | } | | ++------------------------------+----------------------------------------+-----------------------------------------------------------------------------+ +| meta-reference-gpu | { | Use Meta Reference for running LLM inference | +| | "inference": [ | | +| | "inline::meta-reference" | | +| | ], | | +| | "memory": [ | | +| | "inline::faiss", | | +| | "remote::chromadb", | | +| | "remote::pgvector" | | +| | ], | | +| | "safety": [ | | +| | "inline::llama-guard" | | +| | ], | | +| | "agents": [ | | +| | "inline::meta-reference" | | +| | ], | | +| | "telemetry": [ | | +| | "inline::meta-reference" | | +| | ] | | +| | } | | ++------------------------------+----------------------------------------+-----------------------------------------------------------------------------+ +| hf-serverless | { | Use (an external) Hugging Face Inference Endpoint for running LLM inference | +| | "inference": [ | | +| | "remote::hf::serverless" | | +| | ], | | +| | "memory": [ | | +| | "inline::faiss", | | +| | "remote::chromadb", | | +| | "remote::pgvector" | | +| | ], | | +| | "safety": [ | | +| | "inline::llama-guard" | | +| | ], | | +| | "agents": [ | | +| | "inline::meta-reference" | | +| | ], | | +| | "telemetry": [ | | +| | "inline::meta-reference" | | +| | ] | | +| | } | | ++------------------------------+----------------------------------------+-----------------------------------------------------------------------------+ +| together | { | Use Together.AI for running LLM inference | +| | "inference": [ | | +| | "remote::together" | | +| | ], | | +| | "memory": [ | | +| | "inline::faiss", | | +| | "remote::chromadb", | | +| | "remote::pgvector" | | +| | ], | | +| | "safety": [ | | +| | "inline::llama-guard" | | +| | ], | | +| | "agents": [ | | +| | "inline::meta-reference" | | +| | ], | | +| | "telemetry": [ | | +| | "inline::meta-reference" | | +| | ] | | +| | } | | ++------------------------------+----------------------------------------+-----------------------------------------------------------------------------+ +| ollama | { | Use (an external) Ollama server for running LLM inference | +| | "inference": [ | | +| | "remote::ollama" | | +| | ], | | +| | "memory": [ | | +| | "inline::faiss", | | +| | "remote::chromadb", | | +| | "remote::pgvector" | | +| | ], | | +| | "safety": [ | | +| | "inline::llama-guard" | | +| | ], | | +| | "agents": [ | | +| | "inline::meta-reference" | | +| | ], | | +| | "telemetry": [ | | +| | "inline::meta-reference" | | +| | ] | | +| | } | | ++------------------------------+----------------------------------------+-----------------------------------------------------------------------------+ +| bedrock | { | Use AWS Bedrock for running LLM inference and safety | +| | "inference": [ | | +| | "remote::bedrock" | | +| | ], | | +| | "memory": [ | | +| | "inline::faiss", | | +| | "remote::chromadb", | | +| | "remote::pgvector" | | +| | ], | | +| | "safety": [ | | +| | "remote::bedrock" | | +| | ], | | +| | "agents": [ | | +| | "inline::meta-reference" | | +| | ], | | +| | "telemetry": [ | | +| | "inline::meta-reference" | | +| | ] | | +| | } | | ++------------------------------+----------------------------------------+-----------------------------------------------------------------------------+ +| hf-endpoint | { | Use (an external) Hugging Face Inference Endpoint for running LLM inference | +| | "inference": [ | | +| | "remote::hf::endpoint" | | +| | ], | | +| | "memory": [ | | +| | "inline::faiss", | | +| | "remote::chromadb", | | +| | "remote::pgvector" | | +| | ], | | +| | "safety": [ | | +| | "inline::llama-guard" | | +| | ], | | +| | "agents": [ | | +| | "inline::meta-reference" | | +| | ], | | +| | "telemetry": [ | | +| | "inline::meta-reference" | | +| | ] | | +| | } | | ++------------------------------+----------------------------------------+-----------------------------------------------------------------------------+ +| fireworks | { | Use Fireworks.AI for running LLM inference | +| | "inference": [ | | +| | "remote::fireworks" | | +| | ], | | +| | "memory": [ | | +| | "inline::faiss", | | +| | "remote::chromadb", | | +| | "remote::pgvector" | | +| | ], | | +| | "safety": [ | | +| | "inline::llama-guard" | | +| | ], | | +| | "agents": [ | | +| | "inline::meta-reference" | | +| | ], | | +| | "telemetry": [ | | +| | "inline::meta-reference" | | +| | ] | | +| | } | | ++------------------------------+----------------------------------------+-----------------------------------------------------------------------------+ +| cerebras | { | Use Cerebras for running LLM inference | +| | "inference": [ | | +| | "remote::cerebras" | | +| | ], | | +| | "safety": [ | | +| | "inline::llama-guard" | | +| | ], | | +| | "memory": [ | | +| | "inline::meta-reference" | | +| | ], | | +| | "agents": [ | | +| | "inline::meta-reference" | | +| | ], | | +| | "telemetry": [ | | +| | "inline::meta-reference" | | +| | ] | | +| | } | | ++------------------------------+----------------------------------------+-----------------------------------------------------------------------------+ ``` You may then pick a template to build your distribution with providers fitted to your liking. diff --git a/docs/source/distributions/self_hosted_distro/cerebras.md b/docs/source/distributions/self_hosted_distro/cerebras.md new file mode 100644 index 000000000..08b35809a --- /dev/null +++ b/docs/source/distributions/self_hosted_distro/cerebras.md @@ -0,0 +1,61 @@ +# Cerebras Distribution + +The `llamastack/distribution-cerebras` distribution consists of the following provider configurations. + +| API | Provider(s) | +|-----|-------------| +| agents | `inline::meta-reference` | +| inference | `remote::cerebras` | +| memory | `inline::meta-reference` | +| safety | `inline::llama-guard` | +| telemetry | `inline::meta-reference` | + + +### Environment Variables + +The following environment variables can be configured: + +- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`) +- `CEREBRAS_API_KEY`: Cerebras API Key (default: ``) + +### Models + +The following models are available by default: + +- `meta-llama/Llama-3.1-8B-Instruct (llama3.1-8b)` +- `meta-llama/Llama-3.1-70B-Instruct (llama3.1-70b)` + + +### Prerequisite: API Keys + +Make sure you have access to a Cerebras API Key. You can get one by visiting [cloud.cerebras.ai](https://cloud.cerebras.ai/). + + +## Running Llama Stack with Cerebras + +You can do this via Conda (build code) or Docker which has a pre-built image. + +### Via Docker + +This method allows you to get started quickly without having to build the distribution code. + +```bash +LLAMA_STACK_PORT=5001 +docker run \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ./run.yaml:/root/my-run.yaml \ + llamastack/distribution-cerebras \ + --yaml-config /root/my-run.yaml \ + --port $LLAMA_STACK_PORT \ + --env CEREBRAS_API_KEY=$CEREBRAS_API_KEY +``` + +### Via Conda + +```bash +llama stack build --template cerebras --image-type conda +llama stack run ./run.yaml \ + --port 5001 \ + --env CEREBRAS_API_KEY=$CEREBRAS_API_KEY +``` diff --git a/docs/source/index.md b/docs/source/index.md index 291237843..abfaf51b4 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -45,6 +45,7 @@ Llama Stack already has a number of "adapters" available for some popular Infere | **API Provider** | **Environments** | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | Meta Reference | Single Node | Y | Y | Y | Y | Y | +| Cerebras | Single Node | | Y | | | | | Fireworks | Hosted | Y | Y | Y | | | | AWS Bedrock | Hosted | | Y | | Y | | | Together | Hosted | Y | Y | | Y | | diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py index c8d061f6c..13d463ad8 100644 --- a/llama_stack/providers/registry/inference.py +++ b/llama_stack/providers/registry/inference.py @@ -61,6 +61,17 @@ def available_providers() -> List[ProviderSpec]: config_class="llama_stack.providers.remote.inference.sample.SampleConfig", ), ), + remote_provider_spec( + api=Api.inference, + adapter=AdapterSpec( + adapter_type="cerebras", + pip_packages=[ + "cerebras_cloud_sdk", + ], + module="llama_stack.providers.remote.inference.cerebras", + config_class="llama_stack.providers.remote.inference.cerebras.CerebrasImplConfig", + ), + ), remote_provider_spec( api=Api.inference, adapter=AdapterSpec( diff --git a/llama_stack/providers/remote/inference/cerebras/__init__.py b/llama_stack/providers/remote/inference/cerebras/__init__.py new file mode 100644 index 000000000..a24bb2c70 --- /dev/null +++ b/llama_stack/providers/remote/inference/cerebras/__init__.py @@ -0,0 +1,21 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .config import CerebrasImplConfig + + +async def get_adapter_impl(config: CerebrasImplConfig, _deps): + from .cerebras import CerebrasInferenceAdapter + + assert isinstance( + config, CerebrasImplConfig + ), f"Unexpected config type: {type(config)}" + + impl = CerebrasInferenceAdapter(config) + + await impl.initialize() + + return impl diff --git a/llama_stack/providers/remote/inference/cerebras/cerebras.py b/llama_stack/providers/remote/inference/cerebras/cerebras.py new file mode 100644 index 000000000..65022f85e --- /dev/null +++ b/llama_stack/providers/remote/inference/cerebras/cerebras.py @@ -0,0 +1,191 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import AsyncGenerator + +from cerebras.cloud.sdk import AsyncCerebras + +from llama_models.llama3.api.chat_format import ChatFormat + +from llama_models.llama3.api.datatypes import Message +from llama_models.llama3.api.tokenizer import Tokenizer + +from llama_stack.apis.inference import * # noqa: F403 + +from llama_models.datatypes import CoreModelId + +from llama_stack.providers.utils.inference.model_registry import ( + build_model_alias, + ModelRegistryHelper, +) +from llama_stack.providers.utils.inference.openai_compat import ( + get_sampling_options, + process_chat_completion_response, + process_chat_completion_stream_response, + process_completion_response, + process_completion_stream_response, +) +from llama_stack.providers.utils.inference.prompt_adapter import ( + chat_completion_request_to_prompt, + completion_request_to_prompt, +) + +from .config import CerebrasImplConfig + + +model_aliases = [ + build_model_alias( + "llama3.1-8b", + CoreModelId.llama3_1_8b_instruct.value, + ), + build_model_alias( + "llama3.1-70b", + CoreModelId.llama3_1_70b_instruct.value, + ), +] + + +class CerebrasInferenceAdapter(ModelRegistryHelper, Inference): + def __init__(self, config: CerebrasImplConfig) -> None: + ModelRegistryHelper.__init__( + self, + model_aliases=model_aliases, + ) + self.config = config + self.formatter = ChatFormat(Tokenizer.get_instance()) + + self.client = AsyncCerebras( + base_url=self.config.base_url, api_key=self.config.api_key + ) + + async def initialize(self) -> None: + return + + async def shutdown(self) -> None: + pass + + async def completion( + self, + model_id: str, + content: InterleavedTextMedia, + sampling_params: Optional[SamplingParams] = SamplingParams(), + response_format: Optional[ResponseFormat] = None, + stream: Optional[bool] = False, + logprobs: Optional[LogProbConfig] = None, + ) -> AsyncGenerator: + model = await self.model_store.get_model(model_id) + request = CompletionRequest( + model=model.provider_resource_id, + content=content, + sampling_params=sampling_params, + response_format=response_format, + stream=stream, + logprobs=logprobs, + ) + if stream: + return self._stream_completion( + request, + ) + else: + return await self._nonstream_completion(request) + + async def _nonstream_completion( + self, request: CompletionRequest + ) -> CompletionResponse: + params = self._get_params(request) + + r = await self.client.completions.create(**params) + + return process_completion_response(r, self.formatter) + + async def _stream_completion(self, request: CompletionRequest) -> AsyncGenerator: + params = self._get_params(request) + + stream = await self.client.completions.create(**params) + + async for chunk in process_completion_stream_response(stream, self.formatter): + yield chunk + + async def chat_completion( + self, + model_id: str, + messages: List[Message], + sampling_params: Optional[SamplingParams] = SamplingParams(), + tools: Optional[List[ToolDefinition]] = None, + tool_choice: Optional[ToolChoice] = ToolChoice.auto, + tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json, + response_format: Optional[ResponseFormat] = None, + stream: Optional[bool] = False, + logprobs: Optional[LogProbConfig] = None, + ) -> AsyncGenerator: + model = await self.model_store.get_model(model_id) + request = ChatCompletionRequest( + model=model.provider_resource_id, + messages=messages, + sampling_params=sampling_params, + tools=tools or [], + tool_choice=tool_choice, + tool_prompt_format=tool_prompt_format, + response_format=response_format, + stream=stream, + logprobs=logprobs, + ) + + if stream: + return self._stream_chat_completion(request) + else: + return await self._nonstream_chat_completion(request) + + async def _nonstream_chat_completion( + self, request: CompletionRequest + ) -> CompletionResponse: + params = self._get_params(request) + + r = await self.client.completions.create(**params) + + return process_chat_completion_response(r, self.formatter) + + async def _stream_chat_completion( + self, request: CompletionRequest + ) -> AsyncGenerator: + params = self._get_params(request) + + stream = await self.client.completions.create(**params) + + async for chunk in process_chat_completion_stream_response( + stream, self.formatter + ): + yield chunk + + def _get_params( + self, request: Union[ChatCompletionRequest, CompletionRequest] + ) -> dict: + if request.sampling_params and request.sampling_params.top_k: + raise ValueError("`top_k` not supported by Cerebras") + + prompt = "" + if type(request) == ChatCompletionRequest: + prompt = chat_completion_request_to_prompt( + request, self.get_llama_model(request.model), self.formatter + ) + elif type(request) == CompletionRequest: + prompt = completion_request_to_prompt(request, self.formatter) + else: + raise ValueError(f"Unknown request type {type(request)}") + + return { + "model": request.model, + "prompt": prompt, + "stream": request.stream, + **get_sampling_options(request.sampling_params), + } + + async def embeddings( + self, + model_id: str, + contents: List[InterleavedTextMedia], + ) -> EmbeddingsResponse: + raise NotImplementedError() diff --git a/llama_stack/providers/remote/inference/cerebras/config.py b/llama_stack/providers/remote/inference/cerebras/config.py new file mode 100644 index 000000000..9bae6ca4d --- /dev/null +++ b/llama_stack/providers/remote/inference/cerebras/config.py @@ -0,0 +1,32 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import os +from typing import Any, Dict, Optional + +from llama_models.schema_utils import json_schema_type +from pydantic import BaseModel, Field + +DEFAULT_BASE_URL = "https://api.cerebras.ai" + + +@json_schema_type +class CerebrasImplConfig(BaseModel): + base_url: str = Field( + default=os.environ.get("CEREBRAS_BASE_URL", DEFAULT_BASE_URL), + description="Base URL for the Cerebras API", + ) + api_key: Optional[str] = Field( + default=os.environ.get("CEREBRAS_API_KEY"), + description="Cerebras API Key", + ) + + @classmethod + def sample_run_config(cls, **kwargs) -> Dict[str, Any]: + return { + "base_url": DEFAULT_BASE_URL, + "api_key": "${env.CEREBRAS_API_KEY}", + } diff --git a/llama_stack/providers/tests/inference/fixtures.py b/llama_stack/providers/tests/inference/fixtures.py index a427eef12..21e122149 100644 --- a/llama_stack/providers/tests/inference/fixtures.py +++ b/llama_stack/providers/tests/inference/fixtures.py @@ -17,6 +17,7 @@ from llama_stack.providers.inline.inference.meta_reference import ( ) from llama_stack.providers.remote.inference.bedrock import BedrockConfig +from llama_stack.providers.remote.inference.cerebras import CerebrasImplConfig from llama_stack.providers.remote.inference.fireworks import FireworksImplConfig from llama_stack.providers.remote.inference.nvidia import NVIDIAConfig from llama_stack.providers.remote.inference.ollama import OllamaImplConfig @@ -64,6 +65,21 @@ def inference_meta_reference(inference_model) -> ProviderFixture: ) +@pytest.fixture(scope="session") +def inference_cerebras() -> ProviderFixture: + return ProviderFixture( + providers=[ + Provider( + provider_id="cerebras", + provider_type="remote::cerebras", + config=CerebrasImplConfig( + api_key=get_env_or_fail("CEREBRAS_API_KEY"), + ).model_dump(), + ) + ], + ) + + @pytest.fixture(scope="session") def inference_ollama(inference_model) -> ProviderFixture: inference_model = ( @@ -206,6 +222,7 @@ INFERENCE_FIXTURES = [ "vllm_remote", "remote", "bedrock", + "cerebras", "nvidia", "tgi", ] diff --git a/llama_stack/providers/tests/inference/test_text_inference.py b/llama_stack/providers/tests/inference/test_text_inference.py index 9e5c67375..aa2f0b413 100644 --- a/llama_stack/providers/tests/inference/test_text_inference.py +++ b/llama_stack/providers/tests/inference/test_text_inference.py @@ -94,6 +94,7 @@ class TestInference: "remote::tgi", "remote::together", "remote::fireworks", + "remote::cerebras", ): pytest.skip("Other inference providers don't support completion() yet") @@ -139,6 +140,7 @@ class TestInference: "remote::tgi", "remote::together", "remote::fireworks", + "remote::cerebras", ): pytest.skip( "Other inference providers don't support structured output in completions yet" diff --git a/llama_stack/templates/cerebras/__init__.py b/llama_stack/templates/cerebras/__init__.py new file mode 100644 index 000000000..9f9929b52 --- /dev/null +++ b/llama_stack/templates/cerebras/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .cerebras import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/cerebras/build.yaml b/llama_stack/templates/cerebras/build.yaml new file mode 100644 index 000000000..a1fe93099 --- /dev/null +++ b/llama_stack/templates/cerebras/build.yaml @@ -0,0 +1,17 @@ +version: '2' +name: cerebras +distribution_spec: + description: Use Cerebras for running LLM inference + docker_image: null + providers: + inference: + - remote::cerebras + safety: + - inline::llama-guard + memory: + - inline::meta-reference + agents: + - inline::meta-reference + telemetry: + - inline::meta-reference +image_type: conda diff --git a/llama_stack/templates/cerebras/cerebras.py b/llama_stack/templates/cerebras/cerebras.py new file mode 100644 index 000000000..58e05adf8 --- /dev/null +++ b/llama_stack/templates/cerebras/cerebras.py @@ -0,0 +1,71 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from pathlib import Path + +from llama_models.sku_list import all_registered_models + +from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput +from llama_stack.providers.remote.inference.cerebras import CerebrasImplConfig +from llama_stack.providers.remote.inference.cerebras.cerebras import model_aliases + +from llama_stack.templates.template import DistributionTemplate, RunConfigSettings + + +def get_distribution_template() -> DistributionTemplate: + providers = { + "inference": ["remote::cerebras"], + "safety": ["inline::llama-guard"], + "memory": ["inline::meta-reference"], + "agents": ["inline::meta-reference"], + "telemetry": ["inline::meta-reference"], + } + + inference_provider = Provider( + provider_id="cerebras", + provider_type="remote::cerebras", + config=CerebrasImplConfig.sample_run_config(), + ) + + core_model_to_hf_repo = { + m.descriptor(): m.huggingface_repo for m in all_registered_models() + } + default_models = [ + ModelInput( + model_id=core_model_to_hf_repo[m.llama_model], + provider_model_id=m.provider_model_id, + ) + for m in model_aliases + ] + + return DistributionTemplate( + name="cerebras", + distro_type="self_hosted", + description="Use Cerebras for running LLM inference", + docker_image=None, + template_path=Path(__file__).parent / "doc_template.md", + providers=providers, + default_models=default_models, + run_configs={ + "run.yaml": RunConfigSettings( + provider_overrides={ + "inference": [inference_provider], + }, + default_models=default_models, + default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")], + ), + }, + run_config_env_vars={ + "LLAMASTACK_PORT": ( + "5001", + "Port for the Llama Stack distribution server", + ), + "CEREBRAS_API_KEY": ( + "", + "Cerebras API Key", + ), + }, + ) diff --git a/llama_stack/templates/cerebras/doc_template.md b/llama_stack/templates/cerebras/doc_template.md new file mode 100644 index 000000000..77fc6f478 --- /dev/null +++ b/llama_stack/templates/cerebras/doc_template.md @@ -0,0 +1,60 @@ +# Cerebras Distribution + +The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations. + +{{ providers_table }} + +{% if run_config_env_vars %} +### Environment Variables + +The following environment variables can be configured: + +{% for var, (default_value, description) in run_config_env_vars.items() %} +- `{{ var }}`: {{ description }} (default: `{{ default_value }}`) +{% endfor %} +{% endif %} + +{% if default_models %} +### Models + +The following models are available by default: + +{% for model in default_models %} +- `{{ model.model_id }} ({{ model.provider_model_id }})` +{% endfor %} +{% endif %} + + +### Prerequisite: API Keys + +Make sure you have access to a Cerebras API Key. You can get one by visiting [cloud.cerebras.ai](https://cloud.cerebras.ai/). + + +## Running Llama Stack with Cerebras + +You can do this via Conda (build code) or Docker which has a pre-built image. + +### Via Docker + +This method allows you to get started quickly without having to build the distribution code. + +```bash +LLAMA_STACK_PORT=5001 +docker run \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ./run.yaml:/root/my-run.yaml \ + llamastack/distribution-{{ name }} \ + --yaml-config /root/my-run.yaml \ + --port $LLAMA_STACK_PORT \ + --env CEREBRAS_API_KEY=$CEREBRAS_API_KEY +``` + +### Via Conda + +```bash +llama stack build --template cerebras --image-type conda +llama stack run ./run.yaml \ + --port 5001 \ + --env CEREBRAS_API_KEY=$CEREBRAS_API_KEY +``` diff --git a/llama_stack/templates/cerebras/run.yaml b/llama_stack/templates/cerebras/run.yaml new file mode 100644 index 000000000..0b41f5b76 --- /dev/null +++ b/llama_stack/templates/cerebras/run.yaml @@ -0,0 +1,63 @@ +version: '2' +image_name: cerebras +docker_image: null +conda_env: cerebras +apis: +- agents +- inference +- memory +- safety +- telemetry +providers: + inference: + - provider_id: cerebras + provider_type: remote::cerebras + config: + base_url: https://api.cerebras.ai + api_key: ${env.CEREBRAS_API_KEY} + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard + config: {} + memory: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/cerebras}/faiss_store.db + agents: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + persistence_store: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/cerebras}/agents_store.db + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} +metadata_store: + namespace: null + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/cerebras}/registry.db +models: +- metadata: {} + model_id: meta-llama/Llama-3.1-8B-Instruct + provider_id: null + provider_model_id: llama3.1-8b +- metadata: {} + model_id: meta-llama/Llama-3.1-70B-Instruct + provider_id: null + provider_model_id: llama3.1-70b +shields: +- params: null + shield_id: meta-llama/Llama-Guard-3-8B + provider_id: null + provider_shield_id: null +memory_banks: [] +datasets: [] +scoring_fns: [] +eval_tasks: [] From caf1dac1145193846c0c77a93af3c4669dc5575d Mon Sep 17 00:00:00 2001 From: Sixian Yi Date: Tue, 3 Dec 2024 21:18:30 -0800 Subject: [PATCH 17/69] unregister API for dataset (#507) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? 1) Implement `unregister_dataset(dataset_id)` API in both llama stack routing table and providers: It removes {dataset_id -> Dataset} mapping from routing table and removes the dataset_id references in provider as well (ex. for huggingface, we use a KV store to store the dataset id => dataset. we delete it during unregistering as well) 2) expose the datasets/unregister_dataset api endpoint ## Test Plan **Unit test:** ` pytest llama_stack/providers/tests/datasetio/test_datasetio.py -m "huggingface" -v -s --tb=short --disable-warnings ` **Test on endpoint:** tested llama stack using an ollama distribution template: 1) start an ollama server 2) Start a llama stack server with the default ollama distribution config + dataset/datasetsio APIs + datasetio provider ``` ---- .../ollama-run.yaml ... apis: - agents - inference - memory - safety - telemetry - datasetio - datasets providers: datasetio: - provider_id: localfs provider_type: inline::localfs config: {} ... ``` saw that the new API showed up in startup script ``` Serving API datasets GET /alpha/datasets/get GET /alpha/datasets/list POST /alpha/datasets/register POST /alpha/datasets/unregister ``` 3) query `/alpha/datasets/unregister` through curl (since we have not implemented unregister api in llama stack client) ``` (base) sxyi@sxyi-mbp llama-stack % llama-stack-client datasets register --dataset-id sixian --url https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/chat.rst --schema {} (base) sxyi@sxyi-mbp llama-stack % llama-stack-client datasets list ┏━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┓ ┃ identifier ┃ provider_id ┃ metadata ┃ type ┃ ┡━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━┩ │ sixian │ localfs │ {} │ dataset │ └────────────┴─────────────┴──────────┴─────────┘ (base) sxyi@sxyi-mbp llama-stack % llama-stack-client datasets register --dataset-id sixian2 --url https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/chat.rst --schema {} (base) sxyi@sxyi-mbp llama-stack % llama-stack-client datasets list ┏━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┓ ┃ identifier ┃ provider_id ┃ metadata ┃ type ┃ ┡━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━┩ │ sixian │ localfs │ {} │ dataset │ │ sixian2 │ localfs │ {} │ dataset │ └────────────┴─────────────┴──────────┴─────────┘ (base) sxyi@sxyi-mbp llama-stack % curl http://localhost:5001/alpha/datasets/unregister \ -H "Content-Type: application/json" \ -d '{"dataset_id": "sixian"}' null% (base) sxyi@sxyi-mbp llama-stack % llama-stack-client datasets list ┏━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┓ ┃ identifier ┃ provider_id ┃ metadata ┃ type ┃ ┡━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━┩ │ sixian2 │ localfs │ {} │ dataset │ └────────────┴─────────────┴──────────┴─────────┘ (base) sxyi@sxyi-mbp llama-stack % curl http://localhost:5001/alpha/datasets/unregister \ -H "Content-Type: application/json" \ -d '{"dataset_id": "sixian2"}' null% (base) sxyi@sxyi-mbp llama-stack % llama-stack-client datasets list ``` ## Sources ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Ran pre-commit to handle lint / formatting issues. - [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ ] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests. --- docs/resources/llama-stack-spec.html | 50 +++++++++++++++++++ docs/resources/llama-stack-spec.yaml | 33 ++++++++++++ llama_stack/apis/datasets/client.py | 15 ++++++ llama_stack/apis/datasets/datasets.py | 6 +++ .../distribution/routers/routing_tables.py | 8 +++ llama_stack/providers/datatypes.py | 2 + .../inline/datasetio/localfs/datasetio.py | 3 ++ .../datasetio/huggingface/huggingface.py | 5 ++ .../tests/datasetio/test_datasetio.py | 12 +++++ 9 files changed, 134 insertions(+) diff --git a/docs/resources/llama-stack-spec.html b/docs/resources/llama-stack-spec.html index 090253804..4f220ea1e 100644 --- a/docs/resources/llama-stack-spec.html +++ b/docs/resources/llama-stack-spec.html @@ -2291,6 +2291,39 @@ "required": true } } + }, + "/alpha/datasets/unregister": { + "post": { + "responses": { + "200": { + "description": "OK" + } + }, + "tags": [ + "Datasets" + ], + "parameters": [ + { + "name": "X-LlamaStack-ProviderData", + "in": "header", + "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", + "required": false, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/UnregisterDatasetRequest" + } + } + }, + "required": true + } + } } }, "jsonSchemaDialect": "https://json-schema.org/draft/2020-12/schema", @@ -7917,6 +7950,18 @@ "required": [ "model_id" ] + }, + "UnregisterDatasetRequest": { + "type": "object", + "properties": { + "dataset_id": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "dataset_id" + ] } }, "responses": {} @@ -8529,6 +8574,10 @@ "name": "UnregisterModelRequest", "description": "" }, + { + "name": "UnregisterDatasetRequest", + "description": "" + }, { "name": "UnstructuredLogEvent", "description": "" @@ -8718,6 +8767,7 @@ "URL", "UnregisterMemoryBankRequest", "UnregisterModelRequest", + "UnregisterDatasetRequest", "UnstructuredLogEvent", "UserMessage", "VectorMemoryBank", diff --git a/docs/resources/llama-stack-spec.yaml b/docs/resources/llama-stack-spec.yaml index 8ffd9fdef..6564ddf3f 100644 --- a/docs/resources/llama-stack-spec.yaml +++ b/docs/resources/llama-stack-spec.yaml @@ -3253,6 +3253,14 @@ components: required: - model_id type: object + UnregisterDatasetRequest: + additionalProperties: false + properties: + dataset_id: + type: string + required: + - dataset_id + type: object UnstructuredLogEvent: additionalProperties: false properties: @@ -3789,6 +3797,27 @@ paths: description: OK tags: - Datasets + /alpha/datasets/unregister: + post: + parameters: + - description: JSON-encoded provider data which will be made available to the + adapter servicing the API + in: header + name: X-LlamaStack-ProviderData + required: false + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/UnregisterDatasetRequest' + required: true + responses: + '200': + description: OK + tags: + - Datasets /alpha/eval-tasks/get: get: parameters: @@ -5242,6 +5271,9 @@ tags: - description: name: UnregisterModelRequest +- description: + name: UnregisterDatasetRequest - description: name: UnstructuredLogEvent @@ -5418,6 +5450,7 @@ x-tagGroups: - URL - UnregisterMemoryBankRequest - UnregisterModelRequest + - UnregisterDatasetRequest - UnstructuredLogEvent - UserMessage - VectorMemoryBank diff --git a/llama_stack/apis/datasets/client.py b/llama_stack/apis/datasets/client.py index 9e5891e74..c379a49fb 100644 --- a/llama_stack/apis/datasets/client.py +++ b/llama_stack/apis/datasets/client.py @@ -78,6 +78,21 @@ class DatasetsClient(Datasets): return [DatasetDefWithProvider(**x) for x in response.json()] + async def unregister_dataset( + self, + dataset_id: str, + ) -> None: + async with httpx.AsyncClient() as client: + response = await client.delete( + f"{self.base_url}/datasets/unregister", + params={ + "dataset_id": dataset_id, + }, + headers={"Content-Type": "application/json"}, + timeout=60, + ) + response.raise_for_status() + async def run_main(host: str, port: int): client = DatasetsClient(f"http://{host}:{port}") diff --git a/llama_stack/apis/datasets/datasets.py b/llama_stack/apis/datasets/datasets.py index 2ab958782..e1ac4af21 100644 --- a/llama_stack/apis/datasets/datasets.py +++ b/llama_stack/apis/datasets/datasets.py @@ -64,3 +64,9 @@ class Datasets(Protocol): @webmethod(route="/datasets/list", method="GET") async def list_datasets(self) -> List[Dataset]: ... + + @webmethod(route="/datasets/unregister", method="POST") + async def unregister_dataset( + self, + dataset_id: str, + ) -> None: ... diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py index 4df693b26..2fb5a5e1c 100644 --- a/llama_stack/distribution/routers/routing_tables.py +++ b/llama_stack/distribution/routers/routing_tables.py @@ -57,6 +57,8 @@ async def unregister_object_from_provider(obj: RoutableObject, p: Any) -> None: return await p.unregister_memory_bank(obj.identifier) elif api == Api.inference: return await p.unregister_model(obj.identifier) + elif api == Api.datasetio: + return await p.unregister_dataset(obj.identifier) else: raise ValueError(f"Unregister not supported for {api}") @@ -354,6 +356,12 @@ class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets): ) await self.register_object(dataset) + async def unregister_dataset(self, dataset_id: str) -> None: + dataset = await self.get_dataset(dataset_id) + if dataset is None: + raise ValueError(f"Dataset {dataset_id} not found") + await self.unregister_object(dataset) + class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, ScoringFunctions): async def list_scoring_functions(self) -> List[ScoringFn]: diff --git a/llama_stack/providers/datatypes.py b/llama_stack/providers/datatypes.py index 080204e45..8e89bcc72 100644 --- a/llama_stack/providers/datatypes.py +++ b/llama_stack/providers/datatypes.py @@ -63,6 +63,8 @@ class MemoryBanksProtocolPrivate(Protocol): class DatasetsProtocolPrivate(Protocol): async def register_dataset(self, dataset: Dataset) -> None: ... + async def unregister_dataset(self, dataset_id: str) -> None: ... + class ScoringFunctionsProtocolPrivate(Protocol): async def list_scoring_functions(self) -> List[ScoringFn]: ... diff --git a/llama_stack/providers/inline/datasetio/localfs/datasetio.py b/llama_stack/providers/inline/datasetio/localfs/datasetio.py index 4de1850ae..010610056 100644 --- a/llama_stack/providers/inline/datasetio/localfs/datasetio.py +++ b/llama_stack/providers/inline/datasetio/localfs/datasetio.py @@ -97,6 +97,9 @@ class LocalFSDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate): dataset_impl=dataset_impl, ) + async def unregister_dataset(self, dataset_id: str) -> None: + del self.dataset_infos[dataset_id] + async def get_rows_paginated( self, dataset_id: str, diff --git a/llama_stack/providers/remote/datasetio/huggingface/huggingface.py b/llama_stack/providers/remote/datasetio/huggingface/huggingface.py index c2e4506bf..cdd5d9cd3 100644 --- a/llama_stack/providers/remote/datasetio/huggingface/huggingface.py +++ b/llama_stack/providers/remote/datasetio/huggingface/huggingface.py @@ -64,6 +64,11 @@ class HuggingfaceDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate): ) self.dataset_infos[dataset_def.identifier] = dataset_def + async def unregister_dataset(self, dataset_id: str) -> None: + key = f"{DATASETS_PREFIX}{dataset_id}" + await self.kvstore.delete(key=key) + del self.dataset_infos[dataset_id] + async def get_rows_paginated( self, dataset_id: str, diff --git a/llama_stack/providers/tests/datasetio/test_datasetio.py b/llama_stack/providers/tests/datasetio/test_datasetio.py index dd2cbd019..7d88b6115 100644 --- a/llama_stack/providers/tests/datasetio/test_datasetio.py +++ b/llama_stack/providers/tests/datasetio/test_datasetio.py @@ -81,6 +81,18 @@ class TestDatasetIO: assert len(response) == 1 assert response[0].identifier == "test_dataset" + with pytest.raises(Exception) as exc_info: + # unregister a dataset that does not exist + await datasets_impl.unregister_dataset("test_dataset2") + + await datasets_impl.unregister_dataset("test_dataset") + response = await datasets_impl.list_datasets() + assert isinstance(response, list) + assert len(response) == 0 + + with pytest.raises(Exception) as exc_info: + await datasets_impl.unregister_dataset("test_dataset") + @pytest.mark.asyncio async def test_get_rows_paginated(self, datasetio_stack): datasetio_impl, datasets_impl = datasetio_stack From 16769256b7d1f7ffadc09480eb2c8e1367fc2c8b Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 4 Dec 2024 09:47:09 -0800 Subject: [PATCH 18/69] [llama stack ui] add native eval & inspect distro & playground pages (#541) # What does this PR do? New Pages Added: - (1) Inspect Distro - (2) Evaluations: - (a) native evaluations (including generation) - (b) application evaluations (no generation, scoring only) - (3) Playground: - (a) chat - (b) RAG ## Test Plan ``` streamlit run app.py ``` #### Playground https://github.com/user-attachments/assets/6ca617e8-32ca-49b2-9774-185020ff5204 #### Inspect https://github.com/user-attachments/assets/01d52b2d-92af-4e3a-b623-a9b8ba22ba99 #### Evaluations (Generation + Scoring) https://github.com/user-attachments/assets/345845c7-2a2b-4095-960a-9ae40f6a93cf #### Evaluations (Scoring) https://github.com/user-attachments/assets/6cc1659f-eba4-49ca-a0a5-7c243557b4f5 ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Ran pre-commit to handle lint / formatting issues. - [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ ] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests. --- llama_stack/distribution/ui/README.md | 6 + llama_stack/distribution/ui/app.py | 196 +++---------- .../distribution/ui/modules/__init__.py | 5 + llama_stack/distribution/ui/modules/api.py | 13 +- llama_stack/distribution/ui/modules/utils.py | 11 + llama_stack/distribution/ui/page/__init__.py | 5 + .../ui/page/distribution/datasets.py | 19 ++ .../ui/page/distribution/eval_tasks.py | 22 ++ .../ui/page/distribution/memory_banks.py | 23 ++ .../ui/page/distribution/models.py | 19 ++ .../ui/page/distribution/providers.py | 20 ++ .../ui/page/distribution/resources.py | 52 ++++ .../ui/page/distribution/scoring_functions.py | 22 ++ .../ui/page/distribution/shields.py | 20 ++ .../ui/page/evaluations/__init__.py | 5 + .../ui/page/evaluations/app_eval.py | 148 ++++++++++ .../ui/page/evaluations/native_eval.py | 257 ++++++++++++++++++ .../ui/page/playground/__init__.py | 5 + .../distribution/ui/page/playground/chat.py | 123 +++++++++ .../distribution/ui/page/playground/rag.py | 188 +++++++++++++ llama_stack/distribution/ui/requirements.txt | 1 + .../scoring_fn/fn_defs/llm_as_judge_base.py | 6 +- 22 files changed, 1000 insertions(+), 166 deletions(-) create mode 100644 llama_stack/distribution/ui/modules/__init__.py create mode 100644 llama_stack/distribution/ui/page/__init__.py create mode 100644 llama_stack/distribution/ui/page/distribution/datasets.py create mode 100644 llama_stack/distribution/ui/page/distribution/eval_tasks.py create mode 100644 llama_stack/distribution/ui/page/distribution/memory_banks.py create mode 100644 llama_stack/distribution/ui/page/distribution/models.py create mode 100644 llama_stack/distribution/ui/page/distribution/providers.py create mode 100644 llama_stack/distribution/ui/page/distribution/resources.py create mode 100644 llama_stack/distribution/ui/page/distribution/scoring_functions.py create mode 100644 llama_stack/distribution/ui/page/distribution/shields.py create mode 100644 llama_stack/distribution/ui/page/evaluations/__init__.py create mode 100644 llama_stack/distribution/ui/page/evaluations/app_eval.py create mode 100644 llama_stack/distribution/ui/page/evaluations/native_eval.py create mode 100644 llama_stack/distribution/ui/page/playground/__init__.py create mode 100644 llama_stack/distribution/ui/page/playground/chat.py create mode 100644 llama_stack/distribution/ui/page/playground/rag.py diff --git a/llama_stack/distribution/ui/README.md b/llama_stack/distribution/ui/README.md index a91883067..2cc352c52 100644 --- a/llama_stack/distribution/ui/README.md +++ b/llama_stack/distribution/ui/README.md @@ -2,6 +2,12 @@ [!NOTE] This is a work in progress. +## Prerequisite +- Start up Llama Stack Server +``` +llama stack run +``` + ## Running Streamlit App ``` diff --git a/llama_stack/distribution/ui/app.py b/llama_stack/distribution/ui/app.py index 763b126a7..87a80e235 100644 --- a/llama_stack/distribution/ui/app.py +++ b/llama_stack/distribution/ui/app.py @@ -3,170 +3,54 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. - -import json - -import pandas as pd - import streamlit as st -from modules.api import LlamaStackEvaluation - -from modules.utils import process_dataset - -EVALUATION_API = LlamaStackEvaluation() - def main(): - # Add collapsible sidebar - with st.sidebar: - # Add collapse button - if "sidebar_state" not in st.session_state: - st.session_state.sidebar_state = True - - if st.session_state.sidebar_state: - st.title("Navigation") - page = st.radio( - "Select a Page", - ["Application Evaluation"], - index=0, - ) - else: - page = "Application Evaluation" # Default page when sidebar is collapsed - - # Main content area - st.title("🦙 Llama Stack Evaluations") - - if page == "Application Evaluation": - application_evaluation_page() - - -def application_evaluation_page(): - # File uploader - uploaded_file = st.file_uploader("Upload Dataset", type=["csv", "xlsx", "xls"]) - - if uploaded_file is None: - st.error("No file uploaded") - return - - # Process uploaded file - df = process_dataset(uploaded_file) - if df is None: - st.error("Error processing file") - return - - # Display dataset information - st.success("Dataset loaded successfully!") - - # Display dataframe preview - st.subheader("Dataset Preview") - st.dataframe(df) - - # Select Scoring Functions to Run Evaluation On - st.subheader("Select Scoring Functions") - scoring_functions = EVALUATION_API.list_scoring_functions() - scoring_functions = {sf.identifier: sf for sf in scoring_functions} - scoring_functions_names = list(scoring_functions.keys()) - selected_scoring_functions = st.multiselect( - "Choose one or more scoring functions", - options=scoring_functions_names, - help="Choose one or more scoring functions.", + # Evaluation pages + application_evaluation_page = st.Page( + "page/evaluations/app_eval.py", + title="Evaluations (Scoring)", + icon="📊", + default=False, + ) + native_evaluation_page = st.Page( + "page/evaluations/native_eval.py", + title="Evaluations (Generation + Scoring)", + icon="📊", + default=False, ) - available_models = EVALUATION_API.list_models() - available_models = [m.identifier for m in available_models] + # Playground pages + chat_page = st.Page( + "page/playground/chat.py", title="Chat", icon="💬", default=True + ) + rag_page = st.Page("page/playground/rag.py", title="RAG", icon="💬", default=False) - scoring_params = {} - if selected_scoring_functions: - st.write("Selected:") - for scoring_fn_id in selected_scoring_functions: - scoring_fn = scoring_functions[scoring_fn_id] - st.write(f"- **{scoring_fn_id}**: {scoring_fn.description}") - new_params = None - if scoring_fn.params: - new_params = {} - for param_name, param_value in scoring_fn.params.to_dict().items(): - if param_name == "type": - new_params[param_name] = param_value - continue + # Distribution pages + resources_page = st.Page( + "page/distribution/resources.py", title="Resources", icon="🔍", default=False + ) + provider_page = st.Page( + "page/distribution/providers.py", + title="API Providers", + icon="🔍", + default=False, + ) - if param_name == "judge_model": - value = st.selectbox( - f"Select **{param_name}** for {scoring_fn_id}", - options=available_models, - index=0, - key=f"{scoring_fn_id}_{param_name}", - ) - new_params[param_name] = value - else: - value = st.text_area( - f"Enter value for **{param_name}** in {scoring_fn_id} in valid JSON format", - value=json.dumps(param_value, indent=2), - height=80, - ) - try: - new_params[param_name] = json.loads(value) - except json.JSONDecodeError: - st.error( - f"Invalid JSON for **{param_name}** in {scoring_fn_id}" - ) - - st.json(new_params) - scoring_params[scoring_fn_id] = new_params - - # Add run evaluation button & slider - total_rows = len(df) - num_rows = st.slider("Number of rows to evaluate", 1, total_rows, total_rows) - - if st.button("Run Evaluation"): - progress_text = "Running evaluation..." - progress_bar = st.progress(0, text=progress_text) - rows = df.to_dict(orient="records") - if num_rows < total_rows: - rows = rows[:num_rows] - - # Create separate containers for progress text and results - progress_text_container = st.empty() - results_container = st.empty() - output_res = {} - for i, r in enumerate(rows): - # Update progress - progress = i / len(rows) - progress_bar.progress(progress, text=progress_text) - - # Run evaluation for current row - score_res = EVALUATION_API.run_scoring( - r, - scoring_function_ids=selected_scoring_functions, - scoring_params=scoring_params, - ) - - for k in r.keys(): - if k not in output_res: - output_res[k] = [] - output_res[k].append(r[k]) - - for fn_id in selected_scoring_functions: - if fn_id not in output_res: - output_res[fn_id] = [] - output_res[fn_id].append(score_res.results[fn_id].score_rows[0]) - - # Display current row results using separate containers - progress_text_container.write( - f"Expand to see current processed result ({i+1}/{len(rows)})" - ) - results_container.json( - score_res.to_json(), - expanded=2, - ) - - progress_bar.progress(1.0, text="Evaluation complete!") - - # Display results in dataframe - if output_res: - output_df = pd.DataFrame(output_res) - st.subheader("Evaluation Results") - st.dataframe(output_df) + pg = st.navigation( + { + "Playground": [ + chat_page, + rag_page, + application_evaluation_page, + native_evaluation_page, + ], + "Inspect": [provider_page, resources_page], + }, + expanded=False, + ) + pg.run() if __name__ == "__main__": diff --git a/llama_stack/distribution/ui/modules/__init__.py b/llama_stack/distribution/ui/modules/__init__.py new file mode 100644 index 000000000..756f351d8 --- /dev/null +++ b/llama_stack/distribution/ui/modules/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/llama_stack/distribution/ui/modules/api.py b/llama_stack/distribution/ui/modules/api.py index a8d8bf37d..d3852caee 100644 --- a/llama_stack/distribution/ui/modules/api.py +++ b/llama_stack/distribution/ui/modules/api.py @@ -11,7 +11,7 @@ from typing import Optional from llama_stack_client import LlamaStackClient -class LlamaStackEvaluation: +class LlamaStackApi: def __init__(self): self.client = LlamaStackClient( base_url=os.environ.get("LLAMA_STACK_ENDPOINT", "http://localhost:5000"), @@ -22,14 +22,6 @@ class LlamaStackEvaluation: }, ) - def list_scoring_functions(self): - """List all available scoring functions""" - return self.client.scoring_functions.list() - - def list_models(self): - """List all available judge models""" - return self.client.models.list() - def run_scoring( self, row, scoring_function_ids: list[str], scoring_params: Optional[dict] ): @@ -39,3 +31,6 @@ class LlamaStackEvaluation: return self.client.scoring.score( input_rows=[row], scoring_functions=scoring_params ) + + +llama_stack_api = LlamaStackApi() diff --git a/llama_stack/distribution/ui/modules/utils.py b/llama_stack/distribution/ui/modules/utils.py index f8da2e54e..67cce98fa 100644 --- a/llama_stack/distribution/ui/modules/utils.py +++ b/llama_stack/distribution/ui/modules/utils.py @@ -4,6 +4,7 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +import base64 import os import pandas as pd @@ -29,3 +30,13 @@ def process_dataset(file): except Exception as e: st.error(f"Error processing file: {str(e)}") return None + + +def data_url_from_file(file) -> str: + file_content = file.getvalue() + base64_content = base64.b64encode(file_content).decode("utf-8") + mime_type = file.type + + data_url = f"data:{mime_type};base64,{base64_content}" + + return data_url diff --git a/llama_stack/distribution/ui/page/__init__.py b/llama_stack/distribution/ui/page/__init__.py new file mode 100644 index 000000000..756f351d8 --- /dev/null +++ b/llama_stack/distribution/ui/page/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/llama_stack/distribution/ui/page/distribution/datasets.py b/llama_stack/distribution/ui/page/distribution/datasets.py new file mode 100644 index 000000000..44e314cde --- /dev/null +++ b/llama_stack/distribution/ui/page/distribution/datasets.py @@ -0,0 +1,19 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import streamlit as st +from modules.api import llama_stack_api + + +def datasets(): + st.header("Datasets") + + datasets_info = { + d.identifier: d.to_dict() for d in llama_stack_api.client.datasets.list() + } + + selected_dataset = st.selectbox("Select a dataset", list(datasets_info.keys())) + st.json(datasets_info[selected_dataset], expanded=True) diff --git a/llama_stack/distribution/ui/page/distribution/eval_tasks.py b/llama_stack/distribution/ui/page/distribution/eval_tasks.py new file mode 100644 index 000000000..4957fb178 --- /dev/null +++ b/llama_stack/distribution/ui/page/distribution/eval_tasks.py @@ -0,0 +1,22 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import streamlit as st +from modules.api import llama_stack_api + + +def eval_tasks(): + # Eval Tasks Section + st.header("Eval Tasks") + + eval_tasks_info = { + d.identifier: d.to_dict() for d in llama_stack_api.client.eval_tasks.list() + } + + selected_eval_task = st.selectbox( + "Select an eval task", list(eval_tasks_info.keys()), key="eval_task_inspect" + ) + st.json(eval_tasks_info[selected_eval_task], expanded=True) diff --git a/llama_stack/distribution/ui/page/distribution/memory_banks.py b/llama_stack/distribution/ui/page/distribution/memory_banks.py new file mode 100644 index 000000000..f28010bf2 --- /dev/null +++ b/llama_stack/distribution/ui/page/distribution/memory_banks.py @@ -0,0 +1,23 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import streamlit as st +from modules.api import llama_stack_api + + +def memory_banks(): + st.header("Memory Banks") + memory_banks_info = { + m.identifier: m.to_dict() for m in llama_stack_api.client.memory_banks.list() + } + + if len(memory_banks_info) > 0: + selected_memory_bank = st.selectbox( + "Select a memory bank", list(memory_banks_info.keys()) + ) + st.json(memory_banks_info[selected_memory_bank]) + else: + st.info("No memory banks found") diff --git a/llama_stack/distribution/ui/page/distribution/models.py b/llama_stack/distribution/ui/page/distribution/models.py new file mode 100644 index 000000000..70b166f2e --- /dev/null +++ b/llama_stack/distribution/ui/page/distribution/models.py @@ -0,0 +1,19 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import streamlit as st +from modules.api import llama_stack_api + + +def models(): + # Models Section + st.header("Models") + models_info = { + m.identifier: m.to_dict() for m in llama_stack_api.client.models.list() + } + + selected_model = st.selectbox("Select a model", list(models_info.keys())) + st.json(models_info[selected_model]) diff --git a/llama_stack/distribution/ui/page/distribution/providers.py b/llama_stack/distribution/ui/page/distribution/providers.py new file mode 100644 index 000000000..69f6bd771 --- /dev/null +++ b/llama_stack/distribution/ui/page/distribution/providers.py @@ -0,0 +1,20 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import streamlit as st +from modules.api import llama_stack_api + + +def providers(): + st.header("🔍 API Providers") + apis_providers_info = llama_stack_api.client.providers.list() + # selected_api = st.selectbox("Select an API", list(apis_providers_info.keys())) + for api in apis_providers_info.keys(): + st.markdown(f"###### {api}") + st.dataframe([p.to_dict() for p in apis_providers_info[api]], width=500) + + +providers() diff --git a/llama_stack/distribution/ui/page/distribution/resources.py b/llama_stack/distribution/ui/page/distribution/resources.py new file mode 100644 index 000000000..6b3ea0e3a --- /dev/null +++ b/llama_stack/distribution/ui/page/distribution/resources.py @@ -0,0 +1,52 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from page.distribution.datasets import datasets +from page.distribution.eval_tasks import eval_tasks +from page.distribution.memory_banks import memory_banks +from page.distribution.models import models +from page.distribution.scoring_functions import scoring_functions +from page.distribution.shields import shields + +from streamlit_option_menu import option_menu + + +def resources_page(): + options = [ + "Models", + "Memory Banks", + "Shields", + "Scoring Functions", + "Datasets", + "Eval Tasks", + ] + icons = ["magic", "memory", "shield", "file-bar-graph", "database", "list-task"] + selected_resource = option_menu( + None, + options, + icons=icons, + orientation="horizontal", + styles={ + "nav-link": { + "font-size": "12px", + }, + }, + ) + if selected_resource == "Eval Tasks": + eval_tasks() + elif selected_resource == "Memory Banks": + memory_banks() + elif selected_resource == "Datasets": + datasets() + elif selected_resource == "Models": + models() + elif selected_resource == "Scoring Functions": + scoring_functions() + elif selected_resource == "Shields": + shields() + + +resources_page() diff --git a/llama_stack/distribution/ui/page/distribution/scoring_functions.py b/llama_stack/distribution/ui/page/distribution/scoring_functions.py new file mode 100644 index 000000000..581ae0db7 --- /dev/null +++ b/llama_stack/distribution/ui/page/distribution/scoring_functions.py @@ -0,0 +1,22 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import streamlit as st +from modules.api import llama_stack_api + + +def scoring_functions(): + st.header("Scoring Functions") + + scoring_functions_info = { + s.identifier: s.to_dict() + for s in llama_stack_api.client.scoring_functions.list() + } + + selected_scoring_function = st.selectbox( + "Select a scoring function", list(scoring_functions_info.keys()) + ) + st.json(scoring_functions_info[selected_scoring_function], expanded=True) diff --git a/llama_stack/distribution/ui/page/distribution/shields.py b/llama_stack/distribution/ui/page/distribution/shields.py new file mode 100644 index 000000000..18bbfc008 --- /dev/null +++ b/llama_stack/distribution/ui/page/distribution/shields.py @@ -0,0 +1,20 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import streamlit as st +from modules.api import llama_stack_api + + +def shields(): + # Shields Section + st.header("Shields") + + shields_info = { + s.identifier: s.to_dict() for s in llama_stack_api.client.shields.list() + } + + selected_shield = st.selectbox("Select a shield", list(shields_info.keys())) + st.json(shields_info[selected_shield]) diff --git a/llama_stack/distribution/ui/page/evaluations/__init__.py b/llama_stack/distribution/ui/page/evaluations/__init__.py new file mode 100644 index 000000000..756f351d8 --- /dev/null +++ b/llama_stack/distribution/ui/page/evaluations/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/llama_stack/distribution/ui/page/evaluations/app_eval.py b/llama_stack/distribution/ui/page/evaluations/app_eval.py new file mode 100644 index 000000000..5ec47ed45 --- /dev/null +++ b/llama_stack/distribution/ui/page/evaluations/app_eval.py @@ -0,0 +1,148 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import json + +import pandas as pd +import streamlit as st + +from modules.api import llama_stack_api +from modules.utils import process_dataset + + +def application_evaluation_page(): + + st.set_page_config(page_title="Evaluations (Scoring)", page_icon="🦙") + st.title("📊 Evaluations (Scoring)") + + # File uploader + uploaded_file = st.file_uploader("Upload Dataset", type=["csv", "xlsx", "xls"]) + + if uploaded_file is None: + st.error("No file uploaded") + return + + # Process uploaded file + df = process_dataset(uploaded_file) + if df is None: + st.error("Error processing file") + return + + # Display dataset information + st.success("Dataset loaded successfully!") + + # Display dataframe preview + st.subheader("Dataset Preview") + st.dataframe(df) + + # Select Scoring Functions to Run Evaluation On + st.subheader("Select Scoring Functions") + scoring_functions = llama_stack_api.client.scoring_functions.list() + scoring_functions = {sf.identifier: sf for sf in scoring_functions} + scoring_functions_names = list(scoring_functions.keys()) + selected_scoring_functions = st.multiselect( + "Choose one or more scoring functions", + options=scoring_functions_names, + help="Choose one or more scoring functions.", + ) + + available_models = llama_stack_api.client.models.list() + available_models = [m.identifier for m in available_models] + + scoring_params = {} + if selected_scoring_functions: + st.write("Selected:") + for scoring_fn_id in selected_scoring_functions: + scoring_fn = scoring_functions[scoring_fn_id] + st.write(f"- **{scoring_fn_id}**: {scoring_fn.description}") + new_params = None + if scoring_fn.params: + new_params = {} + for param_name, param_value in scoring_fn.params.to_dict().items(): + if param_name == "type": + new_params[param_name] = param_value + continue + + if param_name == "judge_model": + value = st.selectbox( + f"Select **{param_name}** for {scoring_fn_id}", + options=available_models, + index=0, + key=f"{scoring_fn_id}_{param_name}", + ) + new_params[param_name] = value + else: + value = st.text_area( + f"Enter value for **{param_name}** in {scoring_fn_id} in valid JSON format", + value=json.dumps(param_value, indent=2), + height=80, + ) + try: + new_params[param_name] = json.loads(value) + except json.JSONDecodeError: + st.error( + f"Invalid JSON for **{param_name}** in {scoring_fn_id}" + ) + + st.json(new_params) + scoring_params[scoring_fn_id] = new_params + + # Add run evaluation button & slider + total_rows = len(df) + num_rows = st.slider("Number of rows to evaluate", 1, total_rows, total_rows) + + if st.button("Run Evaluation"): + progress_text = "Running evaluation..." + progress_bar = st.progress(0, text=progress_text) + rows = df.to_dict(orient="records") + if num_rows < total_rows: + rows = rows[:num_rows] + + # Create separate containers for progress text and results + progress_text_container = st.empty() + results_container = st.empty() + output_res = {} + for i, r in enumerate(rows): + # Update progress + progress = i / len(rows) + progress_bar.progress(progress, text=progress_text) + + # Run evaluation for current row + score_res = llama_stack_api.run_scoring( + r, + scoring_function_ids=selected_scoring_functions, + scoring_params=scoring_params, + ) + + for k in r.keys(): + if k not in output_res: + output_res[k] = [] + output_res[k].append(r[k]) + + for fn_id in selected_scoring_functions: + if fn_id not in output_res: + output_res[fn_id] = [] + output_res[fn_id].append(score_res.results[fn_id].score_rows[0]) + + # Display current row results using separate containers + progress_text_container.write( + f"Expand to see current processed result ({i+1}/{len(rows)})" + ) + results_container.json( + score_res.to_json(), + expanded=2, + ) + + progress_bar.progress(1.0, text="Evaluation complete!") + + # Display results in dataframe + if output_res: + output_df = pd.DataFrame(output_res) + st.subheader("Evaluation Results") + st.dataframe(output_df) + + +application_evaluation_page() diff --git a/llama_stack/distribution/ui/page/evaluations/native_eval.py b/llama_stack/distribution/ui/page/evaluations/native_eval.py new file mode 100644 index 000000000..b8cc8bfa6 --- /dev/null +++ b/llama_stack/distribution/ui/page/evaluations/native_eval.py @@ -0,0 +1,257 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import json + +import pandas as pd + +import streamlit as st + +from modules.api import llama_stack_api + + +def select_eval_task_1(): + # Select Eval Tasks + st.subheader("1. Choose An Eval Task") + eval_tasks = llama_stack_api.client.eval_tasks.list() + eval_tasks = {et.identifier: et for et in eval_tasks} + eval_tasks_names = list(eval_tasks.keys()) + selected_eval_task = st.selectbox( + "Choose an eval task.", + options=eval_tasks_names, + help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.", + ) + with st.expander("View Eval Task"): + st.json(eval_tasks[selected_eval_task], expanded=True) + + st.session_state["selected_eval_task"] = selected_eval_task + st.session_state["eval_tasks"] = eval_tasks + if st.button("Confirm", key="confirm_1"): + st.session_state["selected_eval_task_1_next"] = True + + +def define_eval_candidate_2(): + if not st.session_state.get("selected_eval_task_1_next", None): + return + + st.subheader("2. Define Eval Candidate") + st.info( + """ + Define the configurations for the evaluation candidate model or agent used for generation. + Select "model" if you want to run generation with inference API, or "agent" if you want to run generation with agent API through specifying AgentConfig. + """ + ) + with st.expander("Define Eval Candidate", expanded=True): + # Define Eval Candidate + candidate_type = st.radio("Candidate Type", ["model", "agent"]) + + available_models = llama_stack_api.client.models.list() + available_models = [model.identifier for model in available_models] + selected_model = st.selectbox( + "Choose a model", + available_models, + index=0, + ) + + # Sampling Parameters + st.markdown("##### Sampling Parameters") + strategy = st.selectbox( + "Strategy", + ["greedy", "top_p", "top_k"], + index=0, + ) + temperature = st.slider( + "Temperature", + min_value=0.0, + max_value=1.0, + value=0.0, + step=0.1, + help="Controls the randomness of the response. Higher values make the output more creative and unexpected, lower values make it more conservative and predictable", + ) + top_p = st.slider( + "Top P", + min_value=0.0, + max_value=1.0, + value=0.95, + step=0.1, + ) + max_tokens = st.slider( + "Max Tokens", + min_value=0, + max_value=4096, + value=512, + step=1, + help="The maximum number of tokens to generate", + ) + repetition_penalty = st.slider( + "Repetition Penalty", + min_value=1.0, + max_value=2.0, + value=1.0, + step=0.1, + help="Controls the likelihood for generating the same word or phrase multiple times in the same sentence or paragraph. 1 implies no penalty, 2 will strongly discourage model to repeat words or phrases.", + ) + if candidate_type == "model": + eval_candidate = { + "type": "model", + "model": selected_model, + "sampling_params": { + "strategy": strategy, + "temperature": temperature, + "top_p": top_p, + "max_tokens": max_tokens, + "repetition_penalty": repetition_penalty, + }, + } + elif candidate_type == "agent": + system_prompt = st.text_area( + "System Prompt", + value="You are a helpful AI assistant.", + help="Initial instructions given to the AI to set its behavior and context", + ) + tools_json = st.text_area( + "Tools Configuration (JSON)", + value=json.dumps( + [ + { + "type": "brave_search", + "engine": "brave", + "api_key": "ENTER_BRAVE_API_KEY_HERE", + } + ] + ), + help="Enter tool configurations in JSON format. Each tool should have a name, description, and parameters.", + height=200, + ) + try: + tools = json.loads(tools_json) + except json.JSONDecodeError: + st.error("Invalid JSON format for tools configuration") + tools = [] + eval_candidate = { + "type": "agent", + "config": { + "model": selected_model, + "instructions": system_prompt, + "tools": tools, + "tool_choice": "auto", + "tool_prompt_format": "json", + "input_shields": [], + "output_shields": [], + "enable_session_persistence": False, + }, + } + st.session_state["eval_candidate"] = eval_candidate + + if st.button("Confirm", key="confirm_2"): + st.session_state["selected_eval_candidate_2_next"] = True + + +def run_evaluation_3(): + if not st.session_state.get("selected_eval_candidate_2_next", None): + return + + st.subheader("3. Run Evaluation") + # Add info box to explain configurations being used + st.info( + """ + Review the configurations that will be used for this evaluation run, make any necessary changes, and then click the "Run Evaluation" button. + """ + ) + selected_eval_task = st.session_state["selected_eval_task"] + eval_tasks = st.session_state["eval_tasks"] + eval_candidate = st.session_state["eval_candidate"] + + dataset_id = eval_tasks[selected_eval_task].dataset_id + rows = llama_stack_api.client.datasetio.get_rows_paginated( + dataset_id=dataset_id, + rows_in_page=-1, + ) + total_rows = len(rows.rows) + # Add number of examples control + num_rows = st.number_input( + "Number of Examples to Evaluate", + min_value=1, + max_value=total_rows, + value=5, + help="Number of examples from the dataset to evaluate. ", + ) + + eval_task_config = { + "type": "benchmark", + "eval_candidate": eval_candidate, + "scoring_params": {}, + } + + with st.expander("View Evaluation Task", expanded=True): + st.json(eval_tasks[selected_eval_task], expanded=True) + with st.expander("View Evaluation Task Configuration", expanded=True): + st.json(eval_task_config, expanded=True) + + # Add run button and handle evaluation + if st.button("Run Evaluation"): + + progress_text = "Running evaluation..." + progress_bar = st.progress(0, text=progress_text) + rows = rows.rows + if num_rows < total_rows: + rows = rows[:num_rows] + + # Create separate containers for progress text and results + progress_text_container = st.empty() + results_container = st.empty() + output_res = {} + for i, r in enumerate(rows): + # Update progress + progress = i / len(rows) + progress_bar.progress(progress, text=progress_text) + # Run evaluation for current row + eval_res = llama_stack_api.client.eval.evaluate_rows( + task_id=selected_eval_task, + input_rows=[r], + scoring_functions=eval_tasks[selected_eval_task].scoring_functions, + task_config=eval_task_config, + ) + + for k in r.keys(): + if k not in output_res: + output_res[k] = [] + output_res[k].append(r[k]) + + for k in eval_res.generations[0].keys(): + if k not in output_res: + output_res[k] = [] + output_res[k].append(eval_res.generations[0][k]) + + for scoring_fn in eval_tasks[selected_eval_task].scoring_functions: + if scoring_fn not in output_res: + output_res[scoring_fn] = [] + output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0]) + + progress_text_container.write( + f"Expand to see current processed result ({i+1}/{len(rows)})" + ) + results_container.json(eval_res, expanded=2) + + progress_bar.progress(1.0, text="Evaluation complete!") + # Display results in dataframe + if output_res: + output_df = pd.DataFrame(output_res) + st.subheader("Evaluation Results") + st.dataframe(output_df) + + +def native_evaluation_page(): + + st.set_page_config(page_title="Evaluations (Generation + Scoring)", page_icon="🦙") + st.title("📊 Evaluations (Generation + Scoring)") + + select_eval_task_1() + define_eval_candidate_2() + run_evaluation_3() + + +native_evaluation_page() diff --git a/llama_stack/distribution/ui/page/playground/__init__.py b/llama_stack/distribution/ui/page/playground/__init__.py new file mode 100644 index 000000000..756f351d8 --- /dev/null +++ b/llama_stack/distribution/ui/page/playground/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/llama_stack/distribution/ui/page/playground/chat.py b/llama_stack/distribution/ui/page/playground/chat.py new file mode 100644 index 000000000..157922d3b --- /dev/null +++ b/llama_stack/distribution/ui/page/playground/chat.py @@ -0,0 +1,123 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import streamlit as st +from modules.api import llama_stack_api + +# Sidebar configurations +with st.sidebar: + st.header("Configuration") + available_models = llama_stack_api.client.models.list() + available_models = [model.identifier for model in available_models] + selected_model = st.selectbox( + "Choose a model", + available_models, + index=0, + ) + + temperature = st.slider( + "Temperature", + min_value=0.0, + max_value=1.0, + value=0.0, + step=0.1, + help="Controls the randomness of the response. Higher values make the output more creative and unexpected, lower values make it more conservative and predictable", + ) + + top_p = st.slider( + "Top P", + min_value=0.0, + max_value=1.0, + value=0.95, + step=0.1, + ) + + max_tokens = st.slider( + "Max Tokens", + min_value=0, + max_value=4096, + value=512, + step=1, + help="The maximum number of tokens to generate", + ) + + repetition_penalty = st.slider( + "Repetition Penalty", + min_value=1.0, + max_value=2.0, + value=1.0, + step=0.1, + help="Controls the likelihood for generating the same word or phrase multiple times in the same sentence or paragraph. 1 implies no penalty, 2 will strongly discourage model to repeat words or phrases.", + ) + + stream = st.checkbox("Stream", value=True) + system_prompt = st.text_area( + "System Prompt", + value="You are a helpful AI assistant.", + help="Initial instructions given to the AI to set its behavior and context", + ) + + # Add clear chat button to sidebar + if st.button("Clear Chat", use_container_width=True): + st.session_state.messages = [] + st.rerun() + + +# Main chat interface +st.title("🦙 Chat") + + +# Initialize chat history +if "messages" not in st.session_state: + st.session_state.messages = [] + +# Display chat messages +for message in st.session_state.messages: + with st.chat_message(message["role"]): + st.markdown(message["content"]) + +# Chat input +if prompt := st.chat_input("Example: What is Llama Stack?"): + # Add user message to chat history + st.session_state.messages.append({"role": "user", "content": prompt}) + + # Display user message + with st.chat_message("user"): + st.markdown(prompt) + + # Display assistant response + with st.chat_message("assistant"): + message_placeholder = st.empty() + full_response = "" + + response = llama_stack_api.client.inference.chat_completion( + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": prompt}, + ], + model_id=selected_model, + stream=stream, + sampling_params={ + "temperature": temperature, + "top_p": top_p, + "max_tokens": max_tokens, + "repetition_penalty": repetition_penalty, + }, + ) + + if stream: + for chunk in response: + if chunk.event.event_type == "progress": + full_response += chunk.event.delta + message_placeholder.markdown(full_response + "▌") + message_placeholder.markdown(full_response) + else: + full_response = response + message_placeholder.markdown(full_response.completion_message.content) + + st.session_state.messages.append( + {"role": "assistant", "content": full_response} + ) diff --git a/llama_stack/distribution/ui/page/playground/rag.py b/llama_stack/distribution/ui/page/playground/rag.py new file mode 100644 index 000000000..ffcaf1afd --- /dev/null +++ b/llama_stack/distribution/ui/page/playground/rag.py @@ -0,0 +1,188 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import streamlit as st +from llama_stack_client.lib.agents.agent import Agent +from llama_stack_client.lib.agents.event_logger import EventLogger +from llama_stack_client.types.agent_create_params import AgentConfig +from llama_stack_client.types.memory_insert_params import Document + +from modules.api import llama_stack_api +from modules.utils import data_url_from_file + + +def rag_chat_page(): + st.title("🦙 RAG") + + with st.sidebar: + # File/Directory Upload Section + st.subheader("Upload Documents") + uploaded_files = st.file_uploader( + "Upload file(s) or directory", + accept_multiple_files=True, + type=["txt", "pdf", "doc", "docx"], # Add more file types as needed + ) + # Process uploaded files + if uploaded_files: + st.success(f"Successfully uploaded {len(uploaded_files)} files") + # Add memory bank name input field + memory_bank_name = st.text_input( + "Memory Bank Name", + value="rag_bank", + help="Enter a unique identifier for this memory bank", + ) + if st.button("Create Memory Bank"): + documents = [ + Document( + document_id=uploaded_file.name, + content=data_url_from_file(uploaded_file), + ) + for i, uploaded_file in enumerate(uploaded_files) + ] + + providers = llama_stack_api.client.providers.list() + llama_stack_api.client.memory_banks.register( + memory_bank_id=memory_bank_name, # Use the user-provided name + params={ + "embedding_model": "all-MiniLM-L6-v2", + "chunk_size_in_tokens": 512, + "overlap_size_in_tokens": 64, + }, + provider_id=providers["memory"][0].provider_id, + ) + + # insert documents using the custom bank name + llama_stack_api.client.memory.insert( + bank_id=memory_bank_name, # Use the user-provided name + documents=documents, + ) + st.success("Memory bank created successfully!") + + st.subheader("Configure Agent") + # select memory banks + memory_banks = llama_stack_api.client.memory_banks.list() + memory_banks = [bank.identifier for bank in memory_banks] + selected_memory_banks = st.multiselect( + "Select Memory Banks", + memory_banks, + ) + memory_bank_configs = [ + {"bank_id": bank_id, "type": "vector"} for bank_id in selected_memory_banks + ] + + available_models = llama_stack_api.client.models.list() + available_models = [model.identifier for model in available_models] + selected_model = st.selectbox( + "Choose a model", + available_models, + index=0, + ) + system_prompt = st.text_area( + "System Prompt", + value="You are a helpful assistant. ", + help="Initial instructions given to the AI to set its behavior and context", + ) + temperature = st.slider( + "Temperature", + min_value=0.0, + max_value=1.0, + value=0.0, + step=0.1, + help="Controls the randomness of the response. Higher values make the output more creative and unexpected, lower values make it more conservative and predictable", + ) + + top_p = st.slider( + "Top P", + min_value=0.0, + max_value=1.0, + value=0.95, + step=0.1, + ) + + # Add clear chat button to sidebar + if st.button("Clear Chat", use_container_width=True): + st.session_state.messages = [] + st.rerun() + + # Chat Interface + if "messages" not in st.session_state: + st.session_state.messages = [] + + # Display chat history + for message in st.session_state.messages: + with st.chat_message(message["role"]): + st.markdown(message["content"]) + + selected_model = llama_stack_api.client.models.list()[0].identifier + + agent_config = AgentConfig( + model=selected_model, + instructions=system_prompt, + sampling_params={ + "strategy": "greedy", + "temperature": temperature, + "top_p": top_p, + }, + tools=[ + { + "type": "memory", + "memory_bank_configs": memory_bank_configs, + "query_generator_config": {"type": "default", "sep": " "}, + "max_tokens_in_context": 4096, + "max_chunks": 10, + } + ], + tool_choice="auto", + tool_prompt_format="json", + input_shields=[], + output_shields=[], + enable_session_persistence=False, + ) + + agent = Agent(llama_stack_api.client, agent_config) + session_id = agent.create_session("rag-session") + + # Chat input + if prompt := st.chat_input("Ask a question about your documents"): + # Add user message to chat history + st.session_state.messages.append({"role": "user", "content": prompt}) + + # Display user message + with st.chat_message("user"): + st.markdown(prompt) + + response = agent.create_turn( + messages=[ + { + "role": "user", + "content": prompt, + } + ], + session_id=session_id, + ) + + # Display assistant response + with st.chat_message("assistant"): + retrieval_message_placeholder = st.empty() + message_placeholder = st.empty() + full_response = "" + retrieval_response = "" + for log in EventLogger().log(response): + log.print() + if log.role == "memory_retrieval": + retrieval_response += log.content.replace("====", "").strip() + retrieval_message_placeholder.info(retrieval_response) + else: + full_response += log.content + message_placeholder.markdown(full_response + "▌") + message_placeholder.markdown(full_response) + + st.session_state.messages.append( + {"role": "assistant", "content": full_response} + ) + + +rag_chat_page() diff --git a/llama_stack/distribution/ui/requirements.txt b/llama_stack/distribution/ui/requirements.txt index c03959444..39f2b3d27 100644 --- a/llama_stack/distribution/ui/requirements.txt +++ b/llama_stack/distribution/ui/requirements.txt @@ -1,3 +1,4 @@ streamlit pandas llama-stack-client>=0.0.55 +streamlit-option-menu diff --git a/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_base.py b/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_base.py index b00b9a7db..0b18bac01 100644 --- a/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_base.py +++ b/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_base.py @@ -5,7 +5,7 @@ # the root directory of this source tree. from llama_stack.apis.common.type_system import NumberType -from llama_stack.apis.scoring_functions import ScoringFn +from llama_stack.apis.scoring_functions import LLMAsJudgeScoringFnParams, ScoringFn llm_as_judge_base = ScoringFn( @@ -14,4 +14,8 @@ llm_as_judge_base = ScoringFn( return_type=NumberType(), provider_id="llm-as-judge", provider_resource_id="llm-as-judge-base", + params=LLMAsJudgeScoringFnParams( + judge_model="meta-llama/Llama-3.1-405B-Instruct", + prompt_template="Enter custom LLM as Judge Prompt Template", + ), ) From fcd64495195a53d78ebd7ec45b93e3b3d1143a57 Mon Sep 17 00:00:00 2001 From: Dinesh Yeduguru Date: Wed, 4 Dec 2024 11:22:45 -0800 Subject: [PATCH 19/69] Telemetry API redesign (#525) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? Change the Telemetry API to be able to support different use cases like returning traces for the UI and ability to export for Evals. Other changes: * Add a new trace_protocol decorator to decorate all our API methods so that any call to them will automatically get traced across all impls. * There is some issue with the decorator pattern of span creation when using async generators, where there are multiple yields with in the same context. I think its much more explicit by using the explicit context manager pattern using with. I moved the span creations in agent instance to be using with * Inject session id at the turn level, which should quickly give us all traces across turns for a given session Addresses #509 ## Test Plan ``` llama stack run /Users/dineshyv/.llama/distributions/llamastack-together/together-run.yaml PYTHONPATH=. python -m examples.agents.rag_with_memory_bank localhost 5000 curl -X POST 'http://localhost:5000/alpha/telemetry/query-traces' \ -H 'Content-Type: application/json' \ -d '{ "attribute_filters": [ { "key": "session_id", "op": "eq", "value": "dd667b87-ca4b-4d30-9265-5a0de318fc65" }], "limit": 100, "offset": 0, "order_by": ["start_time"] }' | jq . [ { "trace_id": "6902f54b83b4b48be18a6f422b13e16f", "root_span_id": "5f37b85543afc15a", "start_time": "2024-12-04T08:08:30.501587", "end_time": "2024-12-04T08:08:36.026463" }, { "trace_id": "92227dac84c0615ed741be393813fb5f", "root_span_id": "af7c5bb46665c2c8", "start_time": "2024-12-04T08:08:36.031170", "end_time": "2024-12-04T08:08:41.693301" }, { "trace_id": "7d578a6edac62f204ab479fba82f77b6", "root_span_id": "1d935e3362676896", "start_time": "2024-12-04T08:08:41.695204", "end_time": "2024-12-04T08:08:47.228016" }, { "trace_id": "dbd767d76991bc816f9f078907dc9ff2", "root_span_id": "f5a7ee76683b9602", "start_time": "2024-12-04T08:08:47.234578", "end_time": "2024-12-04T08:08:53.189412" } ] curl -X POST 'http://localhost:5000/alpha/telemetry/get-span-tree' \ -H 'Content-Type: application/json' \ -d '{ "span_id" : "6cceb4b48a156913", "max_depth": 2, "attributes_to_return": ["input"] }' | jq . % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 875 100 790 100 85 18462 1986 --:--:-- --:--:-- --:--:-- 20833 { "span_id": "6cceb4b48a156913", "trace_id": "dafa796f6aaf925f511c04cd7c67fdda", "parent_span_id": "892a66d726c7f990", "name": "retrieve_rag_context", "start_time": "2024-12-04T09:28:21.781995", "end_time": "2024-12-04T09:28:21.913352", "attributes": { "input": [ "{\"role\":\"system\",\"content\":\"You are a helpful assistant\"}", "{\"role\":\"user\",\"content\":\"What are the top 5 topics that were explained in the documentation? Only list succinct bullet points.\",\"context\":null}" ] }, "children": [ { "span_id": "1a2df181854064a8", "trace_id": "dafa796f6aaf925f511c04cd7c67fdda", "parent_span_id": "6cceb4b48a156913", "name": "MemoryRouter.query_documents", "start_time": "2024-12-04T09:28:21.787620", "end_time": "2024-12-04T09:28:21.906512", "attributes": { "input": null }, "children": [], "status": "ok" } ], "status": "ok" } ``` Screenshot 2024-12-04 at 9 42 56 AM --- llama_stack/apis/agents/agents.py | 2 + llama_stack/apis/datasetio/datasetio.py | 5 + llama_stack/apis/inference/inference.py | 3 + llama_stack/apis/memory/memory.py | 2 + llama_stack/apis/memory_banks/memory_banks.py | 2 + llama_stack/apis/models/models.py | 2 + llama_stack/apis/safety/safety.py | 3 + llama_stack/apis/shields/shields.py | 2 + llama_stack/apis/telemetry/telemetry.py | 66 ++++- llama_stack/distribution/routers/routers.py | 6 + llama_stack/distribution/server/server.py | 8 +- llama_stack/distribution/tracing.py | 128 +++++++++ .../agents/meta_reference/agent_instance.py | 227 +++++++++------- .../inline/datasetio/localfs/datasetio.py | 43 ++- .../meta_reference/telemetry/__init__.py | 15 -- .../inline/meta_reference/telemetry/config.py | 21 -- .../meta_reference/telemetry/console.py | 25 +- .../{remote => inline}/telemetry/__init__.py | 0 .../telemetry/meta_reference/__init__.py | 18 ++ .../inline/telemetry/meta_reference/config.py | 45 ++++ .../meta_reference/console_span_processor.py | 95 +++++++ .../meta_reference/sqlite_span_processor.py | 242 +++++++++++++++++ .../telemetry/meta_reference/telemetry.py | 247 ++++++++++++++++++ .../telemetry/sample/__init__.py | 0 .../telemetry/sample/config.py | 0 .../telemetry/sample/sample.py | 0 llama_stack/providers/registry/telemetry.py | 23 +- .../datasetio/huggingface/huggingface.py | 21 +- .../telemetry/opentelemetry/__init__.py | 15 -- .../remote/telemetry/opentelemetry/config.py | 27 -- .../telemetry/opentelemetry/opentelemetry.py | 115 +++++--- .../providers/utils/telemetry/sqlite.py | 177 +++++++++++++ .../utils/telemetry/sqlite_trace_store.py | 180 +++++++++++++ .../providers/utils/telemetry/tracing.py | 31 ++- 34 files changed, 1551 insertions(+), 245 deletions(-) create mode 100644 llama_stack/distribution/tracing.py delete mode 100644 llama_stack/providers/inline/meta_reference/telemetry/__init__.py delete mode 100644 llama_stack/providers/inline/meta_reference/telemetry/config.py rename llama_stack/providers/{remote => inline}/telemetry/__init__.py (100%) create mode 100644 llama_stack/providers/inline/telemetry/meta_reference/__init__.py create mode 100644 llama_stack/providers/inline/telemetry/meta_reference/config.py create mode 100644 llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py create mode 100644 llama_stack/providers/inline/telemetry/meta_reference/sqlite_span_processor.py create mode 100644 llama_stack/providers/inline/telemetry/meta_reference/telemetry.py rename llama_stack/providers/{remote => inline}/telemetry/sample/__init__.py (100%) rename llama_stack/providers/{remote => inline}/telemetry/sample/config.py (100%) rename llama_stack/providers/{remote => inline}/telemetry/sample/sample.py (100%) delete mode 100644 llama_stack/providers/remote/telemetry/opentelemetry/__init__.py delete mode 100644 llama_stack/providers/remote/telemetry/opentelemetry/config.py create mode 100644 llama_stack/providers/utils/telemetry/sqlite.py create mode 100644 llama_stack/providers/utils/telemetry/sqlite_trace_store.py diff --git a/llama_stack/apis/agents/agents.py b/llama_stack/apis/agents/agents.py index 25de35497..d2243c96f 100644 --- a/llama_stack/apis/agents/agents.py +++ b/llama_stack/apis/agents/agents.py @@ -23,6 +23,7 @@ from llama_models.schema_utils import json_schema_type, webmethod from pydantic import BaseModel, ConfigDict, Field from typing_extensions import Annotated +from llama_stack.distribution.tracing import trace_protocol from llama_models.llama3.api.datatypes import * # noqa: F403 from llama_stack.apis.common.deployment_types import * # noqa: F403 from llama_stack.apis.inference import * # noqa: F403 @@ -418,6 +419,7 @@ class AgentStepResponse(BaseModel): @runtime_checkable +@trace_protocol class Agents(Protocol): @webmethod(route="/agents/create") async def create_agent( diff --git a/llama_stack/apis/datasetio/datasetio.py b/llama_stack/apis/datasetio/datasetio.py index c5052877a..22acc3211 100644 --- a/llama_stack/apis/datasetio/datasetio.py +++ b/llama_stack/apis/datasetio/datasetio.py @@ -37,3 +37,8 @@ class DatasetIO(Protocol): page_token: Optional[str] = None, filter_condition: Optional[str] = None, ) -> PaginatedRowsResult: ... + + @webmethod(route="/datasetio/append-rows", method="POST") + async def append_rows( + self, dataset_id: str, rows: List[Dict[str, Any]] + ) -> None: ... diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py index 5aadd97c7..85b29a147 100644 --- a/llama_stack/apis/inference/inference.py +++ b/llama_stack/apis/inference/inference.py @@ -21,6 +21,8 @@ from llama_models.schema_utils import json_schema_type, webmethod from pydantic import BaseModel, Field from typing_extensions import Annotated +from llama_stack.distribution.tracing import trace_protocol + from llama_models.llama3.api.datatypes import * # noqa: F403 from llama_stack.apis.models import * # noqa: F403 @@ -220,6 +222,7 @@ class ModelStore(Protocol): @runtime_checkable +@trace_protocol class Inference(Protocol): model_store: ModelStore diff --git a/llama_stack/apis/memory/memory.py b/llama_stack/apis/memory/memory.py index 48b6e2241..b75df8a1a 100644 --- a/llama_stack/apis/memory/memory.py +++ b/llama_stack/apis/memory/memory.py @@ -16,6 +16,7 @@ from pydantic import BaseModel, Field from llama_models.llama3.api.datatypes import * # noqa: F403 from llama_stack.apis.memory_banks import * # noqa: F403 +from llama_stack.distribution.tracing import trace_protocol @json_schema_type @@ -43,6 +44,7 @@ class MemoryBankStore(Protocol): @runtime_checkable +@trace_protocol class Memory(Protocol): memory_bank_store: MemoryBankStore diff --git a/llama_stack/apis/memory_banks/memory_banks.py b/llama_stack/apis/memory_banks/memory_banks.py index 1b16af330..0b8b2563f 100644 --- a/llama_stack/apis/memory_banks/memory_banks.py +++ b/llama_stack/apis/memory_banks/memory_banks.py @@ -20,6 +20,7 @@ from llama_models.schema_utils import json_schema_type, webmethod from pydantic import BaseModel, Field from llama_stack.apis.resource import Resource, ResourceType +from llama_stack.distribution.tracing import trace_protocol @json_schema_type @@ -129,6 +130,7 @@ class MemoryBankInput(BaseModel): @runtime_checkable +@trace_protocol class MemoryBanks(Protocol): @webmethod(route="/memory-banks/list", method="GET") async def list_memory_banks(self) -> List[MemoryBank]: ... diff --git a/llama_stack/apis/models/models.py b/llama_stack/apis/models/models.py index cbd6265e2..2c0f1ee21 100644 --- a/llama_stack/apis/models/models.py +++ b/llama_stack/apis/models/models.py @@ -10,6 +10,7 @@ from llama_models.schema_utils import json_schema_type, webmethod from pydantic import BaseModel, ConfigDict, Field from llama_stack.apis.resource import Resource, ResourceType +from llama_stack.distribution.tracing import trace_protocol class CommonModelFields(BaseModel): @@ -43,6 +44,7 @@ class ModelInput(CommonModelFields): @runtime_checkable +@trace_protocol class Models(Protocol): @webmethod(route="/models/list", method="GET") async def list_models(self) -> List[Model]: ... diff --git a/llama_stack/apis/safety/safety.py b/llama_stack/apis/safety/safety.py index 724f8dc96..41058f107 100644 --- a/llama_stack/apis/safety/safety.py +++ b/llama_stack/apis/safety/safety.py @@ -10,6 +10,8 @@ from typing import Any, Dict, List, Protocol, runtime_checkable from llama_models.schema_utils import json_schema_type, webmethod from pydantic import BaseModel +from llama_stack.distribution.tracing import trace_protocol + from llama_models.llama3.api.datatypes import * # noqa: F403 from llama_stack.apis.shields import * # noqa: F403 @@ -43,6 +45,7 @@ class ShieldStore(Protocol): @runtime_checkable +@trace_protocol class Safety(Protocol): shield_store: ShieldStore diff --git a/llama_stack/apis/shields/shields.py b/llama_stack/apis/shields/shields.py index 5ee444f68..b28605727 100644 --- a/llama_stack/apis/shields/shields.py +++ b/llama_stack/apis/shields/shields.py @@ -10,6 +10,7 @@ from llama_models.schema_utils import json_schema_type, webmethod from pydantic import BaseModel from llama_stack.apis.resource import Resource, ResourceType +from llama_stack.distribution.tracing import trace_protocol class CommonShieldFields(BaseModel): @@ -38,6 +39,7 @@ class ShieldInput(CommonShieldFields): @runtime_checkable +@trace_protocol class Shields(Protocol): @webmethod(route="/shields/list", method="GET") async def list_shields(self) -> List[Shield]: ... diff --git a/llama_stack/apis/telemetry/telemetry.py b/llama_stack/apis/telemetry/telemetry.py index 31f64733b..2ff783c46 100644 --- a/llama_stack/apis/telemetry/telemetry.py +++ b/llama_stack/apis/telemetry/telemetry.py @@ -6,12 +6,24 @@ from datetime import datetime from enum import Enum -from typing import Any, Dict, Literal, Optional, Protocol, runtime_checkable, Union +from typing import ( + Any, + Dict, + List, + Literal, + Optional, + Protocol, + runtime_checkable, + Union, +) from llama_models.schema_utils import json_schema_type, webmethod from pydantic import BaseModel, Field from typing_extensions import Annotated +# Add this constant near the top of the file, after the imports +DEFAULT_TTL_DAYS = 7 + @json_schema_type class SpanStatus(Enum): @@ -29,6 +41,11 @@ class Span(BaseModel): end_time: Optional[datetime] = None attributes: Optional[Dict[str, Any]] = Field(default_factory=dict) + def set_attribute(self, key: str, value: Any): + if self.attributes is None: + self.attributes = {} + self.attributes[key] = value + @json_schema_type class Trace(BaseModel): @@ -123,10 +140,49 @@ Event = Annotated[ ] +@json_schema_type +class EvalTrace(BaseModel): + session_id: str + step: str + input: str + output: str + expected_output: str + + +@json_schema_type +class SpanWithChildren(Span): + children: List["SpanWithChildren"] = Field(default_factory=list) + status: Optional[SpanStatus] = None + + +@json_schema_type +class QueryCondition(BaseModel): + key: str + op: Literal["eq", "ne", "gt", "lt"] + value: Any + + @runtime_checkable class Telemetry(Protocol): - @webmethod(route="/telemetry/log-event") - async def log_event(self, event: Event) -> None: ... - @webmethod(route="/telemetry/get-trace", method="GET") - async def get_trace(self, trace_id: str) -> Trace: ... + @webmethod(route="/telemetry/log-event") + async def log_event( + self, event: Event, ttl_seconds: int = DEFAULT_TTL_DAYS * 86400 + ) -> None: ... + + @webmethod(route="/telemetry/query-traces", method="POST") + async def query_traces( + self, + attribute_filters: Optional[List[QueryCondition]] = None, + limit: Optional[int] = 100, + offset: Optional[int] = 0, + order_by: Optional[List[str]] = None, + ) -> List[Trace]: ... + + @webmethod(route="/telemetry/get-span-tree", method="POST") + async def get_span_tree( + self, + span_id: str, + attributes_to_return: Optional[List[str]] = None, + max_depth: Optional[int] = None, + ) -> SpanWithChildren: ... diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py index 5a62b6d64..5b75a525b 100644 --- a/llama_stack/distribution/routers/routers.py +++ b/llama_stack/distribution/routers/routers.py @@ -222,6 +222,12 @@ class DatasetIORouter(DatasetIO): filter_condition=filter_condition, ) + async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None: + return await self.routing_table.get_provider_impl(dataset_id).append_rows( + dataset_id=dataset_id, + rows=rows, + ) + class ScoringRouter(Scoring): def __init__( diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py index 8116e2b39..4ae1854df 100644 --- a/llama_stack/distribution/server/server.py +++ b/llama_stack/distribution/server/server.py @@ -43,9 +43,9 @@ from llama_stack.distribution.stack import ( replace_env_vars, validate_env_pair, ) -from llama_stack.providers.inline.meta_reference.telemetry.console import ( - ConsoleConfig, - ConsoleTelemetryImpl, +from llama_stack.providers.inline.telemetry.meta_reference import ( + TelemetryAdapter, + TelemetryConfig, ) from .endpoints import get_all_api_endpoints @@ -290,7 +290,7 @@ def main(): if Api.telemetry in impls: setup_logger(impls[Api.telemetry]) else: - setup_logger(ConsoleTelemetryImpl(ConsoleConfig())) + setup_logger(TelemetryAdapter(TelemetryConfig())) all_endpoints = get_all_api_endpoints() diff --git a/llama_stack/distribution/tracing.py b/llama_stack/distribution/tracing.py new file mode 100644 index 000000000..ea663ec89 --- /dev/null +++ b/llama_stack/distribution/tracing.py @@ -0,0 +1,128 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import asyncio +import inspect +import json +from functools import wraps +from typing import Any, AsyncGenerator, Callable, Type, TypeVar + +from pydantic import BaseModel + +from llama_stack.providers.utils.telemetry import tracing + +T = TypeVar("T") + + +def serialize_value(value: Any) -> str: + """Helper function to serialize values to string representation.""" + try: + if isinstance(value, BaseModel): + return value.model_dump_json() + elif isinstance(value, list) and value and isinstance(value[0], BaseModel): + return json.dumps([item.model_dump_json() for item in value]) + elif hasattr(value, "to_dict"): + return json.dumps(value.to_dict()) + elif isinstance(value, (dict, list, int, float, str, bool)): + return json.dumps(value) + else: + return str(value) + except Exception: + return str(value) + + +def trace_protocol(cls: Type[T]) -> Type[T]: + """ + A class decorator that automatically traces all methods in a protocol/base class + and its inheriting classes. + """ + + def trace_method(method: Callable) -> Callable: + is_async = asyncio.iscoroutinefunction(method) + is_async_gen = inspect.isasyncgenfunction(method) + + def create_span_context(self: Any, *args: Any, **kwargs: Any) -> tuple: + class_name = self.__class__.__name__ + method_name = method.__name__ + + span_type = ( + "async_generator" if is_async_gen else "async" if is_async else "sync" + ) + span_attributes = { + "class": class_name, + "method": method_name, + "type": span_type, + "args": serialize_value(args), + } + + return class_name, method_name, span_attributes + + @wraps(method) + async def async_gen_wrapper( + self: Any, *args: Any, **kwargs: Any + ) -> AsyncGenerator: + class_name, method_name, span_attributes = create_span_context( + self, *args, **kwargs + ) + + with tracing.span(f"{class_name}.{method_name}", span_attributes) as span: + try: + count = 0 + async for item in method(self, *args, **kwargs): + yield item + count += 1 + finally: + span.set_attribute("chunk_count", count) + + @wraps(method) + async def async_wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: + class_name, method_name, span_attributes = create_span_context( + self, *args, **kwargs + ) + + with tracing.span(f"{class_name}.{method_name}", span_attributes) as span: + try: + result = await method(self, *args, **kwargs) + span.set_attribute("output", serialize_value(result)) + return result + except Exception as e: + span.set_attribute("error", str(e)) + raise + + @wraps(method) + def sync_wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: + class_name, method_name, span_attributes = create_span_context( + self, *args, **kwargs + ) + + with tracing.span(f"{class_name}.{method_name}", span_attributes) as span: + try: + result = method(self, *args, **kwargs) + span.set_attribute("output", serialize_value(result)) + return result + except Exception as e: + raise + + if is_async_gen: + return async_gen_wrapper + elif is_async: + return async_wrapper + else: + return sync_wrapper + + original_init_subclass = getattr(cls, "__init_subclass__", None) + + def __init_subclass__(cls_child, **kwargs): # noqa: N807 + if original_init_subclass: + original_init_subclass(**kwargs) + + for name, method in vars(cls_child).items(): + if inspect.isfunction(method) and not name.startswith("_"): + setattr(cls_child, name, trace_method(method)) # noqa: B010 + + cls.__init_subclass__ = classmethod(__init_subclass__) + + return cls diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py index 8f800ad6f..7df5d3bd4 100644 --- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py +++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py @@ -144,87 +144,91 @@ class ChatAgent(ShieldRunnerMixin): async def create_session(self, name: str) -> str: return await self.storage.create_session(name) - @tracing.span("create_and_execute_turn") async def create_and_execute_turn( self, request: AgentTurnCreateRequest ) -> AsyncGenerator: - assert request.stream is True, "Non-streaming not supported" + with tracing.span("create_and_execute_turn") as span: + span.set_attribute("session_id", request.session_id) + span.set_attribute("agent_id", self.agent_id) + span.set_attribute("request", request.model_dump_json()) + assert request.stream is True, "Non-streaming not supported" - session_info = await self.storage.get_session_info(request.session_id) - if session_info is None: - raise ValueError(f"Session {request.session_id} not found") + session_info = await self.storage.get_session_info(request.session_id) + if session_info is None: + raise ValueError(f"Session {request.session_id} not found") - turns = await self.storage.get_session_turns(request.session_id) + turns = await self.storage.get_session_turns(request.session_id) - messages = [] - if self.agent_config.instructions != "": - messages.append(SystemMessage(content=self.agent_config.instructions)) + messages = [] + if self.agent_config.instructions != "": + messages.append(SystemMessage(content=self.agent_config.instructions)) - for i, turn in enumerate(turns): - messages.extend(self.turn_to_messages(turn)) + for i, turn in enumerate(turns): + messages.extend(self.turn_to_messages(turn)) - messages.extend(request.messages) + messages.extend(request.messages) - turn_id = str(uuid.uuid4()) - start_time = datetime.now() - yield AgentTurnResponseStreamChunk( - event=AgentTurnResponseEvent( - payload=AgentTurnResponseTurnStartPayload( - turn_id=turn_id, + turn_id = str(uuid.uuid4()) + span.set_attribute("turn_id", turn_id) + start_time = datetime.now() + yield AgentTurnResponseStreamChunk( + event=AgentTurnResponseEvent( + payload=AgentTurnResponseTurnStartPayload( + turn_id=turn_id, + ) ) ) - ) - steps = [] - output_message = None - async for chunk in self.run( - session_id=request.session_id, - turn_id=turn_id, - input_messages=messages, - attachments=request.attachments or [], - sampling_params=self.agent_config.sampling_params, - stream=request.stream, - ): - if isinstance(chunk, CompletionMessage): - log.info( - f"{chunk.role.capitalize()}: {chunk.content}", - ) - output_message = chunk - continue - - assert isinstance( - chunk, AgentTurnResponseStreamChunk - ), f"Unexpected type {type(chunk)}" - event = chunk.event - if ( - event.payload.event_type - == AgentTurnResponseEventType.step_complete.value + steps = [] + output_message = None + async for chunk in self.run( + session_id=request.session_id, + turn_id=turn_id, + input_messages=messages, + attachments=request.attachments or [], + sampling_params=self.agent_config.sampling_params, + stream=request.stream, ): - steps.append(event.payload.step_details) + if isinstance(chunk, CompletionMessage): + log.info( + f"{chunk.role.capitalize()}: {chunk.content}", + ) + output_message = chunk + continue - yield chunk + assert isinstance( + chunk, AgentTurnResponseStreamChunk + ), f"Unexpected type {type(chunk)}" + event = chunk.event + if ( + event.payload.event_type + == AgentTurnResponseEventType.step_complete.value + ): + steps.append(event.payload.step_details) - assert output_message is not None + yield chunk - turn = Turn( - turn_id=turn_id, - session_id=request.session_id, - input_messages=request.messages, - output_message=output_message, - started_at=start_time, - completed_at=datetime.now(), - steps=steps, - ) - await self.storage.add_turn_to_session(request.session_id, turn) + assert output_message is not None - chunk = AgentTurnResponseStreamChunk( - event=AgentTurnResponseEvent( - payload=AgentTurnResponseTurnCompletePayload( - turn=turn, + turn = Turn( + turn_id=turn_id, + session_id=request.session_id, + input_messages=request.messages, + output_message=output_message, + started_at=start_time, + completed_at=datetime.now(), + steps=steps, + ) + await self.storage.add_turn_to_session(request.session_id, turn) + + chunk = AgentTurnResponseStreamChunk( + event=AgentTurnResponseEvent( + payload=AgentTurnResponseTurnCompletePayload( + turn=turn, + ) ) ) - ) - yield chunk + yield chunk async def run( self, @@ -273,7 +277,6 @@ class ChatAgent(ShieldRunnerMixin): yield final_response - @tracing.span("run_shields") async def run_multiple_shields_wrapper( self, turn_id: str, @@ -281,23 +284,47 @@ class ChatAgent(ShieldRunnerMixin): shields: List[str], touchpoint: str, ) -> AsyncGenerator: - if len(shields) == 0: - return + with tracing.span("run_shields") as span: + span.set_attribute("turn_id", turn_id) + span.set_attribute("input", [m.model_dump_json() for m in messages]) + if len(shields) == 0: + span.set_attribute("output", "no shields") + return - step_id = str(uuid.uuid4()) - try: - yield AgentTurnResponseStreamChunk( - event=AgentTurnResponseEvent( - payload=AgentTurnResponseStepStartPayload( - step_type=StepType.shield_call.value, - step_id=step_id, - metadata=dict(touchpoint=touchpoint), + step_id = str(uuid.uuid4()) + try: + yield AgentTurnResponseStreamChunk( + event=AgentTurnResponseEvent( + payload=AgentTurnResponseStepStartPayload( + step_type=StepType.shield_call.value, + step_id=step_id, + metadata=dict(touchpoint=touchpoint), + ) ) ) - ) - await self.run_multiple_shields(messages, shields) + await self.run_multiple_shields(messages, shields) + + except SafetyException as e: + yield AgentTurnResponseStreamChunk( + event=AgentTurnResponseEvent( + payload=AgentTurnResponseStepCompletePayload( + step_type=StepType.shield_call.value, + step_details=ShieldCallStep( + step_id=step_id, + turn_id=turn_id, + violation=e.violation, + ), + ) + ) + ) + span.set_attribute("output", e.violation.model_dump_json()) + + yield CompletionMessage( + content=str(e), + stop_reason=StopReason.end_of_turn, + ) + yield False - except SafetyException as e: yield AgentTurnResponseStreamChunk( event=AgentTurnResponseEvent( payload=AgentTurnResponseStepCompletePayload( @@ -305,30 +332,12 @@ class ChatAgent(ShieldRunnerMixin): step_details=ShieldCallStep( step_id=step_id, turn_id=turn_id, - violation=e.violation, + violation=None, ), ) ) ) - - yield CompletionMessage( - content=str(e), - stop_reason=StopReason.end_of_turn, - ) - yield False - - yield AgentTurnResponseStreamChunk( - event=AgentTurnResponseEvent( - payload=AgentTurnResponseStepCompletePayload( - step_type=StepType.shield_call.value, - step_details=ShieldCallStep( - step_id=step_id, - turn_id=turn_id, - violation=None, - ), - ) - ) - ) + span.set_attribute("output", "no violations") async def _run( self, @@ -356,10 +365,15 @@ class ChatAgent(ShieldRunnerMixin): # TODO: find older context from the session and either replace it # or append with a sliding window. this is really a very simplistic implementation - with tracing.span("retrieve_rag_context"): + with tracing.span("retrieve_rag_context") as span: rag_context, bank_ids = await self._retrieve_context( session_id, input_messages, attachments ) + span.set_attribute( + "input", [m.model_dump_json() for m in input_messages] + ) + span.set_attribute("output", rag_context) + span.set_attribute("bank_ids", bank_ids) step_id = str(uuid.uuid4()) yield AgentTurnResponseStreamChunk( @@ -416,7 +430,7 @@ class ChatAgent(ShieldRunnerMixin): content = "" stop_reason = None - with tracing.span("inference"): + with tracing.span("inference") as span: async for chunk in await self.inference_api.chat_completion( self.agent_config.model, input_messages, @@ -436,7 +450,6 @@ class ChatAgent(ShieldRunnerMixin): if isinstance(delta, ToolCallDelta): if delta.parse_status == ToolCallParseStatus.success: tool_calls.append(delta.content) - if stream: yield AgentTurnResponseStreamChunk( event=AgentTurnResponseEvent( @@ -466,6 +479,13 @@ class ChatAgent(ShieldRunnerMixin): if event.stop_reason is not None: stop_reason = event.stop_reason + span.set_attribute("stop_reason", stop_reason) + span.set_attribute( + "input", [m.model_dump_json() for m in input_messages] + ) + span.set_attribute( + "output", f"content: {content} tool_calls: {tool_calls}" + ) stop_reason = stop_reason or StopReason.out_of_tokens @@ -549,7 +569,13 @@ class ChatAgent(ShieldRunnerMixin): ) ) - with tracing.span("tool_execution"): + with tracing.span( + "tool_execution", + { + "tool_name": tool_call.tool_name, + "input": message.model_dump_json(), + }, + ) as span: result_messages = await execute_tool_call_maybe( self.tools_dict, [message], @@ -558,6 +584,7 @@ class ChatAgent(ShieldRunnerMixin): len(result_messages) == 1 ), "Currently not supporting multiple messages" result_message = result_messages[0] + span.set_attribute("output", result_message.model_dump_json()) yield AgentTurnResponseStreamChunk( event=AgentTurnResponseEvent( diff --git a/llama_stack/providers/inline/datasetio/localfs/datasetio.py b/llama_stack/providers/inline/datasetio/localfs/datasetio.py index 010610056..736e5d8b9 100644 --- a/llama_stack/providers/inline/datasetio/localfs/datasetio.py +++ b/llama_stack/providers/inline/datasetio/localfs/datasetio.py @@ -3,14 +3,17 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Optional +from typing import Any, Dict, List, Optional import pandas from llama_models.llama3.api.datatypes import * # noqa: F403 from llama_stack.apis.datasetio import * # noqa: F403 +import base64 +import os from abc import ABC, abstractmethod from dataclasses import dataclass +from urllib.parse import urlparse from llama_stack.providers.datatypes import DatasetsProtocolPrivate from llama_stack.providers.utils.datasetio.url_utils import get_dataframe_from_url @@ -131,3 +134,41 @@ class LocalFSDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate): total_count=len(rows), next_page_token=str(end), ) + + async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None: + dataset_info = self.dataset_infos.get(dataset_id) + if dataset_info is None: + raise ValueError(f"Dataset with id {dataset_id} not found") + + dataset_impl = dataset_info.dataset_impl + dataset_impl.load() + + new_rows_df = pandas.DataFrame(rows) + new_rows_df = dataset_impl._validate_dataset_schema(new_rows_df) + dataset_impl.df = pandas.concat( + [dataset_impl.df, new_rows_df], ignore_index=True + ) + + url = str(dataset_info.dataset_def.url) + parsed_url = urlparse(url) + + if parsed_url.scheme == "file" or not parsed_url.scheme: + file_path = parsed_url.path + os.makedirs(os.path.dirname(file_path), exist_ok=True) + dataset_impl.df.to_csv(file_path, index=False) + elif parsed_url.scheme == "data": + # For data URLs, we need to update the base64-encoded content + if not parsed_url.path.startswith("text/csv;base64,"): + raise ValueError("Data URL must be a base64-encoded CSV") + + csv_buffer = dataset_impl.df.to_csv(index=False) + base64_content = base64.b64encode(csv_buffer.encode("utf-8")).decode( + "utf-8" + ) + dataset_info.dataset_def.url = URL( + uri=f"data:text/csv;base64,{base64_content}" + ) + else: + raise ValueError( + f"Unsupported URL scheme: {parsed_url.scheme}. Only file:// and data: URLs are supported for writing." + ) diff --git a/llama_stack/providers/inline/meta_reference/telemetry/__init__.py b/llama_stack/providers/inline/meta_reference/telemetry/__init__.py deleted file mode 100644 index 4a0c2f6ee..000000000 --- a/llama_stack/providers/inline/meta_reference/telemetry/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from .config import ConsoleConfig - - -async def get_provider_impl(config: ConsoleConfig, _deps): - from .console import ConsoleTelemetryImpl - - impl = ConsoleTelemetryImpl(config) - await impl.initialize() - return impl diff --git a/llama_stack/providers/inline/meta_reference/telemetry/config.py b/llama_stack/providers/inline/meta_reference/telemetry/config.py deleted file mode 100644 index a1db1d4d8..000000000 --- a/llama_stack/providers/inline/meta_reference/telemetry/config.py +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from enum import Enum - -from llama_models.schema_utils import json_schema_type - -from pydantic import BaseModel - - -class LogFormat(Enum): - TEXT = "text" - JSON = "json" - - -@json_schema_type -class ConsoleConfig(BaseModel): - log_format: LogFormat = LogFormat.TEXT diff --git a/llama_stack/providers/inline/meta_reference/telemetry/console.py b/llama_stack/providers/inline/meta_reference/telemetry/console.py index d8ef49481..838aaa4e1 100644 --- a/llama_stack/providers/inline/meta_reference/telemetry/console.py +++ b/llama_stack/providers/inline/meta_reference/telemetry/console.py @@ -5,7 +5,7 @@ # the root directory of this source tree. import json -from typing import Optional +from typing import List, Optional from .config import LogFormat @@ -49,8 +49,27 @@ class ConsoleTelemetryImpl(Telemetry): if formatted: print(formatted) - async def get_trace(self, trace_id: str) -> Trace: - raise NotImplementedError() + async def query_traces( + self, + attribute_conditions: Optional[List[QueryCondition]] = None, + attribute_keys_to_return: Optional[List[str]] = None, + limit: Optional[int] = 100, + offset: Optional[int] = 0, + order_by: Optional[List[str]] = None, + ) -> List[Trace]: + raise NotImplementedError("Console telemetry does not support trace querying") + + async def get_spans( + self, + span_id: str, + attribute_conditions: Optional[List[QueryCondition]] = None, + attribute_keys_to_return: Optional[List[str]] = None, + max_depth: Optional[int] = None, + limit: Optional[int] = 100, + offset: Optional[int] = 0, + order_by: Optional[List[str]] = None, + ) -> SpanWithChildren: + raise NotImplementedError("Console telemetry does not support span querying") COLORS = { diff --git a/llama_stack/providers/remote/telemetry/__init__.py b/llama_stack/providers/inline/telemetry/__init__.py similarity index 100% rename from llama_stack/providers/remote/telemetry/__init__.py rename to llama_stack/providers/inline/telemetry/__init__.py diff --git a/llama_stack/providers/inline/telemetry/meta_reference/__init__.py b/llama_stack/providers/inline/telemetry/meta_reference/__init__.py new file mode 100644 index 000000000..6213d5536 --- /dev/null +++ b/llama_stack/providers/inline/telemetry/meta_reference/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import Any, Dict + +from .config import TelemetryConfig, TelemetrySink +from .telemetry import TelemetryAdapter + +__all__ = ["TelemetryConfig", "TelemetryAdapter", "TelemetrySink"] + + +async def get_provider_impl(config: TelemetryConfig, deps: Dict[str, Any]): + impl = TelemetryAdapter(config) + await impl.initialize() + return impl diff --git a/llama_stack/providers/inline/telemetry/meta_reference/config.py b/llama_stack/providers/inline/telemetry/meta_reference/config.py new file mode 100644 index 000000000..0230d24d2 --- /dev/null +++ b/llama_stack/providers/inline/telemetry/meta_reference/config.py @@ -0,0 +1,45 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from enum import Enum +from typing import Any, Dict, List + +from pydantic import BaseModel, Field + +from llama_stack.distribution.utils.config_dirs import RUNTIME_BASE_DIR + + +class TelemetrySink(str, Enum): + JAEGER = "jaeger" + SQLITE = "sqlite" + CONSOLE = "console" + + +class TelemetryConfig(BaseModel): + otel_endpoint: str = Field( + default="http://localhost:4318/v1/traces", + description="The OpenTelemetry collector endpoint URL", + ) + service_name: str = Field( + default="llama-stack", + description="The service name to use for telemetry", + ) + sinks: List[TelemetrySink] = Field( + default=[TelemetrySink.CONSOLE, TelemetrySink.SQLITE], + description="List of telemetry sinks to enable (possible values: jaeger, sqlite, console)", + ) + sqlite_db_path: str = Field( + default=(RUNTIME_BASE_DIR / "trace_store.db").as_posix(), + description="The path to the SQLite database to use for storing traces", + ) + + @classmethod + def sample_run_config(cls, **kwargs) -> Dict[str, Any]: + return { + "service_name": "${env.OTEL_SERVICE_NAME:llama-stack}", + "sinks": "${env.TELEMETRY_SINKS:['console', 'sqlite']}", + "sqlite_db_path": "${env.SQLITE_DB_PATH:${runtime.base_dir}/trace_store.db}", + } diff --git a/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py b/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py new file mode 100644 index 000000000..8d6f779e6 --- /dev/null +++ b/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py @@ -0,0 +1,95 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from datetime import datetime + +from opentelemetry.sdk.trace import ReadableSpan +from opentelemetry.sdk.trace.export import SpanProcessor + +# Colors for console output +COLORS = { + "reset": "\033[0m", + "bold": "\033[1m", + "dim": "\033[2m", + "red": "\033[31m", + "green": "\033[32m", + "yellow": "\033[33m", + "blue": "\033[34m", + "magenta": "\033[35m", + "cyan": "\033[36m", + "white": "\033[37m", +} + + +class ConsoleSpanProcessor(SpanProcessor): + """A SpanProcessor that prints spans to the console with color formatting.""" + + def on_start(self, span: ReadableSpan, parent_context=None) -> None: + """Called when a span starts.""" + timestamp = datetime.utcfromtimestamp(span.start_time / 1e9).strftime( + "%H:%M:%S.%f" + )[:-3] + + print( + f"{COLORS['dim']}{timestamp}{COLORS['reset']} " + f"{COLORS['magenta']}[START]{COLORS['reset']} " + f"{COLORS['cyan']}{span.name}{COLORS['reset']}" + ) + + def on_end(self, span: ReadableSpan) -> None: + """Called when a span ends.""" + timestamp = datetime.utcfromtimestamp(span.end_time / 1e9).strftime( + "%H:%M:%S.%f" + )[:-3] + + # Build the span context string + span_context = ( + f"{COLORS['dim']}{timestamp}{COLORS['reset']} " + f"{COLORS['magenta']}[END]{COLORS['reset']} " + f"{COLORS['cyan']}{span.name}{COLORS['reset']} " + ) + + # Add status if not OK + if span.status.status_code != 0: # UNSET or ERROR + status_color = ( + COLORS["red"] if span.status.status_code == 2 else COLORS["yellow"] + ) + span_context += ( + f" {status_color}[{span.status.status_code}]{COLORS['reset']}" + ) + + # Add duration + duration_ms = (span.end_time - span.start_time) / 1e6 + span_context += f" {COLORS['dim']}({duration_ms:.2f}ms){COLORS['reset']}" + + # Print the main span line + print(span_context) + + # Print attributes indented + if span.attributes: + for key, value in span.attributes.items(): + print(f" {COLORS['dim']}{key}: {value}{COLORS['reset']}") + + # Print events indented + for event in span.events: + event_time = datetime.utcfromtimestamp(event.timestamp / 1e9).strftime( + "%H:%M:%S.%f" + )[:-3] + print( + f" {COLORS['dim']}{event_time}{COLORS['reset']} " + f"{COLORS['cyan']}[EVENT]{COLORS['reset']} {event.name}" + ) + if event.attributes: + for key, value in event.attributes.items(): + print(f" {COLORS['dim']}{key}: {value}{COLORS['reset']}") + + def shutdown(self) -> None: + """Shutdown the processor.""" + pass + + def force_flush(self, timeout_millis: float = None) -> bool: + """Force flush any pending spans.""" + return True diff --git a/llama_stack/providers/inline/telemetry/meta_reference/sqlite_span_processor.py b/llama_stack/providers/inline/telemetry/meta_reference/sqlite_span_processor.py new file mode 100644 index 000000000..553dd5000 --- /dev/null +++ b/llama_stack/providers/inline/telemetry/meta_reference/sqlite_span_processor.py @@ -0,0 +1,242 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import json +import os +import sqlite3 +import threading +from datetime import datetime, timedelta +from typing import Dict + +from opentelemetry.sdk.trace import SpanProcessor +from opentelemetry.trace import Span + + +class SQLiteSpanProcessor(SpanProcessor): + def __init__(self, conn_string, ttl_days=30): + """Initialize the SQLite span processor with a connection string.""" + self.conn_string = conn_string + self.ttl_days = ttl_days + self.cleanup_task = None + self._thread_local = threading.local() + self._connections: Dict[int, sqlite3.Connection] = {} + self._lock = threading.Lock() + self.setup_database() + + def _get_connection(self) -> sqlite3.Connection: + """Get a thread-specific database connection.""" + thread_id = threading.get_ident() + with self._lock: + if thread_id not in self._connections: + conn = sqlite3.connect(self.conn_string) + self._connections[thread_id] = conn + return self._connections[thread_id] + + def setup_database(self): + """Create the necessary tables if they don't exist.""" + # Create directory if it doesn't exist + os.makedirs(os.path.dirname(self.conn_string), exist_ok=True) + + conn = self._get_connection() + cursor = conn.cursor() + + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS traces ( + trace_id TEXT PRIMARY KEY, + service_name TEXT, + root_span_id TEXT, + start_time TIMESTAMP, + end_time TIMESTAMP, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ) + """ + ) + + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS spans ( + span_id TEXT PRIMARY KEY, + trace_id TEXT REFERENCES traces(trace_id), + parent_span_id TEXT, + name TEXT, + start_time TIMESTAMP, + end_time TIMESTAMP, + attributes TEXT, + status TEXT, + kind TEXT + ) + """ + ) + + cursor.execute( + """ + CREATE TABLE IF NOT EXISTS span_events ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + span_id TEXT REFERENCES spans(span_id), + name TEXT, + timestamp TIMESTAMP, + attributes TEXT + ) + """ + ) + + cursor.execute( + """ + CREATE INDEX IF NOT EXISTS idx_traces_created_at + ON traces(created_at) + """ + ) + + conn.commit() + cursor.close() + + # Start periodic cleanup in a separate thread + self.cleanup_task = threading.Thread(target=self._periodic_cleanup, daemon=True) + self.cleanup_task.start() + + def _cleanup_old_data(self): + """Delete records older than TTL.""" + try: + conn = self._get_connection() + cutoff_date = (datetime.now() - timedelta(days=self.ttl_days)).isoformat() + cursor = conn.cursor() + + # Delete old span events + cursor.execute( + """ + DELETE FROM span_events + WHERE span_id IN ( + SELECT span_id FROM spans + WHERE trace_id IN ( + SELECT trace_id FROM traces + WHERE created_at < ? + ) + ) + """, + (cutoff_date,), + ) + + # Delete old spans + cursor.execute( + """ + DELETE FROM spans + WHERE trace_id IN ( + SELECT trace_id FROM traces + WHERE created_at < ? + ) + """, + (cutoff_date,), + ) + + # Delete old traces + cursor.execute("DELETE FROM traces WHERE created_at < ?", (cutoff_date,)) + + conn.commit() + cursor.close() + except Exception as e: + print(f"Error during cleanup: {e}") + + def _periodic_cleanup(self): + """Run cleanup periodically.""" + import time + + while True: + time.sleep(3600) # Sleep for 1 hour + self._cleanup_old_data() + + def on_start(self, span: Span, parent_context=None): + """Called when a span starts.""" + pass + + def on_end(self, span: Span): + """Called when a span ends. Export the span data to SQLite.""" + try: + conn = self._get_connection() + cursor = conn.cursor() + + trace_id = format(span.get_span_context().trace_id, "032x") + span_id = format(span.get_span_context().span_id, "016x") + service_name = span.resource.attributes.get("service.name", "unknown") + + parent_span_id = None + parent_context = span.parent + if parent_context: + parent_span_id = format(parent_context.span_id, "016x") + + # Insert into traces + cursor.execute( + """ + INSERT INTO traces ( + trace_id, service_name, root_span_id, start_time, end_time + ) VALUES (?, ?, ?, ?, ?) + ON CONFLICT(trace_id) DO UPDATE SET + root_span_id = COALESCE(root_span_id, excluded.root_span_id), + start_time = MIN(excluded.start_time, start_time), + end_time = MAX(excluded.end_time, end_time) + """, + ( + trace_id, + service_name, + (span_id if not parent_span_id else None), + datetime.fromtimestamp(span.start_time / 1e9).isoformat(), + datetime.fromtimestamp(span.end_time / 1e9).isoformat(), + ), + ) + + # Insert into spans + cursor.execute( + """ + INSERT INTO spans ( + span_id, trace_id, parent_span_id, name, + start_time, end_time, attributes, status, + kind + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + span_id, + trace_id, + parent_span_id, + span.name, + datetime.fromtimestamp(span.start_time / 1e9).isoformat(), + datetime.fromtimestamp(span.end_time / 1e9).isoformat(), + json.dumps(dict(span.attributes)), + span.status.status_code.name, + span.kind.name, + ), + ) + + for event in span.events: + cursor.execute( + """ + INSERT INTO span_events ( + span_id, name, timestamp, attributes + ) VALUES (?, ?, ?, ?) + """, + ( + span_id, + event.name, + datetime.fromtimestamp(event.timestamp / 1e9).isoformat(), + json.dumps(dict(event.attributes)), + ), + ) + + conn.commit() + cursor.close() + except Exception as e: + print(f"Error exporting span to SQLite: {e}") + + def shutdown(self): + """Cleanup any resources.""" + with self._lock: + for conn in self._connections.values(): + if conn: + conn.close() + self._connections.clear() + + def force_flush(self, timeout_millis=30000): + """Force export of spans.""" + pass diff --git a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py new file mode 100644 index 000000000..6540a667f --- /dev/null +++ b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py @@ -0,0 +1,247 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import threading +from typing import List, Optional + +from opentelemetry import metrics, trace +from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter +from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter +from opentelemetry.sdk.metrics import MeterProvider +from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader +from opentelemetry.sdk.resources import Resource +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.semconv.resource import ResourceAttributes + +from llama_stack.providers.inline.telemetry.meta_reference.console_span_processor import ( + ConsoleSpanProcessor, +) + +from llama_stack.providers.inline.telemetry.meta_reference.sqlite_span_processor import ( + SQLiteSpanProcessor, +) +from llama_stack.providers.utils.telemetry.sqlite_trace_store import SQLiteTraceStore + +from llama_stack.apis.telemetry import * # noqa: F403 + +from .config import TelemetryConfig, TelemetrySink + +_GLOBAL_STORAGE = { + "active_spans": {}, + "counters": {}, + "gauges": {}, + "up_down_counters": {}, +} +_global_lock = threading.Lock() + + +def string_to_trace_id(s: str) -> int: + # Convert the string to bytes and then to an integer + return int.from_bytes(s.encode(), byteorder="big", signed=False) + + +def string_to_span_id(s: str) -> int: + # Use only the first 8 bytes (64 bits) for span ID + return int.from_bytes(s.encode()[:8], byteorder="big", signed=False) + + +def is_tracing_enabled(tracer): + with tracer.start_as_current_span("check_tracing") as span: + return span.is_recording() + + +class TelemetryAdapter(Telemetry): + def __init__(self, config: TelemetryConfig) -> None: + self.config = config + + resource = Resource.create( + { + ResourceAttributes.SERVICE_NAME: self.config.service_name, + } + ) + + provider = TracerProvider(resource=resource) + trace.set_tracer_provider(provider) + if TelemetrySink.JAEGER in self.config.sinks: + otlp_exporter = OTLPSpanExporter( + endpoint=self.config.otel_endpoint, + ) + span_processor = BatchSpanProcessor(otlp_exporter) + trace.get_tracer_provider().add_span_processor(span_processor) + metric_reader = PeriodicExportingMetricReader( + OTLPMetricExporter( + endpoint=self.config.otel_endpoint, + ) + ) + metric_provider = MeterProvider( + resource=resource, metric_readers=[metric_reader] + ) + metrics.set_meter_provider(metric_provider) + self.meter = metrics.get_meter(__name__) + if TelemetrySink.SQLITE in self.config.sinks: + trace.get_tracer_provider().add_span_processor( + SQLiteSpanProcessor(self.config.sqlite_db_path) + ) + self.trace_store = SQLiteTraceStore(self.config.sqlite_db_path) + if TelemetrySink.CONSOLE in self.config.sinks: + trace.get_tracer_provider().add_span_processor(ConsoleSpanProcessor()) + self._lock = _global_lock + + async def initialize(self) -> None: + pass + + async def shutdown(self) -> None: + trace.get_tracer_provider().force_flush() + trace.get_tracer_provider().shutdown() + metrics.get_meter_provider().shutdown() + + async def log_event(self, event: Event, ttl_seconds: int = 604800) -> None: + if isinstance(event, UnstructuredLogEvent): + self._log_unstructured(event, ttl_seconds) + elif isinstance(event, MetricEvent): + self._log_metric(event) + elif isinstance(event, StructuredLogEvent): + self._log_structured(event, ttl_seconds) + else: + raise ValueError(f"Unknown event type: {event}") + + def _log_unstructured(self, event: UnstructuredLogEvent, ttl_seconds: int) -> None: + with self._lock: + # Use global storage instead of instance storage + span_id = string_to_span_id(event.span_id) + span = _GLOBAL_STORAGE["active_spans"].get(span_id) + + if span: + timestamp_ns = int(event.timestamp.timestamp() * 1e9) + span.add_event( + name=event.type, + attributes={ + "message": event.message, + "severity": event.severity.value, + "__ttl__": ttl_seconds, + **event.attributes, + }, + timestamp=timestamp_ns, + ) + else: + print( + f"Warning: No active span found for span_id {span_id}. Dropping event: {event}" + ) + + def _get_or_create_counter(self, name: str, unit: str) -> metrics.Counter: + if name not in _GLOBAL_STORAGE["counters"]: + _GLOBAL_STORAGE["counters"][name] = self.meter.create_counter( + name=name, + unit=unit, + description=f"Counter for {name}", + ) + return _GLOBAL_STORAGE["counters"][name] + + def _get_or_create_gauge(self, name: str, unit: str) -> metrics.ObservableGauge: + if name not in _GLOBAL_STORAGE["gauges"]: + _GLOBAL_STORAGE["gauges"][name] = self.meter.create_gauge( + name=name, + unit=unit, + description=f"Gauge for {name}", + ) + return _GLOBAL_STORAGE["gauges"][name] + + def _log_metric(self, event: MetricEvent) -> None: + if isinstance(event.value, int): + counter = self._get_or_create_counter(event.metric, event.unit) + counter.add(event.value, attributes=event.attributes) + elif isinstance(event.value, float): + up_down_counter = self._get_or_create_up_down_counter( + event.metric, event.unit + ) + up_down_counter.add(event.value, attributes=event.attributes) + + def _get_or_create_up_down_counter( + self, name: str, unit: str + ) -> metrics.UpDownCounter: + if name not in _GLOBAL_STORAGE["up_down_counters"]: + _GLOBAL_STORAGE["up_down_counters"][name] = ( + self.meter.create_up_down_counter( + name=name, + unit=unit, + description=f"UpDownCounter for {name}", + ) + ) + return _GLOBAL_STORAGE["up_down_counters"][name] + + def _log_structured(self, event: StructuredLogEvent, ttl_seconds: int) -> None: + with self._lock: + span_id = string_to_span_id(event.span_id) + trace_id = string_to_trace_id(event.trace_id) + tracer = trace.get_tracer(__name__) + if event.attributes is None: + event.attributes = {} + event.attributes["__ttl__"] = ttl_seconds + + if isinstance(event.payload, SpanStartPayload): + # Check if span already exists to prevent duplicates + if span_id in _GLOBAL_STORAGE["active_spans"]: + return + + parent_span = None + if event.payload.parent_span_id: + parent_span_id = string_to_span_id(event.payload.parent_span_id) + parent_span = _GLOBAL_STORAGE["active_spans"].get(parent_span_id) + + context = trace.Context(trace_id=trace_id) + if parent_span: + context = trace.set_span_in_context(parent_span, context) + + span = tracer.start_span( + name=event.payload.name, + context=context, + attributes=event.attributes or {}, + ) + _GLOBAL_STORAGE["active_spans"][span_id] = span + + elif isinstance(event.payload, SpanEndPayload): + span = _GLOBAL_STORAGE["active_spans"].get(span_id) + if span: + if event.attributes: + span.set_attributes(event.attributes) + + status = ( + trace.Status(status_code=trace.StatusCode.OK) + if event.payload.status == SpanStatus.OK + else trace.Status(status_code=trace.StatusCode.ERROR) + ) + span.set_status(status) + span.end() + _GLOBAL_STORAGE["active_spans"].pop(span_id, None) + else: + raise ValueError(f"Unknown structured log event: {event}") + + async def query_traces( + self, + attribute_filters: Optional[List[QueryCondition]] = None, + limit: Optional[int] = 100, + offset: Optional[int] = 0, + order_by: Optional[List[str]] = None, + ) -> List[Trace]: + return await self.trace_store.query_traces( + attribute_filters=attribute_filters, + limit=limit, + offset=offset, + order_by=order_by, + ) + + async def get_span_tree( + self, + span_id: str, + attributes_to_return: Optional[List[str]] = None, + max_depth: Optional[int] = None, + ) -> SpanWithChildren: + return await self.trace_store.get_materialized_span( + span_id=span_id, + attributes_to_return=attributes_to_return, + max_depth=max_depth, + ) diff --git a/llama_stack/providers/remote/telemetry/sample/__init__.py b/llama_stack/providers/inline/telemetry/sample/__init__.py similarity index 100% rename from llama_stack/providers/remote/telemetry/sample/__init__.py rename to llama_stack/providers/inline/telemetry/sample/__init__.py diff --git a/llama_stack/providers/remote/telemetry/sample/config.py b/llama_stack/providers/inline/telemetry/sample/config.py similarity index 100% rename from llama_stack/providers/remote/telemetry/sample/config.py rename to llama_stack/providers/inline/telemetry/sample/config.py diff --git a/llama_stack/providers/remote/telemetry/sample/sample.py b/llama_stack/providers/inline/telemetry/sample/sample.py similarity index 100% rename from llama_stack/providers/remote/telemetry/sample/sample.py rename to llama_stack/providers/inline/telemetry/sample/sample.py diff --git a/llama_stack/providers/registry/telemetry.py b/llama_stack/providers/registry/telemetry.py index ac537e076..a53ad5b94 100644 --- a/llama_stack/providers/registry/telemetry.py +++ b/llama_stack/providers/registry/telemetry.py @@ -14,9 +14,12 @@ def available_providers() -> List[ProviderSpec]: InlineProviderSpec( api=Api.telemetry, provider_type="inline::meta-reference", - pip_packages=[], - module="llama_stack.providers.inline.meta_reference.telemetry", - config_class="llama_stack.providers.inline.meta_reference.telemetry.ConsoleConfig", + pip_packages=[ + "opentelemetry-sdk", + "opentelemetry-exporter-otlp-proto-http", + ], + module="llama_stack.providers.inline.telemetry.meta_reference", + config_class="llama_stack.providers.inline.telemetry.meta_reference.config.TelemetryConfig", ), remote_provider_spec( api=Api.telemetry, @@ -27,18 +30,4 @@ def available_providers() -> List[ProviderSpec]: config_class="llama_stack.providers.remote.telemetry.sample.SampleConfig", ), ), - remote_provider_spec( - api=Api.telemetry, - adapter=AdapterSpec( - adapter_type="opentelemetry-jaeger", - pip_packages=[ - "opentelemetry-api", - "opentelemetry-sdk", - "opentelemetry-exporter-jaeger", - "opentelemetry-semantic-conventions", - ], - module="llama_stack.providers.remote.telemetry.opentelemetry", - config_class="llama_stack.providers.remote.telemetry.opentelemetry.OpenTelemetryConfig", - ), - ), ] diff --git a/llama_stack/providers/remote/datasetio/huggingface/huggingface.py b/llama_stack/providers/remote/datasetio/huggingface/huggingface.py index cdd5d9cd3..db52270a7 100644 --- a/llama_stack/providers/remote/datasetio/huggingface/huggingface.py +++ b/llama_stack/providers/remote/datasetio/huggingface/huggingface.py @@ -3,7 +3,7 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Optional +from typing import Any, Dict, List, Optional from llama_stack.apis.datasetio import * # noqa: F403 @@ -100,3 +100,22 @@ class HuggingfaceDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate): total_count=len(rows), next_page_token=str(end), ) + + async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None: + dataset_def = self.dataset_infos[dataset_id] + loaded_dataset = load_hf_dataset(dataset_def) + + # Convert rows to HF Dataset format + new_dataset = hf_datasets.Dataset.from_list(rows) + + # Concatenate the new rows with existing dataset + updated_dataset = hf_datasets.concatenate_datasets( + [loaded_dataset, new_dataset] + ) + + if dataset_def.metadata.get("path", None): + updated_dataset.push_to_hub(dataset_def.metadata["path"]) + else: + raise NotImplementedError( + "Uploading to URL-based datasets is not supported yet" + ) diff --git a/llama_stack/providers/remote/telemetry/opentelemetry/__init__.py b/llama_stack/providers/remote/telemetry/opentelemetry/__init__.py deleted file mode 100644 index 0842afe2d..000000000 --- a/llama_stack/providers/remote/telemetry/opentelemetry/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from .config import OpenTelemetryConfig - - -async def get_adapter_impl(config: OpenTelemetryConfig, _deps): - from .opentelemetry import OpenTelemetryAdapter - - impl = OpenTelemetryAdapter(config) - await impl.initialize() - return impl diff --git a/llama_stack/providers/remote/telemetry/opentelemetry/config.py b/llama_stack/providers/remote/telemetry/opentelemetry/config.py deleted file mode 100644 index 5e9dff1a1..000000000 --- a/llama_stack/providers/remote/telemetry/opentelemetry/config.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from typing import Any, Dict - -from pydantic import BaseModel, Field - - -class OpenTelemetryConfig(BaseModel): - otel_endpoint: str = Field( - default="http://localhost:4318/v1/traces", - description="The OpenTelemetry collector endpoint URL", - ) - service_name: str = Field( - default="llama-stack", - description="The service name to use for telemetry", - ) - - @classmethod - def sample_run_config(cls, **kwargs) -> Dict[str, Any]: - return { - "otel_endpoint": "${env.OTEL_ENDPOINT:http://localhost:4318/v1/traces}", - "service_name": "${env.OTEL_SERVICE_NAME:llama-stack}", - } diff --git a/llama_stack/providers/remote/telemetry/opentelemetry/opentelemetry.py b/llama_stack/providers/remote/telemetry/opentelemetry/opentelemetry.py index c9830fd9d..04eb71ce0 100644 --- a/llama_stack/providers/remote/telemetry/opentelemetry/opentelemetry.py +++ b/llama_stack/providers/remote/telemetry/opentelemetry/opentelemetry.py @@ -5,6 +5,16 @@ # the root directory of this source tree. import threading +from typing import List, Optional + +from llama_stack.distribution.datatypes import Api +from llama_stack.providers.remote.telemetry.opentelemetry.console_span_processor import ( + ConsoleSpanProcessor, +) +from llama_stack.providers.remote.telemetry.opentelemetry.sqlite_span_processor import ( + SQLiteSpanProcessor, +) +from llama_stack.providers.utils.telemetry.sqlite_trace_store import SQLiteTraceStore from opentelemetry import metrics, trace from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter @@ -19,7 +29,7 @@ from opentelemetry.semconv.resource import ResourceAttributes from llama_stack.apis.telemetry import * # noqa: F403 -from .config import OpenTelemetryConfig +from .config import OpenTelemetryConfig, TelemetrySink _GLOBAL_STORAGE = { "active_spans": {}, @@ -46,8 +56,9 @@ def is_tracing_enabled(tracer): class OpenTelemetryAdapter(Telemetry): - def __init__(self, config: OpenTelemetryConfig): + def __init__(self, config: OpenTelemetryConfig, deps) -> None: self.config = config + self.datasetio = deps[Api.datasetio] resource = Resource.create( { @@ -57,22 +68,29 @@ class OpenTelemetryAdapter(Telemetry): provider = TracerProvider(resource=resource) trace.set_tracer_provider(provider) - otlp_exporter = OTLPSpanExporter( - endpoint=self.config.otel_endpoint, - ) - span_processor = BatchSpanProcessor(otlp_exporter) - trace.get_tracer_provider().add_span_processor(span_processor) - # Set up metrics - metric_reader = PeriodicExportingMetricReader( - OTLPMetricExporter( + if TelemetrySink.JAEGER in self.config.sinks: + otlp_exporter = OTLPSpanExporter( endpoint=self.config.otel_endpoint, ) - ) - metric_provider = MeterProvider( - resource=resource, metric_readers=[metric_reader] - ) - metrics.set_meter_provider(metric_provider) - self.meter = metrics.get_meter(__name__) + span_processor = BatchSpanProcessor(otlp_exporter) + trace.get_tracer_provider().add_span_processor(span_processor) + metric_reader = PeriodicExportingMetricReader( + OTLPMetricExporter( + endpoint=self.config.otel_endpoint, + ) + ) + metric_provider = MeterProvider( + resource=resource, metric_readers=[metric_reader] + ) + metrics.set_meter_provider(metric_provider) + self.meter = metrics.get_meter(__name__) + if TelemetrySink.SQLITE in self.config.sinks: + trace.get_tracer_provider().add_span_processor( + SQLiteSpanProcessor(self.config.sqlite_db_path) + ) + self.trace_store = SQLiteTraceStore(self.config.sqlite_db_path) + if TelemetrySink.CONSOLE in self.config.sinks: + trace.get_tracer_provider().add_span_processor(ConsoleSpanProcessor()) self._lock = _global_lock async def initialize(self) -> None: @@ -83,15 +101,17 @@ class OpenTelemetryAdapter(Telemetry): trace.get_tracer_provider().shutdown() metrics.get_meter_provider().shutdown() - async def log_event(self, event: Event) -> None: + async def log_event(self, event: Event, ttl_seconds: int = 604800) -> None: if isinstance(event, UnstructuredLogEvent): - self._log_unstructured(event) + self._log_unstructured(event, ttl_seconds) elif isinstance(event, MetricEvent): self._log_metric(event) elif isinstance(event, StructuredLogEvent): - self._log_structured(event) + self._log_structured(event, ttl_seconds) + else: + raise ValueError(f"Unknown event type: {event}") - def _log_unstructured(self, event: UnstructuredLogEvent) -> None: + def _log_unstructured(self, event: UnstructuredLogEvent, ttl_seconds: int) -> None: with self._lock: # Use global storage instead of instance storage span_id = string_to_span_id(event.span_id) @@ -104,6 +124,7 @@ class OpenTelemetryAdapter(Telemetry): attributes={ "message": event.message, "severity": event.severity.value, + "__ttl__": ttl_seconds, **event.attributes, }, timestamp=timestamp_ns, @@ -154,11 +175,14 @@ class OpenTelemetryAdapter(Telemetry): ) return _GLOBAL_STORAGE["up_down_counters"][name] - def _log_structured(self, event: StructuredLogEvent) -> None: + def _log_structured(self, event: StructuredLogEvent, ttl_seconds: int) -> None: with self._lock: span_id = string_to_span_id(event.span_id) trace_id = string_to_trace_id(event.trace_id) tracer = trace.get_tracer(__name__) + if event.attributes is None: + event.attributes = {} + event.attributes["__ttl__"] = ttl_seconds if isinstance(event.payload, SpanStartPayload): # Check if span already exists to prevent duplicates @@ -170,7 +194,6 @@ class OpenTelemetryAdapter(Telemetry): parent_span_id = string_to_span_id(event.payload.parent_span_id) parent_span = _GLOBAL_STORAGE["active_spans"].get(parent_span_id) - # Create a new trace context with the trace_id context = trace.Context(trace_id=trace_id) if parent_span: context = trace.set_span_in_context(parent_span, context) @@ -179,14 +202,9 @@ class OpenTelemetryAdapter(Telemetry): name=event.payload.name, context=context, attributes=event.attributes or {}, - start_time=int(event.timestamp.timestamp() * 1e9), ) _GLOBAL_STORAGE["active_spans"][span_id] = span - # Set as current span using context manager - with trace.use_span(span, end_on_exit=False): - pass # Let the span continue beyond this block - elif isinstance(event.payload, SpanEndPayload): span = _GLOBAL_STORAGE["active_spans"].get(span_id) if span: @@ -199,10 +217,43 @@ class OpenTelemetryAdapter(Telemetry): else trace.Status(status_code=trace.StatusCode.ERROR) ) span.set_status(status) - span.end(end_time=int(event.timestamp.timestamp() * 1e9)) - - # Remove from active spans + span.end() _GLOBAL_STORAGE["active_spans"].pop(span_id, None) + else: + raise ValueError(f"Unknown structured log event: {event}") - async def get_trace(self, trace_id: str) -> Trace: - raise NotImplementedError("Trace retrieval not implemented yet") + async def query_traces( + self, + attribute_conditions: Optional[List[QueryCondition]] = None, + attribute_keys_to_return: Optional[List[str]] = None, + limit: Optional[int] = 100, + offset: Optional[int] = 0, + order_by: Optional[List[str]] = None, + ) -> List[Trace]: + return await self.trace_store.query_traces( + attribute_conditions=attribute_conditions, + attribute_keys_to_return=attribute_keys_to_return, + limit=limit, + offset=offset, + order_by=order_by, + ) + + async def get_spans( + self, + span_id: str, + attribute_conditions: Optional[List[QueryCondition]] = None, + attribute_keys_to_return: Optional[List[str]] = None, + max_depth: Optional[int] = None, + limit: Optional[int] = 100, + offset: Optional[int] = 0, + order_by: Optional[List[str]] = None, + ) -> SpanWithChildren: + return await self.trace_store.get_spans( + span_id=span_id, + attribute_conditions=attribute_conditions, + attribute_keys_to_return=attribute_keys_to_return, + max_depth=max_depth, + limit=limit, + offset=offset, + order_by=order_by, + ) diff --git a/llama_stack/providers/utils/telemetry/sqlite.py b/llama_stack/providers/utils/telemetry/sqlite.py new file mode 100644 index 000000000..e7161fffa --- /dev/null +++ b/llama_stack/providers/utils/telemetry/sqlite.py @@ -0,0 +1,177 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import json +from datetime import datetime +from typing import List, Optional + +import aiosqlite + +from llama_stack.apis.telemetry import ( + QueryCondition, + SpanWithChildren, + Trace, + TraceStore, +) + + +class SQLiteTraceStore(TraceStore): + def __init__(self, conn_string: str): + self.conn_string = conn_string + + async def query_traces( + self, + attribute_filters: Optional[List[QueryCondition]] = None, + attributes_to_return: Optional[List[str]] = None, + limit: Optional[int] = 100, + offset: Optional[int] = 0, + order_by: Optional[List[str]] = None, + ) -> List[Trace]: + print(attribute_filters, attributes_to_return, limit, offset, order_by) + + def build_attribute_select() -> str: + if not attributes_to_return: + return "" + return "".join( + f", json_extract(s.attributes, '$.{key}') as attr_{key}" + for key in attributes_to_return + ) + + def build_where_clause() -> tuple[str, list]: + if not attribute_filters: + return "", [] + + conditions = [ + f"json_extract(s.attributes, '$.{condition.key}') {condition.op} ?" + for condition in attribute_filters + ] + params = [condition.value for condition in attribute_filters] + where_clause = " WHERE " + " AND ".join(conditions) + return where_clause, params + + def build_order_clause() -> str: + if not order_by: + return "" + + order_clauses = [] + for field in order_by: + desc = field.startswith("-") + clean_field = field[1:] if desc else field + order_clauses.append(f"t.{clean_field} {'DESC' if desc else 'ASC'}") + return " ORDER BY " + ", ".join(order_clauses) + + # Build the main query + base_query = """ + WITH matching_traces AS ( + SELECT DISTINCT t.trace_id + FROM traces t + JOIN spans s ON t.trace_id = s.trace_id + {where_clause} + ), + filtered_traces AS ( + SELECT t.trace_id, t.root_span_id, t.start_time, t.end_time + {attribute_select} + FROM matching_traces mt + JOIN traces t ON mt.trace_id = t.trace_id + LEFT JOIN spans s ON t.trace_id = s.trace_id + {order_clause} + ) + SELECT DISTINCT trace_id, root_span_id, start_time, end_time + FROM filtered_traces + LIMIT {limit} OFFSET {offset} + """ + + where_clause, params = build_where_clause() + query = base_query.format( + attribute_select=build_attribute_select(), + where_clause=where_clause, + order_clause=build_order_clause(), + limit=limit, + offset=offset, + ) + + # Execute query and return results + async with aiosqlite.connect(self.conn_string) as conn: + conn.row_factory = aiosqlite.Row + async with conn.execute(query, params) as cursor: + rows = await cursor.fetchall() + return [ + Trace( + trace_id=row["trace_id"], + root_span_id=row["root_span_id"], + start_time=datetime.fromisoformat(row["start_time"]), + end_time=datetime.fromisoformat(row["end_time"]), + ) + for row in rows + ] + + async def get_materialized_span( + self, + span_id: str, + attributes_to_return: Optional[List[str]] = None, + max_depth: Optional[int] = None, + ) -> SpanWithChildren: + # Build the attributes selection + attributes_select = "s.attributes" + if attributes_to_return: + json_object = ", ".join( + f"'{key}', json_extract(s.attributes, '$.{key}')" + for key in attributes_to_return + ) + attributes_select = f"json_object({json_object})" + + # SQLite CTE query with filtered attributes + query = f""" + WITH RECURSIVE span_tree AS ( + SELECT s.*, 1 as depth, {attributes_select} as filtered_attributes + FROM spans s + WHERE s.span_id = ? + + UNION ALL + + SELECT s.*, st.depth + 1, {attributes_select} as filtered_attributes + FROM spans s + JOIN span_tree st ON s.parent_span_id = st.span_id + WHERE (? IS NULL OR st.depth < ?) + ) + SELECT * + FROM span_tree + ORDER BY depth, start_time + """ + + async with aiosqlite.connect(self.conn_string) as conn: + conn.row_factory = aiosqlite.Row + async with conn.execute(query, (span_id, max_depth, max_depth)) as cursor: + rows = await cursor.fetchall() + + if not rows: + raise ValueError(f"Span {span_id} not found") + + # Build span tree + spans_by_id = {} + root_span = None + + for row in rows: + span = SpanWithChildren( + span_id=row["span_id"], + trace_id=row["trace_id"], + parent_span_id=row["parent_span_id"], + name=row["name"], + start_time=datetime.fromisoformat(row["start_time"]), + end_time=datetime.fromisoformat(row["end_time"]), + attributes=json.loads(row["filtered_attributes"]), + status=row["status"].lower(), + children=[], + ) + + spans_by_id[span.span_id] = span + + if span.span_id == span_id: + root_span = span + elif span.parent_span_id in spans_by_id: + spans_by_id[span.parent_span_id].children.append(span) + + return root_span diff --git a/llama_stack/providers/utils/telemetry/sqlite_trace_store.py b/llama_stack/providers/utils/telemetry/sqlite_trace_store.py new file mode 100644 index 000000000..ed1343e0b --- /dev/null +++ b/llama_stack/providers/utils/telemetry/sqlite_trace_store.py @@ -0,0 +1,180 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import json +from datetime import datetime +from typing import List, Optional, Protocol + +import aiosqlite + +from llama_stack.apis.telemetry import QueryCondition, SpanWithChildren, Trace + + +class TraceStore(Protocol): + + async def query_traces( + self, + attribute_filters: Optional[List[QueryCondition]] = None, + limit: Optional[int] = 100, + offset: Optional[int] = 0, + order_by: Optional[List[str]] = None, + ) -> List[Trace]: ... + + async def get_materialized_span( + self, + span_id: str, + attributes_to_return: Optional[List[str]] = None, + max_depth: Optional[int] = None, + ) -> SpanWithChildren: ... + + +class SQLiteTraceStore(TraceStore): + def __init__(self, conn_string: str): + self.conn_string = conn_string + + async def query_traces( + self, + attribute_filters: Optional[List[QueryCondition]] = None, + limit: Optional[int] = 100, + offset: Optional[int] = 0, + order_by: Optional[List[str]] = None, + ) -> List[Trace]: + + def build_where_clause() -> tuple[str, list]: + if not attribute_filters: + return "", [] + + ops_map = {"eq": "=", "ne": "!=", "gt": ">", "lt": "<"} + + conditions = [ + f"json_extract(s.attributes, '$.{condition.key}') {ops_map[condition.op]} ?" + for condition in attribute_filters + ] + params = [condition.value for condition in attribute_filters] + where_clause = " WHERE " + " AND ".join(conditions) + return where_clause, params + + def build_order_clause() -> str: + if not order_by: + return "" + + order_clauses = [] + for field in order_by: + desc = field.startswith("-") + clean_field = field[1:] if desc else field + order_clauses.append(f"t.{clean_field} {'DESC' if desc else 'ASC'}") + return " ORDER BY " + ", ".join(order_clauses) + + # Build the main query + base_query = """ + WITH matching_traces AS ( + SELECT DISTINCT t.trace_id + FROM traces t + JOIN spans s ON t.trace_id = s.trace_id + {where_clause} + ), + filtered_traces AS ( + SELECT t.trace_id, t.root_span_id, t.start_time, t.end_time + FROM matching_traces mt + JOIN traces t ON mt.trace_id = t.trace_id + LEFT JOIN spans s ON t.trace_id = s.trace_id + {order_clause} + ) + SELECT DISTINCT trace_id, root_span_id, start_time, end_time + FROM filtered_traces + LIMIT {limit} OFFSET {offset} + """ + + where_clause, params = build_where_clause() + query = base_query.format( + where_clause=where_clause, + order_clause=build_order_clause(), + limit=limit, + offset=offset, + ) + + # Execute query and return results + async with aiosqlite.connect(self.conn_string) as conn: + conn.row_factory = aiosqlite.Row + async with conn.execute(query, params) as cursor: + rows = await cursor.fetchall() + return [ + Trace( + trace_id=row["trace_id"], + root_span_id=row["root_span_id"], + start_time=datetime.fromisoformat(row["start_time"]), + end_time=datetime.fromisoformat(row["end_time"]), + ) + for row in rows + ] + + async def get_materialized_span( + self, + span_id: str, + attributes_to_return: Optional[List[str]] = None, + max_depth: Optional[int] = None, + ) -> SpanWithChildren: + # Build the attributes selection + attributes_select = "s.attributes" + if attributes_to_return: + json_object = ", ".join( + f"'{key}', json_extract(s.attributes, '$.{key}')" + for key in attributes_to_return + ) + attributes_select = f"json_object({json_object})" + + # SQLite CTE query with filtered attributes + query = f""" + WITH RECURSIVE span_tree AS ( + SELECT s.*, 1 as depth, {attributes_select} as filtered_attributes + FROM spans s + WHERE s.span_id = ? + + UNION ALL + + SELECT s.*, st.depth + 1, {attributes_select} as filtered_attributes + FROM spans s + JOIN span_tree st ON s.parent_span_id = st.span_id + WHERE (? IS NULL OR st.depth < ?) + ) + SELECT * + FROM span_tree + ORDER BY depth, start_time + """ + + async with aiosqlite.connect(self.conn_string) as conn: + conn.row_factory = aiosqlite.Row + async with conn.execute(query, (span_id, max_depth, max_depth)) as cursor: + rows = await cursor.fetchall() + + if not rows: + raise ValueError(f"Span {span_id} not found") + + # Build span tree + spans_by_id = {} + root_span = None + + for row in rows: + span = SpanWithChildren( + span_id=row["span_id"], + trace_id=row["trace_id"], + parent_span_id=row["parent_span_id"], + name=row["name"], + start_time=datetime.fromisoformat(row["start_time"]), + end_time=datetime.fromisoformat(row["end_time"]), + attributes=json.loads(row["filtered_attributes"]), + status=row["status"].lower(), + children=[], + ) + + spans_by_id[span.span_id] = span + + if span.span_id == span_id: + root_span = span + elif span.parent_span_id in spans_by_id: + spans_by_id[span.parent_span_id].children.append(span) + + return root_span diff --git a/llama_stack/providers/utils/telemetry/tracing.py b/llama_stack/providers/utils/telemetry/tracing.py index b53dc0df9..54558afdc 100644 --- a/llama_stack/providers/utils/telemetry/tracing.py +++ b/llama_stack/providers/utils/telemetry/tracing.py @@ -69,7 +69,7 @@ class TraceContext: self.logger = logger self.trace_id = trace_id - def push_span(self, name: str, attributes: Dict[str, Any] = None): + def push_span(self, name: str, attributes: Dict[str, Any] = None) -> Span: current_span = self.get_current_span() span = Span( span_id=generate_short_uuid(), @@ -94,6 +94,7 @@ class TraceContext: ) self.spans.append(span) + return span def pop_span(self, status: SpanStatus = SpanStatus.OK): span = self.spans.pop() @@ -203,12 +204,13 @@ class SpanContextManager: def __init__(self, name: str, attributes: Dict[str, Any] = None): self.name = name self.attributes = attributes + self.span = None def __enter__(self): global CURRENT_TRACE_CONTEXT context = CURRENT_TRACE_CONTEXT if context: - context.push_span(self.name, self.attributes) + self.span = context.push_span(self.name, self.attributes) return self def __exit__(self, exc_type, exc_value, traceback): @@ -217,11 +219,24 @@ class SpanContextManager: if context: context.pop_span() + def set_attribute(self, key: str, value: Any): + if self.span: + if self.span.attributes is None: + self.span.attributes = {} + self.span.attributes[key] = value + async def __aenter__(self): - return self.__enter__() + global CURRENT_TRACE_CONTEXT + context = CURRENT_TRACE_CONTEXT + if context: + self.span = context.push_span(self.name, self.attributes) + return self async def __aexit__(self, exc_type, exc_value, traceback): - self.__exit__(exc_type, exc_value, traceback) + global CURRENT_TRACE_CONTEXT + context = CURRENT_TRACE_CONTEXT + if context: + context.pop_span() def __call__(self, func: Callable): @wraps(func) @@ -246,3 +261,11 @@ class SpanContextManager: def span(name: str, attributes: Dict[str, Any] = None): return SpanContextManager(name, attributes) + + +def get_current_span() -> Optional[Span]: + global CURRENT_TRACE_CONTEXT + context = CURRENT_TRACE_CONTEXT + if context: + return context.get_current_span() + return None From 144abd2e716eb4706e40c0fed9aa93741934ffc9 Mon Sep 17 00:00:00 2001 From: Chacksu Date: Wed, 4 Dec 2024 18:42:55 -0500 Subject: [PATCH 20/69] Introduce GitHub Actions Workflow for Llama Stack Tests (#523) # What does this PR do? Initial implementation of GitHub Actions workflow for automated testing of Llama Stack. ## Key Features - Automatically runs tests on pull requests and manual dispatch - Provides support for GPU required model tests - Reports test results and uploads summaries --- .../gha_workflow_llama_stack_tests.yml | 355 ++++++++++++++++++ 1 file changed, 355 insertions(+) create mode 100644 .github/workflows/gha_workflow_llama_stack_tests.yml diff --git a/.github/workflows/gha_workflow_llama_stack_tests.yml b/.github/workflows/gha_workflow_llama_stack_tests.yml new file mode 100644 index 000000000..89e5edf71 --- /dev/null +++ b/.github/workflows/gha_workflow_llama_stack_tests.yml @@ -0,0 +1,355 @@ +name: "Run Llama-stack Tests" + +on: + #### Temporarily disable PR runs until tests run as intended within mainline. + #TODO Add this back. + #pull_request_target: + # types: ["opened"] + # branches: + # - 'main' + # paths: + # - 'llama_stack/**/*.py' + # - 'tests/**/*.py' + + workflow_dispatch: + inputs: + runner: + description: 'GHA Runner Scale Set label to run workflow on.' + required: true + default: "llama-stack-gha-runner-gpu" + + checkout_reference: + description: "The branch, tag, or SHA to checkout" + required: true + default: "main" + + debug: + description: 'Run debugging steps?' + required: false + default: "true" + + sleep_time: + description: '[DEBUG] sleep time for debugging' + required: true + default: "0" + + provider_id: + description: 'ID of your provider' + required: true + default: "meta_reference" + + model_id: + description: 'Shorthand name for target model ID (llama_3b or llama_8b)' + required: true + default: "llama_3b" + + model_override_3b: + description: 'Specify shorthand model for ' + required: false + default: "Llama3.2-3B-Instruct" + + model_override_8b: + description: 'Specify shorthand model for ' + required: false + default: "Llama3.1-8B-Instruct" + +env: + # ID used for each test's provider config + PROVIDER_ID: "${{ inputs.provider_id || 'meta_reference' }}" + + # Path to model checkpoints within EFS volume + MODEL_CHECKPOINT_DIR: "/data/llama" + + # Path to directory to run tests from + TESTS_PATH: "${{ github.workspace }}/llama_stack/providers/tests" + + # Keep track of a list of model IDs that are valid to use within pytest fixture marks + AVAILABLE_MODEL_IDs: "llama_3b llama_8b" + + # Shorthand name for model ID, used in pytest fixture marks + MODEL_ID: "${{ inputs.model_id || 'llama_3b' }}" + + # Override the `llama_3b` / `llama_8b' models, else use the default. + LLAMA_3B_OVERRIDE: "${{ inputs.model_override_3b || 'Llama3.2-3B-Instruct' }}" + LLAMA_8B_OVERRIDE: "${{ inputs.model_override_8b || 'Llama3.1-8B-Instruct' }}" + + # Defines which directories in TESTS_PATH to exclude from the test loop + EXCLUDED_DIRS: "__pycache__" + + # Defines the output xml reports generated after a test is run + REPORTS_GEN: "" + +jobs: + execute_workflow: + name: Execute workload on Self-Hosted GPU k8s runner + permissions: + pull-requests: write + defaults: + run: + shell: bash + runs-on: ${{ inputs.runner != '' && inputs.runner || 'llama-stack-gha-runner-gpu' }} + if: always() + steps: + + ############################## + #### INITIAL DEBUG CHECKS #### + ############################## + - name: "[DEBUG] Check content of the EFS mount" + id: debug_efs_volume + continue-on-error: true + if: inputs.debug == 'true' + run: | + echo "========= Content of the EFS mount =============" + ls -la ${{ env.MODEL_CHECKPOINT_DIR }} + + - name: "[DEBUG] Get runner container OS information" + id: debug_os_info + if: ${{ inputs.debug == 'true' }} + run: | + cat /etc/os-release + + - name: "[DEBUG] Print environment variables" + id: debug_env_vars + if: ${{ inputs.debug == 'true' }} + run: | + echo "PROVIDER_ID = ${PROVIDER_ID}" + echo "MODEL_CHECKPOINT_DIR = ${MODEL_CHECKPOINT_DIR}" + echo "AVAILABLE_MODEL_IDs = ${AVAILABLE_MODEL_IDs}" + echo "MODEL_ID = ${MODEL_ID}" + echo "LLAMA_3B_OVERRIDE = ${LLAMA_3B_OVERRIDE}" + echo "LLAMA_8B_OVERRIDE = ${LLAMA_8B_OVERRIDE}" + echo "EXCLUDED_DIRS = ${EXCLUDED_DIRS}" + echo "REPORTS_GEN = ${REPORTS_GEN}" + + ############################ + #### MODEL INPUT CHECKS #### + ############################ + + - name: "Check if env.model_id is valid" + id: check_model_id + run: | + if [[ " ${AVAILABLE_MODEL_IDs[@]} " =~ " ${MODEL_ID} " ]]; then + echo "Model ID '${MODEL_ID}' is valid." + else + echo "Model ID '${MODEL_ID}' is invalid. Terminating workflow." + exit 1 + fi + + ####################### + #### CODE CHECKOUT #### + ####################### + - name: "Checkout 'meta-llama/llama-stack' repository" + id: checkout_repo + uses: actions/checkout@v4 + with: + ref: ${{ inputs.branch }} + + - name: "[DEBUG] Content of the repository after checkout" + id: debug_content_after_checkout + if: ${{ inputs.debug == 'true' }} + run: | + ls -la ${GITHUB_WORKSPACE} + + ########################################################## + #### OPTIONAL SLEEP DEBUG #### + # # + # Use to "exec" into the test k8s POD and run tests # + # manually to identify what dependencies are being used. # + # # + ########################################################## + - name: "[DEBUG] sleep" + id: debug_sleep + if: ${{ inputs.debug == 'true' && inputs.sleep_time != '' }} + run: | + sleep ${{ inputs.sleep_time }} + + ############################ + #### UPDATE SYSTEM PATH #### + ############################ + - name: "Update path: execute" + id: path_update_exec + run: | + # .local/bin is needed for certain libraries installed below to be recognized + # when calling their executable to install sub-dependencies + mkdir -p ${HOME}/.local/bin + echo "${HOME}/.local/bin" >> "$GITHUB_PATH" + + ##################################### + #### UPDATE CHECKPOINT DIRECTORY #### + ##################################### + - name: "Update checkpoint directory" + id: checkpoint_update + run: | + echo "Checkpoint directory: ${MODEL_CHECKPOINT_DIR}/$LLAMA_3B_OVERRIDE" + if [ "${MODEL_ID}" = "llama_3b" ] && [ -d "${MODEL_CHECKPOINT_DIR}/${LLAMA_3B_OVERRIDE}" ]; then + echo "MODEL_CHECKPOINT_DIR=${MODEL_CHECKPOINT_DIR}/${LLAMA_3B_OVERRIDE}" >> "$GITHUB_ENV" + elif [ "${MODEL_ID}" = "llama_8b" ] && [ -d "${MODEL_CHECKPOINT_DIR}/${LLAMA_8B_OVERRIDE}" ]; then + echo "MODEL_CHECKPOINT_DIR=${MODEL_CHECKPOINT_DIR}/${LLAMA_8B_OVERRIDE}" >> "$GITHUB_ENV" + else + echo "MODEL_ID & LLAMA_*B_OVERRIDE are not a valid pairing. Terminating workflow." + exit 1 + fi + + - name: "[DEBUG] Checkpoint update check" + id: debug_checkpoint_update + if: ${{ inputs.debug == 'true' }} + run: | + echo "MODEL_CHECKPOINT_DIR (after update) = ${MODEL_CHECKPOINT_DIR}" + + ################################## + #### DEPENDENCY INSTALLATIONS #### + ################################## + - name: "Installing 'apt' required packages" + id: install_apt + run: | + echo "[STEP] Installing 'apt' required packages" + sudo apt update -y + sudo apt install -y python3 python3-pip npm wget + + - name: "Installing packages with 'curl'" + id: install_curl + run: | + curl -fsSL https://ollama.com/install.sh | sh + + - name: "Installing packages with 'wget'" + id: install_wget + run: | + wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh + chmod +x Miniconda3-latest-Linux-x86_64.sh + ./Miniconda3-latest-Linux-x86_64.sh -b install -c pytorch -c nvidia faiss-gpu=1.9.0 + # Add miniconda3 bin to system path + echo "${HOME}/miniconda3/bin" >> "$GITHUB_PATH" + + - name: "Installing packages with 'npm'" + id: install_npm_generic + run: | + sudo npm install -g junit-merge + + - name: "Installing pip dependencies" + id: install_pip_generic + run: | + echo "[STEP] Installing 'llama-stack' models" + pip install -U pip setuptools + pip install -r requirements.txt + pip install -e . + pip install -U \ + torch torchvision \ + pytest pytest_asyncio \ + fairscale lm-format-enforcer \ + zmq chardet pypdf \ + pandas sentence_transformers together \ + aiosqlite + - name: "Installing packages with conda" + id: install_conda_generic + run: | + conda install -q -c pytorch -c nvidia faiss-gpu=1.9.0 + + ############################################################# + #### TESTING TO BE DONE FOR BOTH PRS AND MANUAL DISPATCH #### + ############################################################# + - name: "Run Tests: Loop" + id: run_tests_loop + working-directory: "${{ github.workspace }}" + run: | + pattern="" + for dir in llama_stack/providers/tests/*; do + if [ -d "$dir" ]; then + dir_name=$(basename "$dir") + if [[ ! " $EXCLUDED_DIRS " =~ " $dir_name " ]]; then + for file in "$dir"/test_*.py; do + test_name=$(basename "$file") + new_file="result-${dir_name}-${test_name}.xml" + if torchrun $(which pytest) -s -v ${TESTS_PATH}/${dir_name}/${test_name} -m "${PROVIDER_ID} and ${MODEL_ID}" \ + --junitxml="${{ github.workspace }}/${new_file}"; then + echo "Ran test: ${test_name}" + else + echo "Did NOT run test: ${test_name}" + fi + pattern+="${new_file} " + done + fi + fi + done + echo "REPORTS_GEN=$pattern" >> "$GITHUB_ENV" + + - name: "Test Summary: Merge" + id: test_summary_merge + working-directory: "${{ github.workspace }}" + run: | + echo "Merging the following test result files: ${REPORTS_GEN}" + # Defaults to merging them into 'merged-test-results.xml' + junit-merge ${{ env.REPORTS_GEN }} + + ############################################ + #### AUTOMATIC TESTING ON PULL REQUESTS #### + ############################################ + + #### Run tests #### + + - name: "PR - Run Tests" + id: pr_run_tests + working-directory: "${{ github.workspace }}" + if: github.event_name == 'pull_request_target' + run: | + echo "[STEP] Running PyTest tests at 'GITHUB_WORKSPACE' path: ${GITHUB_WORKSPACE} | path: ${{ github.workspace }}" + # (Optional) Add more tests here. + + # Merge test results with 'merged-test-results.xml' from above. + # junit-merge merged-test-results.xml + + #### Create test summary #### + + - name: "PR - Test Summary" + id: pr_test_summary_create + if: github.event_name == 'pull_request_target' + uses: test-summary/action@v2 + with: + paths: "${{ github.workspace }}/merged-test-results.xml" + output: test-summary.md + + - name: "PR - Upload Test Summary" + id: pr_test_summary_upload + if: github.event_name == 'pull_request_target' + uses: actions/upload-artifact@v3 + with: + name: test-summary + path: test-summary.md + + #### Update PR request #### + + - name: "PR - Update comment" + id: pr_update_comment + if: github.event_name == 'pull_request_target' + uses: thollander/actions-comment-pull-request@v2 + with: + filePath: test-summary.md + + ######################## + #### MANUAL TESTING #### + ######################## + + #### Run tests #### + + - name: "Manual - Run Tests: Prep" + id: manual_run_tests + working-directory: "${{ github.workspace }}" + if: github.event_name == 'workflow_dispatch' + run: | + echo "[STEP] Running PyTest tests at 'GITHUB_WORKSPACE' path: ${{ github.workspace }}" + + #TODO Use this when collection errors are resolved + # pytest -s -v -m "${PROVIDER_ID} and ${MODEL_ID}" --junitxml="${{ github.workspace }}/merged-test-results.xml" + + # (Optional) Add more tests here. + + # Merge test results with 'merged-test-results.xml' from above. + # junit-merge merged-test-results.xml + + #### Create test summary #### + + - name: "Manual - Test Summary" + id: manual_test_summary + if: always() && github.event_name == 'workflow_dispatch' + uses: test-summary/action@v2 + with: + paths: "${{ github.workspace }}/merged-test-results.xml" From 999b9781f71616241408ca3711ca4d8bf2a5f6e1 Mon Sep 17 00:00:00 2001 From: Jeff Tang Date: Thu, 5 Dec 2024 08:39:13 -0800 Subject: [PATCH 21/69] specify the client version that works for current together server (#566) # What does this PR do? Fix the error when using the newer (v0.0.55-57) llama stack client library with Together's stack service. In short, provide a summary of what this PR does and why. Usually, the relevant context should be present in a linked issue. - [ ] Addresses issue (#issue) ## Test Plan Please describe: - tests you ran to verify your changes with result summaries. - provide instructions so it can be reproduced. ## Sources Please link relevant resources if necessary. ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Ran pre-commit to handle lint / formatting issues. - [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ ] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests. --- .../Tool_Calling101_Using_Together's_Llama_Stack_Server.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/zero_to_hero_guide/Tool_Calling101_Using_Together's_Llama_Stack_Server.ipynb b/docs/zero_to_hero_guide/Tool_Calling101_Using_Together's_Llama_Stack_Server.ipynb index e9bff5f33..8e3949e94 100644 --- a/docs/zero_to_hero_guide/Tool_Calling101_Using_Together's_Llama_Stack_Server.ipynb +++ b/docs/zero_to_hero_guide/Tool_Calling101_Using_Together's_Llama_Stack_Server.ipynb @@ -71,7 +71,7 @@ } ], "source": [ - "!pip install llama-stack-client" + "!pip install llama-stack-client==0.0.50" ] }, { From a2d9a983de87c5f04a0f2f4416bbc225fbca7803 Mon Sep 17 00:00:00 2001 From: Dinesh Yeduguru Date: Thu, 5 Dec 2024 09:57:16 -0800 Subject: [PATCH 22/69] remove unused telemetry related code (#570) remove unused tracing code which was added back by mistake. --- .../telemetry/opentelemetry/opentelemetry.py | 259 ------------------ .../providers/utils/telemetry/sqlite.py | 177 ------------ 2 files changed, 436 deletions(-) delete mode 100644 llama_stack/providers/remote/telemetry/opentelemetry/opentelemetry.py delete mode 100644 llama_stack/providers/utils/telemetry/sqlite.py diff --git a/llama_stack/providers/remote/telemetry/opentelemetry/opentelemetry.py b/llama_stack/providers/remote/telemetry/opentelemetry/opentelemetry.py deleted file mode 100644 index 04eb71ce0..000000000 --- a/llama_stack/providers/remote/telemetry/opentelemetry/opentelemetry.py +++ /dev/null @@ -1,259 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -import threading -from typing import List, Optional - -from llama_stack.distribution.datatypes import Api -from llama_stack.providers.remote.telemetry.opentelemetry.console_span_processor import ( - ConsoleSpanProcessor, -) -from llama_stack.providers.remote.telemetry.opentelemetry.sqlite_span_processor import ( - SQLiteSpanProcessor, -) -from llama_stack.providers.utils.telemetry.sqlite_trace_store import SQLiteTraceStore - -from opentelemetry import metrics, trace -from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter -from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter -from opentelemetry.sdk.metrics import MeterProvider -from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader -from opentelemetry.sdk.resources import Resource -from opentelemetry.sdk.trace import TracerProvider -from opentelemetry.sdk.trace.export import BatchSpanProcessor -from opentelemetry.semconv.resource import ResourceAttributes - - -from llama_stack.apis.telemetry import * # noqa: F403 - -from .config import OpenTelemetryConfig, TelemetrySink - -_GLOBAL_STORAGE = { - "active_spans": {}, - "counters": {}, - "gauges": {}, - "up_down_counters": {}, -} -_global_lock = threading.Lock() - - -def string_to_trace_id(s: str) -> int: - # Convert the string to bytes and then to an integer - return int.from_bytes(s.encode(), byteorder="big", signed=False) - - -def string_to_span_id(s: str) -> int: - # Use only the first 8 bytes (64 bits) for span ID - return int.from_bytes(s.encode()[:8], byteorder="big", signed=False) - - -def is_tracing_enabled(tracer): - with tracer.start_as_current_span("check_tracing") as span: - return span.is_recording() - - -class OpenTelemetryAdapter(Telemetry): - def __init__(self, config: OpenTelemetryConfig, deps) -> None: - self.config = config - self.datasetio = deps[Api.datasetio] - - resource = Resource.create( - { - ResourceAttributes.SERVICE_NAME: self.config.service_name, - } - ) - - provider = TracerProvider(resource=resource) - trace.set_tracer_provider(provider) - if TelemetrySink.JAEGER in self.config.sinks: - otlp_exporter = OTLPSpanExporter( - endpoint=self.config.otel_endpoint, - ) - span_processor = BatchSpanProcessor(otlp_exporter) - trace.get_tracer_provider().add_span_processor(span_processor) - metric_reader = PeriodicExportingMetricReader( - OTLPMetricExporter( - endpoint=self.config.otel_endpoint, - ) - ) - metric_provider = MeterProvider( - resource=resource, metric_readers=[metric_reader] - ) - metrics.set_meter_provider(metric_provider) - self.meter = metrics.get_meter(__name__) - if TelemetrySink.SQLITE in self.config.sinks: - trace.get_tracer_provider().add_span_processor( - SQLiteSpanProcessor(self.config.sqlite_db_path) - ) - self.trace_store = SQLiteTraceStore(self.config.sqlite_db_path) - if TelemetrySink.CONSOLE in self.config.sinks: - trace.get_tracer_provider().add_span_processor(ConsoleSpanProcessor()) - self._lock = _global_lock - - async def initialize(self) -> None: - pass - - async def shutdown(self) -> None: - trace.get_tracer_provider().force_flush() - trace.get_tracer_provider().shutdown() - metrics.get_meter_provider().shutdown() - - async def log_event(self, event: Event, ttl_seconds: int = 604800) -> None: - if isinstance(event, UnstructuredLogEvent): - self._log_unstructured(event, ttl_seconds) - elif isinstance(event, MetricEvent): - self._log_metric(event) - elif isinstance(event, StructuredLogEvent): - self._log_structured(event, ttl_seconds) - else: - raise ValueError(f"Unknown event type: {event}") - - def _log_unstructured(self, event: UnstructuredLogEvent, ttl_seconds: int) -> None: - with self._lock: - # Use global storage instead of instance storage - span_id = string_to_span_id(event.span_id) - span = _GLOBAL_STORAGE["active_spans"].get(span_id) - - if span: - timestamp_ns = int(event.timestamp.timestamp() * 1e9) - span.add_event( - name=event.type, - attributes={ - "message": event.message, - "severity": event.severity.value, - "__ttl__": ttl_seconds, - **event.attributes, - }, - timestamp=timestamp_ns, - ) - else: - print( - f"Warning: No active span found for span_id {span_id}. Dropping event: {event}" - ) - - def _get_or_create_counter(self, name: str, unit: str) -> metrics.Counter: - if name not in _GLOBAL_STORAGE["counters"]: - _GLOBAL_STORAGE["counters"][name] = self.meter.create_counter( - name=name, - unit=unit, - description=f"Counter for {name}", - ) - return _GLOBAL_STORAGE["counters"][name] - - def _get_or_create_gauge(self, name: str, unit: str) -> metrics.ObservableGauge: - if name not in _GLOBAL_STORAGE["gauges"]: - _GLOBAL_STORAGE["gauges"][name] = self.meter.create_gauge( - name=name, - unit=unit, - description=f"Gauge for {name}", - ) - return _GLOBAL_STORAGE["gauges"][name] - - def _log_metric(self, event: MetricEvent) -> None: - if isinstance(event.value, int): - counter = self._get_or_create_counter(event.metric, event.unit) - counter.add(event.value, attributes=event.attributes) - elif isinstance(event.value, float): - up_down_counter = self._get_or_create_up_down_counter( - event.metric, event.unit - ) - up_down_counter.add(event.value, attributes=event.attributes) - - def _get_or_create_up_down_counter( - self, name: str, unit: str - ) -> metrics.UpDownCounter: - if name not in _GLOBAL_STORAGE["up_down_counters"]: - _GLOBAL_STORAGE["up_down_counters"][name] = ( - self.meter.create_up_down_counter( - name=name, - unit=unit, - description=f"UpDownCounter for {name}", - ) - ) - return _GLOBAL_STORAGE["up_down_counters"][name] - - def _log_structured(self, event: StructuredLogEvent, ttl_seconds: int) -> None: - with self._lock: - span_id = string_to_span_id(event.span_id) - trace_id = string_to_trace_id(event.trace_id) - tracer = trace.get_tracer(__name__) - if event.attributes is None: - event.attributes = {} - event.attributes["__ttl__"] = ttl_seconds - - if isinstance(event.payload, SpanStartPayload): - # Check if span already exists to prevent duplicates - if span_id in _GLOBAL_STORAGE["active_spans"]: - return - - parent_span = None - if event.payload.parent_span_id: - parent_span_id = string_to_span_id(event.payload.parent_span_id) - parent_span = _GLOBAL_STORAGE["active_spans"].get(parent_span_id) - - context = trace.Context(trace_id=trace_id) - if parent_span: - context = trace.set_span_in_context(parent_span, context) - - span = tracer.start_span( - name=event.payload.name, - context=context, - attributes=event.attributes or {}, - ) - _GLOBAL_STORAGE["active_spans"][span_id] = span - - elif isinstance(event.payload, SpanEndPayload): - span = _GLOBAL_STORAGE["active_spans"].get(span_id) - if span: - if event.attributes: - span.set_attributes(event.attributes) - - status = ( - trace.Status(status_code=trace.StatusCode.OK) - if event.payload.status == SpanStatus.OK - else trace.Status(status_code=trace.StatusCode.ERROR) - ) - span.set_status(status) - span.end() - _GLOBAL_STORAGE["active_spans"].pop(span_id, None) - else: - raise ValueError(f"Unknown structured log event: {event}") - - async def query_traces( - self, - attribute_conditions: Optional[List[QueryCondition]] = None, - attribute_keys_to_return: Optional[List[str]] = None, - limit: Optional[int] = 100, - offset: Optional[int] = 0, - order_by: Optional[List[str]] = None, - ) -> List[Trace]: - return await self.trace_store.query_traces( - attribute_conditions=attribute_conditions, - attribute_keys_to_return=attribute_keys_to_return, - limit=limit, - offset=offset, - order_by=order_by, - ) - - async def get_spans( - self, - span_id: str, - attribute_conditions: Optional[List[QueryCondition]] = None, - attribute_keys_to_return: Optional[List[str]] = None, - max_depth: Optional[int] = None, - limit: Optional[int] = 100, - offset: Optional[int] = 0, - order_by: Optional[List[str]] = None, - ) -> SpanWithChildren: - return await self.trace_store.get_spans( - span_id=span_id, - attribute_conditions=attribute_conditions, - attribute_keys_to_return=attribute_keys_to_return, - max_depth=max_depth, - limit=limit, - offset=offset, - order_by=order_by, - ) diff --git a/llama_stack/providers/utils/telemetry/sqlite.py b/llama_stack/providers/utils/telemetry/sqlite.py deleted file mode 100644 index e7161fffa..000000000 --- a/llama_stack/providers/utils/telemetry/sqlite.py +++ /dev/null @@ -1,177 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -import json -from datetime import datetime -from typing import List, Optional - -import aiosqlite - -from llama_stack.apis.telemetry import ( - QueryCondition, - SpanWithChildren, - Trace, - TraceStore, -) - - -class SQLiteTraceStore(TraceStore): - def __init__(self, conn_string: str): - self.conn_string = conn_string - - async def query_traces( - self, - attribute_filters: Optional[List[QueryCondition]] = None, - attributes_to_return: Optional[List[str]] = None, - limit: Optional[int] = 100, - offset: Optional[int] = 0, - order_by: Optional[List[str]] = None, - ) -> List[Trace]: - print(attribute_filters, attributes_to_return, limit, offset, order_by) - - def build_attribute_select() -> str: - if not attributes_to_return: - return "" - return "".join( - f", json_extract(s.attributes, '$.{key}') as attr_{key}" - for key in attributes_to_return - ) - - def build_where_clause() -> tuple[str, list]: - if not attribute_filters: - return "", [] - - conditions = [ - f"json_extract(s.attributes, '$.{condition.key}') {condition.op} ?" - for condition in attribute_filters - ] - params = [condition.value for condition in attribute_filters] - where_clause = " WHERE " + " AND ".join(conditions) - return where_clause, params - - def build_order_clause() -> str: - if not order_by: - return "" - - order_clauses = [] - for field in order_by: - desc = field.startswith("-") - clean_field = field[1:] if desc else field - order_clauses.append(f"t.{clean_field} {'DESC' if desc else 'ASC'}") - return " ORDER BY " + ", ".join(order_clauses) - - # Build the main query - base_query = """ - WITH matching_traces AS ( - SELECT DISTINCT t.trace_id - FROM traces t - JOIN spans s ON t.trace_id = s.trace_id - {where_clause} - ), - filtered_traces AS ( - SELECT t.trace_id, t.root_span_id, t.start_time, t.end_time - {attribute_select} - FROM matching_traces mt - JOIN traces t ON mt.trace_id = t.trace_id - LEFT JOIN spans s ON t.trace_id = s.trace_id - {order_clause} - ) - SELECT DISTINCT trace_id, root_span_id, start_time, end_time - FROM filtered_traces - LIMIT {limit} OFFSET {offset} - """ - - where_clause, params = build_where_clause() - query = base_query.format( - attribute_select=build_attribute_select(), - where_clause=where_clause, - order_clause=build_order_clause(), - limit=limit, - offset=offset, - ) - - # Execute query and return results - async with aiosqlite.connect(self.conn_string) as conn: - conn.row_factory = aiosqlite.Row - async with conn.execute(query, params) as cursor: - rows = await cursor.fetchall() - return [ - Trace( - trace_id=row["trace_id"], - root_span_id=row["root_span_id"], - start_time=datetime.fromisoformat(row["start_time"]), - end_time=datetime.fromisoformat(row["end_time"]), - ) - for row in rows - ] - - async def get_materialized_span( - self, - span_id: str, - attributes_to_return: Optional[List[str]] = None, - max_depth: Optional[int] = None, - ) -> SpanWithChildren: - # Build the attributes selection - attributes_select = "s.attributes" - if attributes_to_return: - json_object = ", ".join( - f"'{key}', json_extract(s.attributes, '$.{key}')" - for key in attributes_to_return - ) - attributes_select = f"json_object({json_object})" - - # SQLite CTE query with filtered attributes - query = f""" - WITH RECURSIVE span_tree AS ( - SELECT s.*, 1 as depth, {attributes_select} as filtered_attributes - FROM spans s - WHERE s.span_id = ? - - UNION ALL - - SELECT s.*, st.depth + 1, {attributes_select} as filtered_attributes - FROM spans s - JOIN span_tree st ON s.parent_span_id = st.span_id - WHERE (? IS NULL OR st.depth < ?) - ) - SELECT * - FROM span_tree - ORDER BY depth, start_time - """ - - async with aiosqlite.connect(self.conn_string) as conn: - conn.row_factory = aiosqlite.Row - async with conn.execute(query, (span_id, max_depth, max_depth)) as cursor: - rows = await cursor.fetchall() - - if not rows: - raise ValueError(f"Span {span_id} not found") - - # Build span tree - spans_by_id = {} - root_span = None - - for row in rows: - span = SpanWithChildren( - span_id=row["span_id"], - trace_id=row["trace_id"], - parent_span_id=row["parent_span_id"], - name=row["name"], - start_time=datetime.fromisoformat(row["start_time"]), - end_time=datetime.fromisoformat(row["end_time"]), - attributes=json.loads(row["filtered_attributes"]), - status=row["status"].lower(), - children=[], - ) - - spans_by_id[span.span_id] = span - - if span.span_id == span_id: - root_span = span - elif span.parent_span_id in spans_by_id: - spans_by_id[span.parent_span_id].children.append(span) - - return root_span From 703a20c3bc2bd1ddab1afa5f68c69c201ceedbda Mon Sep 17 00:00:00 2001 From: dltn <6599399+dltn@users.noreply.github.com> Date: Thu, 5 Dec 2024 13:21:33 -0800 Subject: [PATCH 23/69] cprint in print_pip_install_help --- llama_stack/distribution/build.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/llama_stack/distribution/build.py b/llama_stack/distribution/build.py index fb4b6a161..526815038 100644 --- a/llama_stack/distribution/build.py +++ b/llama_stack/distribution/build.py @@ -9,9 +9,9 @@ from enum import Enum from typing import List import pkg_resources -from pydantic import BaseModel - from llama_stack.distribution.utils.exec import run_with_pty +from pydantic import BaseModel +from termcolor import cprint from llama_stack.distribution.datatypes import * # noqa: F403 from pathlib import Path @@ -90,11 +90,12 @@ def get_provider_dependencies( def print_pip_install_help(providers: Dict[str, List[Provider]]): normal_deps, special_deps = get_provider_dependencies(providers) - print( - f"Please install needed dependencies using the following commands:\n\n\tpip install {' '.join(normal_deps)}" + cprint( + f"Please install needed dependencies using the following commands:\n\n\tpip install {' '.join(normal_deps)}", + "yellow", ) for special_dep in special_deps: - log.info(f"\tpip install {special_dep}") + cprint(f"\tpip install {special_dep}", "yellow") print() From 6eb5f2a865f40ae9e9ac46a4f7b486c28dfb5d7e Mon Sep 17 00:00:00 2001 From: Dalton Flanagan <6599399+dltn@users.noreply.github.com> Date: Thu, 5 Dec 2024 16:36:26 -0500 Subject: [PATCH 24/69] precommit --- llama_stack/distribution/build.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llama_stack/distribution/build.py b/llama_stack/distribution/build.py index 526815038..9d0ad9af4 100644 --- a/llama_stack/distribution/build.py +++ b/llama_stack/distribution/build.py @@ -9,10 +9,11 @@ from enum import Enum from typing import List import pkg_resources -from llama_stack.distribution.utils.exec import run_with_pty from pydantic import BaseModel from termcolor import cprint +from llama_stack.distribution.utils.exec import run_with_pty + from llama_stack.distribution.datatypes import * # noqa: F403 from pathlib import Path From a4daf4d3ecc3d53ec14725634f2be16a8948ce56 Mon Sep 17 00:00:00 2001 From: Steve Grubb Date: Thu, 5 Dec 2024 17:13:49 -0500 Subject: [PATCH 25/69] Fix up safety client for versioned API (#573) When running: python -m llama_stack.apis.safety.client localhost 5000 The API server was logging: INFO: ::1:57176 - "POST /safety/run_shield HTTP/1.1" 404 Not Found This patch uses the versioned API, uses the updated safety endpoint, and updates the model name to what's being served. The above python command now demonstrates a passing and failing example. --- llama_stack/apis/safety/client.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llama_stack/apis/safety/client.py b/llama_stack/apis/safety/client.py index d7d4bc981..a9396c70c 100644 --- a/llama_stack/apis/safety/client.py +++ b/llama_stack/apis/safety/client.py @@ -17,6 +17,8 @@ from llama_models.llama3.api.datatypes import * # noqa: F403 from pydantic import BaseModel from termcolor import cprint +from llama_stack.apis.version import LLAMA_STACK_API_VERSION + from llama_stack.distribution.datatypes import RemoteProviderConfig from llama_stack.apis.safety import * # noqa: F403 @@ -45,7 +47,7 @@ class SafetyClient(Safety): ) -> RunShieldResponse: async with httpx.AsyncClient() as client: response = await client.post( - f"{self.base_url}/safety/run_shield", + f"{self.base_url}/{LLAMA_STACK_API_VERSION}/safety/run-shield", json=dict( shield_id=shield_id, messages=[encodable_dict(m) for m in messages], @@ -91,7 +93,7 @@ async def run_main(host: str, port: int, image_path: str = None): ]: cprint(f"User>{message.content}", "green") response = await client.run_shield( - shield_id="llama_guard", + shield_id="meta-llama/Llama-Guard-3-1B", messages=[message], ) print(response) From 7301403ce38ae3c3309199602f7cd3472a9238b8 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Thu, 5 Dec 2024 16:29:32 -0800 Subject: [PATCH 26/69] Add eval/scoring/datasetio API providers to distribution templates & UI developer guide (#564) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? - add /eval, /scoring, /datasetio API providers to distribution templates - regenerate build.yaml / run.yaml files - fix `template.py` to take in list of providers instead of only first one - override memory provider as faiss default for all distro (as only 1 memory provider is needed to start basic flow, chromadb/pgvector need additional setup step). ``` python llama_stack/scripts/distro_codegen.py ``` - updated README to start UI via conda builds. ## Test Plan ``` python llama_stack/scripts/distro_codegen.py ``` - Use newly generated `run.yaml` to start server ``` llama stack run ./llama_stack/templates/together/run.yaml ``` image #### Registration ``` ❯ llama-stack-client datasets register \ --dataset-id "mmlu" \ --provider-id "huggingface" \ --url "https://huggingface.co/datasets/llamastack/evals" \ --metadata '{"path": "llamastack/evals", "name": "evals__mmlu__details", "split": "train"}' \ --schema '{"input_query": {"type": "string"}, "expected_answer": {"type": "string", "chat_completion_input": {"type": "string"}}}' ❯ llama-stack-client datasets list ┏━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┓ ┃ identifier ┃ provider_id ┃ metadata ┃ type ┃ ┡━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━┩ │ mmlu │ huggingface │ {'path': 'llamastack/evals', 'name': │ dataset │ │ │ │ 'evals__mmlu__details', 'split': │ │ │ │ │ 'train'} │ │ └────────────┴─────────────┴─────────────────────────────────────────┴─────────┘ ``` ``` ❯ llama-stack-client datasets register \ --dataset-id "simpleqa" \ --provider-id "huggingface" \ --url "https://huggingface.co/datasets/llamastack/evals" \ --metadata '{"path": "llamastack/evals", "name": "evals__simpleqa", "split": "train"}' \ --schema '{"input_query": {"type": "string"}, "expected_answer": {"type": "string", "chat_completion_input": {"type": "string"}}}' ❯ llama-stack-client datasets list ┏━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┓ ┃ identifier ┃ provider_id ┃ metadata ┃ type ┃ ┡━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━┩ │ mmlu │ huggingface │ {'path': 'llamastack/evals', 'name': 'evals__mmlu__details', │ dataset │ │ │ │ 'split': 'train'} │ │ │ simpleqa │ huggingface │ {'path': 'llamastack/evals', 'name': 'evals__simpleqa', │ dataset │ │ │ │ 'split': 'train'} │ │ └────────────┴─────────────┴───────────────────────────────────────────────────────────────┴─────────┘ ``` ``` ❯ llama-stack-client eval_tasks register \ > --eval-task-id meta-reference-mmlu \ > --provider-id meta-reference \ > --dataset-id mmlu \ > --scoring-functions basic::regex_parser_multiple_choice_answer ❯ llama-stack-client eval_tasks register \ --eval-task-id meta-reference-simpleqa \ --provider-id meta-reference \ --dataset-id simpleqa \ --scoring-functions llm-as-judge::405b-simpleqa ❯ llama-stack-client eval_tasks list ┏━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓ ┃ dataset_id ┃ identifier ┃ metadata ┃ provider_id ┃ provider_resour… ┃ scoring_functio… ┃ type ┃ ┡━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩ │ mmlu │ meta-reference-… │ {} │ meta-reference │ meta-reference-… │ ['basic::regex_… │ eval_task │ │ simpleqa │ meta-reference-… │ {} │ meta-reference │ meta-reference-… │ ['llm-as-judge:… │ eval_task │ └────────────┴──────────────────┴──────────┴────────────────┴──────────────────┴──────────────────┴───────────┘ ``` #### Test with UI ``` streamlit run app.py ``` ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Ran pre-commit to handle lint / formatting issues. - [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ ] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests. --- distributions/dependencies.json | 290 ++++++++++-------- .../self_hosted_distro/bedrock.md | 6 +- .../self_hosted_distro/fireworks.md | 3 + .../self_hosted_distro/meta-reference-gpu.md | 3 + .../meta-reference-quantized-gpu.md | 3 + .../self_hosted_distro/ollama.md | 5 +- .../distributions/self_hosted_distro/tgi.md | 3 + .../self_hosted_distro/together.md | 3 + llama_stack/distribution/ui/README.md | 41 ++- llama_stack/templates/bedrock/bedrock.py | 20 +- llama_stack/templates/bedrock/build.yaml | 9 + llama_stack/templates/bedrock/run.yaml | 24 ++ llama_stack/templates/fireworks/build.yaml | 9 + llama_stack/templates/fireworks/fireworks.py | 14 +- llama_stack/templates/fireworks/run.yaml | 24 ++ llama_stack/templates/hf-endpoint/build.yaml | 9 + .../templates/hf-endpoint/hf_endpoint.py | 17 +- .../hf-endpoint/run-with-safety.yaml | 24 ++ llama_stack/templates/hf-endpoint/run.yaml | 24 ++ .../templates/hf-serverless/build.yaml | 9 + .../templates/hf-serverless/hf_serverless.py | 16 +- .../hf-serverless/run-with-safety.yaml | 24 ++ llama_stack/templates/hf-serverless/run.yaml | 24 ++ .../templates/meta-reference-gpu/build.yaml | 9 + .../meta-reference-gpu/meta_reference.py | 15 +- .../meta-reference-gpu/run-with-safety.yaml | 24 ++ .../templates/meta-reference-gpu/run.yaml | 24 ++ .../meta-reference-quantized-gpu/build.yaml | 9 + .../meta_reference.py | 14 +- .../meta-reference-quantized-gpu/run.yaml | 24 ++ llama_stack/templates/ollama/build.yaml | 9 + llama_stack/templates/ollama/doc_template.md | 6 +- llama_stack/templates/ollama/ollama.py | 17 +- .../templates/ollama/run-with-safety.yaml | 24 ++ llama_stack/templates/ollama/run.yaml | 24 ++ llama_stack/templates/remote-vllm/vllm.py | 12 +- llama_stack/templates/template.py | 55 ++-- llama_stack/templates/tgi/build.yaml | 9 + .../templates/tgi/run-with-safety.yaml | 24 ++ llama_stack/templates/tgi/run.yaml | 24 ++ llama_stack/templates/tgi/tgi.py | 15 +- llama_stack/templates/together/build.yaml | 9 + llama_stack/templates/together/run.yaml | 24 ++ llama_stack/templates/together/together.py | 14 +- llama_stack/templates/vllm-gpu/build.yaml | 9 + llama_stack/templates/vllm-gpu/run.yaml | 24 ++ llama_stack/templates/vllm-gpu/vllm.py | 14 +- 47 files changed, 841 insertions(+), 195 deletions(-) diff --git a/distributions/dependencies.json b/distributions/dependencies.json index 80468cc73..4e66a85da 100644 --- a/distributions/dependencies.json +++ b/distributions/dependencies.json @@ -1,10 +1,12 @@ { - "tgi": [ + "hf-serverless": [ "aiohttp", "aiosqlite", + "autoevals", "blobfile", "chardet", "chromadb-client", + "datasets", "faiss-cpu", "fastapi", "fire", @@ -13,6 +15,7 @@ "matplotlib", "nltk", "numpy", + "openai", "pandas", "pillow", "psycopg2-binary", @@ -27,6 +30,66 @@ "sentence-transformers --no-deps", "torch --index-url https://download.pytorch.org/whl/cpu" ], + "together": [ + "aiosqlite", + "autoevals", + "blobfile", + "chardet", + "chromadb-client", + "datasets", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "matplotlib", + "nltk", + "numpy", + "openai", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "together", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" + ], + "vllm-gpu": [ + "aiosqlite", + "autoevals", + "blobfile", + "chardet", + "chromadb-client", + "datasets", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "matplotlib", + "nltk", + "numpy", + "openai", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "vllm", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" + ], "remote-vllm": [ "aiosqlite", "blobfile", @@ -54,18 +117,22 @@ "sentence-transformers --no-deps", "torch --index-url https://download.pytorch.org/whl/cpu" ], - "vllm-gpu": [ + "fireworks": [ "aiosqlite", + "autoevals", "blobfile", "chardet", "chromadb-client", + "datasets", "faiss-cpu", "fastapi", "fire", + "fireworks-ai", "httpx", "matplotlib", "nltk", "numpy", + "openai", "pandas", "pillow", "psycopg2-binary", @@ -77,82 +144,17 @@ "tqdm", "transformers", "uvicorn", - "vllm", "sentence-transformers --no-deps", "torch --index-url https://download.pytorch.org/whl/cpu" ], - "meta-reference-quantized-gpu": [ - "accelerate", - "aiosqlite", - "blobfile", - "chardet", - "chromadb-client", - "fairscale", - "faiss-cpu", - "fastapi", - "fbgemm-gpu", - "fire", - "httpx", - "lm-format-enforcer", - "matplotlib", - "nltk", - "numpy", - "pandas", - "pillow", - "psycopg2-binary", - "pypdf", - "redis", - "scikit-learn", - "scipy", - "sentencepiece", - "torch", - "torchao==0.5.0", - "torchvision", - "tqdm", - "transformers", - "uvicorn", - "zmq", - "sentence-transformers --no-deps", - "torch --index-url https://download.pytorch.org/whl/cpu" - ], - "meta-reference-gpu": [ - "accelerate", - "aiosqlite", - "blobfile", - "chardet", - "chromadb-client", - "fairscale", - "faiss-cpu", - "fastapi", - "fire", - "httpx", - "lm-format-enforcer", - "matplotlib", - "nltk", - "numpy", - "pandas", - "pillow", - "psycopg2-binary", - "pypdf", - "redis", - "scikit-learn", - "scipy", - "sentencepiece", - "torch", - "torchvision", - "tqdm", - "transformers", - "uvicorn", - "zmq", - "sentence-transformers --no-deps", - "torch --index-url https://download.pytorch.org/whl/cpu" - ], - "hf-serverless": [ + "tgi": [ "aiohttp", "aiosqlite", + "autoevals", "blobfile", "chardet", "chromadb-client", + "datasets", "faiss-cpu", "fastapi", "fire", @@ -161,61 +163,7 @@ "matplotlib", "nltk", "numpy", - "pandas", - "pillow", - "psycopg2-binary", - "pypdf", - "redis", - "scikit-learn", - "scipy", - "sentencepiece", - "tqdm", - "transformers", - "uvicorn", - "sentence-transformers --no-deps", - "torch --index-url https://download.pytorch.org/whl/cpu" - ], - "together": [ - "aiosqlite", - "blobfile", - "chardet", - "chromadb-client", - "faiss-cpu", - "fastapi", - "fire", - "httpx", - "matplotlib", - "nltk", - "numpy", - "pandas", - "pillow", - "psycopg2-binary", - "pypdf", - "redis", - "scikit-learn", - "scipy", - "sentencepiece", - "together", - "tqdm", - "transformers", - "uvicorn", - "sentence-transformers --no-deps", - "torch --index-url https://download.pytorch.org/whl/cpu" - ], - "ollama": [ - "aiohttp", - "aiosqlite", - "blobfile", - "chardet", - "chromadb-client", - "faiss-cpu", - "fastapi", - "fire", - "httpx", - "matplotlib", - "nltk", - "numpy", - "ollama", + "openai", "pandas", "pillow", "psycopg2-binary", @@ -232,10 +180,12 @@ ], "bedrock": [ "aiosqlite", + "autoevals", "blobfile", "boto3", "chardet", "chromadb-client", + "datasets", "faiss-cpu", "fastapi", "fire", @@ -243,6 +193,7 @@ "matplotlib", "nltk", "numpy", + "openai", "pandas", "pillow", "psycopg2-binary", @@ -257,20 +208,24 @@ "sentence-transformers --no-deps", "torch --index-url https://download.pytorch.org/whl/cpu" ], - "hf-endpoint": [ - "aiohttp", + "meta-reference-gpu": [ + "accelerate", "aiosqlite", + "autoevals", "blobfile", "chardet", "chromadb-client", + "datasets", + "fairscale", "faiss-cpu", "fastapi", "fire", "httpx", - "huggingface_hub", + "lm-format-enforcer", "matplotlib", "nltk", "numpy", + "openai", "pandas", "pillow", "psycopg2-binary", @@ -279,25 +234,34 @@ "scikit-learn", "scipy", "sentencepiece", + "torch", + "torchvision", "tqdm", "transformers", "uvicorn", + "zmq", "sentence-transformers --no-deps", "torch --index-url https://download.pytorch.org/whl/cpu" ], - "fireworks": [ + "meta-reference-quantized-gpu": [ + "accelerate", "aiosqlite", + "autoevals", "blobfile", "chardet", "chromadb-client", + "datasets", + "fairscale", "faiss-cpu", "fastapi", + "fbgemm-gpu", "fire", - "fireworks-ai", "httpx", + "lm-format-enforcer", "matplotlib", "nltk", "numpy", + "openai", "pandas", "pillow", "psycopg2-binary", @@ -306,9 +270,13 @@ "scikit-learn", "scipy", "sentencepiece", + "torch", + "torchao==0.5.0", + "torchvision", "tqdm", "transformers", "uvicorn", + "zmq", "sentence-transformers --no-deps", "torch --index-url https://download.pytorch.org/whl/cpu" ], @@ -337,5 +305,67 @@ "uvicorn", "sentence-transformers --no-deps", "torch --index-url https://download.pytorch.org/whl/cpu" + ], + "ollama": [ + "aiohttp", + "aiosqlite", + "autoevals", + "blobfile", + "chardet", + "chromadb-client", + "datasets", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "matplotlib", + "nltk", + "numpy", + "ollama", + "openai", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" + ], + "hf-endpoint": [ + "aiohttp", + "aiosqlite", + "autoevals", + "blobfile", + "chardet", + "chromadb-client", + "datasets", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "huggingface_hub", + "matplotlib", + "nltk", + "numpy", + "openai", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" ] } diff --git a/docs/source/distributions/self_hosted_distro/bedrock.md b/docs/source/distributions/self_hosted_distro/bedrock.md index e0a5d80d0..ae03c89da 100644 --- a/docs/source/distributions/self_hosted_distro/bedrock.md +++ b/docs/source/distributions/self_hosted_distro/bedrock.md @@ -1,6 +1,3 @@ ---- -orphan: true ---- # Bedrock Distribution ```{toctree} @@ -15,9 +12,12 @@ The `llamastack/distribution-bedrock` distribution consists of the following pro | API | Provider(s) | |-----|-------------| | agents | `inline::meta-reference` | +| datasetio | `remote::huggingface`, `inline::localfs` | +| eval | `inline::meta-reference` | | inference | `remote::bedrock` | | memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | | safety | `remote::bedrock` | +| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | diff --git a/docs/source/distributions/self_hosted_distro/fireworks.md b/docs/source/distributions/self_hosted_distro/fireworks.md index e54302c2e..06a12cb1d 100644 --- a/docs/source/distributions/self_hosted_distro/fireworks.md +++ b/docs/source/distributions/self_hosted_distro/fireworks.md @@ -15,9 +15,12 @@ The `llamastack/distribution-fireworks` distribution consists of the following p | API | Provider(s) | |-----|-------------| | agents | `inline::meta-reference` | +| datasetio | `remote::huggingface`, `inline::localfs` | +| eval | `inline::meta-reference` | | inference | `remote::fireworks` | | memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | | safety | `inline::llama-guard` | +| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md index f9717894f..73d6befd4 100644 --- a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md +++ b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md @@ -15,9 +15,12 @@ The `llamastack/distribution-meta-reference-gpu` distribution consists of the fo | API | Provider(s) | |-----|-------------| | agents | `inline::meta-reference` | +| datasetio | `remote::huggingface`, `inline::localfs` | +| eval | `inline::meta-reference` | | inference | `inline::meta-reference` | | memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | | safety | `inline::llama-guard` | +| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md index 3ca161d07..fab9c6cd8 100644 --- a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md +++ b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md @@ -15,9 +15,12 @@ The `llamastack/distribution-meta-reference-quantized-gpu` distribution consists | API | Provider(s) | |-----|-------------| | agents | `inline::meta-reference` | +| datasetio | `remote::huggingface`, `inline::localfs` | +| eval | `inline::meta-reference` | | inference | `inline::meta-reference-quantized` | | memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | | safety | `inline::llama-guard` | +| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | diff --git a/docs/source/distributions/self_hosted_distro/ollama.md b/docs/source/distributions/self_hosted_distro/ollama.md index 9f81d9329..c915a7ac3 100644 --- a/docs/source/distributions/self_hosted_distro/ollama.md +++ b/docs/source/distributions/self_hosted_distro/ollama.md @@ -15,9 +15,12 @@ The `llamastack/distribution-ollama` distribution consists of the following prov | API | Provider(s) | |-----|-------------| | agents | `inline::meta-reference` | +| datasetio | `remote::huggingface`, `inline::localfs` | +| eval | `inline::meta-reference` | | inference | `remote::ollama` | | memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | | safety | `inline::llama-guard` | +| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | @@ -119,7 +122,7 @@ llama stack run ./run-with-safety.yaml \ ### (Optional) Update Model Serving Configuration ```{note} -Please check the [model_aliases](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/ollama.py#L45) variable for supported Ollama models. +Please check the [model_aliases](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/ollama.py#L45) for the supported Ollama models. ``` To serve a new model with `ollama` diff --git a/docs/source/distributions/self_hosted_distro/tgi.md b/docs/source/distributions/self_hosted_distro/tgi.md index 59485226e..84b91da38 100644 --- a/docs/source/distributions/self_hosted_distro/tgi.md +++ b/docs/source/distributions/self_hosted_distro/tgi.md @@ -16,9 +16,12 @@ The `llamastack/distribution-tgi` distribution consists of the following provide | API | Provider(s) | |-----|-------------| | agents | `inline::meta-reference` | +| datasetio | `remote::huggingface`, `inline::localfs` | +| eval | `inline::meta-reference` | | inference | `remote::tgi` | | memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | | safety | `inline::llama-guard` | +| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | diff --git a/docs/source/distributions/self_hosted_distro/together.md b/docs/source/distributions/self_hosted_distro/together.md index 5cfc9e805..c458fdb5f 100644 --- a/docs/source/distributions/self_hosted_distro/together.md +++ b/docs/source/distributions/self_hosted_distro/together.md @@ -15,9 +15,12 @@ The `llamastack/distribution-together` distribution consists of the following pr | API | Provider(s) | |-----|-------------| | agents | `inline::meta-reference` | +| datasetio | `remote::huggingface`, `inline::localfs` | +| eval | `inline::meta-reference` | | inference | `remote::together` | | memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | | safety | `inline::llama-guard` | +| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | diff --git a/llama_stack/distribution/ui/README.md b/llama_stack/distribution/ui/README.md index 2cc352c52..c0a2597af 100644 --- a/llama_stack/distribution/ui/README.md +++ b/llama_stack/distribution/ui/README.md @@ -1,16 +1,41 @@ -# LLama Stack UI +# (Experimental) LLama Stack UI -[!NOTE] This is a work in progress. +## Docker Setup -## Prerequisite -- Start up Llama Stack Server -``` -llama stack run -``` +:warning: This is a work in progress. -## Running Streamlit App +## Developer Setup + +1. Start up Llama Stack API server. More details [here](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html). ``` +llama stack build --template together --image-type conda + +llama stack run together +``` + +2. (Optional) Register datasets and eval tasks as resources. If you want to run pre-configured evaluation flows (e.g. Evaluations (Generation + Scoring) Page). + +```bash +$ llama-stack-client datasets register \ +--dataset-id "mmlu" \ +--provider-id "huggingface" \ +--url "https://huggingface.co/datasets/llamastack/evals" \ +--metadata '{"path": "llamastack/evals", "name": "evals__mmlu__details", "split": "train"}' \ +--schema '{"input_query": {"type": "string"}, "expected_answer": {"type": "string", "chat_completion_input": {"type": "string"}}}' +``` + +```bash +$ llama-stack-client eval_tasks register \ +--eval-task-id meta-reference-mmlu \ +--provider-id meta-reference \ +--dataset-id mmlu \ +--scoring-functions basic::regex_parser_multiple_choice_answer +``` + +3. Start Streamlit UI + +```bash cd llama_stack/distribution/ui pip install -r requirements.txt streamlit run app.py diff --git a/llama_stack/templates/bedrock/bedrock.py b/llama_stack/templates/bedrock/bedrock.py index cf3c342fe..c52b56612 100644 --- a/llama_stack/templates/bedrock/bedrock.py +++ b/llama_stack/templates/bedrock/bedrock.py @@ -6,6 +6,9 @@ from pathlib import Path +from llama_stack.distribution.datatypes import Provider + +from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -16,10 +19,19 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["remote::bedrock"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], + "eval": ["inline::meta-reference"], + "datasetio": ["remote::huggingface", "inline::localfs"], + "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], } + name = "bedrock" + memory_provider = Provider( + provider_id="faiss", + provider_type="inline::faiss", + config=FaissImplConfig.sample_run_config(f"distributions/{name}"), + ) return DistributionTemplate( - name="bedrock", + name=name, distro_type="self_hosted", description="Use AWS Bedrock for running LLM inference and safety", docker_image=None, @@ -27,7 +39,11 @@ def get_distribution_template() -> DistributionTemplate: providers=providers, default_models=[], run_configs={ - "run.yaml": RunConfigSettings(), + "run.yaml": RunConfigSettings( + provider_overrides={ + "memory": [memory_provider], + }, + ), }, run_config_env_vars={ "LLAMASTACK_PORT": ( diff --git a/llama_stack/templates/bedrock/build.yaml b/llama_stack/templates/bedrock/build.yaml index c73db3eae..cd36c320e 100644 --- a/llama_stack/templates/bedrock/build.yaml +++ b/llama_stack/templates/bedrock/build.yaml @@ -16,4 +16,13 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference + eval: + - inline::meta-reference + datasetio: + - remote::huggingface + - inline::localfs + scoring: + - inline::basic + - inline::llm-as-judge + - inline::braintrust image_type: conda diff --git a/llama_stack/templates/bedrock/run.yaml b/llama_stack/templates/bedrock/run.yaml index 1f632a1f2..77d4f2248 100644 --- a/llama_stack/templates/bedrock/run.yaml +++ b/llama_stack/templates/bedrock/run.yaml @@ -4,9 +4,12 @@ docker_image: null conda_env: bedrock apis: - agents +- datasetio +- eval - inference - memory - safety +- scoring - telemetry providers: inference: @@ -37,6 +40,27 @@ providers: - provider_id: meta-reference provider_type: inline::meta-reference config: {} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: {} + - provider_id: localfs + provider_type: inline::localfs + config: {} + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: {} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/fireworks/build.yaml b/llama_stack/templates/fireworks/build.yaml index c16e3f5d6..30ea347ae 100644 --- a/llama_stack/templates/fireworks/build.yaml +++ b/llama_stack/templates/fireworks/build.yaml @@ -16,4 +16,13 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference + eval: + - inline::meta-reference + datasetio: + - remote::huggingface + - inline::localfs + scoring: + - inline::basic + - inline::llm-as-judge + - inline::braintrust image_type: conda diff --git a/llama_stack/templates/fireworks/fireworks.py b/llama_stack/templates/fireworks/fireworks.py index 5f744cae0..64387e4b7 100644 --- a/llama_stack/templates/fireworks/fireworks.py +++ b/llama_stack/templates/fireworks/fireworks.py @@ -9,6 +9,7 @@ from pathlib import Path from llama_models.sku_list import all_registered_models from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput +from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.providers.remote.inference.fireworks import FireworksImplConfig from llama_stack.providers.remote.inference.fireworks.fireworks import MODEL_ALIASES @@ -22,13 +23,23 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], + "eval": ["inline::meta-reference"], + "datasetio": ["remote::huggingface", "inline::localfs"], + "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], } + name = "fireworks" + inference_provider = Provider( provider_id="fireworks", provider_type="remote::fireworks", config=FireworksImplConfig.sample_run_config(), ) + memory_provider = Provider( + provider_id="faiss", + provider_type="inline::faiss", + config=FaissImplConfig.sample_run_config(f"distributions/{name}"), + ) core_model_to_hf_repo = { m.descriptor(): m.huggingface_repo for m in all_registered_models() @@ -42,7 +53,7 @@ def get_distribution_template() -> DistributionTemplate: ] return DistributionTemplate( - name="fireworks", + name=name, distro_type="self_hosted", description="Use Fireworks.AI for running LLM inference", docker_image=None, @@ -53,6 +64,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider], + "memory": [memory_provider], }, default_models=default_models, default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")], diff --git a/llama_stack/templates/fireworks/run.yaml b/llama_stack/templates/fireworks/run.yaml index 6add39c3a..9296be28f 100644 --- a/llama_stack/templates/fireworks/run.yaml +++ b/llama_stack/templates/fireworks/run.yaml @@ -4,9 +4,12 @@ docker_image: null conda_env: fireworks apis: - agents +- datasetio +- eval - inference - memory - safety +- scoring - telemetry providers: inference: @@ -39,6 +42,27 @@ providers: - provider_id: meta-reference provider_type: inline::meta-reference config: {} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: {} + - provider_id: localfs + provider_type: inline::localfs + config: {} + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: {} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/hf-endpoint/build.yaml b/llama_stack/templates/hf-endpoint/build.yaml index 798cb3961..523cf5d83 100644 --- a/llama_stack/templates/hf-endpoint/build.yaml +++ b/llama_stack/templates/hf-endpoint/build.yaml @@ -16,4 +16,13 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference + eval: + - inline::meta-reference + datasetio: + - remote::huggingface + - inline::localfs + scoring: + - inline::basic + - inline::llm-as-judge + - inline::braintrust image_type: conda diff --git a/llama_stack/templates/hf-endpoint/hf_endpoint.py b/llama_stack/templates/hf-endpoint/hf_endpoint.py index af00114ba..297fdae51 100644 --- a/llama_stack/templates/hf-endpoint/hf_endpoint.py +++ b/llama_stack/templates/hf-endpoint/hf_endpoint.py @@ -5,6 +5,7 @@ # the root directory of this source tree. from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput +from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.providers.remote.inference.tgi import InferenceEndpointImplConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -16,13 +17,21 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], + "eval": ["inline::meta-reference"], + "datasetio": ["remote::huggingface", "inline::localfs"], + "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], } - + name = "hf-endpoint" inference_provider = Provider( provider_id="hf-endpoint", provider_type="remote::hf::endpoint", config=InferenceEndpointImplConfig.sample_run_config(), ) + memory_provider = Provider( + provider_id="faiss", + provider_type="inline::faiss", + config=FaissImplConfig.sample_run_config(f"distributions/{name}"), + ) inference_model = ModelInput( model_id="${env.INFERENCE_MODEL}", @@ -34,7 +43,7 @@ def get_distribution_template() -> DistributionTemplate: ) return DistributionTemplate( - name="hf-endpoint", + name=name, distro_type="self_hosted", description="Use (an external) Hugging Face Inference Endpoint for running LLM inference", docker_image=None, @@ -45,6 +54,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider], + "memory": [memory_provider], }, default_models=[inference_model], ), @@ -59,7 +69,8 @@ def get_distribution_template() -> DistributionTemplate: endpoint_name="${env.SAFETY_INFERENCE_ENDPOINT_NAME}", ), ), - ] + ], + "memory": [memory_provider], }, default_models=[ inference_model, diff --git a/llama_stack/templates/hf-endpoint/run-with-safety.yaml b/llama_stack/templates/hf-endpoint/run-with-safety.yaml index d518f29b8..bd625ffc5 100644 --- a/llama_stack/templates/hf-endpoint/run-with-safety.yaml +++ b/llama_stack/templates/hf-endpoint/run-with-safety.yaml @@ -4,9 +4,12 @@ docker_image: null conda_env: hf-endpoint apis: - agents +- datasetio +- eval - inference - memory - safety +- scoring - telemetry providers: inference: @@ -44,6 +47,27 @@ providers: - provider_id: meta-reference provider_type: inline::meta-reference config: {} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: {} + - provider_id: localfs + provider_type: inline::localfs + config: {} + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: {} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/hf-endpoint/run.yaml b/llama_stack/templates/hf-endpoint/run.yaml index ff4e90606..bf0697bba 100644 --- a/llama_stack/templates/hf-endpoint/run.yaml +++ b/llama_stack/templates/hf-endpoint/run.yaml @@ -4,9 +4,12 @@ docker_image: null conda_env: hf-endpoint apis: - agents +- datasetio +- eval - inference - memory - safety +- scoring - telemetry providers: inference: @@ -39,6 +42,27 @@ providers: - provider_id: meta-reference provider_type: inline::meta-reference config: {} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: {} + - provider_id: localfs + provider_type: inline::localfs + config: {} + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: {} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/hf-serverless/build.yaml b/llama_stack/templates/hf-serverless/build.yaml index 3c03a98c1..af7eb60fe 100644 --- a/llama_stack/templates/hf-serverless/build.yaml +++ b/llama_stack/templates/hf-serverless/build.yaml @@ -16,4 +16,13 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference + eval: + - inline::meta-reference + datasetio: + - remote::huggingface + - inline::localfs + scoring: + - inline::basic + - inline::llm-as-judge + - inline::braintrust image_type: conda diff --git a/llama_stack/templates/hf-serverless/hf_serverless.py b/llama_stack/templates/hf-serverless/hf_serverless.py index 5434de986..835495bb9 100644 --- a/llama_stack/templates/hf-serverless/hf_serverless.py +++ b/llama_stack/templates/hf-serverless/hf_serverless.py @@ -5,6 +5,7 @@ # the root directory of this source tree. from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput +from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.providers.remote.inference.tgi import InferenceAPIImplConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -16,13 +17,22 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], + "eval": ["inline::meta-reference"], + "datasetio": ["remote::huggingface", "inline::localfs"], + "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], } + name = "hf-serverless" inference_provider = Provider( provider_id="hf-serverless", provider_type="remote::hf::serverless", config=InferenceAPIImplConfig.sample_run_config(), ) + memory_provider = Provider( + provider_id="faiss", + provider_type="inline::faiss", + config=FaissImplConfig.sample_run_config(f"distributions/{name}"), + ) inference_model = ModelInput( model_id="${env.INFERENCE_MODEL}", @@ -34,7 +44,7 @@ def get_distribution_template() -> DistributionTemplate: ) return DistributionTemplate( - name="hf-serverless", + name=name, distro_type="self_hosted", description="Use (an external) Hugging Face Inference Endpoint for running LLM inference", docker_image=None, @@ -45,6 +55,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider], + "memory": [memory_provider], }, default_models=[inference_model], ), @@ -59,7 +70,8 @@ def get_distribution_template() -> DistributionTemplate: repo="${env.SAFETY_MODEL}", ), ), - ] + ], + "memory": [memory_provider], }, default_models=[ inference_model, diff --git a/llama_stack/templates/hf-serverless/run-with-safety.yaml b/llama_stack/templates/hf-serverless/run-with-safety.yaml index e7591bbf0..f5ead14d4 100644 --- a/llama_stack/templates/hf-serverless/run-with-safety.yaml +++ b/llama_stack/templates/hf-serverless/run-with-safety.yaml @@ -4,9 +4,12 @@ docker_image: null conda_env: hf-serverless apis: - agents +- datasetio +- eval - inference - memory - safety +- scoring - telemetry providers: inference: @@ -44,6 +47,27 @@ providers: - provider_id: meta-reference provider_type: inline::meta-reference config: {} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: {} + - provider_id: localfs + provider_type: inline::localfs + config: {} + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: {} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/hf-serverless/run.yaml b/llama_stack/templates/hf-serverless/run.yaml index d7ec02f6a..13e2d7789 100644 --- a/llama_stack/templates/hf-serverless/run.yaml +++ b/llama_stack/templates/hf-serverless/run.yaml @@ -4,9 +4,12 @@ docker_image: null conda_env: hf-serverless apis: - agents +- datasetio +- eval - inference - memory - safety +- scoring - telemetry providers: inference: @@ -39,6 +42,27 @@ providers: - provider_id: meta-reference provider_type: inline::meta-reference config: {} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: {} + - provider_id: localfs + provider_type: inline::localfs + config: {} + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: {} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/meta-reference-gpu/build.yaml b/llama_stack/templates/meta-reference-gpu/build.yaml index ef075d098..300b75b14 100644 --- a/llama_stack/templates/meta-reference-gpu/build.yaml +++ b/llama_stack/templates/meta-reference-gpu/build.yaml @@ -16,4 +16,13 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference + eval: + - inline::meta-reference + datasetio: + - remote::huggingface + - inline::localfs + scoring: + - inline::basic + - inline::llm-as-judge + - inline::braintrust image_type: conda diff --git a/llama_stack/templates/meta-reference-gpu/meta_reference.py b/llama_stack/templates/meta-reference-gpu/meta_reference.py index f254bc920..0aff9f39c 100644 --- a/llama_stack/templates/meta-reference-gpu/meta_reference.py +++ b/llama_stack/templates/meta-reference-gpu/meta_reference.py @@ -10,6 +10,7 @@ from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput from llama_stack.providers.inline.inference.meta_reference import ( MetaReferenceInferenceConfig, ) +from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -20,8 +21,11 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], + "eval": ["inline::meta-reference"], + "datasetio": ["remote::huggingface", "inline::localfs"], + "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], } - + name = "meta-reference-gpu" inference_provider = Provider( provider_id="meta-reference-inference", provider_type="inline::meta-reference", @@ -30,6 +34,11 @@ def get_distribution_template() -> DistributionTemplate: checkpoint_dir="${env.INFERENCE_CHECKPOINT_DIR:null}", ), ) + memory_provider = Provider( + provider_id="faiss", + provider_type="inline::faiss", + config=FaissImplConfig.sample_run_config(f"distributions/{name}"), + ) inference_model = ModelInput( model_id="${env.INFERENCE_MODEL}", @@ -41,7 +50,7 @@ def get_distribution_template() -> DistributionTemplate: ) return DistributionTemplate( - name="meta-reference-gpu", + name=name, distro_type="self_hosted", description="Use Meta Reference for running LLM inference", template_path=Path(__file__).parent / "doc_template.md", @@ -51,6 +60,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider], + "memory": [memory_provider], }, default_models=[inference_model], ), @@ -67,6 +77,7 @@ def get_distribution_template() -> DistributionTemplate: ), ), ], + "memory": [memory_provider], }, default_models=[ inference_model, diff --git a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml index f82e0c938..d0fa05e96 100644 --- a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml +++ b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml @@ -4,9 +4,12 @@ docker_image: null conda_env: meta-reference-gpu apis: - agents +- datasetio +- eval - inference - memory - safety +- scoring - telemetry providers: inference: @@ -46,6 +49,27 @@ providers: - provider_id: meta-reference provider_type: inline::meta-reference config: {} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: {} + - provider_id: localfs + provider_type: inline::localfs + config: {} + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: {} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/meta-reference-gpu/run.yaml b/llama_stack/templates/meta-reference-gpu/run.yaml index b125169a3..3675f4a58 100644 --- a/llama_stack/templates/meta-reference-gpu/run.yaml +++ b/llama_stack/templates/meta-reference-gpu/run.yaml @@ -4,9 +4,12 @@ docker_image: null conda_env: meta-reference-gpu apis: - agents +- datasetio +- eval - inference - memory - safety +- scoring - telemetry providers: inference: @@ -40,6 +43,27 @@ providers: - provider_id: meta-reference provider_type: inline::meta-reference config: {} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: {} + - provider_id: localfs + provider_type: inline::localfs + config: {} + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: {} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/meta-reference-quantized-gpu/build.yaml b/llama_stack/templates/meta-reference-quantized-gpu/build.yaml index 961864dac..9d866de18 100644 --- a/llama_stack/templates/meta-reference-quantized-gpu/build.yaml +++ b/llama_stack/templates/meta-reference-quantized-gpu/build.yaml @@ -16,4 +16,13 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference + eval: + - inline::meta-reference + datasetio: + - remote::huggingface + - inline::localfs + scoring: + - inline::basic + - inline::llm-as-judge + - inline::braintrust image_type: conda diff --git a/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py b/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py index 1ff5d31d6..1d611ae5f 100644 --- a/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py +++ b/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py @@ -10,6 +10,7 @@ from llama_stack.distribution.datatypes import ModelInput, Provider from llama_stack.providers.inline.inference.meta_reference import ( MetaReferenceQuantizedInferenceConfig, ) +from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -20,8 +21,11 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], + "eval": ["inline::meta-reference"], + "datasetio": ["remote::huggingface", "inline::localfs"], + "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], } - + name = "meta-reference-quantized-gpu" inference_provider = Provider( provider_id="meta-reference-inference", provider_type="inline::meta-reference-quantized", @@ -30,13 +34,18 @@ def get_distribution_template() -> DistributionTemplate: checkpoint_dir="${env.INFERENCE_CHECKPOINT_DIR:null}", ), ) + memory_provider = Provider( + provider_id="faiss", + provider_type="inline::faiss", + config=FaissImplConfig.sample_run_config(f"distributions/{name}"), + ) inference_model = ModelInput( model_id="${env.INFERENCE_MODEL}", provider_id="meta-reference-inference", ) return DistributionTemplate( - name="meta-reference-quantized-gpu", + name=name, distro_type="self_hosted", description="Use Meta Reference with fp8, int4 quantization for running LLM inference", template_path=Path(__file__).parent / "doc_template.md", @@ -46,6 +55,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider], + "memory": [memory_provider], }, default_models=[inference_model], ), diff --git a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml index e1104b623..081af0f59 100644 --- a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml +++ b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml @@ -4,9 +4,12 @@ docker_image: null conda_env: meta-reference-quantized-gpu apis: - agents +- datasetio +- eval - inference - memory - safety +- scoring - telemetry providers: inference: @@ -42,6 +45,27 @@ providers: - provider_id: meta-reference provider_type: inline::meta-reference config: {} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: {} + - provider_id: localfs + provider_type: inline::localfs + config: {} + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: {} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/ollama/build.yaml b/llama_stack/templates/ollama/build.yaml index 106449309..a021e4993 100644 --- a/llama_stack/templates/ollama/build.yaml +++ b/llama_stack/templates/ollama/build.yaml @@ -16,4 +16,13 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference + eval: + - inline::meta-reference + datasetio: + - remote::huggingface + - inline::localfs + scoring: + - inline::basic + - inline::llm-as-judge + - inline::braintrust image_type: conda diff --git a/llama_stack/templates/ollama/doc_template.md b/llama_stack/templates/ollama/doc_template.md index cfefce33d..a75583592 100644 --- a/llama_stack/templates/ollama/doc_template.md +++ b/llama_stack/templates/ollama/doc_template.md @@ -114,9 +114,9 @@ llama stack run ./run-with-safety.yaml \ ### (Optional) Update Model Serving Configuration -> [!NOTE] -> Please check the [OLLAMA_SUPPORTED_MODELS](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers.remote/inference/ollama/ollama.py) for the supported Ollama models. - +```{note} +Please check the [model_aliases](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/ollama.py#L45) for the supported Ollama models. +``` To serve a new model with `ollama` ```bash diff --git a/llama_stack/templates/ollama/ollama.py b/llama_stack/templates/ollama/ollama.py index b30c75bb5..c24dfa6e9 100644 --- a/llama_stack/templates/ollama/ollama.py +++ b/llama_stack/templates/ollama/ollama.py @@ -7,6 +7,7 @@ from pathlib import Path from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput +from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.providers.remote.inference.ollama import OllamaImplConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -18,13 +19,21 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], + "eval": ["inline::meta-reference"], + "datasetio": ["remote::huggingface", "inline::localfs"], + "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], } - + name = "ollama" inference_provider = Provider( provider_id="ollama", provider_type="remote::ollama", config=OllamaImplConfig.sample_run_config(), ) + memory_provider = Provider( + provider_id="faiss", + provider_type="inline::faiss", + config=FaissImplConfig.sample_run_config(f"distributions/{name}"), + ) inference_model = ModelInput( model_id="${env.INFERENCE_MODEL}", @@ -36,7 +45,7 @@ def get_distribution_template() -> DistributionTemplate: ) return DistributionTemplate( - name="ollama", + name=name, distro_type="self_hosted", description="Use (an external) Ollama server for running LLM inference", docker_image=None, @@ -47,6 +56,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider], + "memory": [memory_provider], }, default_models=[inference_model], ), @@ -54,7 +64,8 @@ def get_distribution_template() -> DistributionTemplate: provider_overrides={ "inference": [ inference_provider, - ] + ], + "memory": [memory_provider], }, default_models=[ inference_model, diff --git a/llama_stack/templates/ollama/run-with-safety.yaml b/llama_stack/templates/ollama/run-with-safety.yaml index 6c86677b3..dc282f996 100644 --- a/llama_stack/templates/ollama/run-with-safety.yaml +++ b/llama_stack/templates/ollama/run-with-safety.yaml @@ -4,9 +4,12 @@ docker_image: null conda_env: ollama apis: - agents +- datasetio +- eval - inference - memory - safety +- scoring - telemetry providers: inference: @@ -38,6 +41,27 @@ providers: - provider_id: meta-reference provider_type: inline::meta-reference config: {} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: {} + - provider_id: localfs + provider_type: inline::localfs + config: {} + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: {} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml index b2d6f2c18..ab8e12839 100644 --- a/llama_stack/templates/ollama/run.yaml +++ b/llama_stack/templates/ollama/run.yaml @@ -4,9 +4,12 @@ docker_image: null conda_env: ollama apis: - agents +- datasetio +- eval - inference - memory - safety +- scoring - telemetry providers: inference: @@ -38,6 +41,27 @@ providers: - provider_id: meta-reference provider_type: inline::meta-reference config: {} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: {} + - provider_id: localfs + provider_type: inline::localfs + config: {} + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: {} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/remote-vllm/vllm.py b/llama_stack/templates/remote-vllm/vllm.py index c3858f7e5..f5ccfcf16 100644 --- a/llama_stack/templates/remote-vllm/vllm.py +++ b/llama_stack/templates/remote-vllm/vllm.py @@ -7,6 +7,7 @@ from pathlib import Path from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput +from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.providers.remote.inference.vllm import VLLMInferenceAdapterConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -19,7 +20,7 @@ def get_distribution_template() -> DistributionTemplate: "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], } - + name = "remote-vllm" inference_provider = Provider( provider_id="vllm-inference", provider_type="remote::vllm", @@ -27,6 +28,11 @@ def get_distribution_template() -> DistributionTemplate: url="${env.VLLM_URL}", ), ) + memory_provider = Provider( + provider_id="faiss", + provider_type="inline::faiss", + config=FaissImplConfig.sample_run_config(f"distributions/{name}"), + ) inference_model = ModelInput( model_id="${env.INFERENCE_MODEL}", @@ -38,7 +44,7 @@ def get_distribution_template() -> DistributionTemplate: ) return DistributionTemplate( - name="remote-vllm", + name=name, distro_type="self_hosted", description="Use (an external) vLLM server for running LLM inference", template_path=Path(__file__).parent / "doc_template.md", @@ -48,6 +54,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider], + "memory": [memory_provider], }, default_models=[inference_model], ), @@ -63,6 +70,7 @@ def get_distribution_template() -> DistributionTemplate: ), ), ], + "memory": [memory_provider], }, default_models=[ inference_model, diff --git a/llama_stack/templates/template.py b/llama_stack/templates/template.py index bf74b95d1..e82be6394 100644 --- a/llama_stack/templates/template.py +++ b/llama_stack/templates/template.py @@ -44,36 +44,37 @@ class RunConfigSettings(BaseModel): provider_configs[api_str] = api_providers continue - provider_type = provider_types[0] - provider_id = provider_type.split("::")[-1] + provider_configs[api_str] = [] + for provider_type in provider_types: + provider_id = provider_type.split("::")[-1] - api = Api(api_str) - if provider_type not in provider_registry[api]: - raise ValueError( - f"Unknown provider type: {provider_type} for API: {api_str}" + api = Api(api_str) + if provider_type not in provider_registry[api]: + raise ValueError( + f"Unknown provider type: {provider_type} for API: {api_str}" + ) + + config_class = provider_registry[api][provider_type].config_class + assert ( + config_class is not None + ), f"No config class for provider type: {provider_type} for API: {api_str}" + + config_class = instantiate_class_type(config_class) + if hasattr(config_class, "sample_run_config"): + config = config_class.sample_run_config( + __distro_dir__=f"distributions/{name}" + ) + else: + config = {} + + provider_configs[api_str].append( + Provider( + provider_id=provider_id, + provider_type=provider_type, + config=config, + ) ) - config_class = provider_registry[api][provider_type].config_class - assert ( - config_class is not None - ), f"No config class for provider type: {provider_type} for API: {api_str}" - - config_class = instantiate_class_type(config_class) - if hasattr(config_class, "sample_run_config"): - config = config_class.sample_run_config( - __distro_dir__=f"distributions/{name}" - ) - else: - config = {} - - provider_configs[api_str] = [ - Provider( - provider_id=provider_id, - provider_type=provider_type, - config=config, - ) - ] - # Get unique set of APIs from providers apis = list(sorted(providers.keys())) diff --git a/llama_stack/templates/tgi/build.yaml b/llama_stack/templates/tgi/build.yaml index 0f7602e2f..d90b505df 100644 --- a/llama_stack/templates/tgi/build.yaml +++ b/llama_stack/templates/tgi/build.yaml @@ -16,4 +16,13 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference + eval: + - inline::meta-reference + datasetio: + - remote::huggingface + - inline::localfs + scoring: + - inline::basic + - inline::llm-as-judge + - inline::braintrust image_type: conda diff --git a/llama_stack/templates/tgi/run-with-safety.yaml b/llama_stack/templates/tgi/run-with-safety.yaml index ebf082cd6..2ee82ddc3 100644 --- a/llama_stack/templates/tgi/run-with-safety.yaml +++ b/llama_stack/templates/tgi/run-with-safety.yaml @@ -4,9 +4,12 @@ docker_image: null conda_env: tgi apis: - agents +- datasetio +- eval - inference - memory - safety +- scoring - telemetry providers: inference: @@ -42,6 +45,27 @@ providers: - provider_id: meta-reference provider_type: inline::meta-reference config: {} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: {} + - provider_id: localfs + provider_type: inline::localfs + config: {} + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: {} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/tgi/run.yaml b/llama_stack/templates/tgi/run.yaml index 352afabb5..c45e114ee 100644 --- a/llama_stack/templates/tgi/run.yaml +++ b/llama_stack/templates/tgi/run.yaml @@ -4,9 +4,12 @@ docker_image: null conda_env: tgi apis: - agents +- datasetio +- eval - inference - memory - safety +- scoring - telemetry providers: inference: @@ -38,6 +41,27 @@ providers: - provider_id: meta-reference provider_type: inline::meta-reference config: {} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: {} + - provider_id: localfs + provider_type: inline::localfs + config: {} + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: {} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/tgi/tgi.py b/llama_stack/templates/tgi/tgi.py index caa341df3..83818a598 100644 --- a/llama_stack/templates/tgi/tgi.py +++ b/llama_stack/templates/tgi/tgi.py @@ -7,6 +7,7 @@ from pathlib import Path from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput +from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.providers.remote.inference.tgi import TGIImplConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -18,8 +19,11 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], + "eval": ["inline::meta-reference"], + "datasetio": ["remote::huggingface", "inline::localfs"], + "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], } - + name = "tgi" inference_provider = Provider( provider_id="tgi-inference", provider_type="remote::tgi", @@ -27,6 +31,11 @@ def get_distribution_template() -> DistributionTemplate: url="${env.TGI_URL}", ), ) + memory_provider = Provider( + provider_id="faiss", + provider_type="inline::faiss", + config=FaissImplConfig.sample_run_config(f"distributions/{name}"), + ) inference_model = ModelInput( model_id="${env.INFERENCE_MODEL}", @@ -38,7 +47,7 @@ def get_distribution_template() -> DistributionTemplate: ) return DistributionTemplate( - name="tgi", + name=name, distro_type="self_hosted", description="Use (an external) TGI server for running LLM inference", docker_image=None, @@ -49,6 +58,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider], + "memory": [memory_provider], }, default_models=[inference_model], ), @@ -64,6 +74,7 @@ def get_distribution_template() -> DistributionTemplate: ), ), ], + "memory": [memory_provider], }, default_models=[ inference_model, diff --git a/llama_stack/templates/together/build.yaml b/llama_stack/templates/together/build.yaml index a4402ba93..6930b7692 100644 --- a/llama_stack/templates/together/build.yaml +++ b/llama_stack/templates/together/build.yaml @@ -16,4 +16,13 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference + eval: + - inline::meta-reference + datasetio: + - remote::huggingface + - inline::localfs + scoring: + - inline::basic + - inline::llm-as-judge + - inline::braintrust image_type: conda diff --git a/llama_stack/templates/together/run.yaml b/llama_stack/templates/together/run.yaml index 855ba0626..a9f96a099 100644 --- a/llama_stack/templates/together/run.yaml +++ b/llama_stack/templates/together/run.yaml @@ -4,9 +4,12 @@ docker_image: null conda_env: together apis: - agents +- datasetio +- eval - inference - memory - safety +- scoring - telemetry providers: inference: @@ -39,6 +42,27 @@ providers: - provider_id: meta-reference provider_type: inline::meta-reference config: {} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: {} + - provider_id: localfs + provider_type: inline::localfs + config: {} + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: {} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/together/together.py b/llama_stack/templates/together/together.py index 16265b04f..6656cfe44 100644 --- a/llama_stack/templates/together/together.py +++ b/llama_stack/templates/together/together.py @@ -9,6 +9,7 @@ from pathlib import Path from llama_models.sku_list import all_registered_models from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput +from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.providers.remote.inference.together import TogetherImplConfig from llama_stack.providers.remote.inference.together.together import MODEL_ALIASES @@ -22,13 +23,21 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], + "eval": ["inline::meta-reference"], + "datasetio": ["remote::huggingface", "inline::localfs"], + "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], } - + name = "together" inference_provider = Provider( provider_id="together", provider_type="remote::together", config=TogetherImplConfig.sample_run_config(), ) + memory_provider = Provider( + provider_id="faiss", + provider_type="inline::faiss", + config=FaissImplConfig.sample_run_config(f"distributions/{name}"), + ) core_model_to_hf_repo = { m.descriptor(): m.huggingface_repo for m in all_registered_models() @@ -42,7 +51,7 @@ def get_distribution_template() -> DistributionTemplate: ] return DistributionTemplate( - name="together", + name=name, distro_type="self_hosted", description="Use Together.AI for running LLM inference", docker_image=None, @@ -53,6 +62,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider], + "memory": [memory_provider], }, default_models=default_models, default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")], diff --git a/llama_stack/templates/vllm-gpu/build.yaml b/llama_stack/templates/vllm-gpu/build.yaml index 6792a855f..4289296ec 100644 --- a/llama_stack/templates/vllm-gpu/build.yaml +++ b/llama_stack/templates/vllm-gpu/build.yaml @@ -16,4 +16,13 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference + eval: + - inline::meta-reference + datasetio: + - remote::huggingface + - inline::localfs + scoring: + - inline::basic + - inline::llm-as-judge + - inline::braintrust image_type: conda diff --git a/llama_stack/templates/vllm-gpu/run.yaml b/llama_stack/templates/vllm-gpu/run.yaml index a140ad403..ea188777f 100644 --- a/llama_stack/templates/vllm-gpu/run.yaml +++ b/llama_stack/templates/vllm-gpu/run.yaml @@ -4,9 +4,12 @@ docker_image: null conda_env: vllm-gpu apis: - agents +- datasetio +- eval - inference - memory - safety +- scoring - telemetry providers: inference: @@ -42,6 +45,27 @@ providers: - provider_id: meta-reference provider_type: inline::meta-reference config: {} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: {} + - provider_id: localfs + provider_type: inline::localfs + config: {} + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: {} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/vllm-gpu/vllm.py b/llama_stack/templates/vllm-gpu/vllm.py index 78fcf4f57..10b448b5c 100644 --- a/llama_stack/templates/vllm-gpu/vllm.py +++ b/llama_stack/templates/vllm-gpu/vllm.py @@ -6,6 +6,7 @@ from llama_stack.distribution.datatypes import ModelInput, Provider from llama_stack.providers.inline.inference.vllm import VLLMConfig +from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -16,13 +17,21 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], + "eval": ["inline::meta-reference"], + "datasetio": ["remote::huggingface", "inline::localfs"], + "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], } - + name = "vllm-gpu" inference_provider = Provider( provider_id="vllm", provider_type="inline::vllm", config=VLLMConfig.sample_run_config(), ) + memory_provider = Provider( + provider_id="faiss", + provider_type="inline::faiss", + config=FaissImplConfig.sample_run_config(f"distributions/{name}"), + ) inference_model = ModelInput( model_id="${env.INFERENCE_MODEL}", @@ -30,7 +39,7 @@ def get_distribution_template() -> DistributionTemplate: ) return DistributionTemplate( - name="vllm-gpu", + name=name, distro_type="self_hosted", description="Use a built-in vLLM engine for running LLM inference", docker_image=None, @@ -41,6 +50,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider], + "memory": [memory_provider], }, default_models=[inference_model], ), From 66440e2c203e7d73a0aca7249c06ceed33cfc05b Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Thu, 5 Dec 2024 17:44:14 -0800 Subject: [PATCH 27/69] Add missing init file --- llama_stack/providers/inline/eval/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 llama_stack/providers/inline/eval/__init__.py diff --git a/llama_stack/providers/inline/eval/__init__.py b/llama_stack/providers/inline/eval/__init__.py new file mode 100644 index 000000000..756f351d8 --- /dev/null +++ b/llama_stack/providers/inline/eval/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. From cdfc98cf08ce12cadf101020b3916fde2ffd268f Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Thu, 5 Dec 2024 20:54:28 -0800 Subject: [PATCH 28/69] add a warning at least for when `bwrap` is not available for code execution --- .../providers/inline/agents/meta_reference/agents.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/llama_stack/providers/inline/agents/meta_reference/agents.py b/llama_stack/providers/inline/agents/meta_reference/agents.py index f33aadde3..0b0bb6e27 100644 --- a/llama_stack/providers/inline/agents/meta_reference/agents.py +++ b/llama_stack/providers/inline/agents/meta_reference/agents.py @@ -6,9 +6,12 @@ import json import logging +import shutil import uuid from typing import AsyncGenerator +from termcolor import colored + from llama_stack.apis.inference import Inference from llama_stack.apis.memory import Memory from llama_stack.apis.memory_banks import MemoryBanks @@ -44,6 +47,15 @@ class MetaReferenceAgentsImpl(Agents): async def initialize(self) -> None: self.persistence_store = await kvstore_impl(self.config.persistence_store) + # check if "bwrap" is available + if not shutil.which("bwrap"): + print( + colored( + "Warning: `bwrap` is not available. Code interpreter tool will not work correctly.", + "yellow", + ) + ) + async def create_agent( self, agent_config: AgentConfig, From c23363d56117648861e18224b0de68cc9c3d39d0 Mon Sep 17 00:00:00 2001 From: Dinesh Yeduguru Date: Thu, 5 Dec 2024 21:07:30 -0800 Subject: [PATCH 29/69] Add ability to query and export spans to dataset (#574) This PR adds two new methods to the telemetry API: 1) Gives the ability to query spans directly instead of first querying traces and then using that to get spans 2) Another method save_spans_to_dataset, which builds on the query spans to save it on dataset. This give the ability to saves spans that are part of an agent session to a dataset. The unique aspect of this API is that we dont require each provider of telemetry to implement this method. Hence, its implemented in the protocol class itself. This required the protocol check to be slightly modified. --- llama_stack/apis/telemetry/telemetry.py | 17 ++++ .../inline/eval/meta_reference/config.py | 3 +- .../inline/eval/meta_reference/eval.py | 3 +- .../telemetry/meta_reference/__init__.py | 2 +- .../telemetry/meta_reference/telemetry.py | 16 ++-- llama_stack/providers/registry/telemetry.py | 1 + .../providers/utils/telemetry/__init__.py | 3 + .../utils/telemetry/dataset_mixin.py | 87 +++++++++++++++++++ .../utils/telemetry/sqlite_trace_store.py | 4 +- 9 files changed, 126 insertions(+), 10 deletions(-) create mode 100644 llama_stack/providers/utils/telemetry/dataset_mixin.py diff --git a/llama_stack/apis/telemetry/telemetry.py b/llama_stack/apis/telemetry/telemetry.py index 2ff783c46..fd60d99a7 100644 --- a/llama_stack/apis/telemetry/telemetry.py +++ b/llama_stack/apis/telemetry/telemetry.py @@ -186,3 +186,20 @@ class Telemetry(Protocol): attributes_to_return: Optional[List[str]] = None, max_depth: Optional[int] = None, ) -> SpanWithChildren: ... + + @webmethod(route="/telemetry/query-spans", method="POST") + async def query_spans( + self, + attribute_filters: List[QueryCondition], + attributes_to_return: List[str], + max_depth: Optional[int] = None, + ) -> List[Span]: ... + + @webmethod(route="/telemetry/save-spans-to-dataset", method="POST") + async def save_spans_to_dataset( + self, + attribute_filters: List[QueryCondition], + attributes_to_save: List[str], + dataset_id: str, + max_depth: Optional[int] = None, + ) -> None: ... diff --git a/llama_stack/providers/inline/eval/meta_reference/config.py b/llama_stack/providers/inline/eval/meta_reference/config.py index 8538d32ad..95b780cca 100644 --- a/llama_stack/providers/inline/eval/meta_reference/config.py +++ b/llama_stack/providers/inline/eval/meta_reference/config.py @@ -3,12 +3,13 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from pydantic import BaseModel + from llama_stack.distribution.utils.config_dirs import RUNTIME_BASE_DIR from llama_stack.providers.utils.kvstore.config import ( KVStoreConfig, SqliteKVStoreConfig, ) -from pydantic import BaseModel class MetaReferenceEvalConfig(BaseModel): diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py index c6cacfcc3..453215e41 100644 --- a/llama_stack/providers/inline/eval/meta_reference/eval.py +++ b/llama_stack/providers/inline/eval/meta_reference/eval.py @@ -4,7 +4,9 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. from enum import Enum +from typing import Any, Dict, List, Optional from llama_models.llama3.api.datatypes import * # noqa: F403 +from tqdm import tqdm from .....apis.common.job_types import Job from .....apis.eval.eval import Eval, EvalTaskConfig, EvaluateResponse, JobStatus @@ -17,7 +19,6 @@ from llama_stack.apis.inference import Inference from llama_stack.apis.scoring import Scoring from llama_stack.providers.datatypes import EvalTasksProtocolPrivate from llama_stack.providers.utils.kvstore import kvstore_impl -from tqdm import tqdm from .config import MetaReferenceEvalConfig diff --git a/llama_stack/providers/inline/telemetry/meta_reference/__init__.py b/llama_stack/providers/inline/telemetry/meta_reference/__init__.py index 6213d5536..38871a7e4 100644 --- a/llama_stack/providers/inline/telemetry/meta_reference/__init__.py +++ b/llama_stack/providers/inline/telemetry/meta_reference/__init__.py @@ -13,6 +13,6 @@ __all__ = ["TelemetryConfig", "TelemetryAdapter", "TelemetrySink"] async def get_provider_impl(config: TelemetryConfig, deps: Dict[str, Any]): - impl = TelemetryAdapter(config) + impl = TelemetryAdapter(config, deps) await impl.initialize() return impl diff --git a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py index 6540a667f..0bcc48afb 100644 --- a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py +++ b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py @@ -5,7 +5,7 @@ # the root directory of this source tree. import threading -from typing import List, Optional +from typing import Any, Dict, List, Optional from opentelemetry import metrics, trace from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter @@ -24,10 +24,15 @@ from llama_stack.providers.inline.telemetry.meta_reference.console_span_processo from llama_stack.providers.inline.telemetry.meta_reference.sqlite_span_processor import ( SQLiteSpanProcessor, ) -from llama_stack.providers.utils.telemetry.sqlite_trace_store import SQLiteTraceStore +from llama_stack.providers.utils.telemetry import ( + SQLiteTraceStore, + TelemetryDatasetMixin, +) from llama_stack.apis.telemetry import * # noqa: F403 +from llama_stack.distribution.datatypes import Api + from .config import TelemetryConfig, TelemetrySink _GLOBAL_STORAGE = { @@ -54,9 +59,10 @@ def is_tracing_enabled(tracer): return span.is_recording() -class TelemetryAdapter(Telemetry): - def __init__(self, config: TelemetryConfig) -> None: +class TelemetryAdapter(TelemetryDatasetMixin, Telemetry): + def __init__(self, config: TelemetryConfig, deps: Dict[str, Any]) -> None: self.config = config + self.datasetio_api = deps[Api.datasetio] resource = Resource.create( { @@ -240,7 +246,7 @@ class TelemetryAdapter(Telemetry): attributes_to_return: Optional[List[str]] = None, max_depth: Optional[int] = None, ) -> SpanWithChildren: - return await self.trace_store.get_materialized_span( + return await self.trace_store.get_span_tree( span_id=span_id, attributes_to_return=attributes_to_return, max_depth=max_depth, diff --git a/llama_stack/providers/registry/telemetry.py b/llama_stack/providers/registry/telemetry.py index a53ad5b94..d367bf894 100644 --- a/llama_stack/providers/registry/telemetry.py +++ b/llama_stack/providers/registry/telemetry.py @@ -18,6 +18,7 @@ def available_providers() -> List[ProviderSpec]: "opentelemetry-sdk", "opentelemetry-exporter-otlp-proto-http", ], + api_dependencies=[Api.datasetio], module="llama_stack.providers.inline.telemetry.meta_reference", config_class="llama_stack.providers.inline.telemetry.meta_reference.config.TelemetryConfig", ), diff --git a/llama_stack/providers/utils/telemetry/__init__.py b/llama_stack/providers/utils/telemetry/__init__.py index 756f351d8..2d95a5dc5 100644 --- a/llama_stack/providers/utils/telemetry/__init__.py +++ b/llama_stack/providers/utils/telemetry/__init__.py @@ -3,3 +3,6 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. + +from .dataset_mixin import TelemetryDatasetMixin # noqa: F401 +from .sqlite_trace_store import SQLiteTraceStore, TraceStore # noqa: F401 diff --git a/llama_stack/providers/utils/telemetry/dataset_mixin.py b/llama_stack/providers/utils/telemetry/dataset_mixin.py new file mode 100644 index 000000000..7a59801f4 --- /dev/null +++ b/llama_stack/providers/utils/telemetry/dataset_mixin.py @@ -0,0 +1,87 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import List, Optional + +from llama_stack.apis.datasetio import DatasetIO +from llama_stack.apis.telemetry import QueryCondition, Span, SpanWithChildren + + +class TelemetryDatasetMixin: + """Mixin class that provides dataset-related functionality for telemetry providers.""" + + datasetio_api: DatasetIO + + async def save_spans_to_dataset( + self, + attribute_filters: List[QueryCondition], + attributes_to_save: List[str], + dataset_id: str, + max_depth: Optional[int] = None, + ) -> None: + spans = await self.query_spans( + attribute_filters=attribute_filters, + attributes_to_return=attributes_to_save, + max_depth=max_depth, + ) + + rows = [ + { + "trace_id": span.trace_id, + "span_id": span.span_id, + "parent_span_id": span.parent_span_id, + "name": span.name, + "start_time": span.start_time, + "end_time": span.end_time, + **{attr: span.attributes.get(attr) for attr in attributes_to_save}, + } + for span in spans + ] + + await self.datasetio_api.append_rows(dataset_id=dataset_id, rows=rows) + + async def query_spans( + self, + attribute_filters: List[QueryCondition], + attributes_to_return: List[str], + max_depth: Optional[int] = None, + ) -> List[Span]: + traces = await self.query_traces(attribute_filters=attribute_filters) + spans = [] + + for trace in traces: + span_tree = await self.get_span_tree( + span_id=trace.root_span_id, + attributes_to_return=attributes_to_return, + max_depth=max_depth, + ) + + def extract_spans(span: SpanWithChildren) -> List[Span]: + result = [] + if span.attributes and all( + attr in span.attributes and span.attributes[attr] is not None + for attr in attributes_to_return + ): + result.append( + Span( + trace_id=trace.root_span_id, + span_id=span.span_id, + parent_span_id=span.parent_span_id, + name=span.name, + start_time=span.start_time, + end_time=span.end_time, + attributes=span.attributes, + ) + ) + + for child in span.children: + result.extend(extract_spans(child)) + + return result + + spans.extend(extract_spans(span_tree)) + + return spans diff --git a/llama_stack/providers/utils/telemetry/sqlite_trace_store.py b/llama_stack/providers/utils/telemetry/sqlite_trace_store.py index ed1343e0b..031b6fc73 100644 --- a/llama_stack/providers/utils/telemetry/sqlite_trace_store.py +++ b/llama_stack/providers/utils/telemetry/sqlite_trace_store.py @@ -23,7 +23,7 @@ class TraceStore(Protocol): order_by: Optional[List[str]] = None, ) -> List[Trace]: ... - async def get_materialized_span( + async def get_span_tree( self, span_id: str, attributes_to_return: Optional[List[str]] = None, @@ -111,7 +111,7 @@ class SQLiteTraceStore(TraceStore): for row in rows ] - async def get_materialized_span( + async def get_span_tree( self, span_id: str, attributes_to_return: Optional[List[str]] = None, From 392be5f6dcee21c3c9ff107d55e8254f377c139e Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Thu, 5 Dec 2024 21:40:21 -0800 Subject: [PATCH 30/69] Reduce log volume a bit, needs more work --- .../inline/telemetry/meta_reference/console_span_processor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py b/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py index 8d6f779e6..0a2989bd3 100644 --- a/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py +++ b/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py @@ -71,6 +71,9 @@ class ConsoleSpanProcessor(SpanProcessor): # Print attributes indented if span.attributes: for key, value in span.attributes.items(): + # Skip internal attributes; also rename these internal attributes to have underscores + if key in ("class", "method", "type", "__root__", "__ttl__"): + continue print(f" {COLORS['dim']}{key}: {value}{COLORS['reset']}") # Print events indented From 66d8f4ffd126bff668434b314892a99fe854a034 Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Thu, 5 Dec 2024 21:51:47 -0800 Subject: [PATCH 31/69] Move the telemetry util import to be more lazy --- llama_stack/distribution/tracing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_stack/distribution/tracing.py b/llama_stack/distribution/tracing.py index ea663ec89..ff4fe2483 100644 --- a/llama_stack/distribution/tracing.py +++ b/llama_stack/distribution/tracing.py @@ -12,8 +12,6 @@ from typing import Any, AsyncGenerator, Callable, Type, TypeVar from pydantic import BaseModel -from llama_stack.providers.utils.telemetry import tracing - T = TypeVar("T") @@ -41,6 +39,8 @@ def trace_protocol(cls: Type[T]) -> Type[T]: """ def trace_method(method: Callable) -> Callable: + from llama_stack.providers.utils.telemetry import tracing + is_async = asyncio.iscoroutinefunction(method) is_async_gen = inspect.isasyncgenfunction(method) From 2c5c73f7caa3027d022f1fe95b6bc85507ec9c45 Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Fri, 6 Dec 2024 08:36:00 -0800 Subject: [PATCH 32/69] Bump version to 0.0.58 --- requirements.txt | 4 ++-- setup.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 8698495b1..fa7b70fd9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,8 +2,8 @@ blobfile fire httpx huggingface-hub -llama-models>=0.0.57 -llama-stack-client>=0.0.57 +llama-models>=0.0.58 +llama-stack-client>=0.0.58 prompt-toolkit python-dotenv pydantic>=2 diff --git a/setup.py b/setup.py index 3d68021dd..ff6770b81 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ def read_requirements(): setup( name="llama_stack", - version="0.0.57", + version="0.0.58", author="Meta Llama", author_email="llama-oss@meta.com", description="Llama Stack", From 27a27152cd13008c2e376e18d78b353e1ae97c06 Mon Sep 17 00:00:00 2001 From: Adrian Cole <64215+codefromthecrypt@users.noreply.github.com> Date: Sat, 7 Dec 2024 02:16:42 +0800 Subject: [PATCH 33/69] Renames otel config from jaeger to otel (#569) # What does this PR do? #525 introduced a telemetry configuration named jaeger, but what it really is pointing to is an OTLP HTTP endpoint which is supported by most servers in the ecosystem, including raw opentelemetry collectors, several APMs, and even https://github.com/ymtdzzz/otel-tui I chose to rename this to "otel" as it will bring in more people to the ecosystem vs feeling it only works with jaeger. Later, we can use the [standard ENV](https://opentelemetry.io/docs/specs/otel/protocol/exporter/) to configure this if we like so that you can override things with variables people might expect. Note: I also added to the README that you have to install conda. Depending on experience level of the user, and especially with miniforge vs other ways, I felt this helps. ## Test Plan I would like to test this, but actually got a little lost. The previous PRs referenced yaml which doesn't seem published anywhere. It would be nice to have a pre-canned setup that uses ollama and turns on otel, but would also appreciate a hand on instructions meanwhile. ## Sources https://github.com/meta-llama/llama-stack/pull/525 ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [x] Ran pre-commit to handle lint / formatting issues. - [x] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ ] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests. --------- Signed-off-by: Adrian Cole --- README.md | 3 ++- .../providers/inline/telemetry/meta_reference/config.py | 4 ++-- .../providers/inline/telemetry/meta_reference/telemetry.py | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 0dfb1306d..2e7585583 100644 --- a/README.md +++ b/README.md @@ -113,7 +113,8 @@ You have two ways to install this repository: ``` 2. **Install from source**: - If you prefer to install from the source code, follow these steps: + If you prefer to install from the source code, make sure you have [conda installed](https://docs.conda.io/projects/conda/en/stable). + Then, follow these steps: ```bash mkdir -p ~/local cd ~/local diff --git a/llama_stack/providers/inline/telemetry/meta_reference/config.py b/llama_stack/providers/inline/telemetry/meta_reference/config.py index 0230d24d2..4aaa368d1 100644 --- a/llama_stack/providers/inline/telemetry/meta_reference/config.py +++ b/llama_stack/providers/inline/telemetry/meta_reference/config.py @@ -13,7 +13,7 @@ from llama_stack.distribution.utils.config_dirs import RUNTIME_BASE_DIR class TelemetrySink(str, Enum): - JAEGER = "jaeger" + OTEL = "otel" SQLITE = "sqlite" CONSOLE = "console" @@ -29,7 +29,7 @@ class TelemetryConfig(BaseModel): ) sinks: List[TelemetrySink] = Field( default=[TelemetrySink.CONSOLE, TelemetrySink.SQLITE], - description="List of telemetry sinks to enable (possible values: jaeger, sqlite, console)", + description="List of telemetry sinks to enable (possible values: otel, sqlite, console)", ) sqlite_db_path: str = Field( default=(RUNTIME_BASE_DIR / "trace_store.db").as_posix(), diff --git a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py index 0bcc48afb..095591f9a 100644 --- a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py +++ b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py @@ -72,7 +72,7 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry): provider = TracerProvider(resource=resource) trace.set_tracer_provider(provider) - if TelemetrySink.JAEGER in self.config.sinks: + if TelemetrySink.OTEL in self.config.sinks: otlp_exporter = OTLPSpanExporter( endpoint=self.config.otel_endpoint, ) From cb9e9048e748794054e1cee6f35c5f6e70dd7991 Mon Sep 17 00:00:00 2001 From: Dinesh Yeduguru Date: Fri, 6 Dec 2024 10:17:11 -0800 Subject: [PATCH 34/69] add telemetry docs (#572) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add an experimental section and telemetry doc ![Screenshot 2024-12-05 at 10 22 51 AM](https://github.com/user-attachments/assets/b8b7a982-b800-4069-a4d0-481fc300b336) --------- Co-authored-by: Adrian Cole <64215+codefromthecrypt@users.noreply.github.com> --- docs/source/building_applications/index.md | 9 +- .../source/building_applications/telemetry.md | 243 ++++++++++++++++++ 2 files changed, 251 insertions(+), 1 deletion(-) create mode 100644 docs/source/building_applications/telemetry.md diff --git a/docs/source/building_applications/index.md b/docs/source/building_applications/index.md index 6d2f9e3ac..1c333c4a7 100644 --- a/docs/source/building_applications/index.md +++ b/docs/source/building_applications/index.md @@ -11,5 +11,12 @@ - memory / RAG; pre-ingesting content or attaching content in a turn - how does tool calling work - can you do evaluation? - +``` +For details on how to use the telemetry system to debug your applications, export traces to a dataset, and run evaluations, see the [Telemetry](telemetry) section. + +```{toctree} +:hidden: +:maxdepth: 3 + +telemetry ``` diff --git a/docs/source/building_applications/telemetry.md b/docs/source/building_applications/telemetry.md new file mode 100644 index 000000000..fd4446ed2 --- /dev/null +++ b/docs/source/building_applications/telemetry.md @@ -0,0 +1,243 @@ +# Telemetry +```{note} +The telemetry system is currently experimental and subject to change. We welcome feedback and contributions to help improve it. +``` + + + +The Llama Stack telemetry system provides comprehensive tracing, metrics, and logging capabilities. It supports multiple sink types including OpenTelemetry, SQLite, and Console output. + +## Key Concepts + +### Events +The telemetry system supports three main types of events: + +- **Unstructured Log Events**: Free-form log messages with severity levels +```python +unstructured_log_event = UnstructuredLogEvent( + message="This is a log message", + severity=LogSeverity.INFO +) +``` +- **Metric Events**: Numerical measurements with units +```python +metric_event = MetricEvent( + metric="my_metric", + value=10, + unit="count" +) +``` +- **Structured Log Events**: System events like span start/end. Extensible to add more structured log types. +```python +structured_log_event = SpanStartPayload( + name="my_span", + parent_span_id="parent_span_id" +) +``` + +### Spans and Traces +- **Spans**: Represent operations with timing and hierarchical relationships +- **Traces**: Collection of related spans forming a complete request flow + +### Sinks +- **OpenTelemetry**: Send events to an OpenTelemetry Collector. This is useful for visualizing traces in a service like Jaeger. +- **SQLite**: Store events in a local SQLite database. This is needed if you want to query the events later through the Llama Stack API. +- **Console**: Print events to the console. + +## APIs + +The telemetry API is designed to be flexible for different user flows like debugging/visualization in UI, monitoring, and saving traces to datasets. +The telemetry system exposes the following HTTP endpoints: + +### Log Event +```http +POST /telemetry/log-event +``` +Logs a telemetry event (unstructured log, metric, or structured log) with optional TTL. + +### Query Traces +```http +POST /telemetry/query-traces +``` +Retrieves traces based on filters with pagination support. Parameters: +- `attribute_filters`: List of conditions to filter traces +- `limit`: Maximum number of traces to return (default: 100) +- `offset`: Number of traces to skip (default: 0) +- `order_by`: List of fields to sort by + +### Get Span Tree +```http +POST /telemetry/get-span-tree +``` +Retrieves a hierarchical view of spans starting from a specific span. Parameters: +- `span_id`: ID of the root span to retrieve +- `attributes_to_return`: Optional list of specific attributes to include +- `max_depth`: Optional maximum depth of the span tree to return + +### Query Spans +```http +POST /telemetry/query-spans +``` +Retrieves spans matching specified filters and returns selected attributes. Parameters: +- `attribute_filters`: List of conditions to filter traces +- `attributes_to_return`: List of specific attributes to include in results +- `max_depth`: Optional maximum depth of spans to traverse (default: no limit) + +Returns a flattened list of spans with requested attributes. + +### Save Spans to Dataset +This is useful for saving traces to a dataset for running evaluations. For example, you can save the input/output of each span that is part of an agent session/turn to a dataset and then run an eval task on it. See example in [Example: Save Spans to Dataset](#example-save-spans-to-dataset). +```http +POST /telemetry/save-spans-to-dataset +``` +Queries spans and saves their attributes to a dataset. Parameters: +- `attribute_filters`: List of conditions to filter traces +- `attributes_to_save`: List of span attributes to save to the dataset +- `dataset_id`: ID of the dataset to save to +- `max_depth`: Optional maximum depth of spans to traverse (default: no limit) + +## Providers + +### Meta-Reference Provider +Currently, only the meta-reference provider is implemented. It can be configured to send events to three sink types: +1) OpenTelemetry Collector +2) SQLite +3) Console + +## Configuration + +Here's an example that sends telemetry signals to all three sink types. Your configuration might use only one. +```yaml + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + sinks: ['console', 'sqlite', 'otel'] + otel_endpoint: "http://localhost:4318/v1/traces" + sqlite_db_path: "/path/to/telemetry.db" +``` + +## Jaeger to visualize traces + +The `otel` sink works with any service compatible with the OpenTelemetry collector. Let's use Jaeger to visualize this data. + +Start a Jaeger instance with the OTLP HTTP endpoint at 4318 and the Jaeger UI at 16686 using the following command: + +```bash +$ docker run --rm \ + --name jaeger jaegertracing/jaeger:2.0.0 \ + -p 16686:16686 -p 4318:4318 \ + --set receivers.otlp.protocols.http.endpoint=0.0.0.0:4318 +``` + +Once the Jaeger instance is running, you can visualize traces by navigating to http://localhost:16686. + +## Querying Traces Stored in SQLIte + +The `sqlite` sink allows you to query traces without an external system. Here are some example queries: + +Querying Traces for a agent session +The client SDK is not updated to support the new telemetry API. It will be updated soon. You can manually query traces using the following curl command: + +``` bash + curl -X POST 'http://localhost:5000/alpha/telemetry/query-traces' \ +-H 'Content-Type: application/json' \ +-d '{ + "attribute_filters": [ + { + "key": "session_id", + "op": "eq", + "value": "dd667b87-ca4b-4d30-9265-5a0de318fc65" }], + "limit": 100, + "offset": 0, + "order_by": ["start_time"] + + [ + { + "trace_id": "6902f54b83b4b48be18a6f422b13e16f", + "root_span_id": "5f37b85543afc15a", + "start_time": "2024-12-04T08:08:30.501587", + "end_time": "2024-12-04T08:08:36.026463" + }, + ........ +] +}' + +``` + +Querying spans for a specifc root span id + +``` bash +curl -X POST 'http://localhost:5000/alpha/telemetry/get-span-tree' \ +-H 'Content-Type: application/json' \ +-d '{ "span_id" : "6cceb4b48a156913", "max_depth": 2 }' + +{ + "span_id": "6cceb4b48a156913", + "trace_id": "dafa796f6aaf925f511c04cd7c67fdda", + "parent_span_id": "892a66d726c7f990", + "name": "retrieve_rag_context", + "start_time": "2024-12-04T09:28:21.781995", + "end_time": "2024-12-04T09:28:21.913352", + "attributes": { + "input": [ + "{\"role\":\"system\",\"content\":\"You are a helpful assistant\"}", + "{\"role\":\"user\",\"content\":\"What are the top 5 topics that were explained in the documentation? Only list succinct bullet points.\",\"context\":null}" + ] + }, + "children": [ + { + "span_id": "1a2df181854064a8", + "trace_id": "dafa796f6aaf925f511c04cd7c67fdda", + "parent_span_id": "6cceb4b48a156913", + "name": "MemoryRouter.query_documents", + "start_time": "2024-12-04T09:28:21.787620", + "end_time": "2024-12-04T09:28:21.906512", + "attributes": { + "input": null + }, + "children": [], + "status": "ok" + } + ], + "status": "ok" +} + +``` + +## Example: Save Spans to Dataset +Save all spans for a specific agent session to a dataset. +``` bash +curl -X POST 'http://localhost:5000/alpha/telemetry/save-spans-to-dataset' \ +-H 'Content-Type: application/json' \ +-d '{ + "attribute_filters": [ + { + "key": "session_id", + "op": "eq", + "value": "dd667b87-ca4b-4d30-9265-5a0de318fc65" + } + ], + "attributes_to_save": ["input", "output"], + "dataset_id": "my_dataset", + "max_depth": 10 +}' +``` + +Save all spans for a specific agent turn to a dataset. +```bash +curl -X POST 'http://localhost:5000/alpha/telemetry/save-spans-to-dataset' \ +-H 'Content-Type: application/json' \ +-d '{ + "attribute_filters": [ + { + "key": "turn_id", + "op": "eq", + "value": "123e4567-e89b-12d3-a456-426614174000" + } + ], + "attributes_to_save": ["input", "output"], + "dataset_id": "my_dataset", + "max_depth": 10 +}' +``` From 084ec337afc3f6d52c7a2d7b9c8dd54e3a12c107 Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Fri, 6 Dec 2024 09:35:33 -0800 Subject: [PATCH 35/69] Small cleanup of console logs --- llama_stack/distribution/server/server.py | 2 +- llama_stack/distribution/tracing.py | 11 ++++++----- .../meta_reference/console_span_processor.py | 11 +++++++++-- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py index 4ae1854df..43e9c0706 100644 --- a/llama_stack/distribution/server/server.py +++ b/llama_stack/distribution/server/server.py @@ -217,7 +217,7 @@ class TracingMiddleware: async def __call__(self, scope, receive, send): path = scope["path"] - await start_trace(path, {"location": "server"}) + await start_trace(path, {"__location__": "server"}) try: return await self.app(scope, receive, send) finally: diff --git a/llama_stack/distribution/tracing.py b/llama_stack/distribution/tracing.py index ff4fe2483..3fcce08e9 100644 --- a/llama_stack/distribution/tracing.py +++ b/llama_stack/distribution/tracing.py @@ -52,10 +52,11 @@ def trace_protocol(cls: Type[T]) -> Type[T]: "async_generator" if is_async_gen else "async" if is_async else "sync" ) span_attributes = { - "class": class_name, - "method": method_name, - "type": span_type, - "args": serialize_value(args), + "__autotraced__": True, + "__class__": class_name, + "__method__": method_name, + "__type__": span_type, + "__args__": serialize_value(args), } return class_name, method_name, span_attributes @@ -103,7 +104,7 @@ def trace_protocol(cls: Type[T]) -> Type[T]: result = method(self, *args, **kwargs) span.set_attribute("output", serialize_value(result)) return result - except Exception as e: + except Exception as _e: raise if is_async_gen: diff --git a/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py b/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py index 0a2989bd3..6c4d7e8d4 100644 --- a/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py +++ b/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py @@ -29,6 +29,9 @@ class ConsoleSpanProcessor(SpanProcessor): def on_start(self, span: ReadableSpan, parent_context=None) -> None: """Called when a span starts.""" + if span.attributes and span.attributes.get("__autotraced__"): + return + timestamp = datetime.utcfromtimestamp(span.start_time / 1e9).strftime( "%H:%M:%S.%f" )[:-3] @@ -41,6 +44,9 @@ class ConsoleSpanProcessor(SpanProcessor): def on_end(self, span: ReadableSpan) -> None: """Called when a span ends.""" + if span.attributes and span.attributes.get("__autotraced__"): + return + timestamp = datetime.utcfromtimestamp(span.end_time / 1e9).strftime( "%H:%M:%S.%f" )[:-3] @@ -71,8 +77,7 @@ class ConsoleSpanProcessor(SpanProcessor): # Print attributes indented if span.attributes: for key, value in span.attributes.items(): - # Skip internal attributes; also rename these internal attributes to have underscores - if key in ("class", "method", "type", "__root__", "__ttl__"): + if key.startswith("__"): continue print(f" {COLORS['dim']}{key}: {value}{COLORS['reset']}") @@ -87,6 +92,8 @@ class ConsoleSpanProcessor(SpanProcessor): ) if event.attributes: for key, value in event.attributes.items(): + if key.startswith("__"): + continue print(f" {COLORS['dim']}{key}: {value}{COLORS['reset']}") def shutdown(self) -> None: From c543bc0745e3ec33b5f9d98cfad728d82415aec2 Mon Sep 17 00:00:00 2001 From: Dinesh Yeduguru Date: Fri, 6 Dec 2024 11:46:16 -0800 Subject: [PATCH 36/69] Console span processor improvements (#577) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Makes the console span processor output spans in less prominent way and highlight the logs based on severity. ![Screenshot 2024-12-06 at 11 26 46 AM](https://github.com/user-attachments/assets/c3a1b051-85db-4b71-b7a5-7bab5a26f072) --- llama_stack/apis/agents/agents.py | 2 +- llama_stack/apis/inference/inference.py | 2 +- llama_stack/apis/memory/memory.py | 2 +- llama_stack/apis/memory_banks/memory_banks.py | 2 +- llama_stack/apis/models/models.py | 2 +- llama_stack/apis/safety/safety.py | 2 +- llama_stack/apis/shields/shields.py | 2 +- .../providers/inline/memory/faiss/faiss.py | 2 - .../meta_reference/console_span_processor.py | 62 +++++++++++-------- .../utils/telemetry/trace_protocol.py} | 0 10 files changed, 44 insertions(+), 34 deletions(-) rename llama_stack/{distribution/tracing.py => providers/utils/telemetry/trace_protocol.py} (100%) diff --git a/llama_stack/apis/agents/agents.py b/llama_stack/apis/agents/agents.py index d2243c96f..6e41df4f6 100644 --- a/llama_stack/apis/agents/agents.py +++ b/llama_stack/apis/agents/agents.py @@ -23,7 +23,7 @@ from llama_models.schema_utils import json_schema_type, webmethod from pydantic import BaseModel, ConfigDict, Field from typing_extensions import Annotated -from llama_stack.distribution.tracing import trace_protocol +from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol from llama_models.llama3.api.datatypes import * # noqa: F403 from llama_stack.apis.common.deployment_types import * # noqa: F403 from llama_stack.apis.inference import * # noqa: F403 diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py index 85b29a147..233cd1b50 100644 --- a/llama_stack/apis/inference/inference.py +++ b/llama_stack/apis/inference/inference.py @@ -21,7 +21,7 @@ from llama_models.schema_utils import json_schema_type, webmethod from pydantic import BaseModel, Field from typing_extensions import Annotated -from llama_stack.distribution.tracing import trace_protocol +from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol from llama_models.llama3.api.datatypes import * # noqa: F403 from llama_stack.apis.models import * # noqa: F403 diff --git a/llama_stack/apis/memory/memory.py b/llama_stack/apis/memory/memory.py index b75df8a1a..2f3a94956 100644 --- a/llama_stack/apis/memory/memory.py +++ b/llama_stack/apis/memory/memory.py @@ -16,7 +16,7 @@ from pydantic import BaseModel, Field from llama_models.llama3.api.datatypes import * # noqa: F403 from llama_stack.apis.memory_banks import * # noqa: F403 -from llama_stack.distribution.tracing import trace_protocol +from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol @json_schema_type diff --git a/llama_stack/apis/memory_banks/memory_banks.py b/llama_stack/apis/memory_banks/memory_banks.py index 0b8b2563f..a17e8e48d 100644 --- a/llama_stack/apis/memory_banks/memory_banks.py +++ b/llama_stack/apis/memory_banks/memory_banks.py @@ -20,7 +20,7 @@ from llama_models.schema_utils import json_schema_type, webmethod from pydantic import BaseModel, Field from llama_stack.apis.resource import Resource, ResourceType -from llama_stack.distribution.tracing import trace_protocol +from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol @json_schema_type diff --git a/llama_stack/apis/models/models.py b/llama_stack/apis/models/models.py index 2c0f1ee21..cb9cb1117 100644 --- a/llama_stack/apis/models/models.py +++ b/llama_stack/apis/models/models.py @@ -10,7 +10,7 @@ from llama_models.schema_utils import json_schema_type, webmethod from pydantic import BaseModel, ConfigDict, Field from llama_stack.apis.resource import Resource, ResourceType -from llama_stack.distribution.tracing import trace_protocol +from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol class CommonModelFields(BaseModel): diff --git a/llama_stack/apis/safety/safety.py b/llama_stack/apis/safety/safety.py index 41058f107..26ae45ae7 100644 --- a/llama_stack/apis/safety/safety.py +++ b/llama_stack/apis/safety/safety.py @@ -10,7 +10,7 @@ from typing import Any, Dict, List, Protocol, runtime_checkable from llama_models.schema_utils import json_schema_type, webmethod from pydantic import BaseModel -from llama_stack.distribution.tracing import trace_protocol +from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol from llama_models.llama3.api.datatypes import * # noqa: F403 from llama_stack.apis.shields import * # noqa: F403 diff --git a/llama_stack/apis/shields/shields.py b/llama_stack/apis/shields/shields.py index b28605727..8d4d5f9fd 100644 --- a/llama_stack/apis/shields/shields.py +++ b/llama_stack/apis/shields/shields.py @@ -10,7 +10,7 @@ from llama_models.schema_utils import json_schema_type, webmethod from pydantic import BaseModel from llama_stack.apis.resource import Resource, ResourceType -from llama_stack.distribution.tracing import trace_protocol +from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol class CommonShieldFields(BaseModel): diff --git a/llama_stack/providers/inline/memory/faiss/faiss.py b/llama_stack/providers/inline/memory/faiss/faiss.py index dfefefeb8..78de13120 100644 --- a/llama_stack/providers/inline/memory/faiss/faiss.py +++ b/llama_stack/providers/inline/memory/faiss/faiss.py @@ -27,7 +27,6 @@ from llama_stack.providers.utils.memory.vector_store import ( BankWithIndex, EmbeddingIndex, ) -from llama_stack.providers.utils.telemetry import tracing from .config import FaissImplConfig @@ -95,7 +94,6 @@ class FaissIndex(EmbeddingIndex): await self.kvstore.delete(f"faiss_index:v1::{self.bank_id}") - @tracing.span(name="add_chunks") async def add_chunks(self, chunks: List[Chunk], embeddings: NDArray): indexlen = len(self.id_by_index) for i, chunk in enumerate(chunks): diff --git a/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py b/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py index 6c4d7e8d4..2f00b21b8 100644 --- a/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py +++ b/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py @@ -4,10 +4,12 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +import json from datetime import datetime from opentelemetry.sdk.trace import ReadableSpan from opentelemetry.sdk.trace.export import SpanProcessor +from opentelemetry.trace.status import StatusCode # Colors for console output COLORS = { @@ -25,10 +27,11 @@ COLORS = { class ConsoleSpanProcessor(SpanProcessor): - """A SpanProcessor that prints spans to the console with color formatting.""" + + def __init__(self, print_attributes: bool = False): + self.print_attributes = print_attributes def on_start(self, span: ReadableSpan, parent_context=None) -> None: - """Called when a span starts.""" if span.attributes and span.attributes.get("__autotraced__"): return @@ -39,11 +42,10 @@ class ConsoleSpanProcessor(SpanProcessor): print( f"{COLORS['dim']}{timestamp}{COLORS['reset']} " f"{COLORS['magenta']}[START]{COLORS['reset']} " - f"{COLORS['cyan']}{span.name}{COLORS['reset']}" + f"{COLORS['dim']}{span.name}{COLORS['reset']}" ) def on_end(self, span: ReadableSpan) -> None: - """Called when a span ends.""" if span.attributes and span.attributes.get("__autotraced__"): return @@ -51,50 +53,60 @@ class ConsoleSpanProcessor(SpanProcessor): "%H:%M:%S.%f" )[:-3] - # Build the span context string span_context = ( f"{COLORS['dim']}{timestamp}{COLORS['reset']} " f"{COLORS['magenta']}[END]{COLORS['reset']} " - f"{COLORS['cyan']}{span.name}{COLORS['reset']} " + f"{COLORS['dim']}{span.name}{COLORS['reset']}" ) - # Add status if not OK - if span.status.status_code != 0: # UNSET or ERROR - status_color = ( - COLORS["red"] if span.status.status_code == 2 else COLORS["yellow"] - ) - span_context += ( - f" {status_color}[{span.status.status_code}]{COLORS['reset']}" - ) + if span.status.status_code == StatusCode.ERROR: + span_context += f"{COLORS['reset']} {COLORS['red']}[ERROR]{COLORS['reset']}" + elif span.status.status_code != StatusCode.UNSET: + span_context += f"{COLORS['reset']} [{span.status.status_code}]" - # Add duration duration_ms = (span.end_time - span.start_time) / 1e6 - span_context += f" {COLORS['dim']}({duration_ms:.2f}ms){COLORS['reset']}" + span_context += f"{COLORS['reset']} ({duration_ms:.2f}ms)" - # Print the main span line print(span_context) - # Print attributes indented - if span.attributes: + if self.print_attributes and span.attributes: for key, value in span.attributes.items(): if key.startswith("__"): continue - print(f" {COLORS['dim']}{key}: {value}{COLORS['reset']}") + str_value = str(value) + if len(str_value) > 1000: + str_value = str_value[:997] + "..." + print(f" {COLORS['dim']}{key}: {str_value}{COLORS['reset']}") - # Print events indented for event in span.events: event_time = datetime.utcfromtimestamp(event.timestamp / 1e9).strftime( "%H:%M:%S.%f" )[:-3] + + severity = event.attributes.get("severity", "info") + message = event.attributes.get("message", event.name) + if isinstance(message, (dict, list)): + message = json.dumps(message, indent=2) + + severity_colors = { + "error": f"{COLORS['bold']}{COLORS['red']}", + "warn": f"{COLORS['bold']}{COLORS['yellow']}", + "info": COLORS["white"], + "debug": COLORS["dim"], + } + msg_color = severity_colors.get(severity, COLORS["white"]) + print( - f" {COLORS['dim']}{event_time}{COLORS['reset']} " - f"{COLORS['cyan']}[EVENT]{COLORS['reset']} {event.name}" + f" {event_time} " + f"{msg_color}[{severity.upper()}] " + f"{message}{COLORS['reset']}" ) + if event.attributes: for key, value in event.attributes.items(): - if key.startswith("__"): + if key.startswith("__") or key in ["message", "severity"]: continue - print(f" {COLORS['dim']}{key}: {value}{COLORS['reset']}") + print(f" {COLORS['dim']}{key}: {value}{COLORS['reset']}") def shutdown(self) -> None: """Shutdown the processor.""" diff --git a/llama_stack/distribution/tracing.py b/llama_stack/providers/utils/telemetry/trace_protocol.py similarity index 100% rename from llama_stack/distribution/tracing.py rename to llama_stack/providers/utils/telemetry/trace_protocol.py From 0cb996c18d9358e9fe285b345983d4fe1fe87ade Mon Sep 17 00:00:00 2001 From: Aidan Do Date: Sat, 7 Dec 2024 07:03:31 +1100 Subject: [PATCH 37/69] doc: quickstart guide errors (#575) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? Addresses a few errors I got when running the quick start guide: https://llama-stack.readthedocs.io/en/latest/getting_started/index.html. We should keep this up to date to maintain engagement with the community. I've annotated the PR below. Could you PTAL 🙏 ? ## Before submitting - [x] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). --- docs/source/getting_started/index.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/source/getting_started/index.md b/docs/source/getting_started/index.md index e6365208f..bae31e8c4 100644 --- a/docs/source/getting_started/index.md +++ b/docs/source/getting_started/index.md @@ -62,7 +62,7 @@ llama-stack-client --endpoint http://localhost:$LLAMA_STACK_PORT models list You can test basic Llama inference completion using the CLI too. ```bash llama-stack-client --endpoint http://localhost:$LLAMA_STACK_PORT \ - inference chat_completion \ + inference chat-completion \ --message "hello, what model are you?" ``` @@ -118,6 +118,7 @@ async def run_main(): model=os.environ["INFERENCE_MODEL"], instructions="You are a helpful assistant", tools=[{"type": "memory"}], # enable Memory aka RAG + enable_session_persistence=True, ) agent = Agent(client, agent_config) @@ -139,7 +140,7 @@ async def run_main(): attachments=attachments, session_id=session_id, ) - async for log in EventLogger().log(response): + for log in EventLogger().log(response): log.print() From 09fbf2d7861749e5d27ac881ac84ce5f79a102a6 Mon Sep 17 00:00:00 2001 From: Riandy Date: Sat, 7 Dec 2024 04:03:59 +0800 Subject: [PATCH 38/69] Add kotlin docs (#568) # What does this PR do? In short, provide a summary of what this PR does and why. Usually, the relevant context should be present in a linked issue. Docs update for Kotlin SDK release ## Test Plan Please describe: - tests you ran to verify your changes with result summaries. - provide instructions so it can be reproduced. ## Sources Please link relevant resources if necessary. ## Before submitting - [x] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Ran pre-commit to handle lint / formatting issues. - [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ ] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests. --- .../ondevice_distro/android_sdk.md | 246 ++++++++++++++++++ docs/source/index.md | 1 + 2 files changed, 247 insertions(+) create mode 100644 docs/source/distributions/ondevice_distro/android_sdk.md diff --git a/docs/source/distributions/ondevice_distro/android_sdk.md b/docs/source/distributions/ondevice_distro/android_sdk.md new file mode 100644 index 000000000..5a4e67e7e --- /dev/null +++ b/docs/source/distributions/ondevice_distro/android_sdk.md @@ -0,0 +1,246 @@ +# Llama Stack Client Kotlin API Library + +We are excited to share a guide for a Kotlin Library that brings front the benefits of Llama Stack to your Android device. This library is a set of SDKs that provide a simple and effective way to integrate AI capabilities into your Android app whether it is local (on-device) or remote inference. + +Features: +- Local Inferencing: Run Llama models purely on-device with real-time processing. We currently utilize ExecuTorch as the local inference distributor and may support others in the future. + - [ExecuTorch](https://github.com/pytorch/executorch/tree/main) is a complete end-to-end solution within the PyTorch framework for inferencing capabilities on-device with high portability and seamless performance. +- Remote Inferencing: Perform inferencing tasks remotely with Llama models hosted on a remote connection (or serverless localhost). +- Simple Integration: With easy-to-use APIs, a developer can quickly integrate Llama Stack in their Android app. The difference with local vs remote inferencing is also minimal. + +Latest release notes: TODO Add Release Notes + +## Android Demo App +Check out our demo app to see how to integrate Llama Stack into your Android app: + - TODO: Link to Demo App + +The key files in the app are `LlamaStackLocalInference.kt`, `LlamaStackRemoteInference.kts`, and `MainActivity.java`. With encompassed business logic, the app shows how to use Llama Stack for both the environments. + +## Quick Start + +### Add Dependencies +#### Kotlin Library +Add the following dependency in your `build.gradle.kts` file: +``` +dependencies { + implementation("com.llama.llamastack:llama-stack-client-kotlin:0.0.54") +} +``` +This will download jar files in your gradle cache in a directory like `~/.gradle/caches/modules-2/files-2.1/com.llama.llamastack/` + +If you plan on doing remote inferencing this is sufficient to get started. + +#### Dependency for Local + +> [!IMPORTANT] +> For local inferencing, it is required to include the ExecuTorch library into your app. + +Include the ExecuTorch library by: +1. Download the `download-prebuilt-et-lib.sh` script file from [Github](https://github.com/meta-llama/llama-stack-client-kotlin/blob/release/0.0.54/llama-stack-client-kotlin-client-local/download-prebuilt-et-lib.sh) to your local machine. +2. Move the script to the top level of your Android app where the app directory resides: +

+ +

+ +3. Run `sh download-prebuilt-et-lib.sh` to create an `app/libs` directory and download the `executorch.aar` in that path. This generates an ExecuTorch library for the XNNPACK delegate. +4. Add the `executorch.aar` dependency in your `build.gradle.kts` file: +``` +dependencies { + ... + implementation(files("libs/executorch.aar")) + ... +} +``` + +## Llama Stack APIs in Your Android App +Breaking down the demo app, this section will show the core pieces that are used to initialize and run inference with Llama Stack using the Kotlin library. + +### Setup Remote Inferencing +Start a Llama Stack server on localhost. Here is an example of how you can do this using the firework.ai distribution: +``` +conda create -n stack-fireworks python=3.10 +conda activate stack-fireworks +pip install llama-stack=0.0.54 +llama stack build --template fireworks --image-type conda +export FIREWORKS_API_KEY= +llama stack run /Users//.llama/distributions/llamastack-fireworks/fireworks-run.yaml --port=5050 +``` + +Other inference providers: [Table](https://llama-stack.readthedocs.io/en/latest/index.html#supported-llama-stack-implementations) + +TODO: Link to Demo App on how to set this remote localhost in the Settings. + +### Initialize the Client +A client serves as the primary interface for interacting with a specific inference type and its associated parameters. Only after client is initialized then you can configure and start inferences. + + + + + + + + + + +
Local InferenceRemote Inference
+
+client = LlamaStackClientLocalClient
+                    .builder()
+                    .modelPath(modelPath)
+                    .tokenizerPath(tokenizerPath)
+                    .temperature(temperature)
+                    .build()
+
+
+ +```// remoteURL is a string like "http://localhost:5050" +client = LlamaStackClientOkHttpClient + .builder() + .baseUrl(remoteURL) + .build() +``` +
+ + +### Run Inference +With the Kotlin Library managing all the major operational logic, there are minimal to no changes when running simple chat inference for local or remote: + +``` +val result = client!!.inference().chatCompletion( + InferenceChatCompletionParams.builder() + .modelId(modelName) + .putAdditionalQueryParam("seq_len", sequenceLength.toString()) + .messages(listOfMessages) + .build() + ) + +// response contains string with response from model +var response = result.asChatCompletionResponse().completionMessage().content().string(); +``` + +### Setup Tool Calling + +TODO: Link to Android demo app readme for more details + + +## Advanced Users + +The purpose of this section is to share more details with users that would like to dive deeper into the Llama Stack Kotlin Library. Whether you’re interested in contributing to the open source library, debugging or just want to learn more, this section is for you! + +### Prerequisite + +You must complete the following steps: +1. Clone the repo +2. Port the appropriate ExecuTorch libraries over into your Llama Stack Kotlin library environment. +``` +cd llama-stack-client-kotlin-client-local +sh download-prebuilt-et-lib.sh --unzip +``` + +Now you will notice that the `jni/` , `libs/`, and `AndroidManifest.xml` files from the `executorch.aar` file are present in the local module. This way the local client module will be able to realize the ExecuTorch SDK. + +### Building for Development/Debugging +If you’d like to contribute to the Kotlin library via development, debug, or add play around with the library with various print statements, run the following command in your terminal under the llama-stack-client-kotlin directory. + +``` +sh build-libs.sh +``` + +Output: .jar files located in the build-jars directory + +Copy the .jar files over to the lib directory in your Android app. At the same time make sure to remove the llama-stack-client-kotlin dependency within your build.gradle.kts file in your app (or if you are using the demo app) to avoid having multiple llama stack client dependencies. + +### Additional Options for Local Inferencing +Currently we provide additional properties support with local inferencing. In order to get the tokens/sec metric for each inference call, add the following code in your Android app after you run your chatCompletion inference function. The Reference app has this implementation as well: +``` +var tps = (result.asChatCompletionResponse()._additionalProperties()["tps"] as JsonNumber).value as Float +``` +We will be adding more properties in the future. + +### Additional Options for Remote Inferencing + +#### Network options + +##### Retries + +Requests that experience certain errors are automatically retried 2 times by default, with a short exponential backoff. Connection errors (for example, due to a network connectivity problem), 408 Request Timeout, 409 Conflict, 429 Rate Limit, and >=500 Internal errors will all be retried by default. +You can provide a `maxRetries` on the client builder to configure this: + +```kotlin +val client = LlamaStackClientOkHttpClient.builder() + .fromEnv() + .maxRetries(4) + .build() +``` + +##### Timeouts + +Requests time out after 1 minute by default. You can configure this on the client builder: + +```kotlin +val client = LlamaStackClientOkHttpClient.builder() + .fromEnv() + .timeout(Duration.ofSeconds(30)) + .build() +``` + +##### Proxies + +Requests can be routed through a proxy. You can configure this on the client builder: + +```kotlin +val client = LlamaStackClientOkHttpClient.builder() + .fromEnv() + .proxy(new Proxy( + Type.HTTP, + new InetSocketAddress("proxy.com", 8080) + )) + .build() +``` + +##### Environments + +Requests are made to the production environment by default. You can connect to other environments, like `sandbox`, via the client builder: + +```kotlin +val client = LlamaStackClientOkHttpClient.builder() + .fromEnv() + .sandbox() + .build() +``` + +### Error Handling +This library throws exceptions in a single hierarchy for easy handling: + +- **`LlamaStackClientException`** - Base exception for all exceptions + + - **`LlamaStackClientServiceException`** - HTTP errors with a well-formed response body we were able to parse. The exception message and the `.debuggingRequestId()` will be set by the server. + + | 400 | BadRequestException | + | ------ | ----------------------------- | + | 401 | AuthenticationException | + | 403 | PermissionDeniedException | + | 404 | NotFoundException | + | 422 | UnprocessableEntityException | + | 429 | RateLimitException | + | 5xx | InternalServerException | + | others | UnexpectedStatusCodeException | + + - **`LlamaStackClientIoException`** - I/O networking errors + - **`LlamaStackClientInvalidDataException`** - any other exceptions on the client side, e.g.: + - We failed to serialize the request body + - We failed to parse the response body (has access to response code and body) + + + +## Known Issues +1. Streaming response is a work-in-progress for local and remote inference +2. Due to #1, agents are not supported at the time. LS agents only work in streaming mode +3. Changing to another model is a work in progress for local and remote platforms + +## Thanks +- We'd like to extend our thanks to the ExecuTorch team for providing their support as we integrated ExecuTorch as one of the local inference distributors for Llama Stack. Checkout [ExecuTorch Github repo](https://github.com/pytorch/executorch/tree/main) for more information about Executorch. + +--- + +The API interface is generated using the OpenAPI standard with [Stainless](https://www.stainlessapi.com/). diff --git a/docs/source/index.md b/docs/source/index.md index abfaf51b4..adfa8c8ab 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -54,6 +54,7 @@ Llama Stack already has a number of "adapters" available for some popular Infere | Chroma | Single Node | | | Y | | | | Postgres | Single Node | | | Y | | | | PyTorch ExecuTorch | On-device iOS | Y | Y | | | +| PyTorch ExecuTorch | On-device Android | | Y | | | ## Dive In From e4a2948684f2589f3e59003ce0580a21360c929e Mon Sep 17 00:00:00 2001 From: Riandy Date: Sat, 7 Dec 2024 04:53:28 +0800 Subject: [PATCH 39/69] Update android_sdk.md (#578) Fix images URL and replacing todo. Previous commit missed that # What does this PR do? In short, provide a summary of what this PR does and why. Usually, the relevant context should be present in a linked issue. - [ ] Addresses issue (#issue) ## Test Plan Please describe: - tests you ran to verify your changes with result summaries. - provide instructions so it can be reproduced. ## Sources Please link relevant resources if necessary. ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Ran pre-commit to handle lint / formatting issues. - [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ ] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests. --- docs/source/distributions/index.md | 2 +- .../ondevice_distro/android_sdk.md | 35 ++++++++++--------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/docs/source/distributions/index.md b/docs/source/distributions/index.md index b61e9b28f..d361cad2f 100644 --- a/docs/source/distributions/index.md +++ b/docs/source/distributions/index.md @@ -35,6 +35,6 @@ If so, we suggest: - **Do you want to run Llama Stack inference on your iOS / Android device** If so, we suggest: - [iOS SDK](ondevice_distro/ios_sdk) - - Android (coming soon) + - [Android](ondevice_distro/android_sdk) You can also build your own [custom distribution](building_distro). diff --git a/docs/source/distributions/ondevice_distro/android_sdk.md b/docs/source/distributions/ondevice_distro/android_sdk.md index 5a4e67e7e..4fe7fc265 100644 --- a/docs/source/distributions/ondevice_distro/android_sdk.md +++ b/docs/source/distributions/ondevice_distro/android_sdk.md @@ -8,11 +8,10 @@ Features: - Remote Inferencing: Perform inferencing tasks remotely with Llama models hosted on a remote connection (or serverless localhost). - Simple Integration: With easy-to-use APIs, a developer can quickly integrate Llama Stack in their Android app. The difference with local vs remote inferencing is also minimal. -Latest release notes: TODO Add Release Notes +Latest Release Notes: [v0.0.54](https://github.com/meta-llama/llama-stack-client-kotlin/releases/tag/v0.0.54) ## Android Demo App -Check out our demo app to see how to integrate Llama Stack into your Android app: - - TODO: Link to Demo App +Check out our demo app to see how to integrate Llama Stack into your Android app: [Android Demo App](https://github.com/meta-llama/llama-stack-apps/tree/main/examples/android_app) The key files in the app are `LlamaStackLocalInference.kt`, `LlamaStackRemoteInference.kts`, and `MainActivity.java`. With encompassed business logic, the app shows how to use Llama Stack for both the environments. @@ -32,17 +31,16 @@ If you plan on doing remote inferencing this is sufficient to get started. #### Dependency for Local -> [!IMPORTANT] -> For local inferencing, it is required to include the ExecuTorch library into your app. +For local inferencing, it is required to include the ExecuTorch library into your app. Include the ExecuTorch library by: -1. Download the `download-prebuilt-et-lib.sh` script file from [Github](https://github.com/meta-llama/llama-stack-client-kotlin/blob/release/0.0.54/llama-stack-client-kotlin-client-local/download-prebuilt-et-lib.sh) to your local machine. +1. Download the `download-prebuilt-et-lib.sh` script file from the [llama-stack-client-kotlin-client-local](https://github.com/meta-llama/llama-stack-client-kotlin/blob/release/0.0.54/llama-stack-client-kotlin-client-local/download-prebuilt-et-lib.sh) directory to your local machine. 2. Move the script to the top level of your Android app where the app directory resides:

- +

-3. Run `sh download-prebuilt-et-lib.sh` to create an `app/libs` directory and download the `executorch.aar` in that path. This generates an ExecuTorch library for the XNNPACK delegate. +3. Run `sh download-prebuilt-et-lib.sh` to create an `app/libs` directory and download the `executorch.aar` in that path. This generates an ExecuTorch library for the XNNPACK delegate with commit: [0a12e33](https://github.com/pytorch/executorch/commit/0a12e33d22a3d44d1aa2af5f0d0673d45b962553). 4. Add the `executorch.aar` dependency in your `build.gradle.kts` file: ``` dependencies { @@ -68,7 +66,7 @@ llama stack run /Users//.llama/distributions/llamastack-fireworks Other inference providers: [Table](https://llama-stack.readthedocs.io/en/latest/index.html#supported-llama-stack-implementations) -TODO: Link to Demo App on how to set this remote localhost in the Settings. +How to set remote localhost in Demo App: [Settings](https://github.com/meta-llama/llama-stack-apps/tree/main/examples/android_app#settings) ### Initialize the Client A client serves as the primary interface for interacting with a specific inference type and its associated parameters. Only after client is initialized then you can configure and start inferences. @@ -80,18 +78,20 @@ A client serves as the primary interface for interacting with a specific inferen -
+
+```
 client = LlamaStackClientLocalClient
                     .builder()
                     .modelPath(modelPath)
                     .tokenizerPath(tokenizerPath)
                     .temperature(temperature)
                     .build()
-
+``` -```// remoteURL is a string like "http://localhost:5050" +``` +// remoteURL is a string like "http://localhost:5050" client = LlamaStackClientOkHttpClient .builder() .baseUrl(remoteURL) @@ -120,8 +120,7 @@ var response = result.asChatCompletionResponse().completionMessage().content().s ### Setup Tool Calling -TODO: Link to Android demo app readme for more details - +Android demo app for more details: [Tool Calling](https://github.com/meta-llama/llama-stack-apps/tree/main/examples/android_app#tool-calling) ## Advanced Users @@ -130,7 +129,7 @@ The purpose of this section is to share more details with users that would like ### Prerequisite You must complete the following steps: -1. Clone the repo +1. Clone the repo (`git clone https://github.com/meta-llama/llama-stack-client-kotlin.git -b release/0.0.54`) 2. Port the appropriate ExecuTorch libraries over into your Llama Stack Kotlin library environment. ``` cd llama-stack-client-kotlin-client-local @@ -231,15 +230,17 @@ This library throws exceptions in a single hierarchy for easy handling: - We failed to serialize the request body - We failed to parse the response body (has access to response code and body) - +## Reporting Issues +If you encountered any bugs or issues following this guide please file a bug/issue on our [Github issue tracker](https://github.com/meta-llama/llama-stack-client-kotlin/issues). ## Known Issues +We're aware of the following issues and are working to resolve them: 1. Streaming response is a work-in-progress for local and remote inference 2. Due to #1, agents are not supported at the time. LS agents only work in streaming mode 3. Changing to another model is a work in progress for local and remote platforms ## Thanks -- We'd like to extend our thanks to the ExecuTorch team for providing their support as we integrated ExecuTorch as one of the local inference distributors for Llama Stack. Checkout [ExecuTorch Github repo](https://github.com/pytorch/executorch/tree/main) for more information about Executorch. +We'd like to extend our thanks to the ExecuTorch team for providing their support as we integrated ExecuTorch as one of the local inference distributors for Llama Stack. Checkout [ExecuTorch Github repo](https://github.com/pytorch/executorch/tree/main) for more information. --- From b3cb8eaa3867750dcf217a1887418c22f728c751 Mon Sep 17 00:00:00 2001 From: Riandy Date: Sat, 7 Dec 2024 06:45:29 +0800 Subject: [PATCH 40/69] Bump kotlin docs to 0.0.54.1 (#579) # What does this PR do? In short, provide a summary of what this PR does and why. Usually, the relevant context should be present in a linked issue. Updating the kotlin docs to refer to version 0.0.54.1 of the SDK instead of 0.0.54 because we discovered a bug in 0.0.54 where local module as a dependencies are not included automatically. See https://github.com/meta-llama/llama-stack-client-kotlin/commit/593ed21d5f91934b2486a93de4c19b1b38ae4708 ## Test Plan Please describe: - tests you ran to verify your changes with result summaries. - provide instructions so it can be reproduced. docs changes. Changes are tested on the llama stack apps side separately and verified to be working ## Sources Please link relevant resources if necessary. ## Before submitting - [x] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [x] Ran pre-commit to handle lint / formatting issues. - [x] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [x] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests. --- .../distributions/ondevice_distro/android_sdk.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source/distributions/ondevice_distro/android_sdk.md b/docs/source/distributions/ondevice_distro/android_sdk.md index 4fe7fc265..47af8967b 100644 --- a/docs/source/distributions/ondevice_distro/android_sdk.md +++ b/docs/source/distributions/ondevice_distro/android_sdk.md @@ -8,7 +8,7 @@ Features: - Remote Inferencing: Perform inferencing tasks remotely with Llama models hosted on a remote connection (or serverless localhost). - Simple Integration: With easy-to-use APIs, a developer can quickly integrate Llama Stack in their Android app. The difference with local vs remote inferencing is also minimal. -Latest Release Notes: [v0.0.54](https://github.com/meta-llama/llama-stack-client-kotlin/releases/tag/v0.0.54) +Latest Release Notes: [v0.0.54.1](https://github.com/meta-llama/llama-stack-client-kotlin/releases/tag/v0.0.54.1) ## Android Demo App Check out our demo app to see how to integrate Llama Stack into your Android app: [Android Demo App](https://github.com/meta-llama/llama-stack-apps/tree/main/examples/android_app) @@ -22,7 +22,7 @@ The key files in the app are `LlamaStackLocalInference.kt`, `LlamaStackRemoteInf Add the following dependency in your `build.gradle.kts` file: ``` dependencies { - implementation("com.llama.llamastack:llama-stack-client-kotlin:0.0.54") + implementation("com.llama.llamastack:llama-stack-client-kotlin:0.0.54.1") } ``` This will download jar files in your gradle cache in a directory like `~/.gradle/caches/modules-2/files-2.1/com.llama.llamastack/` @@ -34,10 +34,10 @@ If you plan on doing remote inferencing this is sufficient to get started. For local inferencing, it is required to include the ExecuTorch library into your app. Include the ExecuTorch library by: -1. Download the `download-prebuilt-et-lib.sh` script file from the [llama-stack-client-kotlin-client-local](https://github.com/meta-llama/llama-stack-client-kotlin/blob/release/0.0.54/llama-stack-client-kotlin-client-local/download-prebuilt-et-lib.sh) directory to your local machine. +1. Download the `download-prebuilt-et-lib.sh` script file from the [llama-stack-client-kotlin-client-local](https://github.com/meta-llama/llama-stack-client-kotlin/blob/release/0.0.54.1/llama-stack-client-kotlin-client-local/download-prebuilt-et-lib.sh) directory to your local machine. 2. Move the script to the top level of your Android app where the app directory resides:

- +

3. Run `sh download-prebuilt-et-lib.sh` to create an `app/libs` directory and download the `executorch.aar` in that path. This generates an ExecuTorch library for the XNNPACK delegate with commit: [0a12e33](https://github.com/pytorch/executorch/commit/0a12e33d22a3d44d1aa2af5f0d0673d45b962553). @@ -129,7 +129,7 @@ The purpose of this section is to share more details with users that would like ### Prerequisite You must complete the following steps: -1. Clone the repo (`git clone https://github.com/meta-llama/llama-stack-client-kotlin.git -b release/0.0.54`) +1. Clone the repo (`git clone https://github.com/meta-llama/llama-stack-client-kotlin.git -b release/0.0.54.1`) 2. Port the appropriate ExecuTorch libraries over into your Llama Stack Kotlin library environment. ``` cd llama-stack-client-kotlin-client-local From 14f973a64f4f6bee011d94910eea67d75375998f Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Sat, 7 Dec 2024 14:59:36 -0800 Subject: [PATCH 41/69] Make LlamaStackLibraryClient work correctly (#581) This PR does a few things: - it moves "direct client" to llama-stack repo instead of being in the llama-stack-client-python repo - renames it to `LlamaStackLibraryClient` - actually makes synchronous generators work - makes streaming and non-streaming work properly In many ways, this PR makes things finally "work" ## Test Plan See a `library_client_test.py` I added. This isn't really quite a test yet but it demonstrates that this mode now works. Here's the invocation and the response: ``` INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct python llama_stack/distribution/tests/library_client_test.py ollama ``` ![image](https://github.com/user-attachments/assets/17d4e116-4457-4755-a14e-d9a668801fe0) --- llama_stack/distribution/build.py | 6 +- llama_stack/distribution/library_client.py | 272 ++++++++++++++++++ .../distribution/tests/library_client_test.py | 103 +++++++ .../remote/inference/ollama/ollama.py | 1 - 4 files changed, 378 insertions(+), 4 deletions(-) create mode 100644 llama_stack/distribution/library_client.py create mode 100644 llama_stack/distribution/tests/library_client_test.py diff --git a/llama_stack/distribution/build.py b/llama_stack/distribution/build.py index 9d0ad9af4..3349a7d50 100644 --- a/llama_stack/distribution/build.py +++ b/llama_stack/distribution/build.py @@ -46,7 +46,7 @@ class ApiInput(BaseModel): def get_provider_dependencies( - config_providers: Dict[str, List[Provider]] + config_providers: Dict[str, List[Provider]], ) -> tuple[list[str], list[str]]: """Get normal and special dependencies from provider configuration.""" all_providers = get_provider_registry() @@ -92,11 +92,11 @@ def print_pip_install_help(providers: Dict[str, List[Provider]]): normal_deps, special_deps = get_provider_dependencies(providers) cprint( - f"Please install needed dependencies using the following commands:\n\n\tpip install {' '.join(normal_deps)}", + f"Please install needed dependencies using the following commands:\n\npip install {' '.join(normal_deps)}", "yellow", ) for special_dep in special_deps: - cprint(f"\tpip install {special_dep}", "yellow") + cprint(f"pip install {special_dep}", "yellow") print() diff --git a/llama_stack/distribution/library_client.py b/llama_stack/distribution/library_client.py new file mode 100644 index 000000000..4de06ae08 --- /dev/null +++ b/llama_stack/distribution/library_client.py @@ -0,0 +1,272 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import asyncio +import inspect +import queue +import threading +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path +from typing import Any, Generator, get_args, get_origin, Optional, TypeVar + +import yaml +from llama_stack_client import AsyncLlamaStackClient, LlamaStackClient, NOT_GIVEN +from pydantic import TypeAdapter +from rich.console import Console + +from termcolor import cprint + +from llama_stack.distribution.build import print_pip_install_help +from llama_stack.distribution.configure import parse_and_maybe_upgrade_config +from llama_stack.distribution.resolver import ProviderRegistry +from llama_stack.distribution.server.endpoints import get_all_api_endpoints +from llama_stack.distribution.stack import ( + construct_stack, + get_stack_run_config_from_template, + replace_env_vars, +) + +T = TypeVar("T") + + +def stream_across_asyncio_run_boundary( + async_gen_maker, + pool_executor: ThreadPoolExecutor, +) -> Generator[T, None, None]: + result_queue = queue.Queue() + stop_event = threading.Event() + + async def consumer(): + # make sure we make the generator in the event loop context + gen = await async_gen_maker() + try: + async for item in gen: + result_queue.put(item) + except Exception as e: + print(f"Error in generator {e}") + result_queue.put(e) + except asyncio.CancelledError: + return + finally: + result_queue.put(StopIteration) + stop_event.set() + + def run_async(): + # Run our own loop to avoid double async generator cleanup which is done + # by asyncio.run() + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + task = loop.create_task(consumer()) + loop.run_until_complete(task) + finally: + # Handle pending tasks like a generator's athrow() + pending = asyncio.all_tasks(loop) + if pending: + loop.run_until_complete( + asyncio.gather(*pending, return_exceptions=True) + ) + loop.close() + + future = pool_executor.submit(run_async) + + try: + # yield results as they come in + while not stop_event.is_set() or not result_queue.empty(): + try: + item = result_queue.get(timeout=0.1) + if item is StopIteration: + break + if isinstance(item, Exception): + raise item + yield item + except queue.Empty: + continue + finally: + future.result() + + +class LlamaStackAsLibraryClient(LlamaStackClient): + def __init__( + self, + config_path_or_template_name: str, + custom_provider_registry: Optional[ProviderRegistry] = None, + ): + super().__init__() + self.async_client = AsyncLlamaStackAsLibraryClient( + config_path_or_template_name, custom_provider_registry + ) + self.pool_executor = ThreadPoolExecutor(max_workers=4) + + def initialize(self): + asyncio.run(self.async_client.initialize()) + + def get(self, *args, **kwargs): + if kwargs.get("stream"): + return stream_across_asyncio_run_boundary( + lambda: self.async_client.get(*args, **kwargs), + self.pool_executor, + ) + else: + return asyncio.run(self.async_client.get(*args, **kwargs)) + + def post(self, *args, **kwargs): + if kwargs.get("stream"): + return stream_across_asyncio_run_boundary( + lambda: self.async_client.post(*args, **kwargs), + self.pool_executor, + ) + else: + return asyncio.run(self.async_client.post(*args, **kwargs)) + + +class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): + def __init__( + self, + config_path_or_template_name: str, + custom_provider_registry: Optional[ProviderRegistry] = None, + ): + super().__init__() + + if config_path_or_template_name.endswith(".yaml"): + config_path = Path(config_path_or_template_name) + if not config_path.exists(): + raise ValueError(f"Config file {config_path} does not exist") + config_dict = replace_env_vars(yaml.safe_load(config_path.read_text())) + config = parse_and_maybe_upgrade_config(config_dict) + else: + # template + config = get_stack_run_config_from_template(config_path_or_template_name) + + self.config_path_or_template_name = config_path_or_template_name + self.config = config + self.custom_provider_registry = custom_provider_registry + + async def initialize(self): + try: + self.impls = await construct_stack( + self.config, self.custom_provider_registry + ) + except ModuleNotFoundError as e: + cprint( + "Using llama-stack as a library requires installing dependencies depending on the template (providers) you choose.\n", + "yellow", + ) + print_pip_install_help(self.config.providers) + raise e + + console = Console() + console.print(f"Using config [blue]{self.config_path_or_template_name}[/blue]:") + console.print(yaml.dump(self.config.model_dump(), indent=2)) + + endpoints = get_all_api_endpoints() + endpoint_impls = {} + for api, api_endpoints in endpoints.items(): + for endpoint in api_endpoints: + impl = self.impls[api] + func = getattr(impl, endpoint.name) + endpoint_impls[endpoint.route] = func + + self.endpoint_impls = endpoint_impls + + async def get( + self, + path: str, + *, + stream=False, + **kwargs, + ): + if not self.endpoint_impls: + raise ValueError("Client not initialized") + + if stream: + return self._call_streaming(path, "GET") + else: + return await self._call_non_streaming(path, "GET") + + async def post( + self, + path: str, + *, + body: dict = None, + stream=False, + **kwargs, + ): + if not self.endpoint_impls: + raise ValueError("Client not initialized") + + if stream: + return self._call_streaming(path, "POST", body) + else: + return await self._call_non_streaming(path, "POST", body) + + async def _call_non_streaming(self, path: str, method: str, body: dict = None): + func = self.endpoint_impls.get(path) + if not func: + raise ValueError(f"No endpoint found for {path}") + + body = self._convert_body(path, body) + return await func(**body) + + async def _call_streaming(self, path: str, method: str, body: dict = None): + func = self.endpoint_impls.get(path) + if not func: + raise ValueError(f"No endpoint found for {path}") + + body = self._convert_body(path, body) + async for chunk in await func(**body): + yield chunk + + def _convert_body(self, path: str, body: Optional[dict] = None) -> dict: + if not body: + return {} + + func = self.endpoint_impls[path] + sig = inspect.signature(func) + + # Strip NOT_GIVENs to use the defaults in signature + body = {k: v for k, v in body.items() if v is not NOT_GIVEN} + + # Convert parameters to Pydantic models where needed + converted_body = {} + for param_name, param in sig.parameters.items(): + if param_name in body: + value = body.get(param_name) + converted_body[param_name] = self._convert_param( + param.annotation, value + ) + return converted_body + + def _convert_param(self, annotation: Any, value: Any) -> Any: + if isinstance(annotation, type) and annotation in {str, int, float, bool}: + return value + + origin = get_origin(annotation) + if origin is list: + item_type = get_args(annotation)[0] + try: + return [self._convert_param(item_type, item) for item in value] + except Exception: + print(f"Error converting list {value}") + return value + + elif origin is dict: + key_type, val_type = get_args(annotation) + try: + return {k: self._convert_param(val_type, v) for k, v in value.items()} + except Exception: + print(f"Error converting dict {value}") + return value + + try: + # Handle Pydantic models and discriminated unions + return TypeAdapter(annotation).validate_python(value) + except Exception as e: + cprint( + f"Warning: direct client failed to convert parameter {value} into {annotation}: {e}", + "yellow", + ) + return value diff --git a/llama_stack/distribution/tests/library_client_test.py b/llama_stack/distribution/tests/library_client_test.py new file mode 100644 index 000000000..8381f5470 --- /dev/null +++ b/llama_stack/distribution/tests/library_client_test.py @@ -0,0 +1,103 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import argparse +import os + +from llama_stack.distribution.library_client import LlamaStackAsLibraryClient +from llama_stack_client.lib.agents.agent import Agent +from llama_stack_client.lib.agents.event_logger import EventLogger as AgentEventLogger +from llama_stack_client.lib.inference.event_logger import EventLogger +from llama_stack_client.types import UserMessage +from llama_stack_client.types.agent_create_params import AgentConfig + + +def main(config_path: str): + client = LlamaStackAsLibraryClient(config_path) + client.initialize() + + models = client.models.list() + print("\nModels:") + for model in models: + print(model) + + if not models: + print("No models found, skipping chat completion test") + return + + model_id = models[0].identifier + response = client.inference.chat_completion( + messages=[UserMessage(content="What is the capital of France?", role="user")], + model_id=model_id, + stream=False, + ) + print("\nChat completion response (non-stream):") + print(response) + + response = client.inference.chat_completion( + messages=[UserMessage(content="What is the capital of France?", role="user")], + model_id=model_id, + stream=True, + ) + + print("\nChat completion response (stream):") + for log in EventLogger().log(response): + log.print() + + print("\nAgent test:") + agent_config = AgentConfig( + model=model_id, + instructions="You are a helpful assistant", + sampling_params={ + "strategy": "greedy", + "temperature": 1.0, + "top_p": 0.9, + }, + tools=( + [ + { + "type": "brave_search", + "engine": "brave", + "api_key": os.getenv("BRAVE_SEARCH_API_KEY"), + } + ] + if os.getenv("BRAVE_SEARCH_API_KEY") + else [] + ), + tool_choice="auto", + tool_prompt_format="json", + input_shields=[], + output_shields=[], + enable_session_persistence=False, + ) + agent = Agent(client, agent_config) + user_prompts = [ + "Hello", + "Which players played in the winning team of the NBA western conference semifinals of 2024, please use tools", + ] + + session_id = agent.create_session("test-session") + + for prompt in user_prompts: + response = agent.create_turn( + messages=[ + { + "role": "user", + "content": prompt, + } + ], + session_id=session_id, + ) + + for log in AgentEventLogger().log(response): + log.print() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("config_path", help="Path to the config YAML file") + args = parser.parse_args() + main(args.config_path) diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py index f89629afc..d6fa20835 100644 --- a/llama_stack/providers/remote/inference/ollama/ollama.py +++ b/llama_stack/providers/remote/inference/ollama/ollama.py @@ -269,7 +269,6 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate): r = await self.client.chat(**params) else: r = await self.client.generate(**params) - assert isinstance(r, dict) if "message" in r: choice = OpenAICompatCompletionChoice( From a29013112fba5ce009a4942f5d52f540ddd8d767 Mon Sep 17 00:00:00 2001 From: Henry Tu Date: Sun, 8 Dec 2024 01:42:07 -0500 Subject: [PATCH 42/69] Update integration type for Cerebras to hosted (#583) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? I think I misunderstood the meaning of “single node” when describing the type of the Cerebras integration. It should be hosted instead of single node as the inference is done via API call. cc: @ashwinb @raghotham - [X] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Ran pre-commit to handle lint / formatting issues. - [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ ] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2e7585583..f60069e45 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,7 @@ Additionally, we have designed every element of the Stack such that APIs as well | **API Provider Builder** | **Environments** | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | | Meta Reference | Single Node | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | -| Cerebras | Single Node | | :heavy_check_mark: | | | | +| Cerebras | Hosted | | :heavy_check_mark: | | | | | Fireworks | Hosted | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | | | AWS Bedrock | Hosted | | :heavy_check_mark: | | :heavy_check_mark: | | | Together | Hosted | :heavy_check_mark: | :heavy_check_mark: | | :heavy_check_mark: | | From 1274fa4c0d633ccd907438b747fa5f931db1247b Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Sun, 8 Dec 2024 14:56:03 -0800 Subject: [PATCH 43/69] Add documentations for building applications and with some content for agentic loop --- docs/requirements.txt | 1 + docs/source/building_applications/index.md | 416 ++++++++++++++++++++- docs/source/conf.py | 3 + docs/source/distributions/configuration.md | 2 + docs/source/getting_started/index.md | 18 +- 5 files changed, 424 insertions(+), 16 deletions(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index c182f41c4..d455cf6b5 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -9,3 +9,4 @@ sphinx-tabs sphinx-design sphinxcontrib-openapi sphinxcontrib-redoc +sphinxcontrib-mermaid diff --git a/docs/source/building_applications/index.md b/docs/source/building_applications/index.md index 1c333c4a7..6e2062204 100644 --- a/docs/source/building_applications/index.md +++ b/docs/source/building_applications/index.md @@ -1,17 +1,413 @@ -# Building Applications +# Building AI Applications -```{admonition} Work in Progress -:class: warning +Llama Stack provides all the building blocks needed to create sophisticated AI applications. This guide will walk you through how to use these components effectively. -## What can you do with the Stack? +## Basic Inference -- Agents - - what is a turn? session? - - inference - - memory / RAG; pre-ingesting content or attaching content in a turn - - how does tool calling work - - can you do evaluation? +The foundation of any AI application is the ability to interact with LLM models. Llama Stack provides a simple interface for both completion and chat-based inference: + +```python +from llama_stack_client import LlamaStackClient + +client = LlamaStackClient(base_url="http://localhost:5001") + +# List available models +models = client.models.list() + +# Simple chat completion +response = client.inference.chat_completion( + model_id="Llama3.2-3B-Instruct", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Write a haiku about coding"} + ] +) +print(response.completion_message.content) ``` + +## Adding Memory & RAG + +Memory enables your applications to reference and recall information from previous interactions or external documents. Llama Stack's memory system is built around the concept of Memory Banks: + +1. **Vector Memory Banks**: For semantic search and retrieval +2. **Key-Value Memory Banks**: For structured data storage +3. **Keyword Memory Banks**: For basic text search +4. **Graph Memory Banks**: For relationship-based retrieval + +Here's how to set up a vector memory bank for RAG: + +```python +# Register a memory bank +bank_id = "my_documents" +response = client.memory_banks.register( + memory_bank_id=bank_id, + params={ + "memory_bank_type": "vector", + "embedding_model": "all-MiniLM-L6-v2", + "chunk_size_in_tokens": 512 + } +) + +# Insert documents +documents = [ + { + "document_id": "doc1", + "content": "Your document text here", + "mime_type": "text/plain" + } +] +client.memory.insert(bank_id, documents) + +# Query documents +results = client.memory.query( + bank_id=bank_id, + query="What do you know about...", +) +``` + +## Implementing Safety Guardrails + +Safety is a critical component of any AI application. Llama Stack provides a Shield system that can be applied at multiple touchpoints: + +```python +# Register a safety shield +shield_id = "content_safety" +client.shields.register( + shield_id=shield_id, + provider_shield_id="llama-guard-basic" +) + +# Run content through shield +response = client.safety.run_shield( + shield_id=shield_id, + messages=[{"role": "user", "content": "User message here"}] +) + +if response.violation: + print(f"Safety violation detected: {response.violation.user_message}") +``` + +## Building Agents + +Agents are the heart of complex AI applications. They combine inference, memory, safety, and tool usage into coherent workflows. At its core, an agent follows a sophisticated execution loop that enables multi-step reasoning, tool usage, and safety checks. + +### The Agent Execution Loop + +Each agent turn follows these key steps: + +1. **Initial Safety Check**: The user's input is first screened through configured safety shields + +2. **Context Retrieval**: + - If RAG is enabled, the agent queries relevant documents from memory banks + - For new documents, they are first inserted into the memory bank + - Retrieved context is augmented to the user's prompt + +3. **Inference Loop**: The agent enters its main execution loop: + - The LLM receives the augmented prompt (with context and/or previous tool outputs) + - The LLM generates a response, potentially with tool calls + - If tool calls are present: + - Tool inputs are safety-checked + - Tools are executed (e.g., web search, code execution) + - Tool responses are fed back to the LLM for synthesis + - The loop continues until: + - The LLM provides a final response without tool calls + - Maximum iterations are reached + - Token limit is exceeded + +4. **Final Safety Check**: The agent's final response is screened through safety shields + +```{mermaid} +sequenceDiagram + participant U as User + participant E as Executor + participant M as Memory Bank + participant L as LLM + participant T as Tools + participant S as Safety Shield + + Note over U,S: Agent Turn Start + U->>S: 1. Submit Prompt + activate S + S->>E: Input Safety Check + deactivate S + + E->>M: 2.1 Query Context + M-->>E: 2.2 Retrieved Documents + + loop Inference Loop + E->>L: 3.1 Augment with Context + L-->>E: 3.2 Response (with/without tool calls) + + alt Has Tool Calls + E->>S: Check Tool Input + S->>T: 4.1 Execute Tool + T-->>E: 4.2 Tool Response + E->>L: 5.1 Tool Response + L-->>E: 5.2 Synthesized Response + end + + opt Stop Conditions + Note over E: Break if: + Note over E: - No tool calls + Note over E: - Max iterations reached + Note over E: - Token limit exceeded + end + end + + E->>S: Output Safety Check + S->>U: 6. Final Response +``` + +Each step in this process can be monitored and controlled through configurations. Here's an example that demonstrates monitoring the agent's execution: + +```python +from llama_stack_client.lib.agents.event_logger import EventLogger + +agent_config = AgentConfig( + model="Llama3.2-3B-Instruct", + instructions="You are a helpful assistant", + # Enable both RAG and tool usage + tools=[ + { + "type": "memory", + "memory_bank_configs": [{ + "type": "vector", + "bank_id": "my_docs" + }], + "max_tokens_in_context": 4096 + }, + { + "type": "code_interpreter", + "enable_inline_code_execution": True + } + ], + # Configure safety + input_shields=["content_safety"], + output_shields=["content_safety"], + # Control the inference loop + max_infer_iters=5, + sampling_params={ + "temperature": 0.7, + "max_tokens": 2048 + } +) + +agent = Agent(client, agent_config) +session_id = agent.create_session("monitored_session") + +# Stream the agent's execution steps +response = agent.create_turn( + messages=[{"role": "user", "content": "Analyze this code and run it"}], + attachments=[{ + "content": "https://raw.githubusercontent.com/example/code.py", + "mime_type": "text/plain" + }], + session_id=session_id +) + +# Monitor each step of execution +for log in EventLogger().log(response): + if log.event.step_type == "memory_retrieval": + print("Retrieved context:", log.event.retrieved_context) + elif log.event.step_type == "inference": + print("LLM output:", log.event.model_response) + elif log.event.step_type == "tool_execution": + print("Tool call:", log.event.tool_call) + print("Tool response:", log.event.tool_response) + elif log.event.step_type == "shield_call": + if log.event.violation: + print("Safety violation:", log.event.violation) +``` + +This example shows how an agent can: Llama Stack provides a high-level agent framework: + +```python +from llama_stack_client.lib.agents.agent import Agent +from llama_stack_client.types.agent_create_params import AgentConfig + +# Configure an agent +agent_config = AgentConfig( + model="Llama3.2-3B-Instruct", + instructions="You are a helpful assistant", + tools=[ + { + "type": "memory", + "memory_bank_configs": [], + "query_generator_config": { + "type": "default", + "sep": " " + } + } + ], + input_shields=["content_safety"], + output_shields=["content_safety"], + enable_session_persistence=True +) + +# Create an agent +agent = Agent(client, agent_config) +session_id = agent.create_session("my_session") + +# Run agent turns +response = agent.create_turn( + messages=[{"role": "user", "content": "Your question here"}], + session_id=session_id +) +``` + +### Adding Tools to Agents + +Agents can be enhanced with various tools: + +1. **Search**: Web search capabilities through providers like Brave +2. **Code Interpreter**: Execute code snippets +3. **RAG**: Memory and document retrieval +4. **Function Calling**: Custom function execution +5. **WolframAlpha**: Mathematical computations +6. **Photogen**: Image generation + +Example of configuring an agent with tools: + +```python +agent_config = AgentConfig( + model="Llama3.2-3B-Instruct", + tools=[ + { + "type": "brave_search", + "api_key": "YOUR_API_KEY", + "engine": "brave" + }, + { + "type": "code_interpreter", + "enable_inline_code_execution": True + } + ], + tool_choice="auto", + tool_prompt_format="json" +) +``` + +## Building RAG-Enhanced Agents + +One of the most powerful patterns is combining agents with RAG capabilities. Here's a complete example: + +```python +from llama_stack_client.types import Attachment + +# Create attachments from documents +attachments = [ + Attachment( + content="https://raw.githubusercontent.com/example/doc.rst", + mime_type="text/plain" + ) +] + +# Configure agent with memory +agent_config = AgentConfig( + model="Llama3.2-3B-Instruct", + instructions="You are a helpful assistant", + tools=[{ + "type": "memory", + "memory_bank_configs": [], + "query_generator_config": {"type": "default", "sep": " "}, + "max_tokens_in_context": 4096, + "max_chunks": 10 + }], + enable_session_persistence=True +) + +agent = Agent(client, agent_config) +session_id = agent.create_session("rag_session") + +# Initial document ingestion +response = agent.create_turn( + messages=[{ + "role": "user", + "content": "I am providing some documents for reference." + }], + attachments=attachments, + session_id=session_id +) + +# Query with RAG +response = agent.create_turn( + messages=[{ + "role": "user", + "content": "What are the key topics in the documents?" + }], + session_id=session_id +) +``` + +## Testing & Evaluation + +Llama Stack provides built-in tools for evaluating your applications: + +1. **Benchmarking**: Test against standard datasets +2. **Application Evaluation**: Score your application's outputs +3. **Custom Metrics**: Define your own evaluation criteria + +Here's how to set up basic evaluation: + +```python +# Create an evaluation task +response = client.eval_tasks.register( + eval_task_id="my_eval", + dataset_id="my_dataset", + scoring_functions=["accuracy", "relevance"] +) + +# Run evaluation +job = client.eval.run_eval( + task_id="my_eval", + task_config={ + "type": "app", + "eval_candidate": { + "type": "agent", + "config": agent_config + } + } +) + +# Get results +result = client.eval.job_result( + task_id="my_eval", + job_id=job.job_id +) +``` + +## Debugging & Monitoring + +Llama Stack includes comprehensive telemetry for debugging and monitoring your applications: + +1. **Tracing**: Track request flows across components +2. **Metrics**: Measure performance and usage +3. **Logging**: Debug issues and track behavior + +The telemetry system supports multiple output formats: + +- OpenTelemetry for visualization in tools like Jaeger +- SQLite for local storage and querying +- Console output for development + +Example of querying traces: + +```python +# Query traces for a session +traces = client.telemetry.query_traces( + attribute_filters=[{ + "key": "session_id", + "op": "eq", + "value": session_id + }] +) + +# Get detailed span information +span_tree = client.telemetry.get_span_tree( + span_id=traces[0].root_span_id +) +``` + For details on how to use the telemetry system to debug your applications, export traces to a dataset, and run evaluations, see the [Telemetry](telemetry) section. ```{toctree} diff --git a/docs/source/conf.py b/docs/source/conf.py index b657cddff..2a9e3d17c 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -28,6 +28,7 @@ extensions = [ "sphinx_tabs.tabs", "sphinx_design", "sphinxcontrib.redoc", + "sphinxcontrib.mermaid", ] myst_enable_extensions = ["colon_fence"] @@ -47,6 +48,7 @@ exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] myst_enable_extensions = [ "amsmath", "attrs_inline", + "attrs_block", "colon_fence", "deflist", "dollarmath", @@ -65,6 +67,7 @@ myst_substitutions = { "docker_hub": "https://hub.docker.com/repository/docker/llamastack", } + # Copy button settings copybutton_prompt_text = "$ " # for bash prompts copybutton_prompt_is_regexp = True diff --git a/docs/source/distributions/configuration.md b/docs/source/distributions/configuration.md index abf7d16ed..6fee67936 100644 --- a/docs/source/distributions/configuration.md +++ b/docs/source/distributions/configuration.md @@ -81,6 +81,8 @@ A few things to note: - The configuration dictionary is provider-specific. Notice that configuration can reference environment variables (with default values), which are expanded at runtime. When you run a stack server (via docker or via `llama stack run`), you can specify `--env OLLAMA_URL=http://my-server:11434` to override the default value. ## Resources +``` + Finally, let's look at the `models` section: ```yaml models: diff --git a/docs/source/getting_started/index.md b/docs/source/getting_started/index.md index bae31e8c4..c6227db99 100644 --- a/docs/source/getting_started/index.md +++ b/docs/source/getting_started/index.md @@ -19,16 +19,17 @@ export LLAMA_STACK_PORT=5001 ollama run $OLLAMA_INFERENCE_MODEL --keepalive 60m ``` -By default, Ollama keeps the model loaded in memory for 5 minutes which can be too short. We set the `--keepalive` flag to 60 minutes to enspagents/agenure the model remains loaded for sometime. +By default, Ollama keeps the model loaded in memory for 5 minutes which can be too short. We set the `--keepalive` flag to 60 minutes to ensure the model remains loaded for sometime. ### 2. Start the Llama Stack server Llama Stack is based on a client-server architecture. It consists of a server which can be configured very flexibly so you can mix-and-match various providers for its individual API components -- beyond Inference, these include Memory, Agents, Telemetry, Evals and so forth. +To get started quickly, we provide various Docker images for the server component that work with different inference providers out of the box. For this guide, we will use `llamastack/distribution-ollama` as the Docker image. + ```bash -docker run \ - -it \ +docker run -it \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -v ~/.llama:/root/.llama \ llamastack/distribution-ollama \ @@ -42,8 +43,7 @@ Configuration for this is available at `distributions/ollama/run.yaml`. ### 3. Use the Llama Stack client SDK -You can interact with the Llama Stack server using the `llama-stack-client` CLI or via the Python SDK. - +You can interact with the Llama Stack server using various client SDKs. We will use the Python SDK which you can install using: ```bash pip install llama-stack-client ``` @@ -123,7 +123,6 @@ async def run_main(): agent = Agent(client, agent_config) session_id = agent.create_session("test-session") - print(f"Created session_id={session_id} for Agent({agent.agent_id})") user_prompts = [ ( "I am attaching documentation for Torchtune. Help me answer questions I will ask next.", @@ -154,3 +153,10 @@ if __name__ == "__main__": - Learn how to [Build Llama Stacks](../distributions/index.md) - See [References](../references/index.md) for more details about the llama CLI and Python SDK - For example applications and more detailed tutorials, visit our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repository. + + +## Thinking out aloud here in terms of what to write in the docs + +- how to get a llama stack server running +- what are all the different client sdks +- what are the components of building agents From 69a2d7b2648bee58f8629aa3d18ddf28274ec22a Mon Sep 17 00:00:00 2001 From: Jeff Tang Date: Sun, 8 Dec 2024 15:00:41 -0800 Subject: [PATCH 44/69] Use customtool's get_tool_definition to remove duplication (#584) # What does this PR do? Current examples would cause a lot of unnecessary painful duplication when a bunch of custom tools are expected while dealing with a real use case. Also added pip install -U httpx==0.27.2 to avoid a [httpx proxies error](https://github.com/meta-llama/llama-stack-apps/issues/131) when running in an env with 0.28 or higher of httpx installed by default. In short, provide a summary of what this PR does and why. Usually, the relevant context should be present in a linked issue. - [ ] Addresses issue (#issue) ## Test Plan Please describe: - tests you ran to verify your changes with result summaries. - provide instructions so it can be reproduced. ## Sources Please link relevant resources if necessary. ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Ran pre-commit to handle lint / formatting issues. - [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ ] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests. --- .../04_Tool_Calling101.ipynb | 21 ++++---------- ..._Using_Together's_Llama_Stack_Server.ipynb | 28 +++++-------------- 2 files changed, 12 insertions(+), 37 deletions(-) diff --git a/docs/zero_to_hero_guide/04_Tool_Calling101.ipynb b/docs/zero_to_hero_guide/04_Tool_Calling101.ipynb index 9719ad31e..4f0d2e887 100644 --- a/docs/zero_to_hero_guide/04_Tool_Calling101.ipynb +++ b/docs/zero_to_hero_guide/04_Tool_Calling101.ipynb @@ -286,6 +286,9 @@ " input_shields = [] if disable_safety else [\"llama_guard\"]\n", " output_shields = [] if disable_safety else [\"llama_guard\"]\n", "\n", + " # Initialize custom tool (ensure `WebSearchTool` is defined earlier in the notebook)\n", + " webSearchTool = WebSearchTool(api_key=BRAVE_SEARCH_API_KEY)\n", + " \n", " # Define the agent configuration, including the model and tool setup\n", " agent_config = AgentConfig(\n", " model=MODEL_NAME,\n", @@ -296,18 +299,7 @@ " \"top_p\": 0.9,\n", " },\n", " tools=[\n", - " {\n", - " \"function_name\": \"web_search\", # Name of the tool being integrated\n", - " \"description\": \"Search the web for a given query\",\n", - " \"parameters\": {\n", - " \"query\": {\n", - " \"param_type\": \"str\",\n", - " \"description\": \"The query to search for\",\n", - " \"required\": True,\n", - " }\n", - " },\n", - " \"type\": \"function_call\",\n", - " },\n", + " webSearchTool.get_tool_definition()\n", " ],\n", " tool_choice=\"auto\",\n", " tool_prompt_format=\"python_list\",\n", @@ -316,11 +308,8 @@ " enable_session_persistence=False,\n", " )\n", "\n", - " # Initialize custom tools (ensure `WebSearchTool` is defined earlier in the notebook)\n", - " custom_tools = [WebSearchTool(api_key=BRAVE_SEARCH_API_KEY)]\n", - "\n", " # Create an agent instance with the client and configuration\n", - " agent = Agent(client, agent_config, custom_tools)\n", + " agent = Agent(client, agent_config, [webSearchTool])\n", "\n", " # Create a session for interaction and print the session ID\n", " session_id = agent.create_session(\"test-session\")\n", diff --git a/docs/zero_to_hero_guide/Tool_Calling101_Using_Together's_Llama_Stack_Server.ipynb b/docs/zero_to_hero_guide/Tool_Calling101_Using_Together's_Llama_Stack_Server.ipynb index 8e3949e94..b21f3d64c 100644 --- a/docs/zero_to_hero_guide/Tool_Calling101_Using_Together's_Llama_Stack_Server.ipynb +++ b/docs/zero_to_hero_guide/Tool_Calling101_Using_Together's_Llama_Stack_Server.ipynb @@ -71,7 +71,8 @@ } ], "source": [ - "!pip install llama-stack-client==0.0.50" + "!pip install llama-stack-client==0.0.50\n", + "!pip install -U httpx==0.27.2 # https://github.com/meta-llama/llama-stack-apps/issues/131" ] }, { @@ -355,6 +356,9 @@ "async def create_weather_agent(client: LlamaStackClient) -> Agent:\n", " \"\"\"Create an agent with weather tool capability.\"\"\"\n", "\n", + " # Create the agent with the tool\n", + " weather_tool = WeatherTool()\n", + " \n", " agent_config = AgentConfig(\n", " model=LLAMA31_8B_INSTRUCT,\n", " #model=model_name,\n", @@ -369,23 +373,7 @@ " \"top_p\": 0.9,\n", " },\n", " tools=[\n", - " {\n", - " \"function_name\": \"get_weather\",\n", - " \"description\": \"Get weather information for a location\",\n", - " \"parameters\": {\n", - " \"location\": {\n", - " \"param_type\": \"str\",\n", - " \"description\": \"City or location name\",\n", - " \"required\": True,\n", - " },\n", - " \"date\": {\n", - " \"param_type\": \"str\",\n", - " \"description\": \"Optional date (YYYY-MM-DD)\",\n", - " \"required\": False,\n", - " },\n", - " },\n", - " \"type\": \"function_call\",\n", - " }\n", + " weather_tool.get_tool_definition()\n", " ],\n", " tool_choice=\"auto\",\n", " tool_prompt_format=\"json\",\n", @@ -394,8 +382,6 @@ " enable_session_persistence=True\n", " )\n", "\n", - " # Create the agent with the tool\n", - " weather_tool = WeatherTool()\n", " agent = Agent(\n", " client=client,\n", " agent_config=agent_config,\n", @@ -470,5 +456,5 @@ } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 4 } From 095125e4638895e80f1704bb1dcab7c0a9f96b41 Mon Sep 17 00:00:00 2001 From: Aidan Do Date: Mon, 9 Dec 2024 10:02:51 +1100 Subject: [PATCH 45/69] [#391] Add support for json structured output for vLLM (#528) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? Addresses issue (#391) - Adds json structured output for vLLM - Enables structured output tests for vLLM > Give me a recipe for Spaghetti Bolognaise: ```json { "recipe_name": "Spaghetti Bolognaise", "preamble": "Ah, spaghetti bolognaise - the quintessential Italian dish that fills my kitchen with the aromas of childhood nostalgia. As a child, I would watch my nonna cook up a big pot of spaghetti bolognaise every Sunday, filling our small Italian household with the savory scent of simmering meat and tomatoes. The way the sauce would thicken and the spaghetti would al dente - it was love at first bite. And now, as a chef, I want to share that same love with you, so you can recreate these warm, comforting memories at home.", "ingredients": [ "500g minced beef", "1 medium onion, finely chopped", "2 cloves garlic, minced", "1 carrot, finely chopped", " celery, finely chopped", "1 (28 oz) can whole peeled tomatoes", "1 tbsp tomato paste", "1 tsp dried basil", "1 tsp dried oregano", "1 tsp salt", "1/2 tsp black pepper", "1/2 tsp sugar", "1 lb spaghetti", "Grated Parmesan cheese, for serving", "Extra virgin olive oil, for serving" ], "steps": [ "Heat a large pot over medium heat and add a generous drizzle of extra virgin olive oil.", "Add the chopped onion, garlic, carrot, and celery and cook until the vegetables are soft and translucent, about 5-7 minutes.", "Add the minced beef and cook until browned, breaking it up with a spoon as it cooks.", "Add the tomato paste and cook for 1-2 minutes, stirring constantly.", "Add the canned tomatoes, dried basil, dried oregano, salt, black pepper, and sugar. Stir well to combine.", "Bring the sauce to a simmer and let it cook for 20-30 minutes, stirring occasionally, until the sauce has thickened and the flavors have melded together.", "While the sauce cooks, bring a large pot of salted water to a boil and cook the spaghetti according to the package instructions until al dente. Reserve 1 cup of pasta water before draining the spaghetti.", "Add the reserved pasta water to the sauce and stir to combine.", "Combine the cooked spaghetti and sauce, tossing to coat the pasta evenly.", "Serve hot, topped with grated Parmesan cheese and a drizzle of extra virgin olive oil.", "Enjoy!" ] } ``` Generated with Llama-3.2-3B-Instruct model - pretty good for a 3B parameter model 👍 ## Test Plan `pytest -v -s llama_stack/providers/tests/inference/test_text_inference.py -k llama_3b-vllm_remote` With the following setup: ```bash # Environment export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct export INFERENCE_PORT=8000 export VLLM_URL=http://localhost:8000/v1 # vLLM server sudo docker run --gpus all \ -v $STORAGE_DIR/.cache/huggingface:/root/.cache/huggingface \ --env "HUGGING_FACE_HUB_TOKEN=$(cat ~/.cache/huggingface/token)" \ -p 8000:$INFERENCE_PORT \ --ipc=host \ --net=host \ vllm/vllm-openai:v0.6.3.post1 \ --model $INFERENCE_MODEL # llama-stack server llama stack build --template remote-vllm --image-type conda && llama stack run distributions/remote-vllm/run.yaml \ --port 5001 \ --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct ``` Results: ``` llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_model_list[llama_3b-vllm_remote] PASSED llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_completion[llama_3b-vllm_remote] SKIPPED llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_completions_structured_output[llama_3b-vllm_remote] SKIPPED llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_chat_completion_non_streaming[llama_3b-vllm_remote] PASSED llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_structured_output[llama_3b-vllm_remote] PASSED llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_chat_completion_streaming[llama_3b-vllm_remote] PASSED llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_chat_completion_with_tool_calling[llama_3b-vllm_remote] PASSED llama_stack/providers/tests/inference/test_text_inference.py::TestInference::test_chat_completion_with_tool_calling_streaming[llama_3b-vllm_remote] PASSED ================================ 6 passed, 2 skipped, 120 deselected, 2 warnings in 13.26s ================================ ``` ## Sources - https://github.com/vllm-project/vllm/discussions/8300 - By default, vLLM uses https://github.com/dottxt-ai/outlines for structured outputs [[1](https://github.com/vllm-project/vllm/blob/32e7db25365415841ebc7c4215851743fbb1bad1/vllm/engine/arg_utils.py#L279-L280)] ## Before submitting [N/A] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case) - [x] Ran pre-commit to handle lint / formatting issues. - [x] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? [N/A?] Updated relevant documentation. Couldn't find any relevant documentation. Lmk if I've missed anything. - [x] Wrote necessary unit or integration tests. --- llama_stack/providers/remote/inference/vllm/vllm.py | 11 +++++++++++ .../providers/tests/inference/test_text_inference.py | 2 ++ 2 files changed, 13 insertions(+) diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py index 0f4034478..57f3db802 100644 --- a/llama_stack/providers/remote/inference/vllm/vllm.py +++ b/llama_stack/providers/remote/inference/vllm/vllm.py @@ -100,6 +100,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): tool_prompt_format=tool_prompt_format, stream=stream, logprobs=logprobs, + response_format=response_format, ) if stream: return self._stream_chat_completion(request, self.client) @@ -180,6 +181,16 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): self.formatter, ) + if fmt := request.response_format: + if fmt.type == ResponseFormatType.json_schema.value: + input_dict["extra_body"] = { + "guided_json": request.response_format.json_schema + } + elif fmt.type == ResponseFormatType.grammar.value: + raise NotImplementedError("Grammar response format not supported yet") + else: + raise ValueError(f"Unknown response format {fmt.type}") + return { "model": request.model, **input_dict, diff --git a/llama_stack/providers/tests/inference/test_text_inference.py b/llama_stack/providers/tests/inference/test_text_inference.py index aa2f0b413..b84761219 100644 --- a/llama_stack/providers/tests/inference/test_text_inference.py +++ b/llama_stack/providers/tests/inference/test_text_inference.py @@ -140,6 +140,7 @@ class TestInference: "remote::tgi", "remote::together", "remote::fireworks", + "remote::vllm", "remote::cerebras", ): pytest.skip( @@ -200,6 +201,7 @@ class TestInference: "remote::fireworks", "remote::tgi", "remote::together", + "remote::vllm", "remote::nvidia", ): pytest.skip("Other inference providers don't support structured output yet") From 397ee71c14b7ffc02f446acfaacecb76ae6ba6fa Mon Sep 17 00:00:00 2001 From: Yuri Shkuro Date: Sun, 8 Dec 2024 19:29:53 -0400 Subject: [PATCH 46/69] Fix Jaeger instructions (#580) # What does this PR do? - A follow-up for #572 - The command in the original PR did not run - Remove `--set` command unnecessary since Jaeger 2.1.0 ## Test Plan ``` $ docker run --rm --name jaeger \ -p 16686:16686 -p 4318:4318 \ jaegertracing/jaeger:2.1.0 2024/12/07 19:07:13 application version: git-commit=65cff3c30823ea20d3dc48bae39d5685ae307da5, git-version=v2.1.0, build-date=2024-12-06T21:17:15Z ... ``` ## Before submitting - [x] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Ran pre-commit to handle lint / formatting issues. - [x] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [x] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests. Signed-off-by: Yuri Shkuro --- docs/source/building_applications/telemetry.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/docs/source/building_applications/telemetry.md b/docs/source/building_applications/telemetry.md index fd4446ed2..6c8067035 100644 --- a/docs/source/building_applications/telemetry.md +++ b/docs/source/building_applications/telemetry.md @@ -40,7 +40,7 @@ structured_log_event = SpanStartPayload( - **Traces**: Collection of related spans forming a complete request flow ### Sinks -- **OpenTelemetry**: Send events to an OpenTelemetry Collector. This is useful for visualizing traces in a service like Jaeger. +- **OpenTelemetry**: Send events to an OpenTelemetry Collector. This is useful for visualizing traces in a tool like Jaeger. - **SQLite**: Store events in a local SQLite database. This is needed if you want to query the events later through the Llama Stack API. - **Console**: Print events to the console. @@ -124,13 +124,12 @@ The `otel` sink works with any service compatible with the OpenTelemetry collect Start a Jaeger instance with the OTLP HTTP endpoint at 4318 and the Jaeger UI at 16686 using the following command: ```bash -$ docker run --rm \ - --name jaeger jaegertracing/jaeger:2.0.0 \ - -p 16686:16686 -p 4318:4318 \ - --set receivers.otlp.protocols.http.endpoint=0.0.0.0:4318 +$ docker run --rm --name jaeger \ + -p 16686:16686 -p 4318:4318 \ + jaegertracing/jaeger:2.1.0 ``` -Once the Jaeger instance is running, you can visualize traces by navigating to http://localhost:16686. +Once the Jaeger instance is running, you can visualize traces by navigating to http://localhost:16686/. ## Querying Traces Stored in SQLIte From fe249f4577d14639ee595d726b5086ee122a2c70 Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Sun, 8 Dec 2024 14:56:03 -0800 Subject: [PATCH 47/69] Add documentations for building applications and with some content for agentic loop --- docs/source/index.md | 54 +++++------------- docs/source/introduction/index.md | 95 +++++++++++++++++++++++++++++++ 2 files changed, 110 insertions(+), 39 deletions(-) create mode 100644 docs/source/introduction/index.md diff --git a/docs/source/index.md b/docs/source/index.md index adfa8c8ab..ee7f00e0a 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -13,34 +13,27 @@ Our goal is to provide pre-packaged implementations which can be operated in a v The Stack APIs are rapidly improving but still a work-in-progress. We invite feedback as well as direct contributions. ``` -## Philosophy +## Quick Links -### Service-oriented design +- New to Llama Stack? Start with the [Introduction](introduction/index) to understand our motivation and vision. +- Ready to build? Check out the [Quick Start](getting_started/index) to get started. +- Need specific providers? Browse [Distributions](distributions/index) to see all the options available. +- Want to contribute? See the [Contributing](contributing/index) guide. -Unlike other frameworks, Llama Stack is built with a service-oriented, REST API-first approach. Such a design not only allows for seamless transitions from a local to remote deployments, but also forces the design to be more declarative. We believe this restriction can result in a much simpler, robust developer experience. This will necessarily trade-off against expressivity however if we get the APIs right, it can lead to a very powerful platform. +## Available SDKs -### Composability - -We expect the set of APIs we design to be composable. An Agent abstractly depends on { Inference, Memory, Safety } APIs but does not care about the actual implementation details. Safety itself may require model inference and hence can depend on the Inference API. - -### Turnkey one-stop solutions - -We expect to provide turnkey solutions for popular deployment scenarios. It should be easy to deploy a Llama Stack server on AWS or on a private data center. Either of these should allow a developer to get started with powerful agentic apps, model evaluations or fine-tuning services in a matter of minutes. They should all result in the same uniform observability and developer experience. - -### Focus on Llama models - -As a Meta initiated project, we have started by explicitly focusing on Meta's Llama series of models. Supporting the broad set of open models is no easy task and we want to start with models we understand best. - -### Supporting the Ecosystem - -There is a vibrant ecosystem of Providers which provide efficient inference or scalable vector stores or powerful observability solutions. We want to make sure it is easy for developers to pick and choose the best implementations for their use cases. We also want to make sure it is easy for new Providers to onboard and participate in the ecosystem. - -Additionally, we have designed every element of the Stack such that APIs as well as Resources (like Models) can be federated. +We have a number of client-side SDKs available for different languages. +| **Language** | **Client SDK** | **Package** | +| :----: | :----: | :----: | +| Python | [llama-stack-client-python](https://github.com/meta-llama/llama-stack-client-python) | [![PyPI version](https://img.shields.io/pypi/v/llama_stack_client.svg)](https://pypi.org/project/llama_stack_client/) +| Swift | [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift) | [![Swift Package Index](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fmeta-llama%2Fllama-stack-client-swift%2Fbadge%3Ftype%3Dswift-versions)](https://swiftpackageindex.com/meta-llama/llama-stack-client-swift) +| Node | [llama-stack-client-node](https://github.com/meta-llama/llama-stack-client-node) | [![NPM version](https://img.shields.io/npm/v/llama-stack-client.svg)](https://npmjs.org/package/llama-stack-client) +| Kotlin | [llama-stack-client-kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) | [![Maven version](https://img.shields.io/maven-central/v/com.llama.llamastack/llama-stack-client-kotlin)](https://central.sonatype.com/artifact/com.llama.llamastack/llama-stack-client-kotlin) ## Supported Llama Stack Implementations -Llama Stack already has a number of "adapters" available for some popular Inference and Memory (Vector Store) providers. For other APIs (particularly Safety and Agents), we provide *reference implementations* you can use to get started. We expect this list to grow over time. We are slowly onboarding more providers to the ecosystem as we get more confidence in the APIs. +A number of "adapters" are available for some popular Inference and Memory (Vector Store) providers. For other APIs (particularly Safety and Agents), we provide *reference implementations* you can use to get started. We expect this list to grow over time. We are slowly onboarding more providers to the ecosystem as we get more confidence in the APIs. | **API Provider** | **Environments** | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** | | :----: | :----: | :----: | :----: | :----: | :----: | :----: | @@ -56,28 +49,11 @@ Llama Stack already has a number of "adapters" available for some popular Infere | PyTorch ExecuTorch | On-device iOS | Y | Y | | | | PyTorch ExecuTorch | On-device Android | | Y | | | -## Dive In - -- Look at [Quick Start](getting_started/index) section to get started with Llama Stack. -- Learn more about [Llama Stack Concepts](concepts/index) to understand how different components fit together. -- Check out [Zero to Hero](https://github.com/meta-llama/llama-stack/tree/main/docs/zero_to_hero_guide) guide to learn in details about how to build your first agent. -- See how you can use [Llama Stack Distributions](distributions/index) to get started with popular inference and other service providers. - -We also provide a number of Client side SDKs to make it easier to connect to Llama Stack server in your preferred language. - -| **Language** | **Client SDK** | **Package** | -| :----: | :----: | :----: | -| Python | [llama-stack-client-python](https://github.com/meta-llama/llama-stack-client-python) | [![PyPI version](https://img.shields.io/pypi/v/llama_stack_client.svg)](https://pypi.org/project/llama_stack_client/) -| Swift | [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift) | [![Swift Package Index](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fmeta-llama%2Fllama-stack-client-swift%2Fbadge%3Ftype%3Dswift-versions)](https://swiftpackageindex.com/meta-llama/llama-stack-client-swift) -| Node | [llama-stack-client-node](https://github.com/meta-llama/llama-stack-client-node) | [![NPM version](https://img.shields.io/npm/v/llama-stack-client.svg)](https://npmjs.org/package/llama-stack-client) -| Kotlin | [llama-stack-client-kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) | [![Maven version](https://img.shields.io/maven-central/v/com.llama.llamastack/llama-stack-client-kotlin)](https://central.sonatype.com/artifact/com.llama.llamastack/llama-stack-client-kotlin) - -You can find more example scripts with client SDKs to talk with the Llama Stack server in our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repo. - ```{toctree} :hidden: :maxdepth: 3 +introduction/index getting_started/index concepts/index distributions/index diff --git a/docs/source/introduction/index.md b/docs/source/introduction/index.md new file mode 100644 index 000000000..9c2a70341 --- /dev/null +++ b/docs/source/introduction/index.md @@ -0,0 +1,95 @@ +# Why Llama Stack? + +Building production AI applications today requires solving multiple challenges: + +**Infrastructure Complexity** +- Running large language models efficiently requires specialized infrastructure. +- Different deployment scenarios (local development, cloud, edge) need different solutions. +- Moving from development to production often requires significant rework. + +**Essential Capabilities** +- Safety guardrails and content filtering are necessary in an enterprise setting. +- Just model inference is not enough - Knowledge retrieval and RAG capabilities are required. +- Nearly any application needs composable multi-step workflows. +- Finally, without monitoring, observability and evaluation, you end up operating in the dark. + +**Lack of Flexibility and Choice** +- Directly integrating with multiple providers creates tight coupling. +- Different providers have different APIs and abstractions. +- Changing providers requires significant code changes. + + +### The Vision: A Universal Stack + + +```{image} ../../_static/llama-stack.png +:alt: Llama Stack +:width: 400px +``` + +Llama Stack defines and standardizes the core building blocks needed to bring generative AI applications to market. These building blocks are presented as interoperable APIs with a broad set of Service Providers providing their implementations. + +#### Service-oriented Design +Unlike other frameworks, Llama Stack is built with a service-oriented, REST API-first approach. Such a design not only allows for seamless transitions from local to remote deployments but also forces the design to be more declarative. This restriction can result in a much simpler, robust developer experience. The same code works across different environments: + +- Local development with CPU-only setups +- Self-hosted with GPU acceleration +- Cloud-hosted on providers like AWS, Fireworks, Together +- On-device for iOS and Android + + +#### Composability +The APIs we design are composable. An Agent abstractly depends on { Inference, Memory, Safety } APIs but does not care about the actual implementation details. Safety itself may require model inference and hence can depend on the Inference API. + +#### Turnkey Solutions + +We provide turnkey solutions for popular deployment scenarios. It should be easy to deploy a Llama Stack server on AWS or in a private data center. Either of these should allow a developer to get started with powerful agentic apps, model evaluations, or fine-tuning services in minutes. + +We have built-in support for critical needs: + +- Safety guardrails and content filtering +- Comprehensive evaluation capabilities +- Full observability and monitoring +- Provider federation and fallback + +#### Focus on Llama Models +As a Meta-initiated project, we explicitly focus on Meta's Llama series of models. Supporting the broad set of open models is no easy task and we want to start with models we understand best. + +#### Supporting the Ecosystem +There is a vibrant ecosystem of Providers which provide efficient inference or scalable vector stores or powerful observability solutions. We want to make sure it is easy for developers to pick and choose the best implementations for their use cases. We also want to make sure it is easy for new Providers to onboard and participate in the ecosystem. + +Additionally, we have designed every element of the Stack such that APIs as well as Resources (like Models) can be federated. + +#### Rich Provider Ecosystem + +```{list-table} +:header-rows: 1 + +* - Provider + - Local + - Self-hosted + - Cloud +* - Inference + - Ollama + - vLLM, TGI + - Fireworks, Together, AWS +* - Memory + - FAISS + - Chroma, pgvector + - Weaviate +* - Safety + - Llama Guard + - - + - AWS Bedrock +``` + + +### Unified API Layer + +Llama Stack provides a consistent interface for: + +- **Inference**: Run LLM models efficiently +- **Safety**: Apply content filtering and safety policies +- **Memory**: Store and retrieve knowledge for RAG +- **Agents**: Build multi-step workflows +- **Evaluation**: Test and improve application quality From 224e62290f7172f99a03fe5d33d4a1b431916439 Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Sun, 8 Dec 2024 16:57:16 -0800 Subject: [PATCH 48/69] kill unnecessarily large imports from telemetry init --- .../providers/inline/telemetry/meta_reference/telemetry.py | 6 ++---- llama_stack/providers/utils/telemetry/__init__.py | 3 --- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py index 095591f9a..2e4a778e4 100644 --- a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py +++ b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py @@ -24,10 +24,8 @@ from llama_stack.providers.inline.telemetry.meta_reference.console_span_processo from llama_stack.providers.inline.telemetry.meta_reference.sqlite_span_processor import ( SQLiteSpanProcessor, ) -from llama_stack.providers.utils.telemetry import ( - SQLiteTraceStore, - TelemetryDatasetMixin, -) +from llama_stack.providers.utils.telemetry.dataset_mixin import TelemetryDatasetMixin +from llama_stack.providers.utils.telemetry.sqlite_trace_store import SQLiteTraceStore from llama_stack.apis.telemetry import * # noqa: F403 diff --git a/llama_stack/providers/utils/telemetry/__init__.py b/llama_stack/providers/utils/telemetry/__init__.py index 2d95a5dc5..756f351d8 100644 --- a/llama_stack/providers/utils/telemetry/__init__.py +++ b/llama_stack/providers/utils/telemetry/__init__.py @@ -3,6 +3,3 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. - -from .dataset_mixin import TelemetryDatasetMixin # noqa: F401 -from .sqlite_trace_store import SQLiteTraceStore, TraceStore # noqa: F401 From e9518528485000668686aaaf596e8c8dba3b85d4 Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Sun, 8 Dec 2024 19:11:22 -0800 Subject: [PATCH 49/69] Miscellaneous fixes around telemetry, library client and run yaml autogen Also add a `venv` image-type for llama stack build --- distributions/dependencies.json | 24 ++++ llama_stack/__init__.py | 2 + llama_stack/cli/stack/build.py | 6 +- llama_stack/distribution/build.py | 13 ++- llama_stack/distribution/build_venv.sh | 105 ++++++++++++++++++ llama_stack/distribution/datatypes.py | 2 +- llama_stack/distribution/library_client.py | 37 +++++- .../distribution/tests/library_client_test.py | 3 +- .../telemetry/meta_reference/__init__.py | 5 +- .../inline/telemetry/meta_reference/config.py | 21 +++- llama_stack/templates/bedrock/run.yaml | 5 +- llama_stack/templates/cerebras/run.yaml | 5 +- llama_stack/templates/fireworks/run.yaml | 5 +- .../hf-endpoint/run-with-safety.yaml | 5 +- llama_stack/templates/hf-endpoint/run.yaml | 5 +- .../hf-serverless/run-with-safety.yaml | 5 +- llama_stack/templates/hf-serverless/run.yaml | 5 +- .../meta-reference-gpu/run-with-safety.yaml | 5 +- .../templates/meta-reference-gpu/run.yaml | 5 +- .../meta-reference-quantized-gpu/run.yaml | 5 +- .../templates/ollama/run-with-safety.yaml | 5 +- llama_stack/templates/ollama/run.yaml | 5 +- .../remote-vllm/run-with-safety.yaml | 5 +- llama_stack/templates/remote-vllm/run.yaml | 5 +- .../templates/tgi/run-with-safety.yaml | 5 +- llama_stack/templates/tgi/run.yaml | 5 +- llama_stack/templates/together/run.yaml | 5 +- llama_stack/templates/vllm-gpu/run.yaml | 5 +- 28 files changed, 274 insertions(+), 34 deletions(-) create mode 100755 llama_stack/distribution/build_venv.sh diff --git a/distributions/dependencies.json b/distributions/dependencies.json index 4e66a85da..a2393cdea 100644 --- a/distributions/dependencies.json +++ b/distributions/dependencies.json @@ -16,6 +16,8 @@ "nltk", "numpy", "openai", + "opentelemetry-exporter-otlp-proto-http", + "opentelemetry-sdk", "pandas", "pillow", "psycopg2-binary", @@ -45,6 +47,8 @@ "nltk", "numpy", "openai", + "opentelemetry-exporter-otlp-proto-http", + "opentelemetry-sdk", "pandas", "pillow", "psycopg2-binary", @@ -75,6 +79,8 @@ "nltk", "numpy", "openai", + "opentelemetry-exporter-otlp-proto-http", + "opentelemetry-sdk", "pandas", "pillow", "psycopg2-binary", @@ -103,6 +109,8 @@ "nltk", "numpy", "openai", + "opentelemetry-exporter-otlp-proto-http", + "opentelemetry-sdk", "pandas", "pillow", "psycopg2-binary", @@ -133,6 +141,8 @@ "nltk", "numpy", "openai", + "opentelemetry-exporter-otlp-proto-http", + "opentelemetry-sdk", "pandas", "pillow", "psycopg2-binary", @@ -164,6 +174,8 @@ "nltk", "numpy", "openai", + "opentelemetry-exporter-otlp-proto-http", + "opentelemetry-sdk", "pandas", "pillow", "psycopg2-binary", @@ -194,6 +206,8 @@ "nltk", "numpy", "openai", + "opentelemetry-exporter-otlp-proto-http", + "opentelemetry-sdk", "pandas", "pillow", "psycopg2-binary", @@ -226,6 +240,8 @@ "nltk", "numpy", "openai", + "opentelemetry-exporter-otlp-proto-http", + "opentelemetry-sdk", "pandas", "pillow", "psycopg2-binary", @@ -262,6 +278,8 @@ "nltk", "numpy", "openai", + "opentelemetry-exporter-otlp-proto-http", + "opentelemetry-sdk", "pandas", "pillow", "psycopg2-binary", @@ -292,6 +310,8 @@ "matplotlib", "nltk", "numpy", + "opentelemetry-exporter-otlp-proto-http", + "opentelemetry-sdk", "pandas", "pillow", "psycopg2-binary", @@ -323,6 +343,8 @@ "numpy", "ollama", "openai", + "opentelemetry-exporter-otlp-proto-http", + "opentelemetry-sdk", "pandas", "pillow", "psycopg2-binary", @@ -354,6 +376,8 @@ "nltk", "numpy", "openai", + "opentelemetry-exporter-otlp-proto-http", + "opentelemetry-sdk", "pandas", "pillow", "psycopg2-binary", diff --git a/llama_stack/__init__.py b/llama_stack/__init__.py index 756f351d8..34b866692 100644 --- a/llama_stack/__init__.py +++ b/llama_stack/__init__.py @@ -3,3 +3,5 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +# +# from .distribution.library_client import LlamaStackAsLibraryClient, AsyncLlamaStackAsLibraryClient diff --git a/llama_stack/cli/stack/build.py b/llama_stack/cli/stack/build.py index 00d62bd73..f19c6e798 100644 --- a/llama_stack/cli/stack/build.py +++ b/llama_stack/cli/stack/build.py @@ -73,7 +73,7 @@ class StackBuild(Subcommand): "--image-type", type=str, help="Image Type to use for the build. This can be either conda or docker. If not specified, will use the image type from the template config.", - choices=["conda", "docker"], + choices=["conda", "docker", "venv"], default="conda", ) @@ -124,8 +124,8 @@ class StackBuild(Subcommand): image_type = prompt( "> Enter the image type you want your Llama Stack to be built as (docker or conda): ", validator=Validator.from_callable( - lambda x: x in ["docker", "conda"], - error_message="Invalid image type, please enter conda or docker", + lambda x: x in ["docker", "conda", "venv"], + error_message="Invalid image type, please enter conda or docker or venv", ), default="conda", ) diff --git a/llama_stack/distribution/build.py b/llama_stack/distribution/build.py index 3349a7d50..bdda0349f 100644 --- a/llama_stack/distribution/build.py +++ b/llama_stack/distribution/build.py @@ -38,6 +38,7 @@ SERVER_DEPENDENCIES = [ class ImageType(Enum): docker = "docker" conda = "conda" + venv = "venv" class ApiInput(BaseModel): @@ -120,7 +121,7 @@ def build_image(build_config: BuildConfig, build_file_path: Path): str(BUILDS_BASE_DIR / ImageType.docker.value), " ".join(normal_deps), ] - else: + elif build_config.image_type == ImageType.conda.value: script = pkg_resources.resource_filename( "llama_stack", "distribution/build_conda_env.sh" ) @@ -130,6 +131,16 @@ def build_image(build_config: BuildConfig, build_file_path: Path): str(build_file_path), " ".join(normal_deps), ] + elif build_config.image_type == ImageType.venv.value: + script = pkg_resources.resource_filename( + "llama_stack", "distribution/build_venv.sh" + ) + args = [ + script, + build_config.name, + str(build_file_path), + " ".join(normal_deps), + ] if special_deps: args.append("#".join(special_deps)) diff --git a/llama_stack/distribution/build_venv.sh b/llama_stack/distribution/build_venv.sh new file mode 100755 index 000000000..8136e3120 --- /dev/null +++ b/llama_stack/distribution/build_venv.sh @@ -0,0 +1,105 @@ +#!/bin/bash + +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +# TODO: combine this with build_conda_env.sh since it is almost identical +# the only difference is that we don't do any conda-specific setup + +LLAMA_MODELS_DIR=${LLAMA_MODELS_DIR:-} +LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-} +TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-} + +if [ -n "$LLAMA_STACK_DIR" ]; then + echo "Using llama-stack-dir=$LLAMA_STACK_DIR" +fi +if [ -n "$LLAMA_MODELS_DIR" ]; then + echo "Using llama-models-dir=$LLAMA_MODELS_DIR" +fi + +if [ "$#" -lt 3 ]; then + echo "Usage: $0 []" >&2 + echo "Example: $0 mybuild ./my-stack-build.yaml 'numpy pandas scipy'" >&2 + exit 1 +fi + +special_pip_deps="$4" + +set -euo pipefail + +build_name="$1" +env_name="llamastack-$build_name" +build_file_path="$2" +pip_dependencies="$3" + +# Define color codes +RED='\033[0;31m' +GREEN='\033[0;32m' +NC='\033[0m' # No Color + +# this is set if we actually create a new conda in which case we need to clean up +ENVNAME="" + +SCRIPT_DIR=$(dirname "$(readlink -f "$0")") +source "$SCRIPT_DIR/common.sh" + +run() { + local env_name="$1" + local pip_dependencies="$2" + local special_pip_deps="$3" + + if [ -n "$TEST_PYPI_VERSION" ]; then + # these packages are damaged in test-pypi, so install them first + pip install fastapi libcst + pip install --extra-index-url https://test.pypi.org/simple/ \ + llama-models==$TEST_PYPI_VERSION llama-stack==$TEST_PYPI_VERSION \ + $pip_dependencies + if [ -n "$special_pip_deps" ]; then + IFS='#' read -ra parts <<<"$special_pip_deps" + for part in "${parts[@]}"; do + echo "$part" + pip install $part + done + fi + else + # Re-installing llama-stack in the new conda environment + if [ -n "$LLAMA_STACK_DIR" ]; then + if [ ! -d "$LLAMA_STACK_DIR" ]; then + printf "${RED}Warning: LLAMA_STACK_DIR is set but directory does not exist: $LLAMA_STACK_DIR${NC}\n" >&2 + exit 1 + fi + + printf "Installing from LLAMA_STACK_DIR: $LLAMA_STACK_DIR\n" + pip install --no-cache-dir -e "$LLAMA_STACK_DIR" + else + pip install --no-cache-dir llama-stack + fi + + if [ -n "$LLAMA_MODELS_DIR" ]; then + if [ ! -d "$LLAMA_MODELS_DIR" ]; then + printf "${RED}Warning: LLAMA_MODELS_DIR is set but directory does not exist: $LLAMA_MODELS_DIR${NC}\n" >&2 + exit 1 + fi + + printf "Installing from LLAMA_MODELS_DIR: $LLAMA_MODELS_DIR\n" + pip uninstall -y llama-models + pip install --no-cache-dir -e "$LLAMA_MODELS_DIR" + fi + + # Install pip dependencies + printf "Installing pip dependencies\n" + pip install $pip_dependencies + if [ -n "$special_pip_deps" ]; then + IFS='#' read -ra parts <<<"$special_pip_deps" + for part in "${parts[@]}"; do + echo "$part" + pip install $part + done + fi + fi +} + +run "$env_name" "$pip_dependencies" "$special_pip_deps" diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py index c2bff4eed..1159372d4 100644 --- a/llama_stack/distribution/datatypes.py +++ b/llama_stack/distribution/datatypes.py @@ -165,5 +165,5 @@ class BuildConfig(BaseModel): ) image_type: str = Field( default="conda", - description="Type of package to build (conda | container)", + description="Type of package to build (conda | docker | venv)", ) diff --git a/llama_stack/distribution/library_client.py b/llama_stack/distribution/library_client.py index 4de06ae08..64cd343d4 100644 --- a/llama_stack/distribution/library_client.py +++ b/llama_stack/distribution/library_client.py @@ -6,6 +6,7 @@ import asyncio import inspect +import os import queue import threading from concurrent.futures import ThreadPoolExecutor @@ -32,6 +33,18 @@ from llama_stack.distribution.stack import ( T = TypeVar("T") +def is_jupyter(): + """Check if we're running in a Jupyter notebook""" + try: + shell = get_ipython().__class__.__name__ # type: ignore + if shell == "ZMQInteractiveShell": # Jupyter notebook or qtconsole + return True + else: + return False + except NameError: # Probably standard Python interpreter + return False + + def stream_across_asyncio_run_boundary( async_gen_maker, pool_executor: ThreadPoolExecutor, @@ -102,7 +115,12 @@ class LlamaStackAsLibraryClient(LlamaStackClient): self.pool_executor = ThreadPoolExecutor(max_workers=4) def initialize(self): - asyncio.run(self.async_client.initialize()) + if is_jupyter(): + import nest_asyncio + + nest_asyncio.apply() + + return asyncio.run(self.async_client.initialize()) def get(self, *args, **kwargs): if kwargs.get("stream"): @@ -131,6 +149,10 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): ): super().__init__() + # when using the library client, we should not log to console since many + # of our logs are intended for server-side usage + os.environ["TELEMETRY_SINKS"] = "sqlite" + if config_path_or_template_name.endswith(".yaml"): config_path = Path(config_path_or_template_name) if not config_path.exists(): @@ -150,13 +172,19 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): self.impls = await construct_stack( self.config, self.custom_provider_registry ) - except ModuleNotFoundError as e: + except ModuleNotFoundError as _e: cprint( "Using llama-stack as a library requires installing dependencies depending on the template (providers) you choose.\n", "yellow", ) - print_pip_install_help(self.config.providers) - raise e + if self.config_path_or_template_name.endswith(".yaml"): + print_pip_install_help(self.config.providers) + else: + cprint( + f"Please run:\n\nllama stack build --template {self.config_path_or_template_name} --image-type venv\n\n", + "yellow", + ) + return False console = Console() console.print(f"Using config [blue]{self.config_path_or_template_name}[/blue]:") @@ -171,6 +199,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): endpoint_impls[endpoint.route] = func self.endpoint_impls = endpoint_impls + return True async def get( self, diff --git a/llama_stack/distribution/tests/library_client_test.py b/llama_stack/distribution/tests/library_client_test.py index 8381f5470..5e7b997f3 100644 --- a/llama_stack/distribution/tests/library_client_test.py +++ b/llama_stack/distribution/tests/library_client_test.py @@ -17,7 +17,8 @@ from llama_stack_client.types.agent_create_params import AgentConfig def main(config_path: str): client = LlamaStackAsLibraryClient(config_path) - client.initialize() + if not client.initialize(): + return models = client.models.list() print("\nModels:") diff --git a/llama_stack/providers/inline/telemetry/meta_reference/__init__.py b/llama_stack/providers/inline/telemetry/meta_reference/__init__.py index 38871a7e4..2905e2f6a 100644 --- a/llama_stack/providers/inline/telemetry/meta_reference/__init__.py +++ b/llama_stack/providers/inline/telemetry/meta_reference/__init__.py @@ -7,12 +7,13 @@ from typing import Any, Dict from .config import TelemetryConfig, TelemetrySink -from .telemetry import TelemetryAdapter -__all__ = ["TelemetryConfig", "TelemetryAdapter", "TelemetrySink"] +__all__ = ["TelemetryConfig", "TelemetrySink"] async def get_provider_impl(config: TelemetryConfig, deps: Dict[str, Any]): + from .telemetry import TelemetryAdapter + impl = TelemetryAdapter(config, deps) await impl.initialize() return impl diff --git a/llama_stack/providers/inline/telemetry/meta_reference/config.py b/llama_stack/providers/inline/telemetry/meta_reference/config.py index 4aaa368d1..41d62c268 100644 --- a/llama_stack/providers/inline/telemetry/meta_reference/config.py +++ b/llama_stack/providers/inline/telemetry/meta_reference/config.py @@ -7,7 +7,7 @@ from enum import Enum from typing import Any, Dict, List -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, field_validator from llama_stack.distribution.utils.config_dirs import RUNTIME_BASE_DIR @@ -36,10 +36,23 @@ class TelemetryConfig(BaseModel): description="The path to the SQLite database to use for storing traces", ) + @field_validator("sinks", mode="before") @classmethod - def sample_run_config(cls, **kwargs) -> Dict[str, Any]: + def validate_sinks(cls, v): + if isinstance(v, str): + return [TelemetrySink(sink.strip()) for sink in v.split(",")] + return v + + @classmethod + def sample_run_config( + cls, __distro_dir__: str = "runtime", db_name: str = "trace_store.db" + ) -> Dict[str, Any]: return { "service_name": "${env.OTEL_SERVICE_NAME:llama-stack}", - "sinks": "${env.TELEMETRY_SINKS:['console', 'sqlite']}", - "sqlite_db_path": "${env.SQLITE_DB_PATH:${runtime.base_dir}/trace_store.db}", + "sinks": "${env.TELEMETRY_SINKS:console,sqlite}", + "sqlite_db_path": "${env.SQLITE_DB_PATH:~/.llama/" + + __distro_dir__ + + "/" + + db_name + + "}", } diff --git a/llama_stack/templates/bedrock/run.yaml b/llama_stack/templates/bedrock/run.yaml index 77d4f2248..db0ee9d85 100644 --- a/llama_stack/templates/bedrock/run.yaml +++ b/llama_stack/templates/bedrock/run.yaml @@ -39,7 +39,10 @@ providers: telemetry: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + service_name: ${env.OTEL_SERVICE_NAME:llama-stack} + sinks: ${env.TELEMETRY_SINKS:console,sqlite} + sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/bedrock/trace_store.db} eval: - provider_id: meta-reference provider_type: inline::meta-reference diff --git a/llama_stack/templates/cerebras/run.yaml b/llama_stack/templates/cerebras/run.yaml index 0b41f5b76..451e2b076 100644 --- a/llama_stack/templates/cerebras/run.yaml +++ b/llama_stack/templates/cerebras/run.yaml @@ -38,7 +38,10 @@ providers: telemetry: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + service_name: ${env.OTEL_SERVICE_NAME:llama-stack} + sinks: ${env.TELEMETRY_SINKS:console,sqlite} + sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/cerebras/trace_store.db} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/fireworks/run.yaml b/llama_stack/templates/fireworks/run.yaml index 9296be28f..c75db478d 100644 --- a/llama_stack/templates/fireworks/run.yaml +++ b/llama_stack/templates/fireworks/run.yaml @@ -41,7 +41,10 @@ providers: telemetry: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + service_name: ${env.OTEL_SERVICE_NAME:llama-stack} + sinks: ${env.TELEMETRY_SINKS:console,sqlite} + sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/fireworks/trace_store.db} eval: - provider_id: meta-reference provider_type: inline::meta-reference diff --git a/llama_stack/templates/hf-endpoint/run-with-safety.yaml b/llama_stack/templates/hf-endpoint/run-with-safety.yaml index bd625ffc5..678857201 100644 --- a/llama_stack/templates/hf-endpoint/run-with-safety.yaml +++ b/llama_stack/templates/hf-endpoint/run-with-safety.yaml @@ -46,7 +46,10 @@ providers: telemetry: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + service_name: ${env.OTEL_SERVICE_NAME:llama-stack} + sinks: ${env.TELEMETRY_SINKS:console,sqlite} + sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/hf-endpoint/trace_store.db} eval: - provider_id: meta-reference provider_type: inline::meta-reference diff --git a/llama_stack/templates/hf-endpoint/run.yaml b/llama_stack/templates/hf-endpoint/run.yaml index bf0697bba..c062c6c98 100644 --- a/llama_stack/templates/hf-endpoint/run.yaml +++ b/llama_stack/templates/hf-endpoint/run.yaml @@ -41,7 +41,10 @@ providers: telemetry: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + service_name: ${env.OTEL_SERVICE_NAME:llama-stack} + sinks: ${env.TELEMETRY_SINKS:console,sqlite} + sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/hf-endpoint/trace_store.db} eval: - provider_id: meta-reference provider_type: inline::meta-reference diff --git a/llama_stack/templates/hf-serverless/run-with-safety.yaml b/llama_stack/templates/hf-serverless/run-with-safety.yaml index f5ead14d4..4a14ba093 100644 --- a/llama_stack/templates/hf-serverless/run-with-safety.yaml +++ b/llama_stack/templates/hf-serverless/run-with-safety.yaml @@ -46,7 +46,10 @@ providers: telemetry: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + service_name: ${env.OTEL_SERVICE_NAME:llama-stack} + sinks: ${env.TELEMETRY_SINKS:console,sqlite} + sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/hf-serverless/trace_store.db} eval: - provider_id: meta-reference provider_type: inline::meta-reference diff --git a/llama_stack/templates/hf-serverless/run.yaml b/llama_stack/templates/hf-serverless/run.yaml index 13e2d7789..268efddc4 100644 --- a/llama_stack/templates/hf-serverless/run.yaml +++ b/llama_stack/templates/hf-serverless/run.yaml @@ -41,7 +41,10 @@ providers: telemetry: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + service_name: ${env.OTEL_SERVICE_NAME:llama-stack} + sinks: ${env.TELEMETRY_SINKS:console,sqlite} + sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/hf-serverless/trace_store.db} eval: - provider_id: meta-reference provider_type: inline::meta-reference diff --git a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml index d0fa05e96..963679665 100644 --- a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml +++ b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml @@ -48,7 +48,10 @@ providers: telemetry: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + service_name: ${env.OTEL_SERVICE_NAME:llama-stack} + sinks: ${env.TELEMETRY_SINKS:console,sqlite} + sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/meta-reference-gpu/trace_store.db} eval: - provider_id: meta-reference provider_type: inline::meta-reference diff --git a/llama_stack/templates/meta-reference-gpu/run.yaml b/llama_stack/templates/meta-reference-gpu/run.yaml index 3675f4a58..a74cde768 100644 --- a/llama_stack/templates/meta-reference-gpu/run.yaml +++ b/llama_stack/templates/meta-reference-gpu/run.yaml @@ -42,7 +42,10 @@ providers: telemetry: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + service_name: ${env.OTEL_SERVICE_NAME:llama-stack} + sinks: ${env.TELEMETRY_SINKS:console,sqlite} + sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/meta-reference-gpu/trace_store.db} eval: - provider_id: meta-reference provider_type: inline::meta-reference diff --git a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml index 081af0f59..5aada0fe6 100644 --- a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml +++ b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml @@ -44,7 +44,10 @@ providers: telemetry: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + service_name: ${env.OTEL_SERVICE_NAME:llama-stack} + sinks: ${env.TELEMETRY_SINKS:console,sqlite} + sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/meta-reference-quantized-gpu/trace_store.db} eval: - provider_id: meta-reference provider_type: inline::meta-reference diff --git a/llama_stack/templates/ollama/run-with-safety.yaml b/llama_stack/templates/ollama/run-with-safety.yaml index dc282f996..2ab0f78f0 100644 --- a/llama_stack/templates/ollama/run-with-safety.yaml +++ b/llama_stack/templates/ollama/run-with-safety.yaml @@ -40,7 +40,10 @@ providers: telemetry: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + service_name: ${env.OTEL_SERVICE_NAME:llama-stack} + sinks: ${env.TELEMETRY_SINKS:console,sqlite} + sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/ollama/trace_store.db} eval: - provider_id: meta-reference provider_type: inline::meta-reference diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml index ab8e12839..c5206c2d0 100644 --- a/llama_stack/templates/ollama/run.yaml +++ b/llama_stack/templates/ollama/run.yaml @@ -40,7 +40,10 @@ providers: telemetry: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + service_name: ${env.OTEL_SERVICE_NAME:llama-stack} + sinks: ${env.TELEMETRY_SINKS:console,sqlite} + sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/ollama/trace_store.db} eval: - provider_id: meta-reference provider_type: inline::meta-reference diff --git a/llama_stack/templates/remote-vllm/run-with-safety.yaml b/llama_stack/templates/remote-vllm/run-with-safety.yaml index c0849e2d0..ac8cf6f4a 100644 --- a/llama_stack/templates/remote-vllm/run-with-safety.yaml +++ b/llama_stack/templates/remote-vllm/run-with-safety.yaml @@ -45,7 +45,10 @@ providers: telemetry: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + service_name: ${env.OTEL_SERVICE_NAME:llama-stack} + sinks: ${env.TELEMETRY_SINKS:console,sqlite} + sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/remote-vllm/trace_store.db} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/remote-vllm/run.yaml b/llama_stack/templates/remote-vllm/run.yaml index 3457afdd6..27c5df53c 100644 --- a/llama_stack/templates/remote-vllm/run.yaml +++ b/llama_stack/templates/remote-vllm/run.yaml @@ -39,7 +39,10 @@ providers: telemetry: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + service_name: ${env.OTEL_SERVICE_NAME:llama-stack} + sinks: ${env.TELEMETRY_SINKS:console,sqlite} + sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/remote-vllm/trace_store.db} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/tgi/run-with-safety.yaml b/llama_stack/templates/tgi/run-with-safety.yaml index 2ee82ddc3..ecd03c36a 100644 --- a/llama_stack/templates/tgi/run-with-safety.yaml +++ b/llama_stack/templates/tgi/run-with-safety.yaml @@ -44,7 +44,10 @@ providers: telemetry: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + service_name: ${env.OTEL_SERVICE_NAME:llama-stack} + sinks: ${env.TELEMETRY_SINKS:console,sqlite} + sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/tgi/trace_store.db} eval: - provider_id: meta-reference provider_type: inline::meta-reference diff --git a/llama_stack/templates/tgi/run.yaml b/llama_stack/templates/tgi/run.yaml index c45e114ee..b93f09042 100644 --- a/llama_stack/templates/tgi/run.yaml +++ b/llama_stack/templates/tgi/run.yaml @@ -40,7 +40,10 @@ providers: telemetry: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + service_name: ${env.OTEL_SERVICE_NAME:llama-stack} + sinks: ${env.TELEMETRY_SINKS:console,sqlite} + sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/tgi/trace_store.db} eval: - provider_id: meta-reference provider_type: inline::meta-reference diff --git a/llama_stack/templates/together/run.yaml b/llama_stack/templates/together/run.yaml index a9f96a099..381557816 100644 --- a/llama_stack/templates/together/run.yaml +++ b/llama_stack/templates/together/run.yaml @@ -41,7 +41,10 @@ providers: telemetry: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + service_name: ${env.OTEL_SERVICE_NAME:llama-stack} + sinks: ${env.TELEMETRY_SINKS:console,sqlite} + sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/together/trace_store.db} eval: - provider_id: meta-reference provider_type: inline::meta-reference diff --git a/llama_stack/templates/vllm-gpu/run.yaml b/llama_stack/templates/vllm-gpu/run.yaml index ea188777f..1442273f4 100644 --- a/llama_stack/templates/vllm-gpu/run.yaml +++ b/llama_stack/templates/vllm-gpu/run.yaml @@ -44,7 +44,10 @@ providers: telemetry: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + service_name: ${env.OTEL_SERVICE_NAME:llama-stack} + sinks: ${env.TELEMETRY_SINKS:console,sqlite} + sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/vllm-gpu/trace_store.db} eval: - provider_id: meta-reference provider_type: inline::meta-reference From d7dc69c8a9cbb5bb25c07ae8c05c90419c3716aa Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Sun, 8 Dec 2024 20:46:22 -0800 Subject: [PATCH 50/69] Regenerate openapi --- docs/resources/llama-stack-spec.html | 652 ++++++++++++++++-- docs/resources/llama-stack-spec.yaml | 356 +++++++++- llama_stack/apis/telemetry/telemetry.py | 11 +- .../utils/telemetry/sqlite_trace_store.py | 4 +- 4 files changed, 933 insertions(+), 90 deletions(-) diff --git a/docs/resources/llama-stack-spec.html b/docs/resources/llama-stack-spec.html index 4f220ea1e..d1040f186 100644 --- a/docs/resources/llama-stack-spec.html +++ b/docs/resources/llama-stack-spec.html @@ -21,7 +21,7 @@ "info": { "title": "Llama Stack Specification", "version": "alpha", - "description": "This is the specification of the Llama Stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. Generated at 2024-11-22 17:23:55.034164" + "description": "This is the specification of the Llama Stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models." }, "servers": [ { @@ -29,6 +29,39 @@ } ], "paths": { + "/alpha/datasetio/append-rows": { + "post": { + "responses": { + "200": { + "description": "OK" + } + }, + "tags": [ + "DatasetIO" + ], + "parameters": [ + { + "name": "X-LlamaStack-ProviderData", + "in": "header", + "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", + "required": false, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/AppendRowsRequest" + } + } + }, + "required": true + } + } + }, "/alpha/batch-inference/chat-completion": { "post": { "responses": { @@ -1026,15 +1059,15 @@ ] } }, - "/alpha/telemetry/get-trace": { - "get": { + "/alpha/telemetry/get-span-tree": { + "post": { "responses": { "200": { "description": "OK", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/Trace" + "$ref": "#/components/schemas/SpanWithChildren" } } } @@ -1045,13 +1078,21 @@ ], "parameters": [ { - "name": "trace_id", + "name": "span_id", "in": "query", "required": true, "schema": { "type": "string" } }, + { + "name": "max_depth", + "in": "query", + "required": false, + "schema": { + "type": "integer" + } + }, { "name": "X-LlamaStack-ProviderData", "in": "header", @@ -1061,7 +1102,17 @@ "type": "string" } } - ] + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/GetSpanTreeRequest" + } + } + }, + "required": true + } } }, "/alpha/post-training/job/artifacts": { @@ -1778,6 +1829,86 @@ } } }, + "/alpha/telemetry/query-spans": { + "post": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/jsonl": { + "schema": { + "$ref": "#/components/schemas/Span" + } + } + } + } + }, + "tags": [ + "Telemetry" + ], + "parameters": [ + { + "name": "X-LlamaStack-ProviderData", + "in": "header", + "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", + "required": false, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QuerySpansRequest" + } + } + }, + "required": true + } + } + }, + "/alpha/telemetry/query-traces": { + "post": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/jsonl": { + "schema": { + "$ref": "#/components/schemas/Trace" + } + } + } + } + }, + "tags": [ + "Telemetry" + ], + "parameters": [ + { + "name": "X-LlamaStack-ProviderData", + "in": "header", + "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", + "required": false, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryTracesRequest" + } + } + }, + "required": true + } + } + }, "/alpha/datasets/register": { "post": { "responses": { @@ -2066,6 +2197,39 @@ } } }, + "/alpha/telemetry/save-spans-to-dataset": { + "post": { + "responses": { + "200": { + "description": "OK" + } + }, + "tags": [ + "Telemetry" + ], + "parameters": [ + { + "name": "X-LlamaStack-ProviderData", + "in": "header", + "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", + "required": false, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SaveSpansToDatasetRequest" + } + } + }, + "required": true + } + } + }, "/alpha/scoring/score": { "post": { "responses": { @@ -2226,6 +2390,39 @@ } } }, + "/alpha/datasets/unregister": { + "post": { + "responses": { + "200": { + "description": "OK" + } + }, + "tags": [ + "Datasets" + ], + "parameters": [ + { + "name": "X-LlamaStack-ProviderData", + "in": "header", + "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", + "required": false, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/UnregisterDatasetRequest" + } + } + }, + "required": true + } + } + }, "/alpha/memory-banks/unregister": { "post": { "responses": { @@ -2291,44 +2488,52 @@ "required": true } } - }, - "/alpha/datasets/unregister": { - "post": { - "responses": { - "200": { - "description": "OK" - } - }, - "tags": [ - "Datasets" - ], - "parameters": [ - { - "name": "X-LlamaStack-ProviderData", - "in": "header", - "description": "JSON-encoded provider data which will be made available to the adapter servicing the API", - "required": false, - "schema": { - "type": "string" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/UnregisterDatasetRequest" - } - } - }, - "required": true - } - } } }, "jsonSchemaDialect": "https://json-schema.org/draft/2020-12/schema", "components": { "schemas": { + "AppendRowsRequest": { + "type": "object", + "properties": { + "dataset_id": { + "type": "string" + }, + "rows": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + } + }, + "additionalProperties": false, + "required": [ + "dataset_id", + "rows" + ] + }, "BuiltinTool": { "type": "string", "enum": [ @@ -5878,13 +6083,38 @@ ], "title": "A safety shield resource that can be used to check content" }, - "Trace": { + "GetSpanTreeRequest": { "type": "object", "properties": { + "attributes_to_return": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false + }, + "SpanStatus": { + "type": "string", + "enum": [ + "ok", + "error" + ] + }, + "SpanWithChildren": { + "type": "object", + "properties": { + "span_id": { + "type": "string" + }, "trace_id": { "type": "string" }, - "root_span_id": { + "parent_span_id": { + "type": "string" + }, + "name": { "type": "string" }, "start_time": { @@ -5894,13 +6124,49 @@ "end_time": { "type": "string", "format": "date-time" + }, + "attributes": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + }, + "children": { + "type": "array", + "items": { + "$ref": "#/components/schemas/SpanWithChildren" + } + }, + "status": { + "$ref": "#/components/schemas/SpanStatus" } }, "additionalProperties": false, "required": [ + "span_id", "trace_id", - "root_span_id", - "start_time" + "name", + "start_time", + "children" ] }, "Checkpoint": { @@ -6313,13 +6579,6 @@ "name" ] }, - "SpanStatus": { - "type": "string", - "enum": [ - "ok", - "error" - ] - }, "StructuredLogEvent": { "type": "object", "properties": { @@ -6458,11 +6717,15 @@ "$ref": "#/components/schemas/StructuredLogEvent" } ] + }, + "ttl_seconds": { + "type": "integer" } }, "additionalProperties": false, "required": [ - "event" + "event", + "ttl_seconds" ] }, "DPOAlignmentConfig": { @@ -6772,6 +7035,185 @@ "scores" ] }, + "QueryCondition": { + "type": "object", + "properties": { + "key": { + "type": "string" + }, + "op": { + "$ref": "#/components/schemas/QueryConditionOp" + }, + "value": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + }, + "additionalProperties": false, + "required": [ + "key", + "op", + "value" + ] + }, + "QueryConditionOp": { + "type": "string", + "enum": [ + "eq", + "ne", + "gt", + "lt" + ] + }, + "QuerySpansRequest": { + "type": "object", + "properties": { + "attribute_filters": { + "type": "array", + "items": { + "$ref": "#/components/schemas/QueryCondition" + } + }, + "attributes_to_return": { + "type": "array", + "items": { + "type": "string" + } + }, + "max_depth": { + "type": "integer" + } + }, + "additionalProperties": false, + "required": [ + "attribute_filters", + "attributes_to_return" + ] + }, + "Span": { + "type": "object", + "properties": { + "span_id": { + "type": "string" + }, + "trace_id": { + "type": "string" + }, + "parent_span_id": { + "type": "string" + }, + "name": { + "type": "string" + }, + "start_time": { + "type": "string", + "format": "date-time" + }, + "end_time": { + "type": "string", + "format": "date-time" + }, + "attributes": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "additionalProperties": false, + "required": [ + "span_id", + "trace_id", + "name", + "start_time" + ] + }, + "QueryTracesRequest": { + "type": "object", + "properties": { + "attribute_filters": { + "type": "array", + "items": { + "$ref": "#/components/schemas/QueryCondition" + } + }, + "limit": { + "type": "integer" + }, + "offset": { + "type": "integer" + }, + "order_by": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false + }, + "Trace": { + "type": "object", + "properties": { + "trace_id": { + "type": "string" + }, + "root_span_id": { + "type": "string" + }, + "start_time": { + "type": "string", + "format": "date-time" + }, + "end_time": { + "type": "string", + "format": "date-time" + } + }, + "additionalProperties": false, + "required": [ + "trace_id", + "root_span_id", + "start_time" + ] + }, "RegisterDatasetRequest": { "type": "object", "properties": { @@ -7488,6 +7930,35 @@ }, "additionalProperties": false }, + "SaveSpansToDatasetRequest": { + "type": "object", + "properties": { + "attribute_filters": { + "type": "array", + "items": { + "$ref": "#/components/schemas/QueryCondition" + } + }, + "attributes_to_save": { + "type": "array", + "items": { + "type": "string" + } + }, + "dataset_id": { + "type": "string" + }, + "max_depth": { + "type": "integer" + } + }, + "additionalProperties": false, + "required": [ + "attribute_filters", + "attributes_to_save", + "dataset_id" + ] + }, "ScoreRequest": { "type": "object", "properties": { @@ -7927,6 +8398,18 @@ ], "title": "Response from the synthetic data generation. Batch of (prompt, response, score) tuples that pass the threshold." }, + "UnregisterDatasetRequest": { + "type": "object", + "properties": { + "dataset_id": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "dataset_id" + ] + }, "UnregisterMemoryBankRequest": { "type": "object", "properties": { @@ -7950,18 +8433,6 @@ "required": [ "model_id" ] - }, - "UnregisterDatasetRequest": { - "type": "object", - "properties": { - "dataset_id": { - "type": "string" - } - }, - "additionalProperties": false, - "required": [ - "dataset_id" - ] } }, "responses": {} @@ -8027,6 +8498,10 @@ "name": "AppEvalTaskConfig", "description": "" }, + { + "name": "AppendRowsRequest", + "description": "" + }, { "name": "Attachment", "description": "" @@ -8182,6 +8657,10 @@ "name": "GetAgentsSessionRequest", "description": "" }, + { + "name": "GetSpanTreeRequest", + "description": "" + }, { "name": "GraphMemoryBank", "description": "" @@ -8336,6 +8815,14 @@ "name": "QLoraFinetuningConfig", "description": "" }, + { + "name": "QueryCondition", + "description": "" + }, + { + "name": "QueryConditionOp", + "description": "" + }, { "name": "QueryDocumentsRequest", "description": "" @@ -8344,6 +8831,14 @@ "name": "QueryDocumentsResponse", "description": "" }, + { + "name": "QuerySpansRequest", + "description": "" + }, + { + "name": "QueryTracesRequest", + "description": "" + }, { "name": "RLHFAlgorithm", "description": "" @@ -8415,6 +8910,10 @@ "name": "SamplingStrategy", "description": "" }, + { + "name": "SaveSpansToDatasetRequest", + "description": "" + }, { "name": "ScoreBatchRequest", "description": "" @@ -8464,6 +8963,10 @@ { "name": "Shields" }, + { + "name": "Span", + "description": "" + }, { "name": "SpanEndPayload", "description": "" @@ -8476,6 +8979,10 @@ "name": "SpanStatus", "description": "" }, + { + "name": "SpanWithChildren", + "description": "" + }, { "name": "StopReason", "description": "" @@ -8566,6 +9073,10 @@ "name": "URL", "description": "" }, + { + "name": "UnregisterDatasetRequest", + "description": "" + }, { "name": "UnregisterMemoryBankRequest", "description": "" @@ -8574,10 +9085,6 @@ "name": "UnregisterModelRequest", "description": "" }, - { - "name": "UnregisterDatasetRequest", - "description": "" - }, { "name": "UnstructuredLogEvent", "description": "" @@ -8643,6 +9150,7 @@ "AgentTurnResponseTurnCompletePayload", "AgentTurnResponseTurnStartPayload", "AppEvalTaskConfig", + "AppendRowsRequest", "Attachment", "BatchChatCompletionRequest", "BatchChatCompletionResponse", @@ -8678,6 +9186,7 @@ "FinetuningAlgorithm", "FunctionCallToolDefinition", "GetAgentsSessionRequest", + "GetSpanTreeRequest", "GraphMemoryBank", "GraphMemoryBankParams", "HealthInfo", @@ -8712,8 +9221,12 @@ "PreferenceOptimizeRequest", "ProviderInfo", "QLoraFinetuningConfig", + "QueryCondition", + "QueryConditionOp", "QueryDocumentsRequest", "QueryDocumentsResponse", + "QuerySpansRequest", + "QueryTracesRequest", "RLHFAlgorithm", "RegexParserScoringFnParams", "RegisterDatasetRequest", @@ -8731,6 +9244,7 @@ "SafetyViolation", "SamplingParams", "SamplingStrategy", + "SaveSpansToDatasetRequest", "ScoreBatchRequest", "ScoreBatchResponse", "ScoreRequest", @@ -8741,9 +9255,11 @@ "Session", "Shield", "ShieldCallStep", + "Span", "SpanEndPayload", "SpanStartPayload", "SpanStatus", + "SpanWithChildren", "StopReason", "StructuredLogEvent", "SupervisedFineTuneRequest", @@ -8765,9 +9281,9 @@ "TrainingConfig", "Turn", "URL", + "UnregisterDatasetRequest", "UnregisterMemoryBankRequest", "UnregisterModelRequest", - "UnregisterDatasetRequest", "UnstructuredLogEvent", "UserMessage", "VectorMemoryBank", diff --git a/docs/resources/llama-stack-spec.yaml b/docs/resources/llama-stack-spec.yaml index 6564ddf3f..0b737a697 100644 --- a/docs/resources/llama-stack-spec.yaml +++ b/docs/resources/llama-stack-spec.yaml @@ -242,6 +242,27 @@ components: - eval_candidate - scoring_params type: object + AppendRowsRequest: + additionalProperties: false + properties: + dataset_id: + type: string + rows: + items: + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + type: object + type: array + required: + - dataset_id + - rows + type: object Attachment: additionalProperties: false properties: @@ -1059,6 +1080,14 @@ components: type: string type: array type: object + GetSpanTreeRequest: + additionalProperties: false + properties: + attributes_to_return: + items: + type: string + type: array + type: object GraphMemoryBank: additionalProperties: false properties: @@ -1277,8 +1306,11 @@ components: - $ref: '#/components/schemas/UnstructuredLogEvent' - $ref: '#/components/schemas/MetricEvent' - $ref: '#/components/schemas/StructuredLogEvent' + ttl_seconds: + type: integer required: - event + - ttl_seconds type: object LogSeverity: enum: @@ -1825,6 +1857,33 @@ components: - rank - alpha type: object + QueryCondition: + additionalProperties: false + properties: + key: + type: string + op: + $ref: '#/components/schemas/QueryConditionOp' + value: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + required: + - key + - op + - value + type: object + QueryConditionOp: + enum: + - eq + - ne + - gt + - lt + type: string QueryDocumentsRequest: additionalProperties: false properties: @@ -1887,6 +1946,39 @@ components: - chunks - scores type: object + QuerySpansRequest: + additionalProperties: false + properties: + attribute_filters: + items: + $ref: '#/components/schemas/QueryCondition' + type: array + attributes_to_return: + items: + type: string + type: array + max_depth: + type: integer + required: + - attribute_filters + - attributes_to_return + type: object + QueryTracesRequest: + additionalProperties: false + properties: + attribute_filters: + items: + $ref: '#/components/schemas/QueryCondition' + type: array + limit: + type: integer + offset: + type: integer + order_by: + items: + type: string + type: array + type: object RLHFAlgorithm: enum: - dpo @@ -2392,6 +2484,26 @@ components: - top_p - top_k type: string + SaveSpansToDatasetRequest: + additionalProperties: false + properties: + attribute_filters: + items: + $ref: '#/components/schemas/QueryCondition' + type: array + attributes_to_save: + items: + type: string + type: array + dataset_id: + type: string + max_depth: + type: integer + required: + - attribute_filters + - attributes_to_save + - dataset_id + type: object ScoreBatchRequest: additionalProperties: false properties: @@ -2731,6 +2843,39 @@ components: - step_id - step_type type: object + Span: + additionalProperties: false + properties: + attributes: + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + type: object + end_time: + format: date-time + type: string + name: + type: string + parent_span_id: + type: string + span_id: + type: string + start_time: + format: date-time + type: string + trace_id: + type: string + required: + - span_id + - trace_id + - name + - start_time + type: object SpanEndPayload: additionalProperties: false properties: @@ -2764,6 +2909,46 @@ components: - ok - error type: string + SpanWithChildren: + additionalProperties: false + properties: + attributes: + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + type: object + children: + items: + $ref: '#/components/schemas/SpanWithChildren' + type: array + end_time: + format: date-time + type: string + name: + type: string + parent_span_id: + type: string + span_id: + type: string + start_time: + format: date-time + type: string + status: + $ref: '#/components/schemas/SpanStatus' + trace_id: + type: string + required: + - span_id + - trace_id + - name + - start_time + - children + type: object StopReason: enum: - end_of_turn @@ -3237,6 +3422,14 @@ components: format: uri pattern: ^(https?://|file://|data:) type: string + UnregisterDatasetRequest: + additionalProperties: false + properties: + dataset_id: + type: string + required: + - dataset_id + type: object UnregisterMemoryBankRequest: additionalProperties: false properties: @@ -3253,14 +3446,6 @@ components: required: - model_id type: object - UnregisterDatasetRequest: - additionalProperties: false - properties: - dataset_id: - type: string - required: - - dataset_id - type: object UnstructuredLogEvent: additionalProperties: false properties: @@ -3408,7 +3593,7 @@ components: info: description: "This is the specification of the Llama Stack that provides\n \ \ a set of endpoints and their corresponding interfaces that are tailored\ - \ to\n best leverage Llama Models. Generated at 2024-11-22 17:23:55.034164" + \ to\n best leverage Llama Models." title: Llama Stack Specification version: alpha jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema @@ -3692,6 +3877,27 @@ paths: description: OK tags: - BatchInference (Coming Soon) + /alpha/datasetio/append-rows: + post: + parameters: + - description: JSON-encoded provider data which will be made available to the + adapter servicing the API + in: header + name: X-LlamaStack-ProviderData + required: false + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/AppendRowsRequest' + required: true + responses: + '200': + description: OK + tags: + - DatasetIO /alpha/datasetio/get-rows-paginated: get: parameters: @@ -4785,14 +4991,19 @@ paths: description: OK tags: - SyntheticDataGeneration (Coming Soon) - /alpha/telemetry/get-trace: - get: + /alpha/telemetry/get-span-tree: + post: parameters: - in: query - name: trace_id + name: span_id required: true schema: type: string + - in: query + name: max_depth + required: false + schema: + type: integer - description: JSON-encoded provider data which will be made available to the adapter servicing the API in: header @@ -4800,12 +5011,18 @@ paths: required: false schema: type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/GetSpanTreeRequest' + required: true responses: '200': content: application/json: schema: - $ref: '#/components/schemas/Trace' + $ref: '#/components/schemas/SpanWithChildren' description: OK tags: - Telemetry @@ -4830,6 +5047,77 @@ paths: description: OK tags: - Telemetry + /alpha/telemetry/query-spans: + post: + parameters: + - description: JSON-encoded provider data which will be made available to the + adapter servicing the API + in: header + name: X-LlamaStack-ProviderData + required: false + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/QuerySpansRequest' + required: true + responses: + '200': + content: + application/jsonl: + schema: + $ref: '#/components/schemas/Span' + description: OK + tags: + - Telemetry + /alpha/telemetry/query-traces: + post: + parameters: + - description: JSON-encoded provider data which will be made available to the + adapter servicing the API + in: header + name: X-LlamaStack-ProviderData + required: false + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/QueryTracesRequest' + required: true + responses: + '200': + content: + application/jsonl: + schema: + $ref: '#/components/schemas/Trace' + description: OK + tags: + - Telemetry + /alpha/telemetry/save-spans-to-dataset: + post: + parameters: + - description: JSON-encoded provider data which will be made available to the + adapter servicing the API + in: header + name: X-LlamaStack-ProviderData + required: false + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/SaveSpansToDatasetRequest' + required: true + responses: + '200': + description: OK + tags: + - Telemetry security: - Default: [] servers: @@ -4878,6 +5166,9 @@ tags: - description: name: AppEvalTaskConfig +- description: + name: AppendRowsRequest - description: name: Attachment - description: name: GetAgentsSessionRequest +- description: + name: GetSpanTreeRequest - description: name: GraphMemoryBank @@ -5105,12 +5399,23 @@ tags: - description: name: QLoraFinetuningConfig +- description: + name: QueryCondition +- description: + name: QueryConditionOp - description: name: QueryDocumentsRequest - description: name: QueryDocumentsResponse +- description: + name: QuerySpansRequest +- description: + name: QueryTracesRequest - description: name: RLHFAlgorithm - description: name: SamplingStrategy +- description: + name: SaveSpansToDatasetRequest - description: name: ScoreBatchRequest @@ -5190,6 +5498,8 @@ tags: - description: name: ShieldCallStep - name: Shields +- description: + name: Span - description: name: SpanEndPayload - description: name: SpanStatus +- description: + name: SpanWithChildren - description: name: StopReason - description: name: URL +- description: + name: UnregisterDatasetRequest - description: name: UnregisterMemoryBankRequest - description: name: UnregisterModelRequest -- description: - name: UnregisterDatasetRequest - description: name: UnstructuredLogEvent @@ -5326,6 +5639,7 @@ x-tagGroups: - AgentTurnResponseTurnCompletePayload - AgentTurnResponseTurnStartPayload - AppEvalTaskConfig + - AppendRowsRequest - Attachment - BatchChatCompletionRequest - BatchChatCompletionResponse @@ -5361,6 +5675,7 @@ x-tagGroups: - FinetuningAlgorithm - FunctionCallToolDefinition - GetAgentsSessionRequest + - GetSpanTreeRequest - GraphMemoryBank - GraphMemoryBankParams - HealthInfo @@ -5395,8 +5710,12 @@ x-tagGroups: - PreferenceOptimizeRequest - ProviderInfo - QLoraFinetuningConfig + - QueryCondition + - QueryConditionOp - QueryDocumentsRequest - QueryDocumentsResponse + - QuerySpansRequest + - QueryTracesRequest - RLHFAlgorithm - RegexParserScoringFnParams - RegisterDatasetRequest @@ -5414,6 +5733,7 @@ x-tagGroups: - SafetyViolation - SamplingParams - SamplingStrategy + - SaveSpansToDatasetRequest - ScoreBatchRequest - ScoreBatchResponse - ScoreRequest @@ -5424,9 +5744,11 @@ x-tagGroups: - Session - Shield - ShieldCallStep + - Span - SpanEndPayload - SpanStartPayload - SpanStatus + - SpanWithChildren - StopReason - StructuredLogEvent - SupervisedFineTuneRequest @@ -5448,9 +5770,9 @@ x-tagGroups: - TrainingConfig - Turn - URL + - UnregisterDatasetRequest - UnregisterMemoryBankRequest - UnregisterModelRequest - - UnregisterDatasetRequest - UnstructuredLogEvent - UserMessage - VectorMemoryBank diff --git a/llama_stack/apis/telemetry/telemetry.py b/llama_stack/apis/telemetry/telemetry.py index fd60d99a7..12ec5f1d9 100644 --- a/llama_stack/apis/telemetry/telemetry.py +++ b/llama_stack/apis/telemetry/telemetry.py @@ -155,16 +155,23 @@ class SpanWithChildren(Span): status: Optional[SpanStatus] = None +@json_schema_type +class QueryConditionOp(Enum): + EQ = "eq" + NE = "ne" + GT = "gt" + LT = "lt" + + @json_schema_type class QueryCondition(BaseModel): key: str - op: Literal["eq", "ne", "gt", "lt"] + op: QueryConditionOp value: Any @runtime_checkable class Telemetry(Protocol): - @webmethod(route="/telemetry/log-event") async def log_event( self, event: Event, ttl_seconds: int = DEFAULT_TTL_DAYS * 86400 diff --git a/llama_stack/providers/utils/telemetry/sqlite_trace_store.py b/llama_stack/providers/utils/telemetry/sqlite_trace_store.py index 031b6fc73..8d9035216 100644 --- a/llama_stack/providers/utils/telemetry/sqlite_trace_store.py +++ b/llama_stack/providers/utils/telemetry/sqlite_trace_store.py @@ -14,7 +14,6 @@ from llama_stack.apis.telemetry import QueryCondition, SpanWithChildren, Trace class TraceStore(Protocol): - async def query_traces( self, attribute_filters: Optional[List[QueryCondition]] = None, @@ -42,7 +41,6 @@ class SQLiteTraceStore(TraceStore): offset: Optional[int] = 0, order_by: Optional[List[str]] = None, ) -> List[Trace]: - def build_where_clause() -> tuple[str, list]: if not attribute_filters: return "", [] @@ -50,7 +48,7 @@ class SQLiteTraceStore(TraceStore): ops_map = {"eq": "=", "ne": "!=", "gt": ">", "lt": "<"} conditions = [ - f"json_extract(s.attributes, '$.{condition.key}') {ops_map[condition.op]} ?" + f"json_extract(s.attributes, '$.{condition.key}') {ops_map[condition.op.value]} ?" for condition in attribute_filters ] params = [condition.value for condition in attribute_filters] From 5335393fe33524ae07f02310a94f453d8d80b65b Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Sun, 8 Dec 2024 22:25:37 -0800 Subject: [PATCH 51/69] Avoid deleting temp directory between agent turns This brings an interesting aspect -- we need to maintain session-level tempdir state (!) since the model was told there was some resource at a given location that it needs to maintain --- .../distribution/tests/library_client_test.py | 32 ++++++++++++++++--- .../agents/meta_reference/agent_instance.py | 9 ++---- .../inline/agents/meta_reference/agents.py | 3 ++ 3 files changed, 33 insertions(+), 11 deletions(-) diff --git a/llama_stack/distribution/tests/library_client_test.py b/llama_stack/distribution/tests/library_client_test.py index 5e7b997f3..955640c2b 100644 --- a/llama_stack/distribution/tests/library_client_test.py +++ b/llama_stack/distribution/tests/library_client_test.py @@ -11,7 +11,7 @@ from llama_stack.distribution.library_client import LlamaStackAsLibraryClient from llama_stack_client.lib.agents.agent import Agent from llama_stack_client.lib.agents.event_logger import EventLogger as AgentEventLogger from llama_stack_client.lib.inference.event_logger import EventLogger -from llama_stack_client.types import UserMessage +from llama_stack_client.types import Attachment, UserMessage from llama_stack_client.types.agent_create_params import AgentConfig @@ -67,9 +67,15 @@ def main(config_path: str): ] if os.getenv("BRAVE_SEARCH_API_KEY") else [] + ) + + ( + [ + { + "type": "code_interpreter", + } + ] ), - tool_choice="auto", - tool_prompt_format="json", + tool_choice="required", input_shields=[], output_shields=[], enable_session_persistence=False, @@ -79,10 +85,27 @@ def main(config_path: str): "Hello", "Which players played in the winning team of the NBA western conference semifinals of 2024, please use tools", ] + user_prompts = [ + ( + "Here is a csv, can you describe it ?", + [ + Attachment( + content="https://raw.githubusercontent.com/meta-llama/llama-stack-apps/main/examples/resources/inflation.csv", + mime_type="test/csv", + ) + ], + ), + ("Which year ended with the highest inflation ?", None), + ( + "What macro economic situations that led to such high inflation in that period?", + None, + ), + ("Plot average yearly inflation as a time series", None), + ] session_id = agent.create_session("test-session") - for prompt in user_prompts: + for prompt, attachments in user_prompts: response = agent.create_turn( messages=[ { @@ -90,6 +113,7 @@ def main(config_path: str): "content": prompt, } ], + attachments=attachments, session_id=session_id, ) diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py index 7df5d3bd4..e367f3c41 100644 --- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py +++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py @@ -10,9 +10,7 @@ import logging import os import re import secrets -import shutil import string -import tempfile import uuid from datetime import datetime from typing import AsyncGenerator, List, Tuple @@ -57,6 +55,7 @@ class ChatAgent(ShieldRunnerMixin): self, agent_id: str, agent_config: AgentConfig, + tempdir: str, inference_api: Inference, memory_api: Memory, memory_banks_api: MemoryBanks, @@ -65,14 +64,13 @@ class ChatAgent(ShieldRunnerMixin): ): self.agent_id = agent_id self.agent_config = agent_config + self.tempdir = tempdir self.inference_api = inference_api self.memory_api = memory_api self.memory_banks_api = memory_banks_api self.safety_api = safety_api self.storage = AgentPersistence(agent_id, persistence_store) - self.tempdir = tempfile.mkdtemp() - builtin_tools = [] for tool_defn in agent_config.tools: if isinstance(tool_defn, WolframAlphaToolDefinition): @@ -103,9 +101,6 @@ class ChatAgent(ShieldRunnerMixin): output_shields=agent_config.output_shields, ) - def __del__(self): - shutil.rmtree(self.tempdir) - def turn_to_messages(self, turn: Turn) -> List[Message]: messages = [] diff --git a/llama_stack/providers/inline/agents/meta_reference/agents.py b/llama_stack/providers/inline/agents/meta_reference/agents.py index 0b0bb6e27..dec5ec960 100644 --- a/llama_stack/providers/inline/agents/meta_reference/agents.py +++ b/llama_stack/providers/inline/agents/meta_reference/agents.py @@ -7,6 +7,7 @@ import json import logging import shutil +import tempfile import uuid from typing import AsyncGenerator @@ -43,6 +44,7 @@ class MetaReferenceAgentsImpl(Agents): self.memory_banks_api = memory_banks_api self.in_memory_store = InmemoryKVStoreImpl() + self.tempdir = tempfile.mkdtemp() async def initialize(self) -> None: self.persistence_store = await kvstore_impl(self.config.persistence_store) @@ -94,6 +96,7 @@ class MetaReferenceAgentsImpl(Agents): return ChatAgent( agent_id=agent_id, agent_config=agent_config, + tempdir=self.tempdir, inference_api=self.inference_api, safety_api=self.safety_api, memory_api=self.memory_api, From a2170353af47015dbe2f057b147a20fd0ce81681 Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Mon, 9 Dec 2024 09:37:52 -0800 Subject: [PATCH 52/69] better detection for jupyter --- llama_stack/distribution/library_client.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/llama_stack/distribution/library_client.py b/llama_stack/distribution/library_client.py index 64cd343d4..693e2f56c 100644 --- a/llama_stack/distribution/library_client.py +++ b/llama_stack/distribution/library_client.py @@ -33,16 +33,17 @@ from llama_stack.distribution.stack import ( T = TypeVar("T") -def is_jupyter(): - """Check if we're running in a Jupyter notebook""" +def in_notebook(): try: - shell = get_ipython().__class__.__name__ # type: ignore - if shell == "ZMQInteractiveShell": # Jupyter notebook or qtconsole - return True - else: + from IPython import get_ipython + + if "IPKernelApp" not in get_ipython().config: # pragma: no cover return False - except NameError: # Probably standard Python interpreter + except ImportError: return False + except AttributeError: + return False + return True def stream_across_asyncio_run_boundary( @@ -115,7 +116,7 @@ class LlamaStackAsLibraryClient(LlamaStackClient): self.pool_executor = ThreadPoolExecutor(max_workers=4) def initialize(self): - if is_jupyter(): + if in_notebook(): import nest_asyncio nest_asyncio.apply() From c699e884b561e2c550ae0d8d179c5f025fd30d07 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Mon, 9 Dec 2024 11:18:53 -0800 Subject: [PATCH 53/69] fix telemetry import (#585) # What does this PR do? fix issue image ## Test Plan ``` llama stack run ``` image ## Sources Please link relevant resources if necessary. ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Ran pre-commit to handle lint / formatting issues. - [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ ] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests. --- llama_stack/distribution/server/server.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py index 43e9c0706..8f24f3eaf 100644 --- a/llama_stack/distribution/server/server.py +++ b/llama_stack/distribution/server/server.py @@ -43,9 +43,9 @@ from llama_stack.distribution.stack import ( replace_env_vars, validate_env_pair, ) -from llama_stack.providers.inline.telemetry.meta_reference import ( +from llama_stack.providers.inline.telemetry.meta_reference.config import TelemetryConfig +from llama_stack.providers.inline.telemetry.meta_reference.telemetry import ( TelemetryAdapter, - TelemetryConfig, ) from .endpoints import get_all_api_endpoints From cd40a5fdbfee6f5da17fb943526fb436eee757d1 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Mon, 9 Dec 2024 15:40:59 -0800 Subject: [PATCH 54/69] update template run.yaml to include openai api key for braintrust (#590) # What does this PR do? **Why** - braintrust provider needs OpenAI API Key set in config for DirectClient to work ## Test Plan ``` python llama_stack/scripts/distro_codegen.py ``` image - set API key in client via provider_data image ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Ran pre-commit to handle lint / formatting issues. - [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ ] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests. --- .../providers/inline/scoring/braintrust/braintrust.py | 2 +- llama_stack/providers/inline/scoring/braintrust/config.py | 6 ++++++ llama_stack/templates/bedrock/run.yaml | 3 ++- llama_stack/templates/fireworks/run.yaml | 3 ++- llama_stack/templates/hf-endpoint/run-with-safety.yaml | 3 ++- llama_stack/templates/hf-endpoint/run.yaml | 3 ++- llama_stack/templates/hf-serverless/run-with-safety.yaml | 3 ++- llama_stack/templates/hf-serverless/run.yaml | 3 ++- .../templates/meta-reference-gpu/run-with-safety.yaml | 3 ++- llama_stack/templates/meta-reference-gpu/run.yaml | 3 ++- llama_stack/templates/meta-reference-quantized-gpu/run.yaml | 3 ++- llama_stack/templates/ollama/run-with-safety.yaml | 3 ++- llama_stack/templates/ollama/run.yaml | 3 ++- llama_stack/templates/tgi/run-with-safety.yaml | 3 ++- llama_stack/templates/tgi/run.yaml | 3 ++- llama_stack/templates/together/run.yaml | 3 ++- llama_stack/templates/vllm-gpu/run.yaml | 3 ++- 17 files changed, 37 insertions(+), 16 deletions(-) diff --git a/llama_stack/providers/inline/scoring/braintrust/braintrust.py b/llama_stack/providers/inline/scoring/braintrust/braintrust.py index ee515d588..1f266a236 100644 --- a/llama_stack/providers/inline/scoring/braintrust/braintrust.py +++ b/llama_stack/providers/inline/scoring/braintrust/braintrust.py @@ -86,7 +86,7 @@ class BraintrustScoringImpl( async def set_api_key(self) -> None: # api key is in the request headers - if self.config.openai_api_key is None: + if self.config.openai_api_key is None or not self.config.openai_api_key: provider_data = self.get_request_provider_data() if provider_data is None or not provider_data.openai_api_key: raise ValueError( diff --git a/llama_stack/providers/inline/scoring/braintrust/config.py b/llama_stack/providers/inline/scoring/braintrust/config.py index fae0b17eb..e12249432 100644 --- a/llama_stack/providers/inline/scoring/braintrust/config.py +++ b/llama_stack/providers/inline/scoring/braintrust/config.py @@ -11,3 +11,9 @@ class BraintrustScoringConfig(BaseModel): default=None, description="The OpenAI API Key", ) + + @classmethod + def sample_run_config(cls, **kwargs) -> Dict[str, Any]: + return { + "openai_api_key": "${env.OPENAI_API_KEY:}", + } diff --git a/llama_stack/templates/bedrock/run.yaml b/llama_stack/templates/bedrock/run.yaml index db0ee9d85..47885b536 100644 --- a/llama_stack/templates/bedrock/run.yaml +++ b/llama_stack/templates/bedrock/run.yaml @@ -63,7 +63,8 @@ providers: config: {} - provider_id: braintrust provider_type: inline::braintrust - config: {} + config: + openai_api_key: ${env.OPENAI_API_KEY:} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/fireworks/run.yaml b/llama_stack/templates/fireworks/run.yaml index c75db478d..70e2c1e5c 100644 --- a/llama_stack/templates/fireworks/run.yaml +++ b/llama_stack/templates/fireworks/run.yaml @@ -65,7 +65,8 @@ providers: config: {} - provider_id: braintrust provider_type: inline::braintrust - config: {} + config: + openai_api_key: ${env.OPENAI_API_KEY:} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/hf-endpoint/run-with-safety.yaml b/llama_stack/templates/hf-endpoint/run-with-safety.yaml index 678857201..845abf0dc 100644 --- a/llama_stack/templates/hf-endpoint/run-with-safety.yaml +++ b/llama_stack/templates/hf-endpoint/run-with-safety.yaml @@ -70,7 +70,8 @@ providers: config: {} - provider_id: braintrust provider_type: inline::braintrust - config: {} + config: + openai_api_key: ${env.OPENAI_API_KEY:} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/hf-endpoint/run.yaml b/llama_stack/templates/hf-endpoint/run.yaml index c062c6c98..815ee7f03 100644 --- a/llama_stack/templates/hf-endpoint/run.yaml +++ b/llama_stack/templates/hf-endpoint/run.yaml @@ -65,7 +65,8 @@ providers: config: {} - provider_id: braintrust provider_type: inline::braintrust - config: {} + config: + openai_api_key: ${env.OPENAI_API_KEY:} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/hf-serverless/run-with-safety.yaml b/llama_stack/templates/hf-serverless/run-with-safety.yaml index 4a14ba093..82276ca8f 100644 --- a/llama_stack/templates/hf-serverless/run-with-safety.yaml +++ b/llama_stack/templates/hf-serverless/run-with-safety.yaml @@ -70,7 +70,8 @@ providers: config: {} - provider_id: braintrust provider_type: inline::braintrust - config: {} + config: + openai_api_key: ${env.OPENAI_API_KEY:} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/hf-serverless/run.yaml b/llama_stack/templates/hf-serverless/run.yaml index 268efddc4..6f87c04e2 100644 --- a/llama_stack/templates/hf-serverless/run.yaml +++ b/llama_stack/templates/hf-serverless/run.yaml @@ -65,7 +65,8 @@ providers: config: {} - provider_id: braintrust provider_type: inline::braintrust - config: {} + config: + openai_api_key: ${env.OPENAI_API_KEY:} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml index 963679665..044c1e7fd 100644 --- a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml +++ b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml @@ -72,7 +72,8 @@ providers: config: {} - provider_id: braintrust provider_type: inline::braintrust - config: {} + config: + openai_api_key: ${env.OPENAI_API_KEY:} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/meta-reference-gpu/run.yaml b/llama_stack/templates/meta-reference-gpu/run.yaml index a74cde768..e8fdb10c2 100644 --- a/llama_stack/templates/meta-reference-gpu/run.yaml +++ b/llama_stack/templates/meta-reference-gpu/run.yaml @@ -66,7 +66,8 @@ providers: config: {} - provider_id: braintrust provider_type: inline::braintrust - config: {} + config: + openai_api_key: ${env.OPENAI_API_KEY:} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml index 5aada0fe6..0232ec51c 100644 --- a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml +++ b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml @@ -68,7 +68,8 @@ providers: config: {} - provider_id: braintrust provider_type: inline::braintrust - config: {} + config: + openai_api_key: ${env.OPENAI_API_KEY:} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/ollama/run-with-safety.yaml b/llama_stack/templates/ollama/run-with-safety.yaml index 2ab0f78f0..fcb1b2dba 100644 --- a/llama_stack/templates/ollama/run-with-safety.yaml +++ b/llama_stack/templates/ollama/run-with-safety.yaml @@ -64,7 +64,8 @@ providers: config: {} - provider_id: braintrust provider_type: inline::braintrust - config: {} + config: + openai_api_key: ${env.OPENAI_API_KEY:} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml index c5206c2d0..2e739aac2 100644 --- a/llama_stack/templates/ollama/run.yaml +++ b/llama_stack/templates/ollama/run.yaml @@ -64,7 +64,8 @@ providers: config: {} - provider_id: braintrust provider_type: inline::braintrust - config: {} + config: + openai_api_key: ${env.OPENAI_API_KEY:} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/tgi/run-with-safety.yaml b/llama_stack/templates/tgi/run-with-safety.yaml index ecd03c36a..a7375a90f 100644 --- a/llama_stack/templates/tgi/run-with-safety.yaml +++ b/llama_stack/templates/tgi/run-with-safety.yaml @@ -68,7 +68,8 @@ providers: config: {} - provider_id: braintrust provider_type: inline::braintrust - config: {} + config: + openai_api_key: ${env.OPENAI_API_KEY:} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/tgi/run.yaml b/llama_stack/templates/tgi/run.yaml index b93f09042..a3e21075f 100644 --- a/llama_stack/templates/tgi/run.yaml +++ b/llama_stack/templates/tgi/run.yaml @@ -64,7 +64,8 @@ providers: config: {} - provider_id: braintrust provider_type: inline::braintrust - config: {} + config: + openai_api_key: ${env.OPENAI_API_KEY:} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/together/run.yaml b/llama_stack/templates/together/run.yaml index 381557816..529bf7873 100644 --- a/llama_stack/templates/together/run.yaml +++ b/llama_stack/templates/together/run.yaml @@ -65,7 +65,8 @@ providers: config: {} - provider_id: braintrust provider_type: inline::braintrust - config: {} + config: + openai_api_key: ${env.OPENAI_API_KEY:} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/vllm-gpu/run.yaml b/llama_stack/templates/vllm-gpu/run.yaml index 1442273f4..8353dbd51 100644 --- a/llama_stack/templates/vllm-gpu/run.yaml +++ b/llama_stack/templates/vllm-gpu/run.yaml @@ -68,7 +68,8 @@ providers: config: {} - provider_id: braintrust provider_type: inline::braintrust - config: {} + config: + openai_api_key: ${env.OPENAI_API_KEY:} metadata_store: namespace: null type: sqlite From ab7145a04f2b83d0c5e65356139d466fc2632a5f Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Mon, 9 Dec 2024 15:43:12 -0800 Subject: [PATCH 55/69] minor refactor --- llama_stack/providers/inline/scoring/braintrust/braintrust.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_stack/providers/inline/scoring/braintrust/braintrust.py b/llama_stack/providers/inline/scoring/braintrust/braintrust.py index 1f266a236..8b22a8930 100644 --- a/llama_stack/providers/inline/scoring/braintrust/braintrust.py +++ b/llama_stack/providers/inline/scoring/braintrust/braintrust.py @@ -86,7 +86,7 @@ class BraintrustScoringImpl( async def set_api_key(self) -> None: # api key is in the request headers - if self.config.openai_api_key is None or not self.config.openai_api_key: + if not self.config.openai_api_key: provider_data = self.get_request_provider_data() if provider_data is None or not provider_data.openai_api_key: raise ValueError( From bc1fddf1df68fd845ae01f517eb8979f151e10d9 Mon Sep 17 00:00:00 2001 From: Dinesh Yeduguru Date: Mon, 9 Dec 2024 15:46:26 -0800 Subject: [PATCH 56/69] add tracing to library client (#591) --- llama_stack/distribution/library_client.py | 40 ++++++++++++++----- .../meta_reference/sqlite_span_processor.py | 26 +++++++++--- 2 files changed, 49 insertions(+), 17 deletions(-) diff --git a/llama_stack/distribution/library_client.py b/llama_stack/distribution/library_client.py index 693e2f56c..3a87f0c97 100644 --- a/llama_stack/distribution/library_client.py +++ b/llama_stack/distribution/library_client.py @@ -22,6 +22,7 @@ from termcolor import cprint from llama_stack.distribution.build import print_pip_install_help from llama_stack.distribution.configure import parse_and_maybe_upgrade_config +from llama_stack.distribution.datatypes import Api from llama_stack.distribution.resolver import ProviderRegistry from llama_stack.distribution.server.endpoints import get_all_api_endpoints from llama_stack.distribution.stack import ( @@ -29,6 +30,11 @@ from llama_stack.distribution.stack import ( get_stack_run_config_from_template, replace_env_vars, ) +from llama_stack.providers.utils.telemetry.tracing import ( + end_trace, + setup_logger, + start_trace, +) T = TypeVar("T") @@ -187,6 +193,10 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): ) return False + # Set up telemetry logger similar to server.py + if Api.telemetry in self.impls: + setup_logger(self.impls[Api.telemetry]) + console = Console() console.print(f"Using config [blue]{self.config_path_or_template_name}[/blue]:") console.print(yaml.dump(self.config.model_dump(), indent=2)) @@ -234,21 +244,29 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): return await self._call_non_streaming(path, "POST", body) async def _call_non_streaming(self, path: str, method: str, body: dict = None): - func = self.endpoint_impls.get(path) - if not func: - raise ValueError(f"No endpoint found for {path}") + await start_trace(path, {"__location__": "library_client"}) + try: + func = self.endpoint_impls.get(path) + if not func: + raise ValueError(f"No endpoint found for {path}") - body = self._convert_body(path, body) - return await func(**body) + body = self._convert_body(path, body) + return await func(**body) + finally: + end_trace() async def _call_streaming(self, path: str, method: str, body: dict = None): - func = self.endpoint_impls.get(path) - if not func: - raise ValueError(f"No endpoint found for {path}") + await start_trace(path, {"__location__": "library_client"}) + try: + func = self.endpoint_impls.get(path) + if not func: + raise ValueError(f"No endpoint found for {path}") - body = self._convert_body(path, body) - async for chunk in await func(**body): - yield chunk + body = self._convert_body(path, body) + async for chunk in await func(**body): + yield chunk + finally: + end_trace() def _convert_body(self, path: str, body: Optional[dict] = None) -> dict: if not body: diff --git a/llama_stack/providers/inline/telemetry/meta_reference/sqlite_span_processor.py b/llama_stack/providers/inline/telemetry/meta_reference/sqlite_span_processor.py index 553dd5000..f8fdbc12f 100644 --- a/llama_stack/providers/inline/telemetry/meta_reference/sqlite_span_processor.py +++ b/llama_stack/providers/inline/telemetry/meta_reference/sqlite_span_processor.py @@ -20,6 +20,7 @@ class SQLiteSpanProcessor(SpanProcessor): """Initialize the SQLite span processor with a connection string.""" self.conn_string = conn_string self.ttl_days = ttl_days + self._shutdown_event = threading.Event() self.cleanup_task = None self._thread_local = threading.local() self._connections: Dict[int, sqlite3.Connection] = {} @@ -144,9 +145,10 @@ class SQLiteSpanProcessor(SpanProcessor): """Run cleanup periodically.""" import time - while True: + while not self._shutdown_event.is_set(): time.sleep(3600) # Sleep for 1 hour - self._cleanup_old_data() + if not self._shutdown_event.is_set(): + self._cleanup_old_data() def on_start(self, span: Span, parent_context=None): """Called when a span starts.""" @@ -231,11 +233,23 @@ class SQLiteSpanProcessor(SpanProcessor): def shutdown(self): """Cleanup any resources.""" + self._shutdown_event.set() + + # Wait for cleanup thread to finish if it exists + if self.cleanup_task and self.cleanup_task.is_alive(): + self.cleanup_task.join(timeout=5.0) + current_thread_id = threading.get_ident() + with self._lock: - for conn in self._connections.values(): - if conn: - conn.close() - self._connections.clear() + # Close all connections from the current thread + for thread_id, conn in list(self._connections.items()): + if thread_id == current_thread_id: + try: + if conn: + conn.close() + del self._connections[thread_id] + except sqlite3.Error: + pass # Ignore errors during shutdown def force_flush(self, timeout_millis=30000): """Force export of spans.""" From 7615da78b8a60c908584acfc305428d737c000e0 Mon Sep 17 00:00:00 2001 From: Dinesh Yeduguru Date: Mon, 9 Dec 2024 15:54:42 -0800 Subject: [PATCH 57/69] await end_trace in libcli --- llama_stack/distribution/library_client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_stack/distribution/library_client.py b/llama_stack/distribution/library_client.py index 3a87f0c97..08c8e2b5d 100644 --- a/llama_stack/distribution/library_client.py +++ b/llama_stack/distribution/library_client.py @@ -253,7 +253,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): body = self._convert_body(path, body) return await func(**body) finally: - end_trace() + await end_trace() async def _call_streaming(self, path: str, method: str, body: dict = None): await start_trace(path, {"__location__": "library_client"}) @@ -266,7 +266,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): async for chunk in await func(**body): yield chunk finally: - end_trace() + await end_trace() def _convert_body(self, path: str, body: Optional[dict] = None) -> dict: if not body: From a4d8a6009a5a518cb32af71d20db1369a56f936d Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Mon, 9 Dec 2024 17:14:37 -0800 Subject: [PATCH 58/69] Fixes for library client (#587) Library client used _server_ side types which was no bueno. The fix here is not the completely correct fix but it is good for enough and for the demo notebook. --- docs/resources/llama-stack-spec.html | 5 +- docs/resources/llama-stack-spec.yaml | 6 +- llama_stack/apis/agents/agents.py | 3 +- llama_stack/apis/agents/event_logger.py | 2 +- llama_stack/distribution/library_client.py | 153 ++++++++++-------- .../agents/meta_reference/agent_instance.py | 4 +- 6 files changed, 89 insertions(+), 84 deletions(-) diff --git a/docs/resources/llama-stack-spec.html b/docs/resources/llama-stack-spec.html index d1040f186..14e311cfc 100644 --- a/docs/resources/llama-stack-spec.html +++ b/docs/resources/llama-stack-spec.html @@ -4368,14 +4368,11 @@ "step_id": { "type": "string" }, - "model_response_text_delta": { + "text_delta": { "type": "string" }, "tool_call_delta": { "$ref": "#/components/schemas/ToolCallDelta" - }, - "tool_response_text_delta": { - "type": "string" } }, "additionalProperties": false, diff --git a/docs/resources/llama-stack-spec.yaml b/docs/resources/llama-stack-spec.yaml index 0b737a697..86fcae23d 100644 --- a/docs/resources/llama-stack-spec.yaml +++ b/docs/resources/llama-stack-spec.yaml @@ -132,8 +132,6 @@ components: const: step_progress default: step_progress type: string - model_response_text_delta: - type: string step_id: type: string step_type: @@ -143,10 +141,10 @@ components: - shield_call - memory_retrieval type: string + text_delta: + type: string tool_call_delta: $ref: '#/components/schemas/ToolCallDelta' - tool_response_text_delta: - type: string required: - event_type - step_type diff --git a/llama_stack/apis/agents/agents.py b/llama_stack/apis/agents/agents.py index 6e41df4f6..575f336af 100644 --- a/llama_stack/apis/agents/agents.py +++ b/llama_stack/apis/agents/agents.py @@ -340,9 +340,8 @@ class AgentTurnResponseStepProgressPayload(BaseModel): step_type: StepType step_id: str - model_response_text_delta: Optional[str] = None + text_delta: Optional[str] = None tool_call_delta: Optional[ToolCallDelta] = None - tool_response_text_delta: Optional[str] = None @json_schema_type diff --git a/llama_stack/apis/agents/event_logger.py b/llama_stack/apis/agents/event_logger.py index 25931b821..737ba385c 100644 --- a/llama_stack/apis/agents/event_logger.py +++ b/llama_stack/apis/agents/event_logger.py @@ -121,7 +121,7 @@ class EventLogger: else: yield event, LogEvent( role=None, - content=event.payload.model_response_text_delta, + content=event.payload.text_delta, end="", color="yellow", ) diff --git a/llama_stack/distribution/library_client.py b/llama_stack/distribution/library_client.py index 08c8e2b5d..9265bb560 100644 --- a/llama_stack/distribution/library_client.py +++ b/llama_stack/distribution/library_client.py @@ -6,16 +6,18 @@ import asyncio import inspect +import json import os import queue import threading from concurrent.futures import ThreadPoolExecutor +from enum import Enum from pathlib import Path -from typing import Any, Generator, get_args, get_origin, Optional, TypeVar +from typing import Any, Generator, get_args, get_origin, Optional, Type, TypeVar, Union import yaml from llama_stack_client import AsyncLlamaStackClient, LlamaStackClient, NOT_GIVEN -from pydantic import TypeAdapter +from pydantic import BaseModel, TypeAdapter from rich.console import Console from termcolor import cprint @@ -109,6 +111,65 @@ def stream_across_asyncio_run_boundary( future.result() +def convert_pydantic_to_json_value(value: Any, cast_to: Type) -> dict: + if isinstance(value, Enum): + return value.value + elif isinstance(value, list): + return [convert_pydantic_to_json_value(item, cast_to) for item in value] + elif isinstance(value, dict): + return {k: convert_pydantic_to_json_value(v, cast_to) for k, v in value.items()} + elif isinstance(value, BaseModel): + # This is quite hacky and we should figure out how to use stuff from + # generated client-sdk code (using ApiResponse.parse() essentially) + value_dict = json.loads(value.model_dump_json()) + + origin = get_origin(cast_to) + if origin is Union: + args = get_args(cast_to) + for arg in args: + arg_name = arg.__name__.split(".")[-1] + value_name = value.__class__.__name__.split(".")[-1] + if arg_name == value_name: + return arg(**value_dict) + + # assume we have the correct association between the server-side type and the client-side type + return cast_to(**value_dict) + + return value + + +def convert_to_pydantic(annotation: Any, value: Any) -> Any: + if isinstance(annotation, type) and annotation in {str, int, float, bool}: + return value + + origin = get_origin(annotation) + if origin is list: + item_type = get_args(annotation)[0] + try: + return [convert_to_pydantic(item_type, item) for item in value] + except Exception: + print(f"Error converting list {value}") + return value + + elif origin is dict: + key_type, val_type = get_args(annotation) + try: + return {k: convert_to_pydantic(val_type, v) for k, v in value.items()} + except Exception: + print(f"Error converting dict {value}") + return value + + try: + # Handle Pydantic models and discriminated unions + return TypeAdapter(annotation).validate_python(value) + except Exception as e: + cprint( + f"Warning: direct client failed to convert parameter {value} into {annotation}: {e}", + "yellow", + ) + return value + + class LlamaStackAsLibraryClient(LlamaStackClient): def __init__( self, @@ -129,23 +190,14 @@ class LlamaStackAsLibraryClient(LlamaStackClient): return asyncio.run(self.async_client.initialize()) - def get(self, *args, **kwargs): + def request(self, *args, **kwargs): if kwargs.get("stream"): return stream_across_asyncio_run_boundary( - lambda: self.async_client.get(*args, **kwargs), + lambda: self.async_client.request(*args, **kwargs), self.pool_executor, ) else: - return asyncio.run(self.async_client.get(*args, **kwargs)) - - def post(self, *args, **kwargs): - if kwargs.get("stream"): - return stream_across_asyncio_run_boundary( - lambda: self.async_client.post(*args, **kwargs), - self.pool_executor, - ) - else: - return asyncio.run(self.async_client.post(*args, **kwargs)) + return asyncio.run(self.async_client.request(*args, **kwargs)) class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): @@ -187,8 +239,9 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): if self.config_path_or_template_name.endswith(".yaml"): print_pip_install_help(self.config.providers) else: + prefix = "!" if in_notebook() else "" cprint( - f"Please run:\n\nllama stack build --template {self.config_path_or_template_name} --image-type venv\n\n", + f"Please run:\n\n{prefix}llama stack build --template {self.config_path_or_template_name} --image-type venv\n\n", "yellow", ) return False @@ -212,38 +265,27 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): self.endpoint_impls = endpoint_impls return True - async def get( + async def request( self, - path: str, + cast_to: Any, + options: Any, *, stream=False, - **kwargs, + stream_cls=None, ): if not self.endpoint_impls: raise ValueError("Client not initialized") + params = options.params or {} + params |= options.json_data or {} if stream: - return self._call_streaming(path, "GET") + return self._call_streaming(options.url, params, cast_to) else: - return await self._call_non_streaming(path, "GET") + return await self._call_non_streaming(options.url, params, cast_to) - async def post( - self, - path: str, - *, - body: dict = None, - stream=False, - **kwargs, + async def _call_non_streaming( + self, path: str, body: dict = None, cast_to: Any = None ): - if not self.endpoint_impls: - raise ValueError("Client not initialized") - - if stream: - return self._call_streaming(path, "POST", body) - else: - return await self._call_non_streaming(path, "POST", body) - - async def _call_non_streaming(self, path: str, method: str, body: dict = None): await start_trace(path, {"__location__": "library_client"}) try: func = self.endpoint_impls.get(path) @@ -251,11 +293,11 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): raise ValueError(f"No endpoint found for {path}") body = self._convert_body(path, body) - return await func(**body) + return convert_pydantic_to_json_value(await func(**body), cast_to) finally: await end_trace() - async def _call_streaming(self, path: str, method: str, body: dict = None): + async def _call_streaming(self, path: str, body: dict = None, cast_to: Any = None): await start_trace(path, {"__location__": "library_client"}) try: func = self.endpoint_impls.get(path) @@ -264,7 +306,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): body = self._convert_body(path, body) async for chunk in await func(**body): - yield chunk + yield convert_pydantic_to_json_value(chunk, cast_to) finally: await end_trace() @@ -283,38 +325,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): for param_name, param in sig.parameters.items(): if param_name in body: value = body.get(param_name) - converted_body[param_name] = self._convert_param( + converted_body[param_name] = convert_to_pydantic( param.annotation, value ) return converted_body - - def _convert_param(self, annotation: Any, value: Any) -> Any: - if isinstance(annotation, type) and annotation in {str, int, float, bool}: - return value - - origin = get_origin(annotation) - if origin is list: - item_type = get_args(annotation)[0] - try: - return [self._convert_param(item_type, item) for item in value] - except Exception: - print(f"Error converting list {value}") - return value - - elif origin is dict: - key_type, val_type = get_args(annotation) - try: - return {k: self._convert_param(val_type, v) for k, v in value.items()} - except Exception: - print(f"Error converting dict {value}") - return value - - try: - # Handle Pydantic models and discriminated unions - return TypeAdapter(annotation).validate_python(value) - except Exception as e: - cprint( - f"Warning: direct client failed to convert parameter {value} into {annotation}: {e}", - "yellow", - ) - return value diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py index e367f3c41..126c2e193 100644 --- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py +++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py @@ -451,7 +451,7 @@ class ChatAgent(ShieldRunnerMixin): payload=AgentTurnResponseStepProgressPayload( step_type=StepType.inference.value, step_id=step_id, - model_response_text_delta="", + text_delta="", tool_call_delta=delta, ) ) @@ -465,7 +465,7 @@ class ChatAgent(ShieldRunnerMixin): payload=AgentTurnResponseStepProgressPayload( step_type=StepType.inference.value, step_id=step_id, - model_response_text_delta=event.delta, + text_delta=event.delta, ) ) ) From baae4f7b5115f60f461f3a7e17290a399d8ff0b6 Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Mon, 9 Dec 2024 21:22:20 -0800 Subject: [PATCH 59/69] Bump version to 0.0.59 --- requirements.txt | 4 ++-- setup.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index fa7b70fd9..a4859d754 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,8 +2,8 @@ blobfile fire httpx huggingface-hub -llama-models>=0.0.58 -llama-stack-client>=0.0.58 +llama-models>=0.0.59 +llama-stack-client>=0.0.59 prompt-toolkit python-dotenv pydantic>=2 diff --git a/setup.py b/setup.py index ff6770b81..dacdbb767 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ def read_requirements(): setup( name="llama_stack", - version="0.0.58", + version="0.0.59", author="Meta Llama", author_email="llama-oss@meta.com", description="Llama Stack", From 176ebddf470d1c394a5d23e2a5c56ba55087e96f Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Mon, 9 Dec 2024 22:17:25 -0800 Subject: [PATCH 60/69] Disable telemetry in library client for now --- llama_stack/distribution/library_client.py | 27 ++++++++++++---------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/llama_stack/distribution/library_client.py b/llama_stack/distribution/library_client.py index 9265bb560..29423db0b 100644 --- a/llama_stack/distribution/library_client.py +++ b/llama_stack/distribution/library_client.py @@ -24,7 +24,7 @@ from termcolor import cprint from llama_stack.distribution.build import print_pip_install_help from llama_stack.distribution.configure import parse_and_maybe_upgrade_config -from llama_stack.distribution.datatypes import Api +from llama_stack.distribution.datatypes import Api # noqa from llama_stack.distribution.resolver import ProviderRegistry from llama_stack.distribution.server.endpoints import get_all_api_endpoints from llama_stack.distribution.stack import ( @@ -32,11 +32,12 @@ from llama_stack.distribution.stack import ( get_stack_run_config_from_template, replace_env_vars, ) -from llama_stack.providers.utils.telemetry.tracing import ( - end_trace, - setup_logger, - start_trace, -) + +from llama_stack.providers.utils.telemetry.tracing import ( # noqa + end_trace, # noqa + setup_logger, # noqa + start_trace, # noqa +) # noqa T = TypeVar("T") @@ -247,8 +248,8 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): return False # Set up telemetry logger similar to server.py - if Api.telemetry in self.impls: - setup_logger(self.impls[Api.telemetry]) + # if Api.telemetry in self.impls: + # setup_logger(self.impls[Api.telemetry]) console = Console() console.print(f"Using config [blue]{self.config_path_or_template_name}[/blue]:") @@ -286,7 +287,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): async def _call_non_streaming( self, path: str, body: dict = None, cast_to: Any = None ): - await start_trace(path, {"__location__": "library_client"}) + # await start_trace(path, {"__location__": "library_client"}) try: func = self.endpoint_impls.get(path) if not func: @@ -295,10 +296,11 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): body = self._convert_body(path, body) return convert_pydantic_to_json_value(await func(**body), cast_to) finally: - await end_trace() + pass + # await end_trace() async def _call_streaming(self, path: str, body: dict = None, cast_to: Any = None): - await start_trace(path, {"__location__": "library_client"}) + # await start_trace(path, {"__location__": "library_client"}) try: func = self.endpoint_impls.get(path) if not func: @@ -308,7 +310,8 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): async for chunk in await func(**body): yield convert_pydantic_to_json_value(chunk, cast_to) finally: - await end_trace() + pass + # await end_trace() def _convert_body(self, path: str, body: Optional[dict] = None) -> dict: if not body: From 1ad691bb04d0934597a90e56d5b63e13fee0693c Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Mon, 9 Dec 2024 22:19:51 -0800 Subject: [PATCH 61/69] Bump version to 0.0.60 --- requirements.txt | 4 ++-- setup.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index a4859d754..cefc0ed2b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,8 +2,8 @@ blobfile fire httpx huggingface-hub -llama-models>=0.0.59 -llama-stack-client>=0.0.59 +llama-models>=0.0.60 +llama-stack-client>=0.0.60 prompt-toolkit python-dotenv pydantic>=2 diff --git a/setup.py b/setup.py index dacdbb767..b3c71fa45 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ def read_requirements(): setup( name="llama_stack", - version="0.0.59", + version="0.0.60", author="Meta Llama", author_email="llama-oss@meta.com", description="Llama Stack", From 686f8d5b8d0ccd5aec36560fdee2249e60279cd1 Mon Sep 17 00:00:00 2001 From: Dinesh Yeduguru Date: Tue, 10 Dec 2024 08:40:42 -0800 Subject: [PATCH 62/69] remove info logging in agent instance --- .../agents/meta_reference/agent_instance.py | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py index 126c2e193..f08bdb032 100644 --- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py +++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py @@ -185,9 +185,9 @@ class ChatAgent(ShieldRunnerMixin): stream=request.stream, ): if isinstance(chunk, CompletionMessage): - log.info( - f"{chunk.role.capitalize()}: {chunk.content}", - ) + # log.info( + # f"{chunk.role.capitalize()}: {chunk.content}", + # ) output_message = chunk continue @@ -405,11 +405,11 @@ class ChatAgent(ShieldRunnerMixin): n_iter = 0 while True: msg = input_messages[-1] - if len(str(msg)) > 1000: - msg_str = f"{str(msg)[:500]}......{str(msg)[-500:]}" - else: - msg_str = str(msg) - log.info(f"{msg_str}") + # if len(str(msg)) > 1000: + # msg_str = f"{str(msg)[:500]}......{str(msg)[-500:]}" + # else: + # msg_str = str(msg) + # log.info(f"{msg_str}") step_id = str(uuid.uuid4()) yield AgentTurnResponseStreamChunk( @@ -514,12 +514,12 @@ class ChatAgent(ShieldRunnerMixin): ) if n_iter >= self.agent_config.max_infer_iters: - log.info("Done with MAX iterations, exiting.") + # log.info("Done with MAX iterations, exiting.") yield message break if stop_reason == StopReason.out_of_tokens: - log.info("Out of token budget, exiting.") + # log.info("Out of token budget, exiting.") yield message break @@ -533,10 +533,10 @@ class ChatAgent(ShieldRunnerMixin): message.content = [message.content] + attachments yield message else: - log.info(f"Partial message: {str(message)}") + # log.info(f"Partial message: {str(message)}") input_messages = input_messages + [message] else: - log.info(f"{str(message)}") + # log.info(f"{str(message)}") try: tool_call = message.tool_calls[0] @@ -800,7 +800,7 @@ async def attachment_message(tempdir: str, urls: List[URL]) -> ToolResponseMessa path = urlparse(uri).path basename = os.path.basename(path) filepath = f"{tempdir}/{make_random_string() + basename}" - log.info(f"Downloading {url} -> {filepath}") + # log.info(f"Downloading {url} -> {filepath}") async with httpx.AsyncClient() as client: r = await client.get(uri) From f969b561ea796d312714872a852098e476b2d048 Mon Sep 17 00:00:00 2001 From: Dinesh Yeduguru Date: Tue, 10 Dec 2024 08:47:18 -0800 Subject: [PATCH 63/69] Revert "Disable telemetry in library client for now" This reverts commit 176ebddf470d1c394a5d23e2a5c56ba55087e96f. --- llama_stack/distribution/library_client.py | 27 ++++++++++------------ 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/llama_stack/distribution/library_client.py b/llama_stack/distribution/library_client.py index 29423db0b..9265bb560 100644 --- a/llama_stack/distribution/library_client.py +++ b/llama_stack/distribution/library_client.py @@ -24,7 +24,7 @@ from termcolor import cprint from llama_stack.distribution.build import print_pip_install_help from llama_stack.distribution.configure import parse_and_maybe_upgrade_config -from llama_stack.distribution.datatypes import Api # noqa +from llama_stack.distribution.datatypes import Api from llama_stack.distribution.resolver import ProviderRegistry from llama_stack.distribution.server.endpoints import get_all_api_endpoints from llama_stack.distribution.stack import ( @@ -32,12 +32,11 @@ from llama_stack.distribution.stack import ( get_stack_run_config_from_template, replace_env_vars, ) - -from llama_stack.providers.utils.telemetry.tracing import ( # noqa - end_trace, # noqa - setup_logger, # noqa - start_trace, # noqa -) # noqa +from llama_stack.providers.utils.telemetry.tracing import ( + end_trace, + setup_logger, + start_trace, +) T = TypeVar("T") @@ -248,8 +247,8 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): return False # Set up telemetry logger similar to server.py - # if Api.telemetry in self.impls: - # setup_logger(self.impls[Api.telemetry]) + if Api.telemetry in self.impls: + setup_logger(self.impls[Api.telemetry]) console = Console() console.print(f"Using config [blue]{self.config_path_or_template_name}[/blue]:") @@ -287,7 +286,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): async def _call_non_streaming( self, path: str, body: dict = None, cast_to: Any = None ): - # await start_trace(path, {"__location__": "library_client"}) + await start_trace(path, {"__location__": "library_client"}) try: func = self.endpoint_impls.get(path) if not func: @@ -296,11 +295,10 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): body = self._convert_body(path, body) return convert_pydantic_to_json_value(await func(**body), cast_to) finally: - pass - # await end_trace() + await end_trace() async def _call_streaming(self, path: str, body: dict = None, cast_to: Any = None): - # await start_trace(path, {"__location__": "library_client"}) + await start_trace(path, {"__location__": "library_client"}) try: func = self.endpoint_impls.get(path) if not func: @@ -310,8 +308,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): async for chunk in await func(**body): yield convert_pydantic_to_json_value(chunk, cast_to) finally: - pass - # await end_trace() + await end_trace() def _convert_body(self, path: str, body: Optional[dict] = None) -> dict: if not body: From 16d103842aa3e4946aec602874f16711fe101d43 Mon Sep 17 00:00:00 2001 From: Dinesh Yeduguru Date: Tue, 10 Dec 2024 08:47:32 -0800 Subject: [PATCH 64/69] Revert "await end_trace in libcli" This reverts commit 7615da78b8a60c908584acfc305428d737c000e0. --- llama_stack/distribution/library_client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_stack/distribution/library_client.py b/llama_stack/distribution/library_client.py index 9265bb560..45382c417 100644 --- a/llama_stack/distribution/library_client.py +++ b/llama_stack/distribution/library_client.py @@ -295,7 +295,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): body = self._convert_body(path, body) return convert_pydantic_to_json_value(await func(**body), cast_to) finally: - await end_trace() + end_trace() async def _call_streaming(self, path: str, body: dict = None, cast_to: Any = None): await start_trace(path, {"__location__": "library_client"}) @@ -308,7 +308,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): async for chunk in await func(**body): yield convert_pydantic_to_json_value(chunk, cast_to) finally: - await end_trace() + end_trace() def _convert_body(self, path: str, body: Optional[dict] = None) -> dict: if not body: From 2e3d3a62a5bc3f6928d7cc0707f89877bf0967b3 Mon Sep 17 00:00:00 2001 From: Dinesh Yeduguru Date: Tue, 10 Dec 2024 08:50:20 -0800 Subject: [PATCH 65/69] Revert "add tracing to library client (#591)" This reverts commit bc1fddf1df68fd845ae01f517eb8979f151e10d9. --- llama_stack/distribution/library_client.py | 40 +++++-------------- .../meta_reference/sqlite_span_processor.py | 26 +++--------- 2 files changed, 17 insertions(+), 49 deletions(-) diff --git a/llama_stack/distribution/library_client.py b/llama_stack/distribution/library_client.py index 45382c417..8766f7a72 100644 --- a/llama_stack/distribution/library_client.py +++ b/llama_stack/distribution/library_client.py @@ -24,7 +24,6 @@ from termcolor import cprint from llama_stack.distribution.build import print_pip_install_help from llama_stack.distribution.configure import parse_and_maybe_upgrade_config -from llama_stack.distribution.datatypes import Api from llama_stack.distribution.resolver import ProviderRegistry from llama_stack.distribution.server.endpoints import get_all_api_endpoints from llama_stack.distribution.stack import ( @@ -32,11 +31,6 @@ from llama_stack.distribution.stack import ( get_stack_run_config_from_template, replace_env_vars, ) -from llama_stack.providers.utils.telemetry.tracing import ( - end_trace, - setup_logger, - start_trace, -) T = TypeVar("T") @@ -246,10 +240,6 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): ) return False - # Set up telemetry logger similar to server.py - if Api.telemetry in self.impls: - setup_logger(self.impls[Api.telemetry]) - console = Console() console.print(f"Using config [blue]{self.config_path_or_template_name}[/blue]:") console.print(yaml.dump(self.config.model_dump(), indent=2)) @@ -286,29 +276,21 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): async def _call_non_streaming( self, path: str, body: dict = None, cast_to: Any = None ): - await start_trace(path, {"__location__": "library_client"}) - try: - func = self.endpoint_impls.get(path) - if not func: - raise ValueError(f"No endpoint found for {path}") + func = self.endpoint_impls.get(path) + if not func: + raise ValueError(f"No endpoint found for {path}") - body = self._convert_body(path, body) - return convert_pydantic_to_json_value(await func(**body), cast_to) - finally: - end_trace() + body = self._convert_body(path, body) + return convert_pydantic_to_json_value(await func(**body), cast_to) async def _call_streaming(self, path: str, body: dict = None, cast_to: Any = None): - await start_trace(path, {"__location__": "library_client"}) - try: - func = self.endpoint_impls.get(path) - if not func: - raise ValueError(f"No endpoint found for {path}") + func = self.endpoint_impls.get(path) + if not func: + raise ValueError(f"No endpoint found for {path}") - body = self._convert_body(path, body) - async for chunk in await func(**body): - yield convert_pydantic_to_json_value(chunk, cast_to) - finally: - end_trace() + body = self._convert_body(path, body) + async for chunk in await func(**body): + yield convert_pydantic_to_json_value(chunk, cast_to) def _convert_body(self, path: str, body: Optional[dict] = None) -> dict: if not body: diff --git a/llama_stack/providers/inline/telemetry/meta_reference/sqlite_span_processor.py b/llama_stack/providers/inline/telemetry/meta_reference/sqlite_span_processor.py index f8fdbc12f..553dd5000 100644 --- a/llama_stack/providers/inline/telemetry/meta_reference/sqlite_span_processor.py +++ b/llama_stack/providers/inline/telemetry/meta_reference/sqlite_span_processor.py @@ -20,7 +20,6 @@ class SQLiteSpanProcessor(SpanProcessor): """Initialize the SQLite span processor with a connection string.""" self.conn_string = conn_string self.ttl_days = ttl_days - self._shutdown_event = threading.Event() self.cleanup_task = None self._thread_local = threading.local() self._connections: Dict[int, sqlite3.Connection] = {} @@ -145,10 +144,9 @@ class SQLiteSpanProcessor(SpanProcessor): """Run cleanup periodically.""" import time - while not self._shutdown_event.is_set(): + while True: time.sleep(3600) # Sleep for 1 hour - if not self._shutdown_event.is_set(): - self._cleanup_old_data() + self._cleanup_old_data() def on_start(self, span: Span, parent_context=None): """Called when a span starts.""" @@ -233,23 +231,11 @@ class SQLiteSpanProcessor(SpanProcessor): def shutdown(self): """Cleanup any resources.""" - self._shutdown_event.set() - - # Wait for cleanup thread to finish if it exists - if self.cleanup_task and self.cleanup_task.is_alive(): - self.cleanup_task.join(timeout=5.0) - current_thread_id = threading.get_ident() - with self._lock: - # Close all connections from the current thread - for thread_id, conn in list(self._connections.items()): - if thread_id == current_thread_id: - try: - if conn: - conn.close() - del self._connections[thread_id] - except sqlite3.Error: - pass # Ignore errors during shutdown + for conn in self._connections.values(): + if conn: + conn.close() + self._connections.clear() def force_flush(self, timeout_millis=30000): """Force export of spans.""" From 885bb0900bb19238435b58f7e20584bec0729bb6 Mon Sep 17 00:00:00 2001 From: Dinesh Yeduguru Date: Tue, 10 Dec 2024 09:32:18 -0800 Subject: [PATCH 66/69] memory retrival to print only the bytes injected --- llama_stack/apis/agents/event_logger.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/llama_stack/apis/agents/event_logger.py b/llama_stack/apis/agents/event_logger.py index 737ba385c..4c379999e 100644 --- a/llama_stack/apis/agents/event_logger.py +++ b/llama_stack/apis/agents/event_logger.py @@ -171,12 +171,14 @@ class EventLogger: and event_type == EventType.step_complete.value ): details = event.payload.step_details - content = interleaved_text_media_as_str(details.inserted_context) - content = content[:200] + "..." if len(content) > 200 else content + inserted_context = interleaved_text_media_as_str( + details.inserted_context + ) + content = f"fetched {len(inserted_context)} bytes from {details.memory_bank_ids}" yield event, LogEvent( role=step_type, - content=f"Retrieved context from banks: {details.memory_bank_ids}.\n====\n{content}\n>", + content=content, color="cyan", ) From fa68ded07c5a6469f113b016a335f355a94ed504 Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Tue, 10 Dec 2024 09:46:37 -0800 Subject: [PATCH 67/69] Remove the unnecessary message after llama stack build --- llama_stack/cli/stack/build.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/llama_stack/cli/stack/build.py b/llama_stack/cli/stack/build.py index f19c6e798..3bd061424 100644 --- a/llama_stack/cli/stack/build.py +++ b/llama_stack/cli/stack/build.py @@ -261,7 +261,6 @@ class StackBuild(Subcommand): ) -> None: import json import os - import re import yaml from termcolor import cprint @@ -291,20 +290,8 @@ class StackBuild(Subcommand): run_config_file = build_dir / f"{build_config.name}-run.yaml" shutil.copy(template_path, run_config_file) - with open(template_path, "r") as f: - yaml_content = f.read() - # Find all ${env.VARIABLE} patterns - env_vars = set(re.findall(r"\${env\.([A-Za-z0-9_]+)}", yaml_content)) - cprint("Build Successful! Next steps: ", color="green") - cprint( - f" 1. Set the environment variables: {list(env_vars)}", - color="green", - ) - cprint( - f" 2. Run: `llama stack run {template_name}`", - color="green", - ) + cprint("Build Successful!", color="green") else: self._generate_run_config(build_config, build_dir) From 02b43be9d78b7a3967c0800d507434f9d04339ba Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Tue, 10 Dec 2024 10:18:44 -0800 Subject: [PATCH 68/69] Bump version to 0.0.61 --- requirements.txt | 4 ++-- setup.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index cefc0ed2b..ce5918fa5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,8 +2,8 @@ blobfile fire httpx huggingface-hub -llama-models>=0.0.60 -llama-stack-client>=0.0.60 +llama-models>=0.0.61 +llama-stack-client>=0.0.61 prompt-toolkit python-dotenv pydantic>=2 diff --git a/setup.py b/setup.py index b3c71fa45..cab3f7d68 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ def read_requirements(): setup( name="llama_stack", - version="0.0.60", + version="0.0.61", author="Meta Llama", author_email="llama-oss@meta.com", description="Llama Stack", From e2054d53e4aa6b1a8949bd7107e2099aeaf07978 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Tue, 10 Dec 2024 10:22:04 -0800 Subject: [PATCH 69/69] Fix issue 586 (#594) # What does this PR do? - Addresses issue (#586 ) ## Test Plan ``` python llama_stack/scripts/distro_codegen.py ``` ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Ran pre-commit to handle lint / formatting issues. - [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ ] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests. --- .../distributions/self_hosted_distro/meta-reference-gpu.md | 2 ++ .../self_hosted_distro/meta-reference-quantized-gpu.md | 2 ++ llama_stack/templates/meta-reference-gpu/doc_template.md | 2 ++ .../templates/meta-reference-quantized-gpu/doc_template.md | 2 ++ 4 files changed, 8 insertions(+) diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md index 73d6befd4..d46039318 100644 --- a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md +++ b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md @@ -60,6 +60,7 @@ LLAMA_STACK_PORT=5001 docker run \ -it \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ~/.llama:/root/.llama \ llamastack/distribution-meta-reference-gpu \ --port $LLAMA_STACK_PORT \ --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct @@ -71,6 +72,7 @@ If you are using Llama Stack Safety / Shield APIs, use: docker run \ -it \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ~/.llama:/root/.llama \ llamastack/distribution-meta-reference-gpu \ --port $LLAMA_STACK_PORT \ --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \ diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md index fab9c6cd8..837be744a 100644 --- a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md +++ b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md @@ -60,6 +60,7 @@ LLAMA_STACK_PORT=5001 docker run \ -it \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ~/.llama:/root/.llama \ llamastack/distribution-meta-reference-quantized-gpu \ --port $LLAMA_STACK_PORT \ --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct @@ -71,6 +72,7 @@ If you are using Llama Stack Safety / Shield APIs, use: docker run \ -it \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ~/.llama:/root/.llama \ llamastack/distribution-meta-reference-quantized-gpu \ --port $LLAMA_STACK_PORT \ --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \ diff --git a/llama_stack/templates/meta-reference-gpu/doc_template.md b/llama_stack/templates/meta-reference-gpu/doc_template.md index f9870adbd..421812dbc 100644 --- a/llama_stack/templates/meta-reference-gpu/doc_template.md +++ b/llama_stack/templates/meta-reference-gpu/doc_template.md @@ -50,6 +50,7 @@ LLAMA_STACK_PORT=5001 docker run \ -it \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ~/.llama:/root/.llama \ llamastack/distribution-{{ name }} \ --port $LLAMA_STACK_PORT \ --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct @@ -61,6 +62,7 @@ If you are using Llama Stack Safety / Shield APIs, use: docker run \ -it \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ~/.llama:/root/.llama \ llamastack/distribution-{{ name }} \ --port $LLAMA_STACK_PORT \ --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \ diff --git a/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md b/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md index 9e3c56d92..daa380d20 100644 --- a/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md +++ b/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md @@ -52,6 +52,7 @@ LLAMA_STACK_PORT=5001 docker run \ -it \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ~/.llama:/root/.llama \ llamastack/distribution-{{ name }} \ --port $LLAMA_STACK_PORT \ --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct @@ -63,6 +64,7 @@ If you are using Llama Stack Safety / Shield APIs, use: docker run \ -it \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ~/.llama:/root/.llama \ llamastack/distribution-{{ name }} \ --port $LLAMA_STACK_PORT \ --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \