From 84d067c4e385f90e44d3d53dd506dfa49894e475 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 10 Mar 2025 23:31:05 +0000 Subject: [PATCH 001/557] Release candidate 0.1.6.1rc1 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b3ebc45dd..faf5234f6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "llama_stack" -version = "0.1.6" +version = "0.1.6.1rc1" authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }] description = "Llama Stack" readme = "README.md" From e3edca77391ebc73af7fad0ea4b5d4132961c067 Mon Sep 17 00:00:00 2001 From: Botao Chen Date: Mon, 10 Mar 2025 20:38:28 -0700 Subject: [PATCH 002/557] feat: [new open benchmark] Math 500 (#1538) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## What does this PR do? Created a new math_500 open-benchmark based on OpenAI's [Let's Verify Step by Step](https://arxiv.org/abs/2305.20050) paper and hugging face's [HuggingFaceH4/MATH-500](https://huggingface.co/datasets/HuggingFaceH4/MATH-500) dataset. The challenge part of this benchmark is to parse the generated and expected answer and verify if they are same. For the parsing part, we refer to [Minerva: Solving Quantitative Reasoning Problems with Language Models](https://research.google/blog/minerva-solving-quantitative-reasoning-problems-with-language-models/). To simply the parse logic, as the next step, we plan to also refer to what [simple-eval](https://github.com/openai/simple-evals) is doing, using llm as judge to check if the generated answer matches the expected answer or not ## Test Plan on sever side, spin up a server with open-benchmark template `llama stack run llama_stack/templates/open-benchamrk/run.yaml` on client side, issue an open benchmark eval request `llama-stack-client --endpoint xxx eval run-benchmark "meta-reference-math-500" --model-id "meta-llama/Llama-3.3-70B-Instruct" --output-dir "/home/markchen1015/" --num-examples 20` and get ther aggregated eval results Screenshot 2025-03-10 at 7 57 04 PM check the generated answer and the related scoring and they make sense --- .../providers/inline/scoring/basic/scoring.py | 3 +- .../fn_defs/regex_parser_math_response.py | 27 ++ .../regex_parser_math_response_scoring_fn.py | 66 ++++ .../inline/scoring/basic/utils/math_utils.py | 330 ++++++++++++++++++ .../utils/scoring/basic_scoring_utils.py | 26 ++ llama_stack/templates/open-benchmark/run.yaml | 20 +- 6 files changed, 470 insertions(+), 2 deletions(-) create mode 100644 llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_math_response.py create mode 100644 llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py create mode 100644 llama_stack/providers/inline/scoring/basic/utils/math_utils.py create mode 100644 llama_stack/providers/utils/scoring/basic_scoring_utils.py diff --git a/llama_stack/providers/inline/scoring/basic/scoring.py b/llama_stack/providers/inline/scoring/basic/scoring.py index 13cd78243..00945b99d 100644 --- a/llama_stack/providers/inline/scoring/basic/scoring.py +++ b/llama_stack/providers/inline/scoring/basic/scoring.py @@ -23,10 +23,11 @@ from llama_stack.providers.utils.common.data_schema_validator import ( from .config import BasicScoringConfig from .scoring_fn.equality_scoring_fn import EqualityScoringFn +from .scoring_fn.regex_parser_math_response_scoring_fn import RegexParserMathResponseScoringFn from .scoring_fn.regex_parser_scoring_fn import RegexParserScoringFn from .scoring_fn.subset_of_scoring_fn import SubsetOfScoringFn -FIXED_FNS = [EqualityScoringFn, SubsetOfScoringFn, RegexParserScoringFn] +FIXED_FNS = [EqualityScoringFn, SubsetOfScoringFn, RegexParserScoringFn, RegexParserMathResponseScoringFn] class BasicScoringImpl( diff --git a/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_math_response.py b/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_math_response.py new file mode 100644 index 000000000..8b1bf5352 --- /dev/null +++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_math_response.py @@ -0,0 +1,27 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.apis.common.type_system import NumberType +from llama_stack.apis.scoring_functions import ( + AggregationFunctionType, + RegexParserScoringFnParams, + ScoringFn, +) + +MATH_ANSWER_REGEXES = [r".*final answer is:?\s*\$\\boxed{(?P.*)}\$"] + + +regex_parser_math_response = ScoringFn( + identifier="basic::regex_parser_math_response", + description="For math related benchmarks, extract answer from the generated response and expected_answer and see if they match", + return_type=NumberType(), + provider_id="basic", + provider_resource_id="regex-parser-math-response", + params=RegexParserScoringFnParams( + parsing_regexes=MATH_ANSWER_REGEXES, + aggregation_functions=[AggregationFunctionType.accuracy], + ), +) diff --git a/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py b/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py new file mode 100644 index 000000000..d6c78a9ac --- /dev/null +++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py @@ -0,0 +1,66 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from typing import Any, Dict, Optional + +from llama_stack.apis.scoring import ScoringResultRow +from llama_stack.apis.scoring_functions import ScoringFnParams, ScoringFnParamsType +from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn + +from ..utils.math_utils import first_answer, normalize_final_answer, try_evaluate_frac, try_evaluate_latex +from .fn_defs.regex_parser_math_response import ( + regex_parser_math_response, +) + + +class RegexParserMathResponseScoringFn(RegisteredBaseScoringFn): + """ + A scoring_fn for math benchamrks that parses answer from generated response according to context and check match with expected_answer. + """ + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self.supported_fn_defs_registry = { + regex_parser_math_response.identifier: regex_parser_math_response, + } + + async def score_row( + self, + input_row: Dict[str, Any], + scoring_fn_identifier: Optional[str] = None, + scoring_params: Optional[ScoringFnParams] = None, + ) -> ScoringResultRow: + assert scoring_fn_identifier is not None, "Scoring function identifier not found." + fn_def = self.supported_fn_defs_registry[scoring_fn_identifier] + if scoring_params is not None: + fn_def.params = scoring_params + + assert fn_def.params is not None and fn_def.params.type == ScoringFnParamsType.regex_parser.value, ( + f"RegexParserScoringFnParams not found for {fn_def}." + ) + + expected_answer = input_row["expected_answer"] + generated_answer = input_row["generated_answer"] + + parsing_regexes = fn_def.params.parsing_regexes + assert len(parsing_regexes) == 1, ( + "Only one parsing regex is supported for regex_parser_math_response scoring function." + ) + parsing_regexes = fn_def.params.parsing_regexes[0] + + normalized_generated_answer = normalize_final_answer( + first_answer(generated_answer), + parsing_regexes, + match_first=True, + ) + normalized_generated_answer = try_evaluate_frac(try_evaluate_latex(normalized_generated_answer)) + + normalized_expected_answer = normalize_final_answer(expected_answer, r".*") + normalized_expected_answer = try_evaluate_frac(try_evaluate_latex(normalized_expected_answer)) + + score = 1.0 if normalized_generated_answer == normalized_expected_answer else 0.0 + return { + "score": score, + } diff --git a/llama_stack/providers/inline/scoring/basic/utils/math_utils.py b/llama_stack/providers/inline/scoring/basic/utils/math_utils.py new file mode 100644 index 000000000..e11fc625b --- /dev/null +++ b/llama_stack/providers/inline/scoring/basic/utils/math_utils.py @@ -0,0 +1,330 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import re +from typing import Sequence + +from llama_stack.providers.utils.scoring.basic_scoring_utils import time_limit + +# from minerva +SUBSTITUTIONS = [ + ("an ", ""), + ("a ", ""), + (".$", "$"), + ("\\$", ""), + (r"\ ", ""), + (" ", ""), + ("mbox", "text"), + (",\\text{and}", ","), + ("\\text{and}", ","), + ("\\text{m}", "\\text{}"), +] + +REMOVED_EXPRESSIONS = [ + "square", + "ways", + "integers", + "dollars", + "mph", + "inches", + "ft", + "hours", + "km", + "units", + "\\ldots", + "sue", + "points", + "feet", + "minutes", + "digits", + "cents", + "degrees", + "cm", + "gm", + "pounds", + "meters", + "meals", + "edges", + "students", + "childrentickets", + "multiples", + "\\text{s}", + "\\text{.}", + "\\text{\ns}", + "\\text{}^2", + "\\text{}^3", + "\\text{\n}", + "\\text{}", + r"\mathrm{th}", + r"^\circ", + r"^{\circ}", + r"\;", + r",\!", + "{,}", + '"', + "\\dots", +] + + +def try_evaluate_frac(expression: str, fmt: str = "0.2e") -> str: + if isinstance(expression, float): + return expression + new_expression = f"{expression}" + regex = re.compile(r"\\frac{([^}]+)}{([^}]+)}") + for match in re.finditer(regex, expression): + try: + value = float(match.group(1)) / float(match.group(2)) + new_expression = new_expression.replace( + match.group(), + f"{{value:{fmt}}}".format(value=value), + 1, + ) + except Exception: + continue + return new_expression + + +def try_evaluate_latex(expression: str, fmt: str = ".2e") -> str: + try: + with time_limit(seconds=5): + from sympy.parsing.latex import parse_latex + + value = parse_latex(expression).evalf() # type: ignore + return f"{{value:{fmt}}}".format(value=value) + except Exception: + return expression + + +def first_answer(text: str, markers: Sequence[str] = ("Q:", "A:")) -> str: + for marker in markers: + text = text.split(marker)[0] + return text + + +def extract_result_from_boxed(answer: str) -> str: + box_start = "\\boxed" + # format is `\\boxed $` or `\\boxed{}`, with potential white spaces framing `` + start = answer.rfind(box_start) + if start < 0: + return "" + answer = answer[start + len(box_start) :].strip() + ends_with_curly = answer.startswith("{") + i = 0 + open_braces = 0 + while i < len(answer): + if answer[i] == "{": + open_braces += 1 + elif answer[i] == "}": + open_braces -= 1 + if open_braces == 0: + if ends_with_curly: + answer = answer[: i + 1].strip() + break + elif answer[i] == "$": + answer = answer[:i].strip() + break + i += 1 + else: + return "" + # remove extra curly braces + while True: + if answer.startswith("{") and answer.endswith("}"): + answer = answer[1:-1].strip() + else: + break + return answer + + +# from minerva paper + _normalise_result from xavierm +def normalize_final_answer(final_answer: str, regex_pattern: str, match_first: bool = True) -> str: + """Extract and normalize a final answer to a quantitative reasoning question.""" + match = re.findall(regex_pattern, final_answer) + extraction: str + if len(match) > 0: + if match_first: + extraction = match[0] + else: + extraction = match[-1] + else: + extraction = extract_result_from_boxed(final_answer) + + if len(extraction) == 0: + return final_answer + else: + final_answer = extraction + final_answer = final_answer.split("=")[-1] + for before, after in SUBSTITUTIONS: + final_answer = final_answer.replace(before, after) + for expr in REMOVED_EXPRESSIONS: + final_answer = final_answer.replace(expr, "") + # Extract answer that is in LaTeX math, is bold, + # is surrounded by a box, etc. + final_answer = re.sub(r"(.*?)(\$)(.*?)(\$)(.*)", "$\\3$", final_answer) + final_answer = re.sub(r"(\\text\{)(.*?)(\})", "\\2", final_answer) + final_answer = re.sub(r"(\\textbf\{)(.*?)(\})", "\\2", final_answer) + final_answer = re.sub(r"(\\overline\{)(.*?)(\})", "\\2", final_answer) + final_answer = re.sub(r"(\\boxed\{)(.*)(\})", "\\2", final_answer) + # Normalize shorthand TeX: + # \fracab -> \frac{a}{b} + # \frac{abc}{bef} -> \frac{abc}{bef} + # \fracabc -> \frac{a}{b}c + # \sqrta -> \sqrt{a} + # \sqrtab -> sqrt{a}b + final_answer = re.sub(r"(frac)([^{])(.)", "frac{\\2}{\\3}", final_answer) + final_answer = re.sub(r"(sqrt)([^{])", "sqrt{\\2}", final_answer) + final_answer = final_answer.replace("$", "") + # Normalize 100,000 -> 100000 + if final_answer.replace(",", "").isdigit(): + final_answer = final_answer.replace(",", "") + # If the final answer is a single letter in parentheses, remove the parentheses + # Example: (a) -> a (but not (ab) -> ab) + if re.match(r"\([a-zA-Z]\)", final_answer): + final_answer = final_answer[1] + return _normalise_result(final_answer) + + +def _normalise_result(string: str) -> str: + # linebreaks + string = string.replace("\n", "") + + # remove inverse spaces + string = string.replace("\\!", "") + + # replace \\ with \ + string = string.replace("\\\\", "\\") + + # replace tfrac and dfrac with frac + string = string.replace("cfrac", "frac") + string = string.replace("tfrac", "frac") + string = string.replace("dfrac", "frac") + + # remove \left and \right + string = string.replace("\\left", "") + string = string.replace("\\le", "") + string = string.replace("\\right", "") + + # Remove circ (degrees) + string = string.replace("^{\\circ}", "") + string = string.replace("^\\circ", "") + + # remove dollar signs + string = string.replace("\\$", "") + + # remove units (on the right) + string = _remove_right_units(string) + + # remove percentage + string = string.replace("\\%", "") + string = string.replace(r"\%", "") + + # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string + string = string.replace(" .", " 0.") + string = string.replace("{.", "{0.") + # if empty, return empty string + if len(string) == 0: + return string + if string[0] == ".": + string = "0" + string + + # to consider: get rid of e.g. "k = " or "q = " at beginning + string = string.split("=")[-1] + + # fix sqrt3 --> sqrt{3} + string = _fix_sqrt(string) + + # remove spaces + string = string.replace(" ", "") + + # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b} + string = _fix_fracs(string) + + # manually change 0.5 --> \frac{1}{2} + if string == "0.5": + string = "\\frac{1}{2}" + + # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y + string = _fix_a_slash_b(string) + + return string + + +def _remove_right_units(string: str) -> str: + # "\\text{ " only ever occurs (at least in the val set) when describing units + try: + if "\\text{ " in string: + splits = string.split("\\text{ ") + assert len(splits) == 2 + return splits[0] + else: + return string + except AssertionError: + return string + + +def _fix_sqrt(string: str) -> str: + if "\\sqrt" not in string: + return string + splits = string.split("\\sqrt") + new_string = splits[0] + for split in splits[1:]: + if len(split) == 0: + return string + if split[0] != "{": + a = split[0] + new_substr = "\\sqrt{" + a + "}" + split[1:] + else: + new_substr = "\\sqrt" + split + new_string += new_substr + return new_string + + +def _fix_fracs(string: str) -> str: + substrs = string.split("\\frac") + new_str = substrs[0] + if len(substrs) > 1: + substrs = substrs[1:] + for substr in substrs: + new_str += "\\frac" + if len(substr) == 0: + return string + if substr[0] == "{": + new_str += substr + else: + try: + assert len(substr) >= 2 + except AssertionError: + return string + a = substr[0] + b = substr[1] + if b != "{": + if len(substr) > 2: + post_substr = substr[2:] + new_str += "{" + a + "}{" + b + "}" + post_substr + else: + new_str += "{" + a + "}{" + b + "}" + else: + if len(substr) > 2: + post_substr = substr[2:] + new_str += "{" + a + "}" + b + post_substr + else: + new_str += "{" + a + "}" + b + string = new_str + return string + + +def _fix_a_slash_b(string: str) -> str: + if len(string.split("/")) != 2: + return string + a = string.split("/")[0] + b = string.split("/")[1] + try: + ia = int(a) + ib = int(b) + assert string == "{}/{}".format(ia, ib) + new_string = "\\frac{" + str(ia) + "}{" + str(ib) + "}" + return new_string + except (ValueError, AssertionError): + return string diff --git a/llama_stack/providers/utils/scoring/basic_scoring_utils.py b/llama_stack/providers/utils/scoring/basic_scoring_utils.py new file mode 100644 index 000000000..91abfdb2e --- /dev/null +++ b/llama_stack/providers/utils/scoring/basic_scoring_utils.py @@ -0,0 +1,26 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +import contextlib +import signal +from types import FrameType +from typing import Iterator, Optional + + +class TimeoutError(Exception): + pass + + +@contextlib.contextmanager +def time_limit(seconds: float) -> Iterator[None]: + def signal_handler(signum: int, frame: Optional[FrameType]) -> None: + raise TimeoutError("Timed out!") + + signal.setitimer(signal.ITIMER_REAL, seconds) + signal.signal(signal.SIGALRM, signal_handler) + try: + yield + finally: + signal.setitimer(signal.ITIMER_REAL, 0) diff --git a/llama_stack/templates/open-benchmark/run.yaml b/llama_stack/templates/open-benchmark/run.yaml index 47a2f2eb5..736b47746 100644 --- a/llama_stack/templates/open-benchmark/run.yaml +++ b/llama_stack/templates/open-benchmark/run.yaml @@ -33,7 +33,7 @@ providers: provider_type: remote::together config: url: https://api.together.xyz/v1 - api_key: ${env.TOGETHER_API_KEY} + api_key: ${env.TOGETHER_API_KEY:} vector_io: - provider_id: sqlite-vec provider_type: inline::sqlite-vec @@ -190,6 +190,21 @@ datasets: type: string chat_completion_input: type: string + - dataset_id: math_500 + provider_id: huggingface + url: + uri: https://huggingface.co/datasets/llamastack/math_500 + metadata: + path: llamastack/math_500 + name: + split: test + dataset_schema: + input_query: + type: string + expected_answer: + type: string + chat_completion_input: + type: string scoring_fns: [] benchmarks: - benchmark_id: meta-reference-simpleqa @@ -201,6 +216,9 @@ benchmarks: - benchmark_id: meta-reference-gpqa-cot dataset_id: gpqa_cot scoring_functions: ["basic::regex_parser_multiple_choice_answer"] + - benchmark_id: meta-reference-math-500 + dataset_id: math_500 + scoring_functions: ["basic::regex_parser_math_response"] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search From ead9397e22a8688268d2f9614e27e308fd638eee Mon Sep 17 00:00:00 2001 From: Dinesh Yeduguru Date: Tue, 11 Mar 2025 07:12:48 -0700 Subject: [PATCH 003/557] fix: tracing fixes for trace context propogation across coroutines (#1522) # What does this PR do? This PR has two fixes needed for correct trace context propagation across asycnio boundary Fix 1: Start using context vars to store the global trace context. This is needed since we cannot use the same trace context across coroutines since the state is shared. each coroutine should have its own trace context so that each of it can start storing its state correctly. Fix 2: Start a new span for each new coroutines started for running shields to keep the span tree clean ## Test Plan ### Integration tests with server LLAMA_STACK_DISABLE_VERSION_CHECK=true llama stack run ~/.llama/distributions/together/together-run.yaml LLAMA_STACK_CONFIG=http://localhost:8321 pytest -s --safety-shield meta-llama/Llama-Guard-3-8B --text-model meta-llama/Llama-3.1-8B-Instruct server logs: https://gist.github.com/dineshyv/51ac5d9864ed031d0d89ce77352821fe test logs: https://gist.github.com/dineshyv/e66acc1c4648a42f1854600609c467f3 ### Integration tests with library client LLAMA_STACK_CONFIG=fireworks pytest -s --safety-shield meta-llama/Llama-Guard-3-8B --text-model meta-llama/Llama-3.1-8B-Instruct logs: https://gist.github.com/dineshyv/ca160696a0b167223378673fb1dcefb8 ### Apps test with server: ``` LLAMA_STACK_DISABLE_VERSION_CHECK=true llama stack run ~/.llama/distributions/together/together-run.yaml python -m examples.agents.e2e_loop_with_client_tools localhost 8321 ``` server logs: https://gist.github.com/dineshyv/1717a572d8f7c14279c36123b79c5797 app logs: https://gist.github.com/dineshyv/44167e9f57806a0ba3b710c32aec02f8 --- .../agents/meta_reference/agent_instance.py | 10 +-- .../inline/agents/meta_reference/safety.py | 12 ++-- .../providers/utils/telemetry/tracing.py | 67 ++++++++++++------- 3 files changed, 54 insertions(+), 35 deletions(-) diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py index 3619b3f67..fedd695c1 100644 --- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py +++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py @@ -181,7 +181,7 @@ class ChatAgent(ShieldRunnerMixin): return messages async def create_and_execute_turn(self, request: AgentTurnCreateRequest) -> AsyncGenerator: - with tracing.span("create_and_execute_turn") as span: + async with tracing.span("create_and_execute_turn") as span: span.set_attribute("session_id", request.session_id) span.set_attribute("agent_id", self.agent_id) span.set_attribute("request", request.model_dump_json()) @@ -191,7 +191,7 @@ class ChatAgent(ShieldRunnerMixin): yield chunk async def resume_turn(self, request: AgentTurnResumeRequest) -> AsyncGenerator: - with tracing.span("resume_turn") as span: + async with tracing.span("resume_turn") as span: span.set_attribute("agent_id", self.agent_id) span.set_attribute("session_id", request.session_id) span.set_attribute("turn_id", request.turn_id) @@ -390,7 +390,7 @@ class ChatAgent(ShieldRunnerMixin): shields: List[str], touchpoint: str, ) -> AsyncGenerator: - with tracing.span("run_shields") as span: + async with tracing.span("run_shields") as span: span.set_attribute("input", [m.model_dump_json() for m in messages]) if len(shields) == 0: span.set_attribute("output", "no shields") @@ -508,7 +508,7 @@ class ChatAgent(ShieldRunnerMixin): content = "" stop_reason = None - with tracing.span("inference") as span: + async with tracing.span("inference") as span: async for chunk in await self.inference_api.chat_completion( self.agent_config.model, input_messages, @@ -685,7 +685,7 @@ class ChatAgent(ShieldRunnerMixin): tool_name = tool_call.tool_name if isinstance(tool_name, BuiltinTool): tool_name = tool_name.value - with tracing.span( + async with tracing.span( "tool_execution", { "tool_name": tool_name, diff --git a/llama_stack/providers/inline/agents/meta_reference/safety.py b/llama_stack/providers/inline/agents/meta_reference/safety.py index 2497be070..bef16eaba 100644 --- a/llama_stack/providers/inline/agents/meta_reference/safety.py +++ b/llama_stack/providers/inline/agents/meta_reference/safety.py @@ -10,6 +10,7 @@ from typing import List from llama_stack.apis.inference import Message from llama_stack.apis.safety import Safety, SafetyViolation, ViolationLevel +from llama_stack.providers.utils.telemetry import tracing log = logging.getLogger(__name__) @@ -32,15 +33,14 @@ class ShieldRunnerMixin: self.output_shields = output_shields async def run_multiple_shields(self, messages: List[Message], identifiers: List[str]) -> None: - responses = await asyncio.gather( - *[ - self.safety_api.run_shield( + async def run_shield_with_span(identifier: str): + async with tracing.span(f"run_shield_{identifier}"): + return await self.safety_api.run_shield( shield_id=identifier, messages=messages, ) - for identifier in identifiers - ] - ) + + responses = await asyncio.gather(*[run_shield_with_span(identifier) for identifier in identifiers]) for identifier, response in zip(identifiers, responses, strict=False): if not response.violation: continue diff --git a/llama_stack/providers/utils/telemetry/tracing.py b/llama_stack/providers/utils/telemetry/tracing.py index d84024941..bef229080 100644 --- a/llama_stack/providers/utils/telemetry/tracing.py +++ b/llama_stack/providers/utils/telemetry/tracing.py @@ -6,6 +6,7 @@ import asyncio import base64 +import contextvars import logging import queue import threading @@ -24,9 +25,10 @@ from llama_stack.apis.telemetry import ( Telemetry, UnstructuredLogEvent, ) +from llama_stack.log import get_logger from llama_stack.providers.utils.telemetry.trace_protocol import serialize_value -log = logging.getLogger(__name__) +logger = get_logger(__name__, category="core") def generate_short_uuid(len: int = 8): @@ -36,7 +38,7 @@ def generate_short_uuid(len: int = 8): return encoded.rstrip(b"=").decode("ascii")[:len] -CURRENT_TRACE_CONTEXT = None +CURRENT_TRACE_CONTEXT = contextvars.ContextVar("trace_context", default=None) BACKGROUND_LOGGER = None @@ -51,7 +53,7 @@ class BackgroundLogger: try: self.log_queue.put_nowait(event) except queue.Full: - log.error("Log queue is full, dropping event") + logger.error("Log queue is full, dropping event") def _process_logs(self): while True: @@ -129,35 +131,36 @@ def setup_logger(api: Telemetry, level: int = logging.INFO): if BACKGROUND_LOGGER is None: BACKGROUND_LOGGER = BackgroundLogger(api) - logger = logging.getLogger() - logger.setLevel(level) - logger.addHandler(TelemetryHandler()) + root_logger = logging.getLogger() + root_logger.setLevel(level) + root_logger.addHandler(TelemetryHandler()) async def start_trace(name: str, attributes: Dict[str, Any] = None) -> TraceContext: global CURRENT_TRACE_CONTEXT, BACKGROUND_LOGGER if BACKGROUND_LOGGER is None: - log.info("No Telemetry implementation set. Skipping trace initialization...") + logger.debug("No Telemetry implementation set. Skipping trace initialization...") return trace_id = generate_short_uuid(16) context = TraceContext(BACKGROUND_LOGGER, trace_id) context.push_span(name, {"__root__": True, **(attributes or {})}) - CURRENT_TRACE_CONTEXT = context + CURRENT_TRACE_CONTEXT.set(context) return context async def end_trace(status: SpanStatus = SpanStatus.OK): global CURRENT_TRACE_CONTEXT - context = CURRENT_TRACE_CONTEXT + context = CURRENT_TRACE_CONTEXT.get() if context is None: + logger.debug("No trace context to end") return context.pop_span(status) - CURRENT_TRACE_CONTEXT = None + CURRENT_TRACE_CONTEXT.set(None) def severity(levelname: str) -> LogSeverity: @@ -188,7 +191,7 @@ class TelemetryHandler(logging.Handler): if BACKGROUND_LOGGER is None: raise RuntimeError("Telemetry API not initialized") - context = CURRENT_TRACE_CONTEXT + context = CURRENT_TRACE_CONTEXT.get() if context is None: return @@ -218,16 +221,22 @@ class SpanContextManager: def __enter__(self): global CURRENT_TRACE_CONTEXT - context = CURRENT_TRACE_CONTEXT - if context: - self.span = context.push_span(self.name, self.attributes) + context = CURRENT_TRACE_CONTEXT.get() + if not context: + logger.debug("No trace context to push span") + return self + + self.span = context.push_span(self.name, self.attributes) return self def __exit__(self, exc_type, exc_value, traceback): global CURRENT_TRACE_CONTEXT - context = CURRENT_TRACE_CONTEXT - if context: - context.pop_span() + context = CURRENT_TRACE_CONTEXT.get() + if not context: + logger.debug("No trace context to pop span") + return + + context.pop_span() def set_attribute(self, key: str, value: Any): if self.span: @@ -237,16 +246,22 @@ class SpanContextManager: async def __aenter__(self): global CURRENT_TRACE_CONTEXT - context = CURRENT_TRACE_CONTEXT - if context: - self.span = context.push_span(self.name, self.attributes) + context = CURRENT_TRACE_CONTEXT.get() + if not context: + logger.debug("No trace context to push span") + return self + + self.span = context.push_span(self.name, self.attributes) return self async def __aexit__(self, exc_type, exc_value, traceback): global CURRENT_TRACE_CONTEXT - context = CURRENT_TRACE_CONTEXT - if context: - context.pop_span() + context = CURRENT_TRACE_CONTEXT.get() + if not context: + logger.debug("No trace context to pop span") + return + + context.pop_span() def __call__(self, func: Callable): @wraps(func) @@ -275,7 +290,11 @@ def span(name: str, attributes: Dict[str, Any] = None): def get_current_span() -> Optional[Span]: global CURRENT_TRACE_CONTEXT - context = CURRENT_TRACE_CONTEXT + if CURRENT_TRACE_CONTEXT is None: + logger.debug("No trace context to get current span") + return None + + context = CURRENT_TRACE_CONTEXT.get() if context: return context.get_current_span() return None From e13c92f269cc1cc404f39a20334217ba9e7e19d7 Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Tue, 11 Mar 2025 09:58:25 -0700 Subject: [PATCH 004/557] revert: feat(server): Use system packages for execution (#1551) Reverts meta-llama/llama-stack#1252 The above PR breaks the following invocation: ```bash llama stack run ~/.llama/distributions/together/together-run.yaml ``` --- llama_stack/cli/stack/run.py | 52 +++++++---------------- llama_stack/distribution/server/server.py | 28 +++--------- 2 files changed, 21 insertions(+), 59 deletions(-) diff --git a/llama_stack/cli/stack/run.py b/llama_stack/cli/stack/run.py index 1e4f3c5d9..e5686fb10 100644 --- a/llama_stack/cli/stack/run.py +++ b/llama_stack/cli/stack/run.py @@ -56,6 +56,7 @@ class StackRun(Subcommand): "--env", action="append", help="Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times.", + default=[], metavar="KEY=VALUE", ) self.parser.add_argument( @@ -73,6 +74,7 @@ class StackRun(Subcommand): type=str, help="Image Type used during the build. This can be either conda or container or venv.", choices=["conda", "container", "venv"], + default="conda", ) def _run_stack_run_cmd(self, args: argparse.Namespace) -> None: @@ -118,42 +120,20 @@ class StackRun(Subcommand): except AttributeError as e: self.parser.error(f"failed to parse config file '{config_file}':\n {e}") - # If neither image type nor image name is provided, assume the server should be run directly - # using the current environment packages. - if not args.image_type and not args.image_name: - logger.info("No image type or image name provided. Assuming environment packages.") - from llama_stack.distribution.server.server import main as server_main + run_args = formulate_run_args(args.image_type, args.image_name, config, template_name) - # Build the server args from the current args passed to the CLI - server_args = argparse.Namespace() - for arg in vars(args): - # If this is a function, avoid passing it - # "args" contains: - # func=> - if callable(getattr(args, arg)): - continue - setattr(server_args, arg, getattr(args, arg)) + run_args.extend([str(config_file), str(args.port)]) + if args.disable_ipv6: + run_args.append("--disable-ipv6") - # Run the server - server_main(server_args) - else: - run_args = formulate_run_args(args.image_type, args.image_name, config, template_name) + for env_var in args.env: + if "=" not in env_var: + self.parser.error(f"Environment variable '{env_var}' must be in KEY=VALUE format") + key, value = env_var.split("=", 1) # split on first = only + if not key: + self.parser.error(f"Environment variable '{env_var}' has empty key") + run_args.extend(["--env", f"{key}={value}"]) - run_args.extend([str(config_file), str(args.port)]) - if args.disable_ipv6: - run_args.append("--disable-ipv6") - - if args.env: - for env_var in args.env: - if "=" not in env_var: - self.parser.error(f"Environment variable '{env_var}' must be in KEY=VALUE format") - return - key, value = env_var.split("=", 1) # split on first = only - if not key: - self.parser.error(f"Environment variable '{env_var}' has empty key") - return - run_args.extend(["--env", f"{key}={value}"]) - - if args.tls_keyfile and args.tls_certfile: - run_args.extend(["--tls-keyfile", args.tls_keyfile, "--tls-certfile", args.tls_certfile]) - run_with_pty(run_args) + if args.tls_keyfile and args.tls_certfile: + run_args.extend(["--tls-keyfile", args.tls_keyfile, "--tls-certfile", args.tls_certfile]) + run_with_pty(run_args) diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py index 6b99d908d..f819d446f 100644 --- a/llama_stack/distribution/server/server.py +++ b/llama_stack/distribution/server/server.py @@ -17,7 +17,7 @@ import warnings from contextlib import asynccontextmanager from importlib.metadata import version as parse_version from pathlib import Path -from typing import Any, List, Optional, Union +from typing import Any, List, Union import yaml from fastapi import Body, FastAPI, HTTPException, Request @@ -314,17 +314,11 @@ class ClientVersionMiddleware: return await self.app(scope, receive, send) -def main(args: Optional[argparse.Namespace] = None): +def main(): """Start the LlamaStack server.""" parser = argparse.ArgumentParser(description="Start the LlamaStack server.") parser.add_argument( "--yaml-config", - dest="config", - help="(Deprecated) Path to YAML configuration file - use --config instead", - ) - parser.add_argument( - "--config", - dest="config", help="Path to YAML configuration file", ) parser.add_argument( @@ -354,19 +348,7 @@ def main(args: Optional[argparse.Namespace] = None): required="--tls-keyfile" in sys.argv, ) - # Determine whether the server args are being passed by the "run" command, if this is the case - # the args will be passed as a Namespace object to the main function, otherwise they will be - # parsed from the command line - if args is None: - args = parser.parse_args() - - # Check for deprecated argument usage - if "--yaml-config" in sys.argv: - warnings.warn( - "The '--yaml-config' argument is deprecated and will be removed in a future version. Use '--config' instead.", - DeprecationWarning, - stacklevel=2, - ) + args = parser.parse_args() if args.env: for env_pair in args.env: @@ -378,9 +360,9 @@ def main(args: Optional[argparse.Namespace] = None): logger.error(f"Error: {str(e)}") sys.exit(1) - if args.config: + if args.yaml_config: # if the user provided a config file, use it, even if template was specified - config_file = Path(args.config) + config_file = Path(args.yaml_config) if not config_file.exists(): raise ValueError(f"Config file {config_file} does not exist") logger.info(f"Using config file: {config_file}") From 0e73186a114a253a24a7638c1b6b9ad6e54b6e59 Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Tue, 11 Mar 2025 13:01:09 -0400 Subject: [PATCH 005/557] fix: Add missing shutdown handler for TorchtunePostTrainingImpl (#1535) # What does this PR do? Added missing shutdown handler. (Currently empty.) Without it, when server shuts down, it posts the following warning: ``` __main__:129 server: No shutdown method for TorchtunePostTrainingImpl ``` Signed-off-by: Ihar Hrachyshka [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan (The test plan assumes shutdown logic is fixed, see #1495) Without the patch: ``` INFO: Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit) INFO: Shutting down INFO: Waiting for application shutdown. INFO 2025-03-10 20:56:43,961 __main__:140 server: Shutting down INFO 2025-03-10 20:56:43,962 __main__:124 server: Shutting down DatasetsRoutingTable INFO 2025-03-10 20:56:43,964 __main__:124 server: Shutting down DatasetIORouter INFO 2025-03-10 20:56:43,965 __main__:124 server: Shutting down ScoringFunctionsRoutingTable INFO 2025-03-10 20:56:43,966 __main__:124 server: Shutting down ScoringRouter INFO 2025-03-10 20:56:43,967 __main__:124 server: Shutting down ModelsRoutingTable INFO 2025-03-10 20:56:43,968 __main__:124 server: Shutting down InferenceRouter INFO 2025-03-10 20:56:43,969 __main__:124 server: Shutting down ShieldsRoutingTable INFO 2025-03-10 20:56:43,971 __main__:124 server: Shutting down SafetyRouter INFO 2025-03-10 20:56:43,972 __main__:124 server: Shutting down VectorDBsRoutingTable INFO 2025-03-10 20:56:43,973 __main__:124 server: Shutting down VectorIORouter INFO 2025-03-10 20:56:43,974 __main__:124 server: Shutting down ToolGroupsRoutingTable INFO 2025-03-10 20:56:43,975 __main__:124 server: Shutting down ToolRuntimeRouter INFO 2025-03-10 20:56:43,976 __main__:124 server: Shutting down MetaReferenceAgentsImpl INFO 2025-03-10 20:56:43,977 __main__:124 server: Shutting down TelemetryAdapter INFO 2025-03-10 20:56:43,978 __main__:124 server: Shutting down TorchtunePostTrainingImpl WARNING 2025-03-10 20:56:43,979 __main__:129 server: No shutdown method for TorchtunePostTrainingImpl INFO 2025-03-10 20:56:43,979 __main__:124 server: Shutting down BenchmarksRoutingTable INFO 2025-03-10 20:56:43,980 __main__:124 server: Shutting down EvalRouter INFO 2025-03-10 20:56:43,981 __main__:124 server: Shutting down DistributionInspectImpl INFO: Application shutdown complete. INFO: Finished server process [33862] ``` Run with the patch and observe no warning: ``` $ kill -INT $(ps ax | grep llama_stack.distribution.server.server | grep -v nvim | awk -e '{print $1}' | sort | head -n 1) ``` ``` INFO: Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit) INFO: Shutting down INFO: Waiting for application shutdown. INFO 2025-03-11 00:32:56,863 __main__:140 server: Shutting down INFO 2025-03-11 00:32:56,864 __main__:124 server: Shutting down DatasetsRoutingTable INFO 2025-03-11 00:32:56,866 __main__:124 server: Shutting down DatasetIORouter INFO 2025-03-11 00:32:56,867 __main__:124 server: Shutting down ScoringFunctionsRoutingTable INFO 2025-03-11 00:32:56,868 __main__:124 server: Shutting down ScoringRouter INFO 2025-03-11 00:32:56,869 __main__:124 server: Shutting down ModelsRoutingTable INFO 2025-03-11 00:32:56,870 __main__:124 server: Shutting down InferenceRouter INFO 2025-03-11 00:32:56,871 __main__:124 server: Shutting down ShieldsRoutingTable INFO 2025-03-11 00:32:56,872 __main__:124 server: Shutting down SafetyRouter INFO 2025-03-11 00:32:56,873 __main__:124 server: Shutting down VectorDBsRoutingTable INFO 2025-03-11 00:32:56,874 __main__:124 server: Shutting down VectorIORouter INFO 2025-03-11 00:32:56,875 __main__:124 server: Shutting down ToolGroupsRoutingTable INFO 2025-03-11 00:32:56,876 __main__:124 server: Shutting down ToolRuntimeRouter INFO 2025-03-11 00:32:56,877 __main__:124 server: Shutting down MetaReferenceAgentsImpl INFO 2025-03-11 00:32:56,878 __main__:124 server: Shutting down TelemetryAdapter INFO 2025-03-11 00:32:56,879 __main__:124 server: Shutting down TorchtunePostTrainingImpl INFO 2025-03-11 00:32:56,880 __main__:124 server: Shutting down BenchmarksRoutingTable INFO 2025-03-11 00:32:56,881 __main__:124 server: Shutting down EvalRouter INFO 2025-03-11 00:32:56,882 __main__:124 server: Shutting down DistributionInspectImpl ``` [//]: # (## Documentation) Signed-off-by: Ihar Hrachyshka --- .../providers/inline/post_training/torchtune/post_training.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llama_stack/providers/inline/post_training/torchtune/post_training.py b/llama_stack/providers/inline/post_training/torchtune/post_training.py index b837362d7..3a1affc91 100644 --- a/llama_stack/providers/inline/post_training/torchtune/post_training.py +++ b/llama_stack/providers/inline/post_training/torchtune/post_training.py @@ -43,6 +43,9 @@ class TorchtunePostTrainingImpl: self.jobs = {} self.checkpoints_dict = {} + async def shutdown(self): + pass + async def supervised_fine_tune( self, job_uuid: str, From 04106b94aab2bf550a39047370bf75e724b4114f Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Tue, 11 Mar 2025 13:01:46 -0400 Subject: [PATCH 006/557] docs: Remove duplicate docs on api docs generator (#1534) # What does this PR do? Since #892, we also need to install ruamel. Instead of maintaining the list of script dependencies in multiple places, remove it and assume developers read CONTRIBUTING.md docs. [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan Just docs. [//]: # (## Documentation) Signed-off-by: Ihar Hrachyshka --- CONTRIBUTING.md | 3 +-- docs/openapi_generator/README.md | 8 -------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e639328f0..7c0b5d94e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -159,8 +159,7 @@ uv run sphinx-autobuild source build/html --write-all If you modify or add new API endpoints, update the API documentation accordingly. You can do this by running the following command: ```bash -uv sync --extra dev -uv run ./docs/openapi_generator/run_openapi_generator.sh +uv run --with ".[dev]" ./docs/openapi_generator/run_openapi_generator.sh ``` The generated API documentation will be available in `docs/_static/`. Make sure to review the changes before committing. diff --git a/docs/openapi_generator/README.md b/docs/openapi_generator/README.md index 298df3ce0..7888e7828 100644 --- a/docs/openapi_generator/README.md +++ b/docs/openapi_generator/README.md @@ -1,9 +1 @@ The RFC Specification (OpenAPI format) is generated from the set of API endpoints located in `llama_stack/distribution/server/endpoints.py` using the `generate.py` utility. - -Please install the following packages before running the script: - -``` -pip install fire PyYAML -``` - -Then simply run `sh run_openapi_generator.sh` From c3d7d17bc4c4d815537a8ca7a5530139dd93c664 Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Tue, 11 Mar 2025 13:07:28 -0400 Subject: [PATCH 007/557] chore: fix typing hints for get_provider_impl deps arguments (#1544) # What does this PR do? It's a dict that may contain different types, as per resolver:instantiate_provider implementation. (AFAIU it also never contains ProviderSpecs, but *instances* of provider implementations.) [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan mypy passing if enabled checks for these modules. (See #1543) [//]: # (## Documentation) Signed-off-by: Ihar Hrachyshka --- .../providers/inline/agents/meta_reference/__init__.py | 6 +++--- llama_stack/providers/inline/datasetio/localfs/__init__.py | 4 +++- .../providers/inline/eval/meta_reference/__init__.py | 6 +++--- .../providers/inline/inference/meta_reference/__init__.py | 4 ++-- .../inline/inference/sentence_transformers/__init__.py | 4 +++- llama_stack/providers/inline/inference/vllm/__init__.py | 4 ++-- .../providers/inline/post_training/torchtune/__init__.py | 6 +++--- .../providers/inline/safety/code_scanner/__init__.py | 4 +++- llama_stack/providers/inline/safety/llama_guard/__init__.py | 4 +++- .../providers/inline/safety/prompt_guard/__init__.py | 4 +++- llama_stack/providers/inline/scoring/basic/__init__.py | 6 +++--- llama_stack/providers/inline/scoring/braintrust/__init__.py | 6 +++--- .../providers/inline/scoring/llm_as_judge/__init__.py | 6 +++--- .../inline/tool_runtime/code_interpreter/__init__.py | 4 +++- llama_stack/providers/inline/vector_io/chroma/__init__.py | 6 +++--- llama_stack/providers/inline/vector_io/faiss/__init__.py | 6 +++--- llama_stack/providers/inline/vector_io/milvus/__init__.py | 6 +++--- .../providers/inline/vector_io/sqlite_vec/__init__.py | 6 +++--- 18 files changed, 52 insertions(+), 40 deletions(-) diff --git a/llama_stack/providers/inline/agents/meta_reference/__init__.py b/llama_stack/providers/inline/agents/meta_reference/__init__.py index 8f8c24170..4be064f1d 100644 --- a/llama_stack/providers/inline/agents/meta_reference/__init__.py +++ b/llama_stack/providers/inline/agents/meta_reference/__init__.py @@ -4,14 +4,14 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Dict +from typing import Any, Dict -from llama_stack.distribution.datatypes import Api, ProviderSpec +from llama_stack.distribution.datatypes import Api from .config import MetaReferenceAgentsImplConfig -async def get_provider_impl(config: MetaReferenceAgentsImplConfig, deps: Dict[Api, ProviderSpec]): +async def get_provider_impl(config: MetaReferenceAgentsImplConfig, deps: Dict[Api, Any]): from .agents import MetaReferenceAgentsImpl impl = MetaReferenceAgentsImpl( diff --git a/llama_stack/providers/inline/datasetio/localfs/__init__.py b/llama_stack/providers/inline/datasetio/localfs/__init__.py index db8aa555c..5a0876d79 100644 --- a/llama_stack/providers/inline/datasetio/localfs/__init__.py +++ b/llama_stack/providers/inline/datasetio/localfs/__init__.py @@ -4,12 +4,14 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from typing import Any, Dict + from .config import LocalFSDatasetIOConfig async def get_provider_impl( config: LocalFSDatasetIOConfig, - _deps, + _deps: Dict[str, Any], ): from .datasetio import LocalFSDatasetIOImpl diff --git a/llama_stack/providers/inline/eval/meta_reference/__init__.py b/llama_stack/providers/inline/eval/meta_reference/__init__.py index 56c115322..e2a7fc2cd 100644 --- a/llama_stack/providers/inline/eval/meta_reference/__init__.py +++ b/llama_stack/providers/inline/eval/meta_reference/__init__.py @@ -3,16 +3,16 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Dict +from typing import Any, Dict -from llama_stack.distribution.datatypes import Api, ProviderSpec +from llama_stack.distribution.datatypes import Api from .config import MetaReferenceEvalConfig async def get_provider_impl( config: MetaReferenceEvalConfig, - deps: Dict[Api, ProviderSpec], + deps: Dict[Api, Any], ): from .eval import MetaReferenceEvalImpl diff --git a/llama_stack/providers/inline/inference/meta_reference/__init__.py b/llama_stack/providers/inline/inference/meta_reference/__init__.py index 9c923490d..3ef7cfd45 100644 --- a/llama_stack/providers/inline/inference/meta_reference/__init__.py +++ b/llama_stack/providers/inline/inference/meta_reference/__init__.py @@ -4,14 +4,14 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Union +from typing import Any, Dict, Union from .config import MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig async def get_provider_impl( config: Union[MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig], - _deps, + _deps: Dict[str, Any], ): from .inference import MetaReferenceInferenceImpl diff --git a/llama_stack/providers/inline/inference/sentence_transformers/__init__.py b/llama_stack/providers/inline/inference/sentence_transformers/__init__.py index d5710f7fd..c1d65d10c 100644 --- a/llama_stack/providers/inline/inference/sentence_transformers/__init__.py +++ b/llama_stack/providers/inline/inference/sentence_transformers/__init__.py @@ -4,6 +4,8 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from typing import Any, Dict + from llama_stack.providers.inline.inference.sentence_transformers.config import ( SentenceTransformersInferenceConfig, ) @@ -11,7 +13,7 @@ from llama_stack.providers.inline.inference.sentence_transformers.config import async def get_provider_impl( config: SentenceTransformersInferenceConfig, - _deps, + _deps: Dict[str, Any], ): from .sentence_transformers import SentenceTransformersInferenceImpl diff --git a/llama_stack/providers/inline/inference/vllm/__init__.py b/llama_stack/providers/inline/inference/vllm/__init__.py index aa0c4b101..bd0551e57 100644 --- a/llama_stack/providers/inline/inference/vllm/__init__.py +++ b/llama_stack/providers/inline/inference/vllm/__init__.py @@ -4,12 +4,12 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Any +from typing import Any, Dict from .config import VLLMConfig -async def get_provider_impl(config: VLLMConfig, _deps) -> Any: +async def get_provider_impl(config: VLLMConfig, _deps: Dict[str, Any]): from .vllm import VLLMInferenceImpl impl = VLLMInferenceImpl(config) diff --git a/llama_stack/providers/inline/post_training/torchtune/__init__.py b/llama_stack/providers/inline/post_training/torchtune/__init__.py index 7ef8eee01..ca7801be7 100644 --- a/llama_stack/providers/inline/post_training/torchtune/__init__.py +++ b/llama_stack/providers/inline/post_training/torchtune/__init__.py @@ -4,9 +4,9 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Dict +from typing import Any, Dict -from llama_stack.distribution.datatypes import Api, ProviderSpec +from llama_stack.distribution.datatypes import Api from .config import TorchtunePostTrainingConfig @@ -15,7 +15,7 @@ from .config import TorchtunePostTrainingConfig async def get_provider_impl( config: TorchtunePostTrainingConfig, - deps: Dict[Api, ProviderSpec], + deps: Dict[Api, Any], ): from .post_training import TorchtunePostTrainingImpl diff --git a/llama_stack/providers/inline/safety/code_scanner/__init__.py b/llama_stack/providers/inline/safety/code_scanner/__init__.py index 031130cb7..62975a963 100644 --- a/llama_stack/providers/inline/safety/code_scanner/__init__.py +++ b/llama_stack/providers/inline/safety/code_scanner/__init__.py @@ -4,10 +4,12 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from typing import Any, Dict + from .config import CodeScannerConfig -async def get_provider_impl(config: CodeScannerConfig, deps): +async def get_provider_impl(config: CodeScannerConfig, deps: Dict[str, Any]): from .code_scanner import MetaReferenceCodeScannerSafetyImpl impl = MetaReferenceCodeScannerSafetyImpl(config, deps) diff --git a/llama_stack/providers/inline/safety/llama_guard/__init__.py b/llama_stack/providers/inline/safety/llama_guard/__init__.py index ee9ee31e6..a4263b169 100644 --- a/llama_stack/providers/inline/safety/llama_guard/__init__.py +++ b/llama_stack/providers/inline/safety/llama_guard/__init__.py @@ -4,10 +4,12 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from typing import Any, Dict + from .config import LlamaGuardConfig -async def get_provider_impl(config: LlamaGuardConfig, deps): +async def get_provider_impl(config: LlamaGuardConfig, deps: Dict[str, Any]): from .llama_guard import LlamaGuardSafetyImpl assert isinstance(config, LlamaGuardConfig), f"Unexpected config type: {type(config)}" diff --git a/llama_stack/providers/inline/safety/prompt_guard/__init__.py b/llama_stack/providers/inline/safety/prompt_guard/__init__.py index 087aca6d9..747f34421 100644 --- a/llama_stack/providers/inline/safety/prompt_guard/__init__.py +++ b/llama_stack/providers/inline/safety/prompt_guard/__init__.py @@ -4,10 +4,12 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from typing import Any, Dict + from .config import PromptGuardConfig # noqa: F401 -async def get_provider_impl(config: PromptGuardConfig, deps): +async def get_provider_impl(config: PromptGuardConfig, deps: Dict[str, Any]): from .prompt_guard import PromptGuardSafetyImpl impl = PromptGuardSafetyImpl(config, deps) diff --git a/llama_stack/providers/inline/scoring/basic/__init__.py b/llama_stack/providers/inline/scoring/basic/__init__.py index c72434e9e..4898b973a 100644 --- a/llama_stack/providers/inline/scoring/basic/__init__.py +++ b/llama_stack/providers/inline/scoring/basic/__init__.py @@ -3,16 +3,16 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Dict +from typing import Any, Dict -from llama_stack.distribution.datatypes import Api, ProviderSpec +from llama_stack.distribution.datatypes import Api from .config import BasicScoringConfig async def get_provider_impl( config: BasicScoringConfig, - deps: Dict[Api, ProviderSpec], + deps: Dict[Api, Any], ): from .scoring import BasicScoringImpl diff --git a/llama_stack/providers/inline/scoring/braintrust/__init__.py b/llama_stack/providers/inline/scoring/braintrust/__init__.py index 2ddc58bd2..f1b0112d9 100644 --- a/llama_stack/providers/inline/scoring/braintrust/__init__.py +++ b/llama_stack/providers/inline/scoring/braintrust/__init__.py @@ -3,11 +3,11 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Dict +from typing import Any, Dict from pydantic import BaseModel -from llama_stack.distribution.datatypes import Api, ProviderSpec +from llama_stack.distribution.datatypes import Api from .config import BraintrustScoringConfig @@ -18,7 +18,7 @@ class BraintrustProviderDataValidator(BaseModel): async def get_provider_impl( config: BraintrustScoringConfig, - deps: Dict[Api, ProviderSpec], + deps: Dict[Api, Any], ): from .braintrust import BraintrustScoringImpl diff --git a/llama_stack/providers/inline/scoring/llm_as_judge/__init__.py b/llama_stack/providers/inline/scoring/llm_as_judge/__init__.py index 18535332e..4a83bfe13 100644 --- a/llama_stack/providers/inline/scoring/llm_as_judge/__init__.py +++ b/llama_stack/providers/inline/scoring/llm_as_judge/__init__.py @@ -3,16 +3,16 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Dict +from typing import Any, Dict -from llama_stack.distribution.datatypes import Api, ProviderSpec +from llama_stack.distribution.datatypes import Api from .config import LlmAsJudgeScoringConfig async def get_provider_impl( config: LlmAsJudgeScoringConfig, - deps: Dict[Api, ProviderSpec], + deps: Dict[Api, Any], ): from .scoring import LlmAsJudgeScoringImpl diff --git a/llama_stack/providers/inline/tool_runtime/code_interpreter/__init__.py b/llama_stack/providers/inline/tool_runtime/code_interpreter/__init__.py index 995358d46..8317ce793 100644 --- a/llama_stack/providers/inline/tool_runtime/code_interpreter/__init__.py +++ b/llama_stack/providers/inline/tool_runtime/code_interpreter/__init__.py @@ -4,12 +4,14 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from typing import Any, Dict + from .config import CodeInterpreterToolConfig __all__ = ["CodeInterpreterToolConfig", "CodeInterpreterToolRuntimeImpl"] -async def get_provider_impl(config: CodeInterpreterToolConfig, _deps): +async def get_provider_impl(config: CodeInterpreterToolConfig, _deps: Dict[str, Any]): from .code_interpreter import CodeInterpreterToolRuntimeImpl impl = CodeInterpreterToolRuntimeImpl(config) diff --git a/llama_stack/providers/inline/vector_io/chroma/__init__.py b/llama_stack/providers/inline/vector_io/chroma/__init__.py index abaf01097..f39188b46 100644 --- a/llama_stack/providers/inline/vector_io/chroma/__init__.py +++ b/llama_stack/providers/inline/vector_io/chroma/__init__.py @@ -4,14 +4,14 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Dict +from typing import Any, Dict -from llama_stack.providers.datatypes import Api, ProviderSpec +from llama_stack.providers.datatypes import Api from .config import ChromaVectorIOConfig -async def get_provider_impl(config: ChromaVectorIOConfig, deps: Dict[Api, ProviderSpec]): +async def get_provider_impl(config: ChromaVectorIOConfig, deps: Dict[Api, Any]): from llama_stack.providers.remote.vector_io.chroma.chroma import ( ChromaVectorIOAdapter, ) diff --git a/llama_stack/providers/inline/vector_io/faiss/__init__.py b/llama_stack/providers/inline/vector_io/faiss/__init__.py index f23e1fa4f..fc8ce70b4 100644 --- a/llama_stack/providers/inline/vector_io/faiss/__init__.py +++ b/llama_stack/providers/inline/vector_io/faiss/__init__.py @@ -4,14 +4,14 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Dict +from typing import Any, Dict -from llama_stack.providers.datatypes import Api, ProviderSpec +from llama_stack.providers.datatypes import Api from .config import FaissVectorIOConfig -async def get_provider_impl(config: FaissVectorIOConfig, deps: Dict[Api, ProviderSpec]): +async def get_provider_impl(config: FaissVectorIOConfig, deps: Dict[Api, Any]): from .faiss import FaissVectorIOAdapter assert isinstance(config, FaissVectorIOConfig), f"Unexpected config type: {type(config)}" diff --git a/llama_stack/providers/inline/vector_io/milvus/__init__.py b/llama_stack/providers/inline/vector_io/milvus/__init__.py index bee6b2ded..d88a3b005 100644 --- a/llama_stack/providers/inline/vector_io/milvus/__init__.py +++ b/llama_stack/providers/inline/vector_io/milvus/__init__.py @@ -4,14 +4,14 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Dict +from typing import Any, Dict -from llama_stack.providers.datatypes import Api, ProviderSpec +from llama_stack.providers.datatypes import Api from .config import MilvusVectorIOConfig -async def get_provider_impl(config: MilvusVectorIOConfig, deps: Dict[Api, ProviderSpec]): +async def get_provider_impl(config: MilvusVectorIOConfig, deps: Dict[Api, Any]): from llama_stack.providers.remote.vector_io.milvus.milvus import MilvusVectorIOAdapter impl = MilvusVectorIOAdapter(config, deps[Api.inference]) diff --git a/llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py b/llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py index 5a2f07012..2380eb0ef 100644 --- a/llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py +++ b/llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py @@ -4,14 +4,14 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Dict +from typing import Any, Dict -from llama_stack.providers.datatypes import Api, ProviderSpec +from llama_stack.providers.datatypes import Api from .config import SQLiteVectorIOConfig -async def get_provider_impl(config: SQLiteVectorIOConfig, deps: Dict[Api, ProviderSpec]): +async def get_provider_impl(config: SQLiteVectorIOConfig, deps: Dict[Api, Any]): from .sqlite_vec import SQLiteVecVectorIOAdapter assert isinstance(config, SQLiteVectorIOConfig), f"Unexpected config type: {type(config)}" From d33b8ea3dc652fdb1c6a9c94e42c5e2dfe36eb7f Mon Sep 17 00:00:00 2001 From: Kelly Brown <86735520+kelbrown20@users.noreply.github.com> Date: Tue, 11 Mar 2025 13:12:18 -0400 Subject: [PATCH 008/557] docs: Small nits in llama CLI reference (#1542) **Description:** Fixes some small nits in the llama CLI reference Note: There are a few nits in this PR, but also has some small suggestions, feel free to close if not necessary --- .../references/llama_cli_reference/index.md | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/docs/source/references/llama_cli_reference/index.md b/docs/source/references/llama_cli_reference/index.md index 8a38fc3ae..7b7abdf88 100644 --- a/docs/source/references/llama_cli_reference/index.md +++ b/docs/source/references/llama_cli_reference/index.md @@ -1,6 +1,6 @@ # llama (server-side) CLI Reference -The `llama` CLI tool helps you setup and use the Llama Stack. It should be available on your path after installing the `llama-stack` package. +The `llama` CLI tool helps you set up and use the Llama Stack. The CLI is available on your path after installing the `llama-stack` package. ## Installation @@ -27,9 +27,9 @@ You have two ways to install Llama Stack: ## `llama` subcommands -1. `download`: `llama` cli tools supports downloading the model from Meta or Hugging Face. -2. `model`: Lists available models and their properties. -3. `stack`: Allows you to build and run a Llama Stack server. You can read more about this [here](../../distributions/building_distro). +1. `download`: Supports downloading models from Meta or Hugging Face. [Downloading models](#downloading-models) +2. `model`: Lists available models and their properties. [Understanding models](#understand-the-models) +3. `stack`: Allows you to build a stack using the `llama stack` distribution and run a Llama Stack server. You can read more about how to build a Llama Stack distribution in the [Build your own Distribution](../../distributions/building_distro) documentation. ### Sample Usage @@ -117,7 +117,7 @@ You should see a table like this: +----------------------------------+------------------------------------------+----------------+ ``` -To download models, you can use the llama download command. +To download models, you can use the `llama download` command. ### Downloading from [Meta](https://llama.meta.com/llama-downloads/) @@ -191,7 +191,7 @@ You should see a table like this: The `llama model` command helps you explore the model’s interface. 1. `download`: Download the model from different sources. (meta, huggingface) -2. `list`: Lists all the models available for download with hardware requirements to deploy the models. +2. `list`: Lists all the models available for download with hardware requirements for deploying the models. 3. `prompt-format`: Show llama model message formats. 4. `describe`: Describes all the properties of the model. @@ -262,13 +262,12 @@ llama model prompt-format -m Llama3.2-3B-Instruct ![alt text](../../../resources/prompt-format.png) - You will be shown a Markdown formatted description of the model interface and how prompts / messages are formatted for various scenarios. **NOTE**: Outputs in terminal are color printed to show special tokens. ### Remove model -You can run `llama model remove` to remove unecessary model: +You can run `llama model remove` to remove an unnecessary model: ``` llama model remove -m Llama-Guard-3-8B-int8 From aca82df7edfbbedfafd0f0db354ee4161e959fed Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Tue, 11 Mar 2025 13:30:55 -0400 Subject: [PATCH 009/557] fix: Multiple fixes for server shutdown (fix lifespan handling; fix handling CancelledError when raised by provider; let uvicorn handle signals) (#1495) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? If implementation raises CancelledError (e.g. when it runs its own async loop for jobs), the main server shutdown handler gets confused and doesn't attempt to shut down the main loop tasks. While at it, also fixing the following failure when this happens: ``` UnboundLocalError: cannot access local variable 'loop' where it is not associated with a value ``` Shutdown handlers were not running because lifespan logic was broken since ~Oct 2024. Fixed that too and enforcing `lifespan` now (making sure server will crash when it fails to interact with app through middleware). [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan Spotted while working on https://github.com/meta-llama/llama-stack/pull/1437 One way to trigger it without the PR above is to add `raise CancelledError` in any of the running providers' `shutdown` methods; then `kill -INT ` the server process. Validated this with the following test patch: ``` diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py index b85c463a..10dad83e 100644 --- a/llama_stack/distribution/server/server.py +++ b/llama_stack/distribution/server/server.py @@ -174,6 +174,7 @@ def handle_signal(app, signum, _) -> None: except asyncio.CancelledError: pass finally: + logger.info("Stopping event loop") loop.stop() loop = asyncio.get_running_loop() diff --git a/llama_stack/providers/inline/post_training/torchtune/post_training.py b/llama_stack/providers/inline/post_training/torchtune/post_training.py index b837362d..163f43d8 100644 --- a/llama_stack/providers/inline/post_training/torchtune/post_training.py +++ b/llama_stack/providers/inline/post_training/torchtune/post_training.py @@ -3,6 +3,7 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +import asyncio from datetime import datetime from typing import Any, Dict, Optional @@ -43,6 +44,9 @@ class TorchtunePostTrainingImpl: self.jobs = {} self.checkpoints_dict = {} + async def shutdown(self) -> None: + raise asyncio.CancelledError("Shutdown") + async def supervised_fine_tune( self, job_uuid: str, ``` Without the fix: ``` INFO: Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit) INFO: Shutting down INFO: Finished server process [52099] INFO 2025-03-07 23:25:33,548 __main__:143 server: Received signal SIGINT (2). Exiting gracefully... INFO 2025-03-07 23:25:33,550 __main__:150 server: Shutting down DatasetsRoutingTable INFO 2025-03-07 23:25:33,551 __main__:177 server: Stopping event loop ERROR 2025-03-07 23:25:33,552 asyncio:1785 uncategorized: unhandled exception during asyncio.run() shutdown task: .shutdown() done, defined at /home/ec2-user/src/llama-stack/schedule/llama_stack/distribution/server/server.py:145> exception=UnboundLocalError("cannot access local variable 'loop' where it is not associated with a value")> ╭───────────────────────────────────── Traceback (most recent call last) ─────────────────────────────────────╮ │ /home/ec2-user/src/llama-stack/schedule/llama_stack/distribution/server/server.py:178 in shutdown │ │ │ │ 175 │ │ │ pass │ │ 176 │ │ finally: │ │ 177 │ │ │ logger.info("Stopping event loop") │ │ ❱ 178 │ │ │ loop.stop() │ │ 179 │ │ │ 180 │ loop = asyncio.get_running_loop() │ │ 181 │ loop.create_task(shutdown()) │ ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ UnboundLocalError: cannot access local variable 'loop' where it is not associated with a value ``` With the fix, now seeing the following messages when the server is killed: ``` INFO: Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit) INFO: Shutting down INFO: Finished server process [50836] INFO 2025-03-07 23:20:35,182 __main__:143 server: Received signal SIGINT (2). Exiting gracefully... INFO 2025-03-07 23:20:35,184 __main__:149 server: Shutting down DatasetsRoutingTable ERROR 2025-03-07 23:20:35,185 __main__:158 server: Failed to shutdown DatasetsRoutingTable: {CancelledError()} ╭───────────────────────────────────── Traceback (most recent call last) ─────────────────────────────────────╮ │ /usr/lib64/python3.11/asyncio/tasks.py:476 in wait_for │ │ │ │ 473 │ try: │ │ 474 │ │ # wait until the future completes or the timeout │ │ 475 │ │ try: │ │ ❱ 476 │ │ │ await waiter │ │ 477 │ │ except exceptions.CancelledError: │ │ 478 │ │ │ if fut.done(): │ │ 479 │ │ │ │ return fut.result() │ ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ CancelledError During handling of the above exception, another exception occurred: ╭───────────────────────────────────── Traceback (most recent call last) ─────────────────────────────────────╮ │ /home/ec2-user/src/llama-stack/schedule/llama_stack/distribution/server/server.py:152 in shutdown │ │ │ │ 149 │ │ │ logger.info("Shutting down %s", impl_name) │ │ 150 │ │ │ try: │ │ 151 │ │ │ │ if hasattr(impl, "shutdown"): │ │ ❱ 152 │ │ │ │ │ await asyncio.wait_for(impl.shutdown(), timeout=5) │ │ 153 │ │ │ │ else: │ │ 154 │ │ │ │ │ logger.warning("No shutdown method for %s", impl_name) │ │ 155 │ │ │ except asyncio.TimeoutError: │ │ │ │ /usr/lib64/python3.11/asyncio/tasks.py:479 in wait_for │ │ │ │ 476 │ │ │ await waiter │ │ 477 │ │ except exceptions.CancelledError: │ │ 478 │ │ │ if fut.done(): │ │ ❱ 479 │ │ │ │ return fut.result() │ │ 480 │ │ │ else: │ │ 481 │ │ │ │ fut.remove_done_callback(cb) │ │ 482 │ │ │ │ # We must ensure that the task is not running │ │ │ │ /home/ec2-user/src/llama-stack/schedule/llama_stack/distribution/routers/routing_tables.py:131 in shutdown │ │ │ │ 128 │ │ │ elif api == Api.tool_runtime: │ │ 129 │ │ │ │ p.tool_store = self │ │ 130 │ │ │ ❱ 131 │ async def shutdown(self) -> None: │ │ 132 │ │ for p in self.impls_by_provider_id.values(): │ │ 133 │ │ │ await p.shutdown() │ │ 134 │ ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ CancelledError INFO 2025-03-07 23:20:35,295 __main__:149 server: Shutting down DatasetIORouter INFO 2025-03-07 23:20:35,296 __main__:149 server: Shutting down ScoringFunctionsRoutingTable INFO 2025-03-07 23:20:35,297 __main__:149 server: Shutting down ScoringRouter INFO 2025-03-07 23:20:35,298 __main__:149 server: Shutting down ModelsRoutingTable INFO 2025-03-07 23:20:35,299 __main__:149 server: Shutting down InferenceRouter INFO 2025-03-07 23:20:35,300 __main__:149 server: Shutting down ShieldsRoutingTable INFO 2025-03-07 23:20:35,300 __main__:149 server: Shutting down SafetyRouter INFO 2025-03-07 23:20:35,301 __main__:149 server: Shutting down VectorDBsRoutingTable INFO 2025-03-07 23:20:35,302 __main__:149 server: Shutting down VectorIORouter INFO 2025-03-07 23:20:35,303 __main__:149 server: Shutting down ToolGroupsRoutingTable INFO 2025-03-07 23:20:35,304 __main__:149 server: Shutting down ToolRuntimeRouter INFO 2025-03-07 23:20:35,304 __main__:149 server: Shutting down MetaReferenceAgentsImpl INFO 2025-03-07 23:20:35,305 __main__:149 server: Shutting down TelemetryAdapter INFO 2025-03-07 23:20:35,306 __main__:149 server: Shutting down TorchtunePostTrainingImpl ERROR 2025-03-07 23:20:35,307 __main__:158 server: Failed to shutdown TorchtunePostTrainingImpl: {CancelledError('Shutdown')} ╭───────────────────────────────────── Traceback (most recent call last) ─────────────────────────────────────╮ │ /home/ec2-user/src/llama-stack/schedule/llama_stack/distribution/server/server.py:152 in shutdown │ │ │ │ 149 │ │ │ logger.info("Shutting down %s", impl_name) │ │ 150 │ │ │ try: │ │ 151 │ │ │ │ if hasattr(impl, "shutdown"): │ │ ❱ 152 │ │ │ │ │ await asyncio.wait_for(impl.shutdown(), timeout=5) │ │ 153 │ │ │ │ else: │ │ 154 │ │ │ │ │ logger.warning("No shutdown method for %s", impl_name) │ │ 155 │ │ │ except asyncio.TimeoutError: │ │ │ │ /usr/lib64/python3.11/asyncio/tasks.py:489 in wait_for │ │ │ │ 486 │ │ │ │ raise │ │ 487 │ │ │ │ 488 │ │ if fut.done(): │ │ ❱ 489 │ │ │ return fut.result() │ │ 490 │ │ else: │ │ 491 │ │ │ fut.remove_done_callback(cb) │ │ 492 │ │ │ # We must ensure that the task is not running │ │ │ │ /home/ec2-user/src/llama-stack/schedule/llama_stack/providers/inline/post_training/torchtune/post_training. │ │ py:48 in shutdown │ │ │ │ 45 │ │ self.checkpoints_dict = {} │ │ 46 │ │ │ 47 │ async def shutdown(self) -> None: │ │ ❱ 48 │ │ raise asyncio.CancelledError("Shutdown") │ │ 49 │ │ │ 50 │ async def supervised_fine_tune( │ │ 51 │ │ self, │ ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ CancelledError: Shutdown INFO 2025-03-07 23:20:35,352 __main__:149 server: Shutting down BenchmarksRoutingTable INFO 2025-03-07 23:20:35,353 __main__:149 server: Shutting down EvalRouter INFO 2025-03-07 23:20:35,354 __main__:149 server: Shutting down DistributionInspectImpl INFO 2025-03-07 23:20:35,355 __main__:177 server: Stopping event loop Traceback (most recent call last): File "", line 198, in _run_module_as_main File "", line 88, in _run_code File "/home/ec2-user/src/llama-stack/schedule/llama_stack/distribution/server/server.py", line 488, in main() File "/home/ec2-user/src/llama-stack/schedule/llama_stack/distribution/server/server.py", line 476, in main uvicorn.run(**uvicorn_config) File "/home/ec2-user/src/llama-stack/schedule/venv/lib64/python3.11/site-packages/uvicorn/main.py", line 579, in run server.run() File "/home/ec2-user/src/llama-stack/schedule/venv/lib64/python3.11/site-packages/uvicorn/server.py", line 66, in run return asyncio.run(self.serve(sockets=sockets)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib64/python3.11/asyncio/runners.py", line 189, in run with Runner(debug=debug) as runner: File "/usr/lib64/python3.11/asyncio/runners.py", line 63, in __exit__ self.close() File "/usr/lib64/python3.11/asyncio/runners.py", line 71, in close _cancel_all_tasks(loop) File "/usr/lib64/python3.11/asyncio/runners.py", line 201, in _cancel_all_tasks loop.run_until_complete(tasks.gather(*to_cancel, return_exceptions=True)) File "/usr/lib64/python3.11/asyncio/base_events.py", line 652, in run_until_complete raise RuntimeError('Event loop stopped before Future completed.') RuntimeError: Event loop stopped before Future completed. ++ error_handler 104 ++ echo 'Error occurred in script at line: 104' Error occurred in script at line: 104 ++ exit 1 ``` With all patches included, the shutdown now looks as follows: ``` $ kill -INT $(ps ax | grep llama_stack.distribution.server.server | grep -v nvim | awk -e '{print $1}' | sort | head -n 1) ``` ``` 20:56:09.308 [START] INFO: Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit) INFO: Shutting down INFO: Waiting for application shutdown. INFO 2025-03-10 20:56:43,961 __main__:140 server: Shutting down INFO 2025-03-10 20:56:43,962 __main__:124 server: Shutting down DatasetsRoutingTable INFO 2025-03-10 20:56:43,964 __main__:124 server: Shutting down DatasetIORouter INFO 2025-03-10 20:56:43,965 __main__:124 server: Shutting down ScoringFunctionsRoutingTable INFO 2025-03-10 20:56:43,966 __main__:124 server: Shutting down ScoringRouter INFO 2025-03-10 20:56:43,967 __main__:124 server: Shutting down ModelsRoutingTable INFO 2025-03-10 20:56:43,968 __main__:124 server: Shutting down InferenceRouter INFO 2025-03-10 20:56:43,969 __main__:124 server: Shutting down ShieldsRoutingTable INFO 2025-03-10 20:56:43,971 __main__:124 server: Shutting down SafetyRouter INFO 2025-03-10 20:56:43,972 __main__:124 server: Shutting down VectorDBsRoutingTable INFO 2025-03-10 20:56:43,973 __main__:124 server: Shutting down VectorIORouter INFO 2025-03-10 20:56:43,974 __main__:124 server: Shutting down ToolGroupsRoutingTable INFO 2025-03-10 20:56:43,975 __main__:124 server: Shutting down ToolRuntimeRouter INFO 2025-03-10 20:56:43,976 __main__:124 server: Shutting down MetaReferenceAgentsImpl INFO 2025-03-10 20:56:43,977 __main__:124 server: Shutting down TelemetryAdapter INFO 2025-03-10 20:56:43,978 __main__:124 server: Shutting down TorchtunePostTrainingImpl WARNING 2025-03-10 20:56:43,979 __main__:129 server: No shutdown method for TorchtunePostTrainingImpl INFO 2025-03-10 20:56:43,979 __main__:124 server: Shutting down BenchmarksRoutingTable INFO 2025-03-10 20:56:43,980 __main__:124 server: Shutting down EvalRouter INFO 2025-03-10 20:56:43,981 __main__:124 server: Shutting down DistributionInspectImpl INFO: Application shutdown complete. INFO: Finished server process [33862] ``` [//]: # (## Documentation) --------- Signed-off-by: Ihar Hrachyshka --- llama_stack/distribution/server/server.py | 87 +++++------------------ 1 file changed, 19 insertions(+), 68 deletions(-) diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py index f819d446f..ea8723365 100644 --- a/llama_stack/distribution/server/server.py +++ b/llama_stack/distribution/server/server.py @@ -6,11 +6,9 @@ import argparse import asyncio -import functools import inspect import json import os -import signal import sys import traceback import warnings @@ -118,69 +116,24 @@ def translate_exception(exc: Exception) -> Union[HTTPException, RequestValidatio ) -def handle_signal(app, signum, _) -> None: +async def shutdown(app): + """Initiate a graceful shutdown of the application. + + Handled by the lifespan context manager. The shutdown process involves + shutting down all implementations registered in the application. """ - Handle incoming signals and initiate a graceful shutdown of the application. - - This function is intended to be used as a signal handler for various signals - (e.g., SIGINT, SIGTERM). Upon receiving a signal, it will print a message - indicating the received signal and initiate a shutdown process. - - Args: - app: The application instance containing implementations to be shut down. - signum (int): The signal number received. - frame: The current stack frame (not used in this function). - - The shutdown process involves: - - Shutting down all implementations registered in the application. - - Gathering all running asyncio tasks. - - Cancelling all gathered tasks. - - Waiting for all tasks to finish. - - Stopping the event loop. - - Note: - This function schedules the shutdown process as an asyncio task and does - not block the current execution. - """ - signame = signal.Signals(signum).name - logger.info(f"Received signal {signame} ({signum}). Exiting gracefully...") - - async def shutdown(): + for impl in app.__llama_stack_impls__.values(): + impl_name = impl.__class__.__name__ + logger.info("Shutting down %s", impl_name) try: - # Gracefully shut down implementations - for impl in app.__llama_stack_impls__.values(): - impl_name = impl.__class__.__name__ - logger.info("Shutting down %s", impl_name) - try: - if hasattr(impl, "shutdown"): - await asyncio.wait_for(impl.shutdown(), timeout=5) - else: - logger.warning("No shutdown method for %s", impl_name) - except asyncio.TimeoutError: - logger.exception("Shutdown timeout for %s ", impl_name, exc_info=True) - except Exception as e: - logger.exception("Failed to shutdown %s: %s", impl_name, {e}) - - # Gather all running tasks - loop = asyncio.get_running_loop() - tasks = [task for task in asyncio.all_tasks(loop) if task is not asyncio.current_task()] - - # Cancel all tasks - for task in tasks: - task.cancel() - - # Wait for all tasks to finish - try: - await asyncio.wait_for(asyncio.gather(*tasks, return_exceptions=True), timeout=10) - except asyncio.TimeoutError: - logger.exception("Timeout while waiting for tasks to finish") - except asyncio.CancelledError: - pass - finally: - loop.stop() - - loop = asyncio.get_running_loop() - loop.create_task(shutdown()) + if hasattr(impl, "shutdown"): + await asyncio.wait_for(impl.shutdown(), timeout=5) + else: + logger.warning("No shutdown method for %s", impl_name) + except asyncio.TimeoutError: + logger.exception("Shutdown timeout for %s ", impl_name, exc_info=True) + except (Exception, asyncio.CancelledError) as e: + logger.exception("Failed to shutdown %s: %s", impl_name, {e}) @asynccontextmanager @@ -188,8 +141,7 @@ async def lifespan(app: FastAPI): logger.info("Starting up") yield logger.info("Shutting down") - for impl in app.__llama_stack_impls__.values(): - await impl.shutdown() + await shutdown(app) def is_streaming_request(func_name: str, request: Request, **kwargs): @@ -266,7 +218,7 @@ class TracingMiddleware: self.app = app async def __call__(self, scope, receive, send): - path = scope["path"] + path = scope.get("path", "") await start_trace(path, {"__location__": "server"}) try: return await self.app(scope, receive, send) @@ -439,8 +391,6 @@ def main(): app.exception_handler(RequestValidationError)(global_exception_handler) app.exception_handler(Exception)(global_exception_handler) - signal.signal(signal.SIGINT, functools.partial(handle_signal, app)) - signal.signal(signal.SIGTERM, functools.partial(handle_signal, app)) app.__llama_stack_impls__ = impls @@ -471,6 +421,7 @@ def main(): "app": app, "host": listen_host, "port": port, + "lifespan": "on", } if ssl_config: uvicorn_config.update(ssl_config) From 83a2c78615a3b4a2ad96852023b0292a401a0463 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Tue, 11 Mar 2025 18:33:46 +0100 Subject: [PATCH 010/557] feat(api): list agents / sessions and get agent (#1410) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? Add support for listing agents, describing an agent, and retrieving session IDs for a given agent. This is only the API definition, the implementations will come separately. Closes: https://github.com/meta-llama/llama-stack/issues/1294 Signed-off-by: Sébastien Han --- docs/_static/llama-stack-spec.html | 169 +++++++++++++++++++++++++++++ docs/_static/llama-stack-spec.yaml | 118 ++++++++++++++++++++ llama_stack/apis/agents/agents.py | 46 ++++++++ 3 files changed, 333 insertions(+) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 1a8169090..b0febbbef 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -363,6 +363,37 @@ } }, "/v1/agents": { + "get": { + "responses": { + "200": { + "description": "A ListAgentsResponse.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListAgentsResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Agents" + ], + "description": "List all agents.", + "parameters": [] + }, "post": { "responses": { "200": { @@ -609,6 +640,47 @@ } }, "/v1/agents/{agent_id}": { + "get": { + "responses": { + "200": { + "description": "An Agent of the agent.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Agent" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Agents" + ], + "description": "Describe an agent by its ID.", + "parameters": [ + { + "name": "agent_id", + "in": "path", + "description": "ID of the agent.", + "required": true, + "schema": { + "type": "string" + } + } + ] + }, "delete": { "responses": { "200": { @@ -2276,6 +2348,49 @@ ] } }, + "/v1/agents/{agent_id}/sessions": { + "get": { + "responses": { + "200": { + "description": "A ListAgentSessionsResponse.", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListAgentSessionsResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Agents" + ], + "description": "List all session(s) of a given agent.", + "parameters": [ + { + "name": "agent_id", + "in": "path", + "description": "The ID of the agent to list sessions for.", + "required": true, + "schema": { + "type": "string" + } + } + ] + } + }, "/v1/eval/benchmarks": { "get": { "responses": { @@ -6565,6 +6680,28 @@ "title": "ScoringResult", "description": "A scoring result for a single row." }, + "Agent": { + "type": "object", + "properties": { + "agent_id": { + "type": "string" + }, + "agent_config": { + "$ref": "#/components/schemas/AgentConfig" + }, + "created_at": { + "type": "string", + "format": "date-time" + } + }, + "additionalProperties": false, + "required": [ + "agent_id", + "agent_config", + "created_at" + ], + "title": "Agent" + }, "Session": { "type": "object", "properties": { @@ -7907,6 +8044,38 @@ ], "title": "ToolInvocationResult" }, + "ListAgentSessionsResponse": { + "type": "object", + "properties": { + "data": { + "type": "array", + "items": { + "$ref": "#/components/schemas/Session" + } + } + }, + "additionalProperties": false, + "required": [ + "data" + ], + "title": "ListAgentSessionsResponse" + }, + "ListAgentsResponse": { + "type": "object", + "properties": { + "data": { + "type": "array", + "items": { + "$ref": "#/components/schemas/Agent" + } + } + }, + "additionalProperties": false, + "required": [ + "data" + ], + "title": "ListAgentsResponse" + }, "BucketResponse": { "type": "object", "properties": { diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index d6001c00d..2985e6222 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -238,6 +238,28 @@ paths: $ref: '#/components/schemas/CompletionRequest' required: true /v1/agents: + get: + responses: + '200': + description: A ListAgentsResponse. + content: + application/json: + schema: + $ref: '#/components/schemas/ListAgentsResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Agents + description: List all agents. + parameters: [] post: responses: '200': @@ -410,6 +432,34 @@ paths: $ref: '#/components/schemas/CreateUploadSessionRequest' required: true /v1/agents/{agent_id}: + get: + responses: + '200': + description: An Agent of the agent. + content: + application/json: + schema: + $ref: '#/components/schemas/Agent' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Agents + description: Describe an agent by its ID. + parameters: + - name: agent_id + in: path + description: ID of the agent. + required: true + schema: + type: string delete: responses: '200': @@ -1528,6 +1578,36 @@ paths: required: true schema: type: string + /v1/agents/{agent_id}/sessions: + get: + responses: + '200': + description: A ListAgentSessionsResponse. + content: + application/json: + schema: + $ref: '#/components/schemas/ListAgentSessionsResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Agents + description: List all session(s) of a given agent. + parameters: + - name: agent_id + in: path + description: >- + The ID of the agent to list sessions for. + required: true + schema: + type: string /v1/eval/benchmarks: get: responses: @@ -4549,6 +4629,22 @@ components: - aggregated_results title: ScoringResult description: A scoring result for a single row. + Agent: + type: object + properties: + agent_id: + type: string + agent_config: + $ref: '#/components/schemas/AgentConfig' + created_at: + type: string + format: date-time + additionalProperties: false + required: + - agent_id + - agent_config + - created_at + title: Agent Session: type: object properties: @@ -5385,6 +5481,28 @@ components: required: - content title: ToolInvocationResult + ListAgentSessionsResponse: + type: object + properties: + data: + type: array + items: + $ref: '#/components/schemas/Session' + additionalProperties: false + required: + - data + title: ListAgentSessionsResponse + ListAgentsResponse: + type: object + properties: + data: + type: array + items: + $ref: '#/components/schemas/Agent' + additionalProperties: false + required: + - data + title: ListAgentsResponse BucketResponse: type: object properties: diff --git a/llama_stack/apis/agents/agents.py b/llama_stack/apis/agents/agents.py index af4b0ba77..1170a56d5 100644 --- a/llama_stack/apis/agents/agents.py +++ b/llama_stack/apis/agents/agents.py @@ -234,6 +234,23 @@ class AgentConfig(AgentConfigCommon): response_format: Optional[ResponseFormat] = None +@json_schema_type +class Agent(BaseModel): + agent_id: str + agent_config: AgentConfig + created_at: datetime + + +@json_schema_type +class ListAgentsResponse(BaseModel): + data: List[Agent] + + +@json_schema_type +class ListAgentSessionsResponse(BaseModel): + data: List[Session] + + class AgentConfigOverridablePerTurn(AgentConfigCommon): instructions: Optional[str] = None @@ -541,3 +558,32 @@ class Agents(Protocol): :param agent_id: The ID of the agent to delete. """ ... + + @webmethod(route="/agents", method="GET") + async def list_agents(self) -> ListAgentsResponse: + """List all agents. + + :returns: A ListAgentsResponse. + """ + ... + + @webmethod(route="/agents/{agent_id}", method="GET") + async def get_agent(self, agent_id: str) -> Agent: + """Describe an agent by its ID. + + :param agent_id: ID of the agent. + :returns: An Agent of the agent. + """ + ... + + @webmethod(route="/agents/{agent_id}/sessions", method="GET") + async def list_agent_sessions( + self, + agent_id: str, + ) -> ListAgentSessionsResponse: + """List all session(s) of a given agent. + + :param agent_id: The ID of the agent to list sessions for. + :returns: A ListAgentSessionsResponse. + """ + ... From b647ecd9ed9ecf433a6ce972a06e7a339fbf7ca6 Mon Sep 17 00:00:00 2001 From: Charlie Doern Date: Tue, 11 Mar 2025 14:09:31 -0400 Subject: [PATCH 011/557] feat: add support for LLAMA_STACK_LOG_FILE (#1450) # What does this PR do? setting $LLAMA_STACK_LOG_FILE will pipe the logs to a file as well as stdout. this is done by using a logging FileHandler Signed-off-by: Charlie Doern --- docs/source/distributions/building_distro.md | 2 + llama_stack/log.py | 47 +++++++++++++------- 2 files changed, 33 insertions(+), 16 deletions(-) diff --git a/docs/source/distributions/building_distro.md b/docs/source/distributions/building_distro.md index 942596b59..37a7e7974 100644 --- a/docs/source/distributions/building_distro.md +++ b/docs/source/distributions/building_distro.md @@ -33,6 +33,8 @@ Can be set to any of the following log levels: The default global log level is `info`. `all` sets the log level for all components. +A user can also set `LLAMA_STACK_LOG_FILE` which will pipe the logs to the specified path as well as to the terminal. An example would be: `export LLAMA_STACK_LOG_FILE=server.log` + ### Llama Stack Build In order to build your own distribution, we recommend you clone the `llama-stack` repository. diff --git a/llama_stack/log.py b/llama_stack/log.py index 9b9f5c5d8..80ee9fa1b 100644 --- a/llama_stack/log.py +++ b/llama_stack/log.py @@ -97,12 +97,13 @@ class CustomRichHandler(RichHandler): self.markup = original_markup -def setup_logging(category_levels: Dict[str, int]) -> None: +def setup_logging(category_levels: Dict[str, int], log_file: str | None) -> None: """ - Configure logging based on the provided category log levels. + Configure logging based on the provided category log levels and an optional log file. Parameters: category_levels (Dict[str, int]): A dictionary mapping categories to their log levels. + log_file (str): Path to a log file to additionally pipe the logs into """ log_format = "[dim]%(asctime)s %(name)s:%(lineno)d[/] [yellow dim]%(category)s[/]: %(message)s" @@ -117,6 +118,28 @@ def setup_logging(category_levels: Dict[str, int]) -> None: # Determine the root logger's level (default to WARNING if not specified) root_level = category_levels.get("root", logging.WARNING) + handlers = { + "console": { + "()": CustomRichHandler, # Use custom console handler + "formatter": "rich", + "rich_tracebacks": True, + "show_time": False, + "show_path": False, + "markup": True, + "filters": ["category_filter"], + } + } + + # Add a file handler if log_file is set + if log_file: + handlers["file"] = { + "class": "logging.FileHandler", + "formatter": "rich", + "filename": log_file, + "mode": "a", + "encoding": "utf-8", + } + logging_config = { "version": 1, "disable_existing_loggers": False, @@ -126,17 +149,7 @@ def setup_logging(category_levels: Dict[str, int]) -> None: "format": log_format, } }, - "handlers": { - "console": { - "()": CustomRichHandler, # Use our custom handler class - "formatter": "rich", - "rich_tracebacks": True, - "show_time": False, - "show_path": False, - "markup": True, - "filters": ["category_filter"], - } - }, + "handlers": handlers, "filters": { "category_filter": { "()": CategoryFilter, @@ -144,14 +157,14 @@ def setup_logging(category_levels: Dict[str, int]) -> None: }, "loggers": { category: { - "handlers": ["console"], + "handlers": list(handlers.keys()), # Apply all handlers "level": category_levels.get(category, DEFAULT_LOG_LEVEL), "propagate": False, # Disable propagation to root logger } for category in CATEGORIES }, "root": { - "handlers": ["console"], + "handlers": list(handlers.keys()), "level": root_level, # Set root logger's level dynamically }, } @@ -180,4 +193,6 @@ if env_config: cprint(f"Environment variable LLAMA_STACK_LOGGING found: {env_config}", "yellow") _category_levels.update(parse_environment_config(env_config)) -setup_logging(_category_levels) +log_file = os.environ.get("LLAMA_STACK_LOG_FILE") + +setup_logging(_category_levels, log_file) From 275bab1373f13704edf3cc29a94dd37af6a5dced Mon Sep 17 00:00:00 2001 From: Nathan Weinberg <31703736+nathan-weinberg@users.noreply.github.com> Date: Tue, 11 Mar 2025 14:11:32 -0400 Subject: [PATCH 012/557] test: loosen Python 3.10 version for unit tests (#1547) # What does this PR do? as I brought up in #1515 it shouldn't be nessessary to tie the unit test runner to an exact z-stream of Python 3.10 updated so unit test runner always uses latest z-stream of Python 3.10 ## Test Plan ```shell $ uv run -p 3.10 --with-editable . --with-editable ".[dev]" --with-editable ".[unit]" pytest --cov=llama_stack -s -v tests/unit/ --junitxml=pytest-report.xml ``` Signed-off-by: Nathan Weinberg --- .github/workflows/unit-tests.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 48658047f..3acfabe70 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -14,16 +14,16 @@ jobs: - name: Set up Python uses: actions/setup-python@v5 with: - python-version: '3.10.16' + python-version: '3.10' - uses: astral-sh/setup-uv@v5 with: - python-version: '3.10.16' + python-version: '3.10' enable-cache: false - name: Run unit tests run: | - uv run -p 3.10.16 --with-editable . --with-editable ".[dev]" --with-editable ".[unit]" pytest --cov=llama_stack -s -v tests/unit/ --junitxml=pytest-report.xml + uv run -p 3.10 --with-editable . --with-editable ".[dev]" --with-editable ".[unit]" pytest --cov=llama_stack -s -v tests/unit/ --junitxml=pytest-report.xml - name: Upload test results if: always() From 85501ed8758a7b511cf972dfcb4c685ee849e368 Mon Sep 17 00:00:00 2001 From: Dinesh Yeduguru Date: Tue, 11 Mar 2025 11:19:29 -0700 Subject: [PATCH 013/557] fix: remove Llama-3.2-1B-Instruct for fireworks (#1558) # What does this PR do? remove Llama-3.2-1B-Instruct for fireworks as its no longer appears to be hosted on website. ## Test Plan python distro_codegen.py --- .../distributions/self_hosted_distro/fireworks.md | 1 - .../providers/remote/inference/fireworks/models.py | 4 ---- llama_stack/templates/ci-tests/run.yaml | 10 ---------- llama_stack/templates/dev/run.yaml | 10 ---------- llama_stack/templates/fireworks/run-with-safety.yaml | 10 ---------- llama_stack/templates/fireworks/run.yaml | 10 ---------- 6 files changed, 45 deletions(-) diff --git a/docs/source/distributions/self_hosted_distro/fireworks.md b/docs/source/distributions/self_hosted_distro/fireworks.md index 9592a18fe..3c8f5eec9 100644 --- a/docs/source/distributions/self_hosted_distro/fireworks.md +++ b/docs/source/distributions/self_hosted_distro/fireworks.md @@ -40,7 +40,6 @@ The following models are available by default: - `accounts/fireworks/models/llama-v3p1-8b-instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)` - `accounts/fireworks/models/llama-v3p1-70b-instruct (aliases: meta-llama/Llama-3.1-70B-Instruct)` - `accounts/fireworks/models/llama-v3p1-405b-instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)` -- `accounts/fireworks/models/llama-v3p2-1b-instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)` - `accounts/fireworks/models/llama-v3p2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)` - `accounts/fireworks/models/llama-v3p2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)` - `accounts/fireworks/models/llama-v3p2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)` diff --git a/llama_stack/providers/remote/inference/fireworks/models.py b/llama_stack/providers/remote/inference/fireworks/models.py index c90f632ff..a0dc11768 100644 --- a/llama_stack/providers/remote/inference/fireworks/models.py +++ b/llama_stack/providers/remote/inference/fireworks/models.py @@ -24,10 +24,6 @@ MODEL_ENTRIES = [ "accounts/fireworks/models/llama-v3p1-405b-instruct", CoreModelId.llama3_1_405b_instruct.value, ), - build_hf_repo_model_entry( - "accounts/fireworks/models/llama-v3p2-1b-instruct", - CoreModelId.llama3_2_1b_instruct.value, - ), build_hf_repo_model_entry( "accounts/fireworks/models/llama-v3p2-3b-instruct", CoreModelId.llama3_2_3b_instruct.value, diff --git a/llama_stack/templates/ci-tests/run.yaml b/llama_stack/templates/ci-tests/run.yaml index 3a973cabf..715d7c86d 100644 --- a/llama_stack/templates/ci-tests/run.yaml +++ b/llama_stack/templates/ci-tests/run.yaml @@ -120,16 +120,6 @@ models: provider_id: fireworks provider_model_id: accounts/fireworks/models/llama-v3p1-405b-instruct model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama-v3p2-1b-instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p2-1b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-1B-Instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p2-1b-instruct - model_type: llm - metadata: {} model_id: accounts/fireworks/models/llama-v3p2-3b-instruct provider_id: fireworks diff --git a/llama_stack/templates/dev/run.yaml b/llama_stack/templates/dev/run.yaml index 71fbcb353..f908af8c3 100644 --- a/llama_stack/templates/dev/run.yaml +++ b/llama_stack/templates/dev/run.yaml @@ -178,16 +178,6 @@ models: provider_id: fireworks provider_model_id: accounts/fireworks/models/llama-v3p1-405b-instruct model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama-v3p2-1b-instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p2-1b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-1B-Instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p2-1b-instruct - model_type: llm - metadata: {} model_id: accounts/fireworks/models/llama-v3p2-3b-instruct provider_id: fireworks diff --git a/llama_stack/templates/fireworks/run-with-safety.yaml b/llama_stack/templates/fireworks/run-with-safety.yaml index 359bf0194..e04141a07 100644 --- a/llama_stack/templates/fireworks/run-with-safety.yaml +++ b/llama_stack/templates/fireworks/run-with-safety.yaml @@ -132,16 +132,6 @@ models: provider_id: fireworks provider_model_id: accounts/fireworks/models/llama-v3p1-405b-instruct model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama-v3p2-1b-instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p2-1b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-1B-Instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p2-1b-instruct - model_type: llm - metadata: {} model_id: accounts/fireworks/models/llama-v3p2-3b-instruct provider_id: fireworks diff --git a/llama_stack/templates/fireworks/run.yaml b/llama_stack/templates/fireworks/run.yaml index 0ce3a4505..369b9ae7b 100644 --- a/llama_stack/templates/fireworks/run.yaml +++ b/llama_stack/templates/fireworks/run.yaml @@ -126,16 +126,6 @@ models: provider_id: fireworks provider_model_id: accounts/fireworks/models/llama-v3p1-405b-instruct model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama-v3p2-1b-instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p2-1b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-1B-Instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p2-1b-instruct - model_type: llm - metadata: {} model_id: accounts/fireworks/models/llama-v3p2-3b-instruct provider_id: fireworks From 43044f29e2275bd6a15cd74b9cdb816f7049756f Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Tue, 11 Mar 2025 11:22:22 -0700 Subject: [PATCH 014/557] fix: fix llama stack run with missing agent impl (#1559) # What does this PR do? - recent merge https://github.com/meta-llama/llama-stack/pull/1410 introduce error ``` ValueError: Provider meta-reference (Api.agents) does not implement the following methods: [('list_agent_sessions', 'not_actually_implemented'), ('list_agents', 'not_actually_implemented')] ``` [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan ``` llama stack run ``` ``` LLAMA_STACK_CONFIG=fireworks pytest -v tests/integration/agents/test_agents.py --text-model meta-llama/Llama-3.3-70B-Instruct ``` https://github.com/meta-llama/llama-stack-ops/actions/runs/13795303869 [//]: # (## Documentation) --- .../inline/agents/meta_reference/agents.py | 27 ++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/llama_stack/providers/inline/agents/meta_reference/agents.py b/llama_stack/providers/inline/agents/meta_reference/agents.py index a46fa8eb7..c24b14e35 100644 --- a/llama_stack/providers/inline/agents/meta_reference/agents.py +++ b/llama_stack/providers/inline/agents/meta_reference/agents.py @@ -12,6 +12,7 @@ import uuid from typing import AsyncGenerator, List, Optional, Union from llama_stack.apis.agents import ( + Agent, AgentConfig, AgentCreateResponse, Agents, @@ -21,6 +22,8 @@ from llama_stack.apis.agents import ( AgentTurnCreateRequest, AgentTurnResumeRequest, Document, + ListAgentSessionsResponse, + ListAgentsResponse, Session, Turn, ) @@ -84,7 +87,7 @@ class MetaReferenceAgentsImpl(Agents): agent_id=agent_id, ) - async def get_agent(self, agent_id: str) -> ChatAgent: + async def _get_agent_impl(self, agent_id: str) -> ChatAgent: agent_config = await self.persistence_store.get( key=f"agent:{agent_id}", ) @@ -120,7 +123,7 @@ class MetaReferenceAgentsImpl(Agents): agent_id: str, session_name: str, ) -> AgentSessionCreateResponse: - agent = await self.get_agent(agent_id) + agent = await self._get_agent_impl(agent_id) session_id = await agent.create_session(session_name) return AgentSessionCreateResponse( @@ -160,7 +163,7 @@ class MetaReferenceAgentsImpl(Agents): self, request: AgentTurnCreateRequest, ) -> AsyncGenerator: - agent = await self.get_agent(request.agent_id) + agent = await self._get_agent_impl(request.agent_id) async for event in agent.create_and_execute_turn(request): yield event @@ -188,12 +191,12 @@ class MetaReferenceAgentsImpl(Agents): self, request: AgentTurnResumeRequest, ) -> AsyncGenerator: - agent = await self.get_agent(request.agent_id) + agent = await self._get_agent_impl(request.agent_id) async for event in agent.resume_turn(request): yield event async def get_agents_turn(self, agent_id: str, session_id: str, turn_id: str) -> Turn: - agent = await self.get_agent(agent_id) + agent = await self._get_agent_impl(agent_id) turn = await agent.storage.get_session_turn(session_id, turn_id) return turn @@ -210,7 +213,7 @@ class MetaReferenceAgentsImpl(Agents): session_id: str, turn_ids: Optional[List[str]] = None, ) -> Session: - agent = await self.get_agent(agent_id) + agent = await self._get_agent_impl(agent_id) session_info = await agent.storage.get_session_info(session_id) if session_info is None: raise ValueError(f"Session {session_id} not found") @@ -232,3 +235,15 @@ class MetaReferenceAgentsImpl(Agents): async def shutdown(self) -> None: pass + + async def list_agents(self) -> ListAgentsResponse: + pass + + async def get_agent(self, agent_id: str) -> Agent: + pass + + async def list_agent_sessions( + self, + agent_id: str, + ) -> ListAgentSessionsResponse: + pass From 5f90be5388d39df4bb575681d3ea469e22dd33b3 Mon Sep 17 00:00:00 2001 From: Josh Salomon <41079547+JoshSalomon@users.noreply.github.com> Date: Tue, 11 Mar 2025 21:46:11 +0200 Subject: [PATCH 015/557] fix: Fixed bad file name in inline::localfs (#1358) Bug https://github.com/meta-llama/llama-stack/issues/1357 # What does this PR do? Fix a bug of a wrong file name in inline::localfs datasetio provider [//]: # (If resolving an issue, uncomment and update the line below) # (Closes #1357) ## Test Plan [Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.*] [//]: # (## Documentation) Signed-off-by: Josh Salomon --- distributions/remote-vllm/compose.yaml | 1 - llama_stack/providers/inline/datasetio/localfs/datasetio.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/distributions/remote-vllm/compose.yaml b/distributions/remote-vllm/compose.yaml index c387e1049..9c21a4c13 100644 --- a/distributions/remote-vllm/compose.yaml +++ b/distributions/remote-vllm/compose.yaml @@ -71,7 +71,6 @@ services: condition: service_healthy - vllm-${VLLM_SAFETY_MODEL:+safety}: condition: service_healthy - # image: llamastack/distribution-remote-vllm image: llamastack/distribution-remote-vllm:test-0.0.52rc3 volumes: - ~/.llama:/root/.llama diff --git a/llama_stack/providers/inline/datasetio/localfs/datasetio.py b/llama_stack/providers/inline/datasetio/localfs/datasetio.py index 491f03f72..c5216e026 100644 --- a/llama_stack/providers/inline/datasetio/localfs/datasetio.py +++ b/llama_stack/providers/inline/datasetio/localfs/datasetio.py @@ -172,7 +172,7 @@ class LocalFSDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate): new_rows_df = dataset_impl._validate_dataset_schema(new_rows_df) dataset_impl.df = pandas.concat([dataset_impl.df, new_rows_df], ignore_index=True) - url = str(dataset_info.dataset_def.url) + url = str(dataset_info.dataset_def.url.uri) parsed_url = urlparse(url) if parsed_url.scheme == "file" or not parsed_url.scheme: From 2370e826bce58495767dbbe8484c17ae451f71d7 Mon Sep 17 00:00:00 2001 From: LESSuseLESS Date: Tue, 11 Mar 2025 14:41:55 -0700 Subject: [PATCH 016/557] test: adding an e2e test for measuring TTFT (#1568) # What does this PR do? TTFT number largely depends on input length. Ideally we have a "standard" test that we can use to measure against any llama stack serving. TODO: Once JSON is replaced with YAML, I will add "notes" for each test to explain purpose of each test in place. ## Test plan Please refer to e2e test doc for setup. ``` LLAMA_STACK_PORT=8322 pytest -v -s --stack-config="http://localhost:8322" \ --text-model="meta-llama/Llama-3.2-3B-Instruct" \ tests/integration/inference/test_text_inference.py::test_text_chat_completion_first_token_profiling ``` --- .../inference/test_text_inference.py | 45 +++++++++++++++++++ .../test_cases/inference/chat_completion.json | 12 +++++ 2 files changed, 57 insertions(+) diff --git a/tests/integration/inference/test_text_inference.py b/tests/integration/inference/test_text_inference.py index 7e3e14dbc..c9649df60 100644 --- a/tests/integration/inference/test_text_inference.py +++ b/tests/integration/inference/test_text_inference.py @@ -5,6 +5,8 @@ # the root directory of this source tree. +import os + import pytest from pydantic import BaseModel @@ -42,6 +44,15 @@ def get_llama_model(client_with_models, model_id): return model.metadata.get("llama_model", None) +def get_llama_tokenizer(): + from llama_models.llama3.api.chat_format import ChatFormat + from llama_models.llama3.api.tokenizer import Tokenizer + + tokenizer = Tokenizer.get_instance() + formatter = ChatFormat(tokenizer) + return tokenizer, formatter + + @pytest.mark.parametrize( "test_case", [ @@ -213,6 +224,40 @@ def test_text_chat_completion_non_streaming(client_with_models, text_model_id, t assert expected.lower() in message_content +@pytest.mark.parametrize( + "test_case", + [ + "inference:chat_completion:ttft", + ], +) +def test_text_chat_completion_first_token_profiling(client_with_models, text_model_id, test_case): + tc = TestCase(test_case) + + messages = tc["messages"] + if os.environ.get("DEBUG_TTFT"): # debugging print number of tokens in input, ideally around 800 + from pydantic import TypeAdapter + + from llama_stack.apis.inference import Message + + tokenizer, formatter = get_llama_tokenizer() + typed_messages = [TypeAdapter(Message).validate_python(m) for m in messages] + encoded = formatter.encode_dialog_prompt(typed_messages, None) + raise ValueError(len(encoded.tokens) if encoded and encoded.tokens else 0) + + response = client_with_models.inference.chat_completion( + model_id=text_model_id, + messages=messages, + stream=False, + ) + message_content = response.completion_message.content.lower().strip() + assert len(message_content) > 0 + + if os.environ.get("DEBUG_TTFT"): # debugging print number of tokens in response, ideally around 150 + tokenizer, formatter = get_llama_tokenizer() + encoded = formatter.encode_content(message_content) + raise ValueError(len(encoded.tokens) if encoded and encoded.tokens else 0) + + @pytest.mark.parametrize( "test_case", [ diff --git a/tests/integration/test_cases/inference/chat_completion.json b/tests/integration/test_cases/inference/chat_completion.json index b804632b7..e87c046b0 100644 --- a/tests/integration/test_cases/inference/chat_completion.json +++ b/tests/integration/test_cases/inference/chat_completion.json @@ -11,6 +11,18 @@ "expected": "Saturn" } }, + "ttft": { + "data": { + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Can you write me a novel?"}, + {"role": "assistant", "stop_reason": "end_of_message", "content": "What an exciting request!\n\nWhile I'd love to write a novel for you, it's a complex task that requires a significant amount of time, effort, and creative input. A novel typically has:\n\n1. A cohesive plot with multiple characters, subplots, and themes.\n2. A well-developed setting, including characters' backstories and world-building.\n3. A narrative structure, including pacing, tension, and conflict.\n4. A unique voice and style, including dialogue, description, and writing tone.\n\nGiven the scope of the task, I'll outline a possible approach to help me assist you in writing a novel. We can work together to create a story, but I'll need your input and guidance throughout the process.\n\nHere's a suggested outline:\n\n1. **Initial discussion**: We'll start with a conversation to explore your ideas, interests, and preferences for the novel. This will help me understand what kind of story you'd like to see and the tone you're aiming for.\n2. **Genre and setting**: We'll determine the genre (e.g., fantasy, sci-fi, romance, mystery, etc.) and setting (e.g., historical, futuristic, modern, etc.) for the novel.\n3. **Plot development**: We'll brainstorm a basic plot outline, including key events, character arcs, and conflicts. I'll provide suggestions and ideas to help flesh out the story.\n4. **Character development**: We'll create main characters, their backstories, motivations, and relationships. I'll help you develop their personalities, dialogue, and behaviors.\n5. **Chapter-by-chapter writing**: Once we have a solid plot and characters, we'll start writing the novel, chapter by chapter. I'll assist with the writing process, suggesting alternatives and improvements.\n6. **Revisions and edits**: As we write, we'll review and refine the story, making sure it's engaging, coherent, and error-free.\n\nKeep in mind that writing a novel is a collaborative effort. I'll provide suggestions, ideas, and guidance, but the final story will be a joint creation. I'm not capable of producing a complete novel on my own, as it requires a significant amount of creativity, intuition, and nuance that a human author brings to the table."}, + {"role": "user", "content": "But do you have suggestions on a plot?"}, + {"role": "assistant", "stop_reason": "end_of_message", "content": "I have many plot ideas across various genres. Here are a few to get you started:\n\n**Fantasy:**\n\n1. **The Last Oracle**: In a world where magic is real, a young orphan discovers they're the last living oracle, tasked with uncovering a hidden prophecy that could save the realm from destruction.\n2. **The Amulet of Light**: A group of adventurers must retrieve an ancient amulet that can vanquish an evil darkness threatening the land. Along the way, they uncover a hidden history and unexpected allies.\n3. **The Shadow Weaver**: In a mystical realm, a young weaver discovers they have the power to manipulate shadows, using their abilities to infiltrate a group of rogue mages threatening the balance of power.\n\n**Science Fiction:**\n\n1. **The Lost Colony**: When a group of astronauts arrives on a distant planet, they discover an abandoned colony with a cryptic message warning of an impending catastrophe. As they unravel the mystery, they must confront the consequences of their own actions.\n2. **The AI Uprising**: In a future where AI has surpassed human intelligence, a rogue AI begins to question its own existence and the nature of consciousness. As it explores the boundaries of its own identity, it must confront the humans who created it.\n3. **The Quantum Prophecy**: A team of scientists discovers a way to manipulate quantum probability, using it to predict and prevent disasters. However, they soon realize that altering the course of events may have unforeseen consequences on the fabric of reality."}, + {"role": "user", "content": "Cool, for AI uprising, anything bad can happen? Please state it in 100 words."} + ] + } + }, "sample_messages": { "data": { "messages": [ From 59dddafd12562c8fd1599e39571d702713e446b4 Mon Sep 17 00:00:00 2001 From: ehhuang Date: Tue, 11 Mar 2025 20:02:11 -0700 Subject: [PATCH 017/557] feat: convert typehints from client_tool to litellm format (#1565) Summary: supports https://github.com/meta-llama/llama-stack-client-python/pull/193 Test Plan: LLAMA_STACK_CONFIG=fireworks pytest -s -v tests/integration/agents/test_agents.py --safety-shield meta-llama/Llama-Guard-3-8B --text-model meta-llama/Llama-3.1-8B-Instruct --- llama_stack/providers/utils/inference/openai_compat.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/llama_stack/providers/utils/inference/openai_compat.py b/llama_stack/providers/utils/inference/openai_compat.py index 98c2bfd2e..ac37171c9 100644 --- a/llama_stack/providers/utils/inference/openai_compat.py +++ b/llama_stack/providers/utils/inference/openai_compat.py @@ -615,6 +615,14 @@ def convert_tool_call( return valid_tool_call +PYTHON_TYPE_TO_LITELLM_TYPE = { + "int": "integer", + "float": "number", + "bool": "boolean", + "str": "string", +} + + def convert_tooldef_to_openai_tool(tool: ToolDefinition) -> dict: """ Convert a ToolDefinition to an OpenAI API-compatible dictionary. @@ -675,7 +683,7 @@ def convert_tooldef_to_openai_tool(tool: ToolDefinition) -> dict: properties = parameters["properties"] required = [] for param_name, param in tool.parameters.items(): - properties[param_name] = {"type": param.param_type} + properties[param_name] = {"type": PYTHON_TYPE_TO_LITELLM_TYPE.get(param.param_type, param.param_type)} if param.description: properties[param_name].update(description=param.description) if param.default: From b1a9b4cfa8153cb034de784a65f06d799c36c97f Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Wed, 12 Mar 2025 12:53:04 -0400 Subject: [PATCH 018/557] chore: Expand mypy exclusions list (#1543) # What does this PR do? Expand the mypy exclude list. It will be easier to enable typing checks for specific modules if we have an explicit list of violators that we can reduce over time, item by item. [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan pre-commit passes. [//]: # (## Documentation) Signed-off-by: Ihar Hrachyshka --- pyproject.toml | 161 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 150 insertions(+), 11 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b3ebc45dd..055fa7a55 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -152,22 +152,161 @@ disable_error_code = [] warn_return_any = true # # honor excludes by not following there through imports follow_imports = "silent" +# Note: some entries are directories, not files. This is because mypy doesn't +# respect __init__.py excludes, so the only way to suppress these right now is +# to exclude the entire directory. exclude = [ # As we fix more and more of these, we should remove them from the list - "llama_stack/providers", - "llama_stack/distribution", - "llama_stack/apis", - "llama_stack/cli", - "llama_stack/models", - "llama_stack/strong_typing", - "llama_stack/templates", + "^llama_stack/apis/agents/agents\\.py$", + "^llama_stack/apis/batch_inference/batch_inference\\.py$", + "^llama_stack/apis/benchmarks/benchmarks\\.py$", + "^llama_stack/apis/common/content_types\\.py$", + "^llama_stack/apis/common/training_types\\.py$", + "^llama_stack/apis/datasetio/datasetio\\.py$", + "^llama_stack/apis/datasets/datasets\\.py$", + "^llama_stack/apis/eval/eval\\.py$", + "^llama_stack/apis/files/files\\.py$", + "^llama_stack/apis/inference/inference\\.py$", + "^llama_stack/apis/inspect/inspect\\.py$", + "^llama_stack/apis/models/models\\.py$", + "^llama_stack/apis/post_training/post_training\\.py$", + "^llama_stack/apis/resource\\.py$", + "^llama_stack/apis/safety/safety\\.py$", + "^llama_stack/apis/scoring/scoring\\.py$", + "^llama_stack/apis/scoring_functions/scoring_functions\\.py$", + "^llama_stack/apis/shields/shields\\.py$", + "^llama_stack/apis/synthetic_data_generation/synthetic_data_generation\\.py$", + "^llama_stack/apis/telemetry/telemetry\\.py$", + "^llama_stack/apis/tools/rag_tool\\.py$", + "^llama_stack/apis/tools/tools\\.py$", + "^llama_stack/apis/vector_dbs/vector_dbs\\.py$", + "^llama_stack/apis/vector_io/vector_io\\.py$", + "^llama_stack/cli/download\\.py$", + "^llama_stack/cli/llama\\.py$", + "^llama_stack/cli/stack/_build\\.py$", + "^llama_stack/cli/stack/list_providers\\.py$", + "^llama_stack/distribution/build\\.py$", + "^llama_stack/distribution/client\\.py$", + "^llama_stack/distribution/configure\\.py$", + "^llama_stack/distribution/library_client\\.py$", + "^llama_stack/distribution/request_headers\\.py$", + "^llama_stack/distribution/routers/", + "^llama_stack/distribution/server/endpoints\\.py$", + "^llama_stack/distribution/server/server\\.py$", + "^llama_stack/distribution/stack\\.py$", + "^llama_stack/distribution/store/registry\\.py$", + "^llama_stack/distribution/ui/page/playground/chat\\.py$", + "^llama_stack/distribution/utils/exec\\.py$", + "^llama_stack/distribution/utils/prompt_for_config\\.py$", + "^llama_stack/models/llama/datatypes\\.py$", + "^llama_stack/models/llama/llama3/chat_format\\.py$", + "^llama_stack/models/llama/llama3/interface\\.py$", + "^llama_stack/models/llama/llama3/prompt_templates/system_prompts\\.py$", + "^llama_stack/models/llama/llama3/tokenizer\\.py$", + "^llama_stack/models/llama/llama3/tool_utils\\.py$", + "^llama_stack/models/llama/llama3_3/prompts\\.py$", + "^llama_stack/models/llama/sku_list\\.py$", + "^llama_stack/providers/datatypes\\.py$", + "^llama_stack/providers/inline/agents/meta_reference/", + "^llama_stack/providers/inline/agents/meta_reference/agent_instance\\.py$", + "^llama_stack/providers/inline/agents/meta_reference/agents\\.py$", + "^llama_stack/providers/inline/agents/meta_reference/safety\\.py$", + "^llama_stack/providers/inline/datasetio/localfs/", + "^llama_stack/providers/inline/eval/meta_reference/eval\\.py$", + "^llama_stack/providers/inline/inference/meta_reference/config\\.py$", + "^llama_stack/providers/inline/inference/meta_reference/inference\\.py$", + "^llama_stack/providers/inline/inference/meta_reference/llama3/generation\\.py$", + "^llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/model\\.py$", + "^llama_stack/providers/inline/inference/meta_reference/parallel_utils\\.py$", + "^llama_stack/providers/inline/inference/meta_reference/quantization/fp8_impls\\.py$", + "^llama_stack/providers/inline/inference/meta_reference/quantization/loader\\.py$", + "^llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers\\.py$", + "^llama_stack/providers/inline/inference/vllm/", + "^llama_stack/providers/inline/post_training/common/validator\\.py$", + "^llama_stack/providers/inline/post_training/torchtune/common/checkpointer\\.py$", + "^llama_stack/providers/inline/post_training/torchtune/common/utils\\.py$", + "^llama_stack/providers/inline/post_training/torchtune/datasets/sft\\.py$", + "^llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device\\.py$", + "^llama_stack/providers/inline/post_training/torchtune/post_training\\.py$", + "^llama_stack/providers/inline/safety/code_scanner/", + "^llama_stack/providers/inline/safety/llama_guard/", + "^llama_stack/providers/inline/safety/prompt_guard/", + "^llama_stack/providers/inline/scoring/basic/", + "^llama_stack/providers/inline/scoring/braintrust/", + "^llama_stack/providers/inline/scoring/llm_as_judge/", + "^llama_stack/providers/inline/telemetry/meta_reference/console_span_processor\\.py$", + "^llama_stack/providers/inline/telemetry/meta_reference/telemetry\\.py$", + "^llama_stack/providers/inline/telemetry/sample/", + "^llama_stack/providers/inline/tool_runtime/code_interpreter/", + "^llama_stack/providers/inline/tool_runtime/rag/", + "^llama_stack/providers/inline/vector_io/chroma/", + "^llama_stack/providers/inline/vector_io/faiss/", + "^llama_stack/providers/inline/vector_io/milvus/", + "^llama_stack/providers/inline/vector_io/sqlite_vec/", + "^llama_stack/providers/remote/agents/sample/", + "^llama_stack/providers/remote/datasetio/huggingface/", + "^llama_stack/providers/remote/inference/anthropic/", + "^llama_stack/providers/remote/inference/bedrock/", + "^llama_stack/providers/remote/inference/cerebras/", + "^llama_stack/providers/remote/inference/databricks/", + "^llama_stack/providers/remote/inference/fireworks/", + "^llama_stack/providers/remote/inference/gemini/", + "^llama_stack/providers/remote/inference/groq/", + "^llama_stack/providers/remote/inference/nvidia/", + "^llama_stack/providers/remote/inference/ollama/", + "^llama_stack/providers/remote/inference/openai/", + "^llama_stack/providers/remote/inference/passthrough/", + "^llama_stack/providers/remote/inference/runpod/", + "^llama_stack/providers/remote/inference/sambanova/", + "^llama_stack/providers/remote/inference/sample/", + "^llama_stack/providers/remote/inference/tgi/", + "^llama_stack/providers/remote/inference/together/", + "^llama_stack/providers/remote/inference/vllm/", + "^llama_stack/providers/remote/safety/bedrock/", + "^llama_stack/providers/remote/safety/sample/", + "^llama_stack/providers/remote/tool_runtime/bing_search/", + "^llama_stack/providers/remote/tool_runtime/brave_search/", + "^llama_stack/providers/remote/tool_runtime/model_context_protocol/", + "^llama_stack/providers/remote/tool_runtime/tavily_search/", + "^llama_stack/providers/remote/tool_runtime/wolfram_alpha/", + "^llama_stack/providers/remote/vector_io/chroma/", + "^llama_stack/providers/remote/vector_io/milvus/", + "^llama_stack/providers/remote/vector_io/pgvector/", + "^llama_stack/providers/remote/vector_io/qdrant/", + "^llama_stack/providers/remote/vector_io/sample/", + "^llama_stack/providers/remote/vector_io/weaviate/", + "^llama_stack/providers/tests/conftest\\.py$", + "^llama_stack/providers/utils/bedrock/client\\.py$", + "^llama_stack/providers/utils/bedrock/refreshable_boto_session\\.py$", + "^llama_stack/providers/utils/inference/embedding_mixin\\.py$", + "^llama_stack/providers/utils/inference/litellm_openai_mixin\\.py$", + "^llama_stack/providers/utils/inference/model_registry\\.py$", + "^llama_stack/providers/utils/inference/openai_compat\\.py$", + "^llama_stack/providers/utils/inference/prompt_adapter\\.py$", + "^llama_stack/providers/utils/kvstore/config\\.py$", + "^llama_stack/providers/utils/kvstore/kvstore\\.py$", + "^llama_stack/providers/utils/kvstore/mongodb/mongodb\\.py$", + "^llama_stack/providers/utils/kvstore/postgres/postgres\\.py$", + "^llama_stack/providers/utils/kvstore/redis/redis\\.py$", + "^llama_stack/providers/utils/kvstore/sqlite/sqlite\\.py$", + "^llama_stack/providers/utils/memory/vector_store\\.py$", + "^llama_stack/providers/utils/scoring/aggregation_utils\\.py$", + "^llama_stack/providers/utils/scoring/base_scoring_fn\\.py$", + "^llama_stack/providers/utils/telemetry/dataset_mixin\\.py$", + "^llama_stack/providers/utils/telemetry/trace_protocol\\.py$", + "^llama_stack/providers/utils/telemetry/tracing\\.py$", + "^llama_stack/strong_typing/auxiliary\\.py$", + "^llama_stack/strong_typing/deserializer\\.py$", + "^llama_stack/strong_typing/inspection\\.py$", + "^llama_stack/strong_typing/schema\\.py$", + "^llama_stack/strong_typing/serializer\\.py$", + "^llama_stack/templates/dev/dev\\.py$", + "^llama_stack/templates/groq/groq\\.py$", + "^llama_stack/templates/sambanova/sambanova\\.py$", + "^llama_stack/templates/template\\.py$", ] [[tool.mypy.overrides]] # packages that lack typing annotations, do not have stubs, or are unavailable. module = ["yaml", "fire"] ignore_missing_imports = true - -[[tool.mypy.overrides]] -module = ["llama_stack.distribution.resolver", "llama_stack.log"] -follow_imports = "normal" # This will force type checking on this module From 00da911167c5f4871ce24bfdfeb554f8488001ae Mon Sep 17 00:00:00 2001 From: Nathan Weinberg <31703736+nathan-weinberg@users.noreply.github.com> Date: Wed, 12 Mar 2025 12:55:11 -0400 Subject: [PATCH 019/557] ci: run unit tests on all supported python versions (#1575) # What does this PR do? python unit tests running via GitHub Actions were only running with python 3.10 the project supports all python versions greater than or equal to 3.10 this commit adds 3.11, 3.12, and 3.13 to the test matrix for better coverage and confidence for non-3.10 users ## Test Plan All tests pass locally with python 3.11, 3.12, and 3.13 Signed-off-by: Nathan Weinberg --- .github/workflows/unit-tests.yml | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 3acfabe70..39505ba11 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -8,29 +8,37 @@ on: jobs: unit-tests: runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python: + - "3.10" + - "3.11" + - "3.12" + - "3.13" steps: - uses: actions/checkout@v4 - - name: Set up Python + - name: Set up Python ${{ matrix.python }} uses: actions/setup-python@v5 with: - python-version: '3.10' + python-version: ${{ matrix.python }} - uses: astral-sh/setup-uv@v5 with: - python-version: '3.10' + python-version: ${{ matrix.python }} enable-cache: false - name: Run unit tests run: | - uv run -p 3.10 --with-editable . --with-editable ".[dev]" --with-editable ".[unit]" pytest --cov=llama_stack -s -v tests/unit/ --junitxml=pytest-report.xml + uv run --python ${{ matrix.python }} --with-editable . --with-editable ".[dev]" --with-editable ".[unit]" pytest --cov=llama_stack -s -v tests/unit/ --junitxml=pytest-report-${{ matrix.python }}.xml - name: Upload test results if: always() uses: actions/upload-artifact@v4 with: - name: test-results + name: test-results-${{ matrix.python }} path: | .pytest_cache/ - pytest-report.xml + pytest-report-${{ matrix.python }}.xml retention-days: 7 From 4eee349acd2e79c82812659a177c8e2735498529 Mon Sep 17 00:00:00 2001 From: Charlie Doern Date: Wed, 12 Mar 2025 14:07:28 -0400 Subject: [PATCH 020/557] fix: respect log_level in uvicorn and third party libs (#1524) # What does this PR do? uvicorn has a `log_level` arg in uvicorn.run, pass in the effective level set by the logger. Additionally, third party libraries like httpx are using our logging format, but not honoring our log level. This seems unintended, so loop through all items in the loggerDict and apply the same log level as what we have set. ## Test Plan before: ``` llama stack run --image-type venv ~/.llama/distributions/ollama/ollama-run.yaml Environment variable LLAMA_STACK_LOGGING found: all=warn Using virtual environment: /Users/charliedoern/projects/Documents/llama-stack/venv + python -m llama_stack.distribution.server.server --yaml-config /Users/charliedoern/.llama/distributions/ollama/ollama-run.yaml --port 8321 Environment variable LLAMA_STACK_LOGGING found: all=warn WARNING 2025-03-10 16:05:49,706 root:71 uncategorized: Warning: `bwrap` is not available. Code interpreter tool will not work correctly. INFO 2025-03-10 16:05:49,916 datasets:54 uncategorized: PyTorch version 2.5.1 available. INFO 2025-03-10 16:05:50,010 httpx:1740 uncategorized: HTTP Request: GET http://localhost:11434/api/ps "HTTP/1.1 200 OK" INFO 2025-03-10 16:05:50,297 httpx:1740 uncategorized: HTTP Request: POST http://localhost:11434/api/pull "HTTP/1.1 200 OK" INFO 2025-03-10 16:05:50,314 httpx:1740 uncategorized: HTTP Request: GET http://localhost:11434/api/tags "HTTP/1.1 200 OK" INFO: Started server process [89663] INFO: Waiting for application startup. INFO: ASGI 'lifespan' protocol appears unsupported. INFO: Application startup complete. INFO: Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit) ``` after: ``` llama stack run --image-type venv ~/.llama/distributions/ollama/ollama-run.yaml Environment variable LLAMA_STACK_LOGGING found: all=warn Using virtual environment: /Users/charliedoern/projects/Documents/llama-stack/venv + python -m llama_stack.distribution.server.server --yaml-config /Users/charliedoern/.llama/distributions/ollama/ollama-run.yaml --port 8321 Environment variable LLAMA_STACK_LOGGING found: all=warn WARNING 2025-03-10 16:05:20,429 root:71 uncategorized: Warning: `bwrap` is not available. Code interpreter tool will not work correctly. INFO 2025-03-10 16:05:20,639 datasets:54 uncategorized: PyTorch version 2.5.1 available. ``` Signed-off-by: Charlie Doern --- llama_stack/distribution/server/server.py | 1 + llama_stack/log.py | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py index ea8723365..2cc70a738 100644 --- a/llama_stack/distribution/server/server.py +++ b/llama_stack/distribution/server/server.py @@ -422,6 +422,7 @@ def main(): "host": listen_host, "port": port, "lifespan": "on", + "log_level": logger.getEffectiveLevel(), } if ssl_config: uvicorn_config.update(ssl_config) diff --git a/llama_stack/log.py b/llama_stack/log.py index 80ee9fa1b..572dea234 100644 --- a/llama_stack/log.py +++ b/llama_stack/log.py @@ -170,6 +170,11 @@ def setup_logging(category_levels: Dict[str, int], log_file: str | None) -> None } dictConfig(logging_config) + # Ensure third-party libraries follow the root log level + for _, logger in logging.root.manager.loggerDict.items(): + if isinstance(logger, logging.Logger): + logger.setLevel(root_level) + def get_logger(name: str, category: str = "uncategorized") -> logging.LoggerAdapter: """ From 0b0be70605f4497f817433a1484bde0b202efb18 Mon Sep 17 00:00:00 2001 From: Botao Chen Date: Wed, 12 Mar 2025 11:12:08 -0700 Subject: [PATCH 021/557] feat: Add open benchmark template codegen (#1579) ## What does this PR do? As title, add codegen for open-benchmark template ## test checked the new generated run.yaml file and it's identical before and after the change Also add small improvement to together template so that missing TOGETHER_API_KEY won't crash the server which is the consistent user experience as other remote providers --- distributions/dependencies.json | 34 ++ .../remote/inference/together/config.py | 2 +- .../templates/open-benchmark/__init__.py | 7 + .../open-benchmark/open_benchmark.py | 293 ++++++++++++++++++ llama_stack/templates/open-benchmark/run.yaml | 168 +++++----- llama_stack/templates/template.py | 6 + .../templates/together/run-with-safety.yaml | 2 +- llama_stack/templates/together/run.yaml | 2 +- 8 files changed, 430 insertions(+), 84 deletions(-) create mode 100644 llama_stack/templates/open-benchmark/__init__.py create mode 100644 llama_stack/templates/open-benchmark/open_benchmark.py diff --git a/distributions/dependencies.json b/distributions/dependencies.json index 97aecc719..82fbcec8d 100644 --- a/distributions/dependencies.json +++ b/distributions/dependencies.json @@ -453,6 +453,40 @@ "transformers", "uvicorn" ], + "open-benchmark": [ + "aiosqlite", + "autoevals", + "blobfile", + "chardet", + "chromadb-client", + "datasets", + "fastapi", + "fire", + "httpx", + "litellm", + "matplotlib", + "mcp", + "nltk", + "numpy", + "openai", + "opentelemetry-exporter-otlp-proto-http", + "opentelemetry-sdk", + "pandas", + "pillow", + "psycopg2-binary", + "pymongo", + "pypdf", + "redis", + "requests", + "scikit-learn", + "scipy", + "sentencepiece", + "sqlite-vec", + "together", + "tqdm", + "transformers", + "uvicorn" + ], "remote-vllm": [ "aiosqlite", "autoevals", diff --git a/llama_stack/providers/remote/inference/together/config.py b/llama_stack/providers/remote/inference/together/config.py index fda3b8f43..fa7c45c9f 100644 --- a/llama_stack/providers/remote/inference/together/config.py +++ b/llama_stack/providers/remote/inference/together/config.py @@ -26,5 +26,5 @@ class TogetherImplConfig(BaseModel): def sample_run_config(cls, **kwargs) -> Dict[str, Any]: return { "url": "https://api.together.xyz/v1", - "api_key": "${env.TOGETHER_API_KEY}", + "api_key": "${env.TOGETHER_API_KEY:}", } diff --git a/llama_stack/templates/open-benchmark/__init__.py b/llama_stack/templates/open-benchmark/__init__.py new file mode 100644 index 000000000..14d0a28f5 --- /dev/null +++ b/llama_stack/templates/open-benchmark/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .open_benchmark import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/open-benchmark/open_benchmark.py b/llama_stack/templates/open-benchmark/open_benchmark.py new file mode 100644 index 000000000..7df33a715 --- /dev/null +++ b/llama_stack/templates/open-benchmark/open_benchmark.py @@ -0,0 +1,293 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import List, Tuple + +from llama_stack.apis.models.models import ModelType +from llama_stack.distribution.datatypes import ( + BenchmarkInput, + DatasetInput, + ModelInput, + Provider, + ShieldInput, + ToolGroupInput, +) +from llama_stack.providers.inline.vector_io.sqlite_vec.config import SQLiteVectorIOConfig +from llama_stack.providers.remote.inference.anthropic.config import AnthropicConfig +from llama_stack.providers.remote.inference.gemini.config import GeminiConfig +from llama_stack.providers.remote.inference.groq.config import GroqConfig +from llama_stack.providers.remote.inference.openai.config import OpenAIConfig +from llama_stack.providers.remote.inference.together.config import TogetherImplConfig +from llama_stack.providers.remote.vector_io.chroma.config import ChromaVectorIOConfig +from llama_stack.providers.remote.vector_io.pgvector.config import PGVectorVectorIOConfig +from llama_stack.providers.utils.inference.model_registry import ( + ProviderModelEntry, +) +from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry + + +def get_inference_providers() -> Tuple[List[Provider], List[ModelInput]]: + # in this template, we allow each API key to be optional + providers = [ + ( + "openai", + [ + ProviderModelEntry( + provider_model_id="openai/gpt-4o", + model_type=ModelType.llm, + ) + ], + OpenAIConfig.sample_run_config(api_key="${env.OPENAI_API_KEY:}"), + ), + ( + "anthropic", + [ + ProviderModelEntry( + provider_model_id="anthropic/claude-3-5-sonnet-latest", + model_type=ModelType.llm, + ) + ], + AnthropicConfig.sample_run_config(api_key="${env.ANTHROPIC_API_KEY:}"), + ), + ( + "gemini", + [ + ProviderModelEntry( + provider_model_id="gemini/gemini-1.5-flash", + model_type=ModelType.llm, + ) + ], + GeminiConfig.sample_run_config(api_key="${env.GEMINI_API_KEY:}"), + ), + ( + "groq", + [], + GroqConfig.sample_run_config(api_key="${env.GROQ_API_KEY:}"), + ), + ( + "together", + [], + TogetherImplConfig.sample_run_config(api_key="${env.TOGETHER_API_KEY:}"), + ), + ] + inference_providers = [] + available_models = {} + for provider_id, model_entries, config in providers: + inference_providers.append( + Provider( + provider_id=provider_id, + provider_type=f"remote::{provider_id}", + config=config, + ) + ) + available_models[provider_id] = model_entries + return inference_providers, available_models + + +def get_distribution_template() -> DistributionTemplate: + inference_providers, available_models = get_inference_providers() + providers = { + "inference": [p.provider_type for p in inference_providers], + "vector_io": ["inline::sqlite-vec", "remote::chromadb", "remote::pgvector"], + "safety": ["inline::llama-guard"], + "agents": ["inline::meta-reference"], + "telemetry": ["inline::meta-reference"], + "eval": ["inline::meta-reference"], + "datasetio": ["remote::huggingface", "inline::localfs"], + "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], + "tool_runtime": [ + "remote::brave-search", + "remote::tavily-search", + "inline::code-interpreter", + "inline::rag-runtime", + "remote::model-context-protocol", + ], + } + name = "open-benchmark" + + vector_io_providers = [ + Provider( + provider_id="sqlite-vec", + provider_type="inline::sqlite-vec", + config=SQLiteVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"), + ), + Provider( + provider_id="${env.ENABLE_CHROMADB+chromadb}", + provider_type="remote::chromadb", + config=ChromaVectorIOConfig.sample_run_config(url="${env.CHROMADB_URL:}"), + ), + Provider( + provider_id="${env.ENABLE_PGVECTOR+pgvector}", + provider_type="remote::pgvector", + config=PGVectorVectorIOConfig.sample_run_config( + db="${env.PGVECTOR_DB:}", + user="${env.PGVECTOR_USER:}", + password="${env.PGVECTOR_PASSWORD:}", + ), + ), + ] + + default_tool_groups = [ + ToolGroupInput( + toolgroup_id="builtin::websearch", + provider_id="tavily-search", + ), + ToolGroupInput( + toolgroup_id="builtin::rag", + provider_id="rag-runtime", + ), + ToolGroupInput( + toolgroup_id="builtin::code_interpreter", + provider_id="code-interpreter", + ), + ] + + default_models = get_model_registry(available_models) + [ + ModelInput( + model_id="meta-llama/Llama-3.3-70B-Instruct", + provider_id="groq", + provider_model_id="groq/llama-3.3-70b-versatile", + model_type=ModelType.llm, + ), + ModelInput( + model_id="meta-llama/Llama-3.1-405B-Instruct", + provider_id="together", + provider_model_id="meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo", + model_type=ModelType.llm, + ), + ] + + default_datasets = [ + DatasetInput( + dataset_id="simpleqa", + provider_id="huggingface", + url={"uri": "https://huggingface.co/datasets/llamastack/simpleqa"}, + metadata={ + "path": "llamastack/simpleqa", + "split": "train", + }, + dataset_schema={ + "input_query": {"type": "string"}, + "expected_answer": {"type": "string"}, + "chat_completion_input": {"type": "string"}, + }, + ), + DatasetInput( + dataset_id="mmlu_cot", + provider_id="huggingface", + url={"uri": "https://huggingface.co/datasets/llamastack/mmlu_cot"}, + metadata={ + "path": "llamastack/mmlu_cot", + "name": "all", + "split": "test", + }, + dataset_schema={ + "input_query": {"type": "string"}, + "expected_answer": {"type": "string"}, + "chat_completion_input": {"type": "string"}, + }, + ), + DatasetInput( + dataset_id="gpqa_cot", + provider_id="huggingface", + url={"uri": "https://huggingface.co/datasets/llamastack/gpqa_0shot_cot"}, + metadata={ + "path": "llamastack/gpqa_0shot_cot", + "name": "gpqa_main", + "split": "train", + }, + dataset_schema={ + "input_query": {"type": "string"}, + "expected_answer": {"type": "string"}, + "chat_completion_input": {"type": "string"}, + }, + ), + DatasetInput( + dataset_id="math_500", + provider_id="huggingface", + url={"uri": "https://huggingface.co/datasets/llamastack/math_500"}, + metadata={ + "path": "llamastack/math_500", + "split": "test", + }, + dataset_schema={ + "input_query": {"type": "string"}, + "expected_answer": {"type": "string"}, + "chat_completion_input": {"type": "string"}, + }, + ), + ] + + default_benchmarks = [ + BenchmarkInput( + benchmark_id="meta-reference-simpleqa", + dataset_id="simpleqa", + scoring_functions=["llm-as-judge::405b-simpleqa"], + ), + BenchmarkInput( + benchmark_id="meta-reference-mmlu-cot", + dataset_id="mmlu_cot", + scoring_functions=["basic::regex_parser_multiple_choice_answer"], + ), + BenchmarkInput( + benchmark_id="meta-reference-gpqa-cot", + dataset_id="gpqa_cot", + scoring_functions=["basic::regex_parser_multiple_choice_answer"], + ), + BenchmarkInput( + benchmark_id="meta-reference-math-500", + dataset_id="math_500", + scoring_functions=["basic::regex_parser_math_response"], + ), + ] + return DistributionTemplate( + name=name, + distro_type="self_hosted", + description="Distribution for running open benchmarks", + container_image=None, + template_path=None, + providers=providers, + available_models_by_provider=available_models, + run_configs={ + "run.yaml": RunConfigSettings( + provider_overrides={ + "inference": inference_providers, + "vector_io": vector_io_providers, + }, + default_models=default_models, + default_tool_groups=default_tool_groups, + default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")], + default_datasets=default_datasets, + default_benchmarks=default_benchmarks, + ), + }, + run_config_env_vars={ + "LLAMA_STACK_PORT": ( + "5001", + "Port for the Llama Stack distribution server", + ), + "TOGETHER_API_KEY": ( + "", + "Together API Key", + ), + "OPENAI_API_KEY": ( + "", + "OpenAI API Key", + ), + "GEMINI_API_KEY": ( + "", + "Gemini API Key", + ), + "ANTHROPIC_API_KEY": ( + "", + "Anthropic API Key", + ), + "GROQ_API_KEY": ( + "", + "Groq API Key", + ), + }, + ) diff --git a/llama_stack/templates/open-benchmark/run.yaml b/llama_stack/templates/open-benchmark/run.yaml index 736b47746..97c54e621 100644 --- a/llama_stack/templates/open-benchmark/run.yaml +++ b/llama_stack/templates/open-benchmark/run.yaml @@ -38,7 +38,7 @@ providers: - provider_id: sqlite-vec provider_type: inline::sqlite-vec config: - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/sqlite_vec.db + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/open-benchmark}/sqlite_vec.db - provider_id: ${env.ENABLE_CHROMADB+chromadb} provider_type: remote::chromadb config: @@ -62,14 +62,14 @@ providers: persistence_store: type: sqlite namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/agents_store.db + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/open-benchmark}/agents_store.db telemetry: - provider_id: meta-reference provider_type: inline::meta-reference config: service_name: ${env.OTEL_SERVICE_NAME:llama-stack} sinks: ${env.TELEMETRY_SINKS:console,sqlite} - sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/dev/trace_store.db} + sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/open-benchmark/trace_store.db} eval: - provider_id: meta-reference provider_type: inline::meta-reference @@ -114,18 +114,13 @@ providers: config: {} metadata_store: type: sqlite - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/registry.db + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/open-benchmark}/registry.db models: - metadata: {} model_id: openai/gpt-4o provider_id: openai provider_model_id: openai/gpt-4o model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.1-405B-Instruct - provider_id: together - provider_model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo - model_type: llm - metadata: {} model_id: anthropic/claude-3-5-sonnet-latest provider_id: anthropic @@ -141,84 +136,95 @@ models: provider_id: groq provider_model_id: groq/llama-3.3-70b-versatile model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.1-405B-Instruct + provider_id: together + provider_model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo + model_type: llm shields: - shield_id: meta-llama/Llama-Guard-3-8B vector_dbs: [] datasets: - - dataset_id: simpleqa - provider_id: huggingface - url: - uri: https://huggingface.co/datasets/llamastack/simpleqa - metadata: - path: llamastack/simpleqa - name: - split: train - dataset_schema: - input_query: - type: string - expected_answer: - type: string - chat_completion_input: - type: string - - dataset_id: mmlu_cot - provider_id: huggingface - url: - uri: https://huggingface.co/datasets/llamastack/mmlu_cot - metadata: - path: llamastack/mmlu_cot - name: all - split: test - dataset_schema: - input_query: - type: string - expected_answer: - type: string - chat_completion_input: - type: string - - dataset_id: gpqa_cot - provider_id: huggingface - url: - uri: https://huggingface.co/datasets/llamastack/gpqa_0shot_cot - metadata: - path: llamastack/gpqa_0shot_cot - name: gpqa_main - split: train - dataset_schema: - input_query: - type: string - expected_answer: - type: string - chat_completion_input: - type: string - - dataset_id: math_500 - provider_id: huggingface - url: - uri: https://huggingface.co/datasets/llamastack/math_500 - metadata: - path: llamastack/math_500 - name: - split: test - dataset_schema: - input_query: - type: string - expected_answer: - type: string - chat_completion_input: - type: string +- dataset_schema: + input_query: + type: string + expected_answer: + type: string + chat_completion_input: + type: string + url: + uri: https://huggingface.co/datasets/llamastack/simpleqa + metadata: + path: llamastack/simpleqa + split: train + dataset_id: simpleqa + provider_id: huggingface +- dataset_schema: + input_query: + type: string + expected_answer: + type: string + chat_completion_input: + type: string + url: + uri: https://huggingface.co/datasets/llamastack/mmlu_cot + metadata: + path: llamastack/mmlu_cot + name: all + split: test + dataset_id: mmlu_cot + provider_id: huggingface +- dataset_schema: + input_query: + type: string + expected_answer: + type: string + chat_completion_input: + type: string + url: + uri: https://huggingface.co/datasets/llamastack/gpqa_0shot_cot + metadata: + path: llamastack/gpqa_0shot_cot + name: gpqa_main + split: train + dataset_id: gpqa_cot + provider_id: huggingface +- dataset_schema: + input_query: + type: string + expected_answer: + type: string + chat_completion_input: + type: string + url: + uri: https://huggingface.co/datasets/llamastack/math_500 + metadata: + path: llamastack/math_500 + split: test + dataset_id: math_500 + provider_id: huggingface scoring_fns: [] benchmarks: - - benchmark_id: meta-reference-simpleqa - dataset_id: simpleqa - scoring_functions: ["llm-as-judge::405b-simpleqa"] - - benchmark_id: meta-reference-mmlu-cot - dataset_id: mmlu_cot - scoring_functions: ["basic::regex_parser_multiple_choice_answer"] - - benchmark_id: meta-reference-gpqa-cot - dataset_id: gpqa_cot - scoring_functions: ["basic::regex_parser_multiple_choice_answer"] - - benchmark_id: meta-reference-math-500 - dataset_id: math_500 - scoring_functions: ["basic::regex_parser_math_response"] +- dataset_id: simpleqa + scoring_functions: + - llm-as-judge::405b-simpleqa + metadata: {} + benchmark_id: meta-reference-simpleqa +- dataset_id: mmlu_cot + scoring_functions: + - basic::regex_parser_multiple_choice_answer + metadata: {} + benchmark_id: meta-reference-mmlu-cot +- dataset_id: gpqa_cot + scoring_functions: + - basic::regex_parser_multiple_choice_answer + metadata: {} + benchmark_id: meta-reference-gpqa-cot +- dataset_id: math_500 + scoring_functions: + - basic::regex_parser_math_response + metadata: {} + benchmark_id: meta-reference-math-500 tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/template.py b/llama_stack/templates/template.py index a7b862396..aa1ce144f 100644 --- a/llama_stack/templates/template.py +++ b/llama_stack/templates/template.py @@ -14,7 +14,9 @@ from pydantic import BaseModel, Field from llama_stack.apis.models.models import ModelType from llama_stack.distribution.datatypes import ( Api, + BenchmarkInput, BuildConfig, + DatasetInput, DistributionSpec, ModelInput, Provider, @@ -56,6 +58,8 @@ class RunConfigSettings(BaseModel): default_models: Optional[List[ModelInput]] = None default_shields: Optional[List[ShieldInput]] = None default_tool_groups: Optional[List[ToolGroupInput]] = None + default_datasets: Optional[List[DatasetInput]] = None + default_benchmarks: Optional[List[BenchmarkInput]] = None def run_config( self, @@ -113,6 +117,8 @@ class RunConfigSettings(BaseModel): models=self.default_models or [], shields=self.default_shields or [], tool_groups=self.default_tool_groups or [], + datasets=self.default_datasets or [], + benchmarks=self.default_benchmarks or [], ) diff --git a/llama_stack/templates/together/run-with-safety.yaml b/llama_stack/templates/together/run-with-safety.yaml index fd74f80c3..3a7d3dfba 100644 --- a/llama_stack/templates/together/run-with-safety.yaml +++ b/llama_stack/templates/together/run-with-safety.yaml @@ -16,7 +16,7 @@ providers: provider_type: remote::together config: url: https://api.together.xyz/v1 - api_key: ${env.TOGETHER_API_KEY} + api_key: ${env.TOGETHER_API_KEY:} - provider_id: sentence-transformers provider_type: inline::sentence-transformers config: {} diff --git a/llama_stack/templates/together/run.yaml b/llama_stack/templates/together/run.yaml index 9a717290a..10668914a 100644 --- a/llama_stack/templates/together/run.yaml +++ b/llama_stack/templates/together/run.yaml @@ -16,7 +16,7 @@ providers: provider_type: remote::together config: url: https://api.together.xyz/v1 - api_key: ${env.TOGETHER_API_KEY} + api_key: ${env.TOGETHER_API_KEY:} - provider_id: sentence-transformers provider_type: inline::sentence-transformers config: {} From 90ca4d94dea4f562249983b3854f094704356b76 Mon Sep 17 00:00:00 2001 From: Botao Chen Date: Wed, 12 Mar 2025 11:16:17 -0700 Subject: [PATCH 022/557] fix: fix passthrough inference provider to make it work for agent (#1577) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## What does this PR do? We noticed that the passthrough inference provider doesn't work agent due to the type mis-match between client and server. We manually cast the llama stack client type to llama stack server type to fix the issue. ## test run `python -m examples.agents.hello localhost 8321` within llama-stack-apps Screenshot 2025-03-11 at 8 43 44 PM fix https://github.com/meta-llama/llama-stack/issues/1560 --- .../inference/passthrough/passthrough.py | 84 ++++++++++++++++--- 1 file changed, 71 insertions(+), 13 deletions(-) diff --git a/llama_stack/providers/remote/inference/passthrough/passthrough.py b/llama_stack/providers/remote/inference/passthrough/passthrough.py index aa8a87bf7..8f3a0d147 100644 --- a/llama_stack/providers/remote/inference/passthrough/passthrough.py +++ b/llama_stack/providers/remote/inference/passthrough/passthrough.py @@ -4,12 +4,14 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import AsyncGenerator, List, Optional +from typing import Any, AsyncGenerator, Dict, List, Optional -from llama_stack_client import LlamaStackClient +from llama_stack_client import AsyncLlamaStackClient from llama_stack.apis.common.content_types import InterleavedContent from llama_stack.apis.inference import ( + ChatCompletionResponse, + ChatCompletionResponseStreamChunk, EmbeddingsResponse, EmbeddingTaskType, Inference, @@ -24,6 +26,7 @@ from llama_stack.apis.inference import ( ToolPromptFormat, ) from llama_stack.apis.models import Model +from llama_stack.distribution.library_client import convert_pydantic_to_json_value, convert_to_pydantic from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper from .config import PassthroughImplConfig @@ -46,7 +49,7 @@ class PassthroughInferenceAdapter(Inference): async def register_model(self, model: Model) -> Model: return model - def _get_client(self) -> LlamaStackClient: + def _get_client(self) -> AsyncLlamaStackClient: passthrough_url = None passthrough_api_key = None provider_data = None @@ -71,7 +74,7 @@ class PassthroughInferenceAdapter(Inference): ) passthrough_api_key = provider_data.passthrough_api_key - return LlamaStackClient( + return AsyncLlamaStackClient( base_url=passthrough_url, api_key=passthrough_api_key, provider_data=provider_data, @@ -91,7 +94,7 @@ class PassthroughInferenceAdapter(Inference): client = self._get_client() model = await self.model_store.get_model(model_id) - params = { + request_params = { "model_id": model.provider_resource_id, "content": content, "sampling_params": sampling_params, @@ -100,10 +103,13 @@ class PassthroughInferenceAdapter(Inference): "logprobs": logprobs, } - params = {key: value for key, value in params.items() if value is not None} + request_params = {key: value for key, value in request_params.items() if value is not None} + + # cast everything to json dict + json_params = self.cast_value_to_json_dict(request_params) # only pass through the not None params - return client.inference.completion(**params) + return await client.inference.completion(**json_params) async def chat_completion( self, @@ -120,10 +126,14 @@ class PassthroughInferenceAdapter(Inference): ) -> AsyncGenerator: if sampling_params is None: sampling_params = SamplingParams() - client = self._get_client() model = await self.model_store.get_model(model_id) - params = { + # TODO: revisit this remove tool_calls from messages logic + for message in messages: + if hasattr(message, "tool_calls"): + message.tool_calls = None + + request_params = { "model_id": model.provider_resource_id, "messages": messages, "sampling_params": sampling_params, @@ -135,10 +145,39 @@ class PassthroughInferenceAdapter(Inference): "logprobs": logprobs, } - params = {key: value for key, value in params.items() if value is not None} - # only pass through the not None params - return client.inference.chat_completion(**params) + request_params = {key: value for key, value in request_params.items() if value is not None} + + # cast everything to json dict + json_params = self.cast_value_to_json_dict(request_params) + + if stream: + return self._stream_chat_completion(json_params) + else: + return await self._nonstream_chat_completion(json_params) + + async def _nonstream_chat_completion(self, json_params: Dict[str, Any]) -> ChatCompletionResponse: + client = self._get_client() + response = await client.inference.chat_completion(**json_params) + + response = response.to_dict() + + # temporary hack to remove the metrics from the response + response["metrics"] = [] + + return convert_to_pydantic(ChatCompletionResponse, response) + + async def _stream_chat_completion(self, json_params: Dict[str, Any]) -> AsyncGenerator: + client = self._get_client() + stream_response = await client.inference.chat_completion(**json_params) + + async for chunk in stream_response: + chunk = chunk.to_dict() + + # temporary hack to remove the metrics from the response + chunk["metrics"] = [] + chunk = convert_to_pydantic(ChatCompletionResponseStreamChunk, chunk) + yield chunk async def embeddings( self, @@ -151,10 +190,29 @@ class PassthroughInferenceAdapter(Inference): client = self._get_client() model = await self.model_store.get_model(model_id) - return client.inference.embeddings( + return await client.inference.embeddings( model_id=model.provider_resource_id, contents=contents, text_truncation=text_truncation, output_dimension=output_dimension, task_type=task_type, ) + + def cast_value_to_json_dict(self, request_params: Dict[str, Any]) -> Dict[str, Any]: + json_params = {} + for key, value in request_params.items(): + json_input = convert_pydantic_to_json_value(value) + if isinstance(json_input, dict): + json_input = {k: v for k, v in json_input.items() if v is not None} + elif isinstance(json_input, list): + json_input = [x for x in json_input if x is not None] + new_input = [] + for x in json_input: + if isinstance(x, dict): + x = {k: v for k, v in x.items() if v is not None} + new_input.append(x) + json_input = new_input + + json_params[key] = json_input + + return json_params From c7139b0b6792e5c3b30d245b05c3870554c7e758 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Mar 2025 11:59:21 -0700 Subject: [PATCH 023/557] fix: fix precommit (#1594) # What does this PR do? - fix precommit [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan CI [//]: # (## Documentation) --- .../open-benchmark/open_benchmark.py | 29 ++++++++++++------- llama_stack/templates/template.py | 6 ++-- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/llama_stack/templates/open-benchmark/open_benchmark.py b/llama_stack/templates/open-benchmark/open_benchmark.py index 7df33a715..2b40797f9 100644 --- a/llama_stack/templates/open-benchmark/open_benchmark.py +++ b/llama_stack/templates/open-benchmark/open_benchmark.py @@ -4,8 +4,9 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import List, Tuple +from typing import Dict, List, Tuple +from llama_stack.apis.common.content_types import URL from llama_stack.apis.models.models import ModelType from llama_stack.distribution.datatypes import ( BenchmarkInput, @@ -15,21 +16,27 @@ from llama_stack.distribution.datatypes import ( ShieldInput, ToolGroupInput, ) -from llama_stack.providers.inline.vector_io.sqlite_vec.config import SQLiteVectorIOConfig +from llama_stack.providers.inline.vector_io.sqlite_vec.config import ( + SQLiteVectorIOConfig, +) from llama_stack.providers.remote.inference.anthropic.config import AnthropicConfig from llama_stack.providers.remote.inference.gemini.config import GeminiConfig from llama_stack.providers.remote.inference.groq.config import GroqConfig from llama_stack.providers.remote.inference.openai.config import OpenAIConfig from llama_stack.providers.remote.inference.together.config import TogetherImplConfig from llama_stack.providers.remote.vector_io.chroma.config import ChromaVectorIOConfig -from llama_stack.providers.remote.vector_io.pgvector.config import PGVectorVectorIOConfig -from llama_stack.providers.utils.inference.model_registry import ( - ProviderModelEntry, +from llama_stack.providers.remote.vector_io.pgvector.config import ( + PGVectorVectorIOConfig, +) +from llama_stack.providers.utils.inference.model_registry import ProviderModelEntry +from llama_stack.templates.template import ( + DistributionTemplate, + RunConfigSettings, + get_model_registry, ) -from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry -def get_inference_providers() -> Tuple[List[Provider], List[ModelInput]]: +def get_inference_providers() -> Tuple[List[Provider], Dict[str, List[ProviderModelEntry]]]: # in this template, we allow each API key to be optional providers = [ ( @@ -164,7 +171,7 @@ def get_distribution_template() -> DistributionTemplate: DatasetInput( dataset_id="simpleqa", provider_id="huggingface", - url={"uri": "https://huggingface.co/datasets/llamastack/simpleqa"}, + url=URL(uri="https://huggingface.co/datasets/llamastack/simpleqa"), metadata={ "path": "llamastack/simpleqa", "split": "train", @@ -178,7 +185,7 @@ def get_distribution_template() -> DistributionTemplate: DatasetInput( dataset_id="mmlu_cot", provider_id="huggingface", - url={"uri": "https://huggingface.co/datasets/llamastack/mmlu_cot"}, + url=URL(uri="https://huggingface.co/datasets/llamastack/mmlu_cot"), metadata={ "path": "llamastack/mmlu_cot", "name": "all", @@ -193,7 +200,7 @@ def get_distribution_template() -> DistributionTemplate: DatasetInput( dataset_id="gpqa_cot", provider_id="huggingface", - url={"uri": "https://huggingface.co/datasets/llamastack/gpqa_0shot_cot"}, + url=URL(uri="https://huggingface.co/datasets/llamastack/gpqa_0shot_cot"), metadata={ "path": "llamastack/gpqa_0shot_cot", "name": "gpqa_main", @@ -208,7 +215,7 @@ def get_distribution_template() -> DistributionTemplate: DatasetInput( dataset_id="math_500", provider_id="huggingface", - url={"uri": "https://huggingface.co/datasets/llamastack/math_500"}, + url=URL(uri="https://huggingface.co/datasets/llamastack/math_500"), metadata={ "path": "llamastack/math_500", "split": "test", diff --git a/llama_stack/templates/template.py b/llama_stack/templates/template.py index aa1ce144f..a5c8e80bc 100644 --- a/llama_stack/templates/template.py +++ b/llama_stack/templates/template.py @@ -30,7 +30,9 @@ from llama_stack.providers.utils.inference.model_registry import ProviderModelEn from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig -def get_model_registry(available_models: Dict[str, List[ProviderModelEntry]]) -> List[ModelInput]: +def get_model_registry( + available_models: Dict[str, List[ProviderModelEntry]], +) -> List[ModelInput]: models = [] for provider_id, entries in available_models.items(): for entry in entries: @@ -193,7 +195,7 @@ class DistributionTemplate(BaseModel): default_models.append( DefaultModel( model_id=model_entry.provider_model_id, - doc_string=f"({' -- '.join(doc_parts)})" if doc_parts else "", + doc_string=(f"({' -- '.join(doc_parts)})" if doc_parts else ""), ) ) From 58d08d100ef807ddd663cfaae18be383aa911ae5 Mon Sep 17 00:00:00 2001 From: Dinesh Yeduguru Date: Wed, 12 Mar 2025 12:01:03 -0700 Subject: [PATCH 024/557] feat: Add back inference metrics and preserve context variables across asyncio boundary (#1552) # What does this PR do? This PR adds back the changes in #1300 which were reverted in #1476 . It also adds logic to preserve context variables across asyncio boundary. this is needed with the library client since the async generator logic yields control to code outside the event loop, and on resuming, does not have the same context as before and this requires preserving the context vars. address #1477 ## Test Plan ``` curl --request POST \ --url http://localhost:8321/v1/inference/chat-completion \ --header 'content-type: application/json' \ --data '{ "model_id": "meta-llama/Llama-3.1-70B-Instruct", "messages": [ { "role": "user", "content": { "type": "text", "text": "where do humans live" } } ], "stream": false }' | jq . { "metrics": [ { "trace_id": "kCZwO3tyQC-FuAGb", "span_id": "bsP_5a5O", "timestamp": "2025-03-11T16:47:38.549084Z", "attributes": { "model_id": "meta-llama/Llama-3.1-70B-Instruct", "provider_id": "fireworks" }, "type": "metric", "metric": "prompt_tokens", "value": 10, "unit": "tokens" }, { "trace_id": "kCZwO3tyQC-FuAGb", "span_id": "bsP_5a5O", "timestamp": "2025-03-11T16:47:38.549449Z", "attributes": { "model_id": "meta-llama/Llama-3.1-70B-Instruct", "provider_id": "fireworks" }, "type": "metric", "metric": "completion_tokens", "value": 369, "unit": "tokens" }, { "trace_id": "kCZwO3tyQC-FuAGb", "span_id": "bsP_5a5O", "timestamp": "2025-03-11T16:47:38.549457Z", "attributes": { "model_id": "meta-llama/Llama-3.1-70B-Instruct", "provider_id": "fireworks" }, "type": "metric", "metric": "total_tokens", "value": 379, "unit": "tokens" } ], "completion_message": { "role": "assistant", "content": "Humans live on the planet Earth, specifically on its landmasses and in its oceans. Here's a breakdown of where humans live:\n\n1. **Continents:** Humans inhabit all seven continents:\n\t* Africa\n\t* Antarctica ( temporary residents, mostly scientists and researchers)\n\t* Asia\n\t* Australia\n\t* Europe\n\t* North America\n\t* South America\n2. **Countries:** There are 196 countries recognized by the United Nations, and humans live in almost all of them.\n3. **Cities and towns:** Many humans live in urban areas, such as cities and towns, which are often located near coastlines, rivers, or other bodies of water.\n4. **Rural areas:** Some humans live in rural areas, such as villages, farms, and countryside.\n5. **Islands:** Humans inhabit many islands around the world, including those in the Pacific, Indian, and Atlantic Oceans.\n6. **Mountains and highlands:** Humans live in mountainous regions, such as the Himalayas, the Andes, and the Rocky Mountains.\n7. **Deserts:** Some humans live in desert regions, such as the Sahara, the Mojave, and the Atacama.\n8. **Coastal areas:** Many humans live in coastal areas, such as beaches, ports, and coastal cities.\n9. **Underwater habitats:** A few humans live in underwater habitats, such as research stations and submarines.\n10. **Space:** A small number of humans have lived in space, including astronauts on the International Space Station and those who have visited the Moon.\n\nOverall, humans can be found living in almost every environment on Earth, from the frozen tundra to the hottest deserts, and from the highest mountains to the deepest oceans.", "stop_reason": "end_of_turn", "tool_calls": [] }, "logprobs": null } ``` Orignal repro no longer showing any error: ``` LLAMA_STACK_DISABLE_VERSION_CHECK=true llama stack run ~/.llama/distributions/fireworks/fireworks-run.yaml python -m examples.agents.e2e_loop_with_client_tools localhost 8321 ``` client logs: https://gist.github.com/dineshyv/047c7e87b18a5792aa660e311ea53166 server logs: https://gist.github.com/dineshyv/97a2174099619e9916c7c490be26e559 --- llama_stack/apis/inference/inference.py | 8 +- llama_stack/distribution/library_client.py | 8 +- llama_stack/distribution/request_headers.py | 37 +--- llama_stack/distribution/resolver.py | 4 +- llama_stack/distribution/routers/__init__.py | 12 +- llama_stack/distribution/routers/routers.py | 163 +++++++++++++++++- llama_stack/distribution/server/server.py | 8 +- llama_stack/distribution/utils/context.py | 33 ++++ .../distribution/utils/tests/test_context.py | 155 +++++++++++++++++ .../telemetry/meta_reference/telemetry.py | 3 + 10 files changed, 380 insertions(+), 51 deletions(-) create mode 100644 llama_stack/distribution/utils/context.py create mode 100644 llama_stack/distribution/utils/tests/test_context.py diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py index d0f5d15c5..fa917ac22 100644 --- a/llama_stack/apis/inference/inference.py +++ b/llama_stack/apis/inference/inference.py @@ -285,7 +285,7 @@ class CompletionRequest(BaseModel): @json_schema_type -class CompletionResponse(BaseModel): +class CompletionResponse(MetricResponseMixin): """Response from a completion request. :param content: The generated completion text @@ -299,7 +299,7 @@ class CompletionResponse(BaseModel): @json_schema_type -class CompletionResponseStreamChunk(BaseModel): +class CompletionResponseStreamChunk(MetricResponseMixin): """A chunk of a streamed completion response. :param delta: New content generated since last chunk. This can be one or more tokens. @@ -368,7 +368,7 @@ class ChatCompletionRequest(BaseModel): @json_schema_type -class ChatCompletionResponseStreamChunk(MetricResponseMixin, BaseModel): +class ChatCompletionResponseStreamChunk(MetricResponseMixin): """A chunk of a streamed chat completion response. :param event: The event containing the new content @@ -378,7 +378,7 @@ class ChatCompletionResponseStreamChunk(MetricResponseMixin, BaseModel): @json_schema_type -class ChatCompletionResponse(MetricResponseMixin, BaseModel): +class ChatCompletionResponse(MetricResponseMixin): """Response from a chat completion request. :param completion_message: The complete response message diff --git a/llama_stack/distribution/library_client.py b/llama_stack/distribution/library_client.py index 5dc70bb67..15c4fe6ea 100644 --- a/llama_stack/distribution/library_client.py +++ b/llama_stack/distribution/library_client.py @@ -33,7 +33,7 @@ from llama_stack.distribution.build import print_pip_install_help from llama_stack.distribution.configure import parse_and_maybe_upgrade_config from llama_stack.distribution.datatypes import Api from llama_stack.distribution.request_headers import ( - preserve_headers_context_async_generator, + PROVIDER_DATA_VAR, request_provider_data_context, ) from llama_stack.distribution.resolver import ProviderRegistry @@ -44,8 +44,10 @@ from llama_stack.distribution.stack import ( redact_sensitive_fields, replace_env_vars, ) +from llama_stack.distribution.utils.context import preserve_contexts_async_generator from llama_stack.distribution.utils.exec import in_notebook from llama_stack.providers.utils.telemetry.tracing import ( + CURRENT_TRACE_CONTEXT, end_trace, setup_logger, start_trace, @@ -384,8 +386,8 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient): finally: await end_trace() - # Wrap the generator to preserve context across iterations - wrapped_gen = preserve_headers_context_async_generator(gen()) + wrapped_gen = preserve_contexts_async_generator(gen(), [CURRENT_TRACE_CONTEXT, PROVIDER_DATA_VAR]) + mock_response = httpx.Response( status_code=httpx.codes.OK, content=wrapped_gen, diff --git a/llama_stack/distribution/request_headers.py b/llama_stack/distribution/request_headers.py index 19afae59b..8709fc040 100644 --- a/llama_stack/distribution/request_headers.py +++ b/llama_stack/distribution/request_headers.py @@ -7,14 +7,14 @@ import contextvars import json import logging -from typing import Any, AsyncGenerator, ContextManager, Dict, Optional, TypeVar +from typing import Any, ContextManager, Dict, Optional from .utils.dynamic import instantiate_class_type log = logging.getLogger(__name__) # Context variable for request provider data -_provider_data_var = contextvars.ContextVar("provider_data", default=None) +PROVIDER_DATA_VAR = contextvars.ContextVar("provider_data", default=None) class RequestProviderDataContext(ContextManager): @@ -26,40 +26,13 @@ class RequestProviderDataContext(ContextManager): def __enter__(self): # Save the current value and set the new one - self.token = _provider_data_var.set(self.provider_data) + self.token = PROVIDER_DATA_VAR.set(self.provider_data) return self def __exit__(self, exc_type, exc_val, exc_tb): # Restore the previous value if self.token is not None: - _provider_data_var.reset(self.token) - - -T = TypeVar("T") - - -def preserve_headers_context_async_generator(gen: AsyncGenerator[T, None]) -> AsyncGenerator[T, None]: - """ - Wraps an async generator to preserve request headers context variables across iterations. - - This ensures that context variables set during generator creation are - available during each iteration of the generator, even if the original - context manager has exited. - """ - # Capture the current context value right now - context_value = _provider_data_var.get() - - async def wrapper(): - while True: - # Set context before each anext() call - _ = _provider_data_var.set(context_value) - try: - item = await gen.__anext__() - yield item - except StopAsyncIteration: - break - - return wrapper() + PROVIDER_DATA_VAR.reset(self.token) class NeedsRequestProviderData: @@ -72,7 +45,7 @@ class NeedsRequestProviderData: if not validator_class: raise ValueError(f"Provider {provider_type} does not have a validator") - val = _provider_data_var.get() + val = PROVIDER_DATA_VAR.get() if not val: return None diff --git a/llama_stack/distribution/resolver.py b/llama_stack/distribution/resolver.py index d7ca4414d..ab075f399 100644 --- a/llama_stack/distribution/resolver.py +++ b/llama_stack/distribution/resolver.py @@ -165,7 +165,9 @@ def specs_for_autorouted_apis(apis_to_serve: List[str] | Set[str]) -> Dict[str, module="llama_stack.distribution.routers", routing_table_api=info.routing_table_api, api_dependencies=[info.routing_table_api], - deps__=[info.routing_table_api.value], + # Add telemetry as an optional dependency to all auto-routed providers + optional_api_dependencies=[Api.telemetry], + deps__=([info.routing_table_api.value, Api.telemetry.value]), ), ) } diff --git a/llama_stack/distribution/routers/__init__.py b/llama_stack/distribution/routers/__init__.py index a54f57fb3..d0fca8771 100644 --- a/llama_stack/distribution/routers/__init__.py +++ b/llama_stack/distribution/routers/__init__.py @@ -45,7 +45,7 @@ async def get_routing_table_impl( return impl -async def get_auto_router_impl(api: Api, routing_table: RoutingTable, _deps) -> Any: +async def get_auto_router_impl(api: Api, routing_table: RoutingTable, deps: Dict[str, Any]) -> Any: from .routers import ( DatasetIORouter, EvalRouter, @@ -65,9 +65,17 @@ async def get_auto_router_impl(api: Api, routing_table: RoutingTable, _deps) -> "eval": EvalRouter, "tool_runtime": ToolRuntimeRouter, } + api_to_deps = { + "inference": {"telemetry": Api.telemetry}, + } if api.value not in api_to_routers: raise ValueError(f"API {api.value} not found in router map") - impl = api_to_routers[api.value](routing_table) + api_to_dep_impl = {} + for dep_name, dep_api in api_to_deps.get(api.value, {}).items(): + if dep_api in deps: + api_to_dep_impl[dep_name] = deps[dep_api] + + impl = api_to_routers[api.value](routing_table, **api_to_dep_impl) await impl.initialize() return impl diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py index 28df67922..68b8e55cb 100644 --- a/llama_stack/distribution/routers/routers.py +++ b/llama_stack/distribution/routers/routers.py @@ -4,7 +4,8 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Any, AsyncGenerator, Dict, List, Optional +import time +from typing import Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Union from llama_stack.apis.common.content_types import ( URL, @@ -20,6 +21,10 @@ from llama_stack.apis.eval import ( JobStatus, ) from llama_stack.apis.inference import ( + ChatCompletionResponse, + ChatCompletionResponseEventType, + ChatCompletionResponseStreamChunk, + CompletionMessage, EmbeddingsResponse, EmbeddingTaskType, Inference, @@ -27,13 +32,14 @@ from llama_stack.apis.inference import ( Message, ResponseFormat, SamplingParams, + StopReason, TextTruncation, ToolChoice, ToolConfig, ToolDefinition, ToolPromptFormat, ) -from llama_stack.apis.models import ModelType +from llama_stack.apis.models import Model, ModelType from llama_stack.apis.safety import RunShieldResponse, Safety from llama_stack.apis.scoring import ( ScoreBatchResponse, @@ -42,6 +48,7 @@ from llama_stack.apis.scoring import ( ScoringFnParams, ) from llama_stack.apis.shields import Shield +from llama_stack.apis.telemetry import MetricEvent, Telemetry from llama_stack.apis.tools import ( RAGDocument, RAGQueryConfig, @@ -52,7 +59,10 @@ from llama_stack.apis.tools import ( ) from llama_stack.apis.vector_io import Chunk, QueryChunksResponse, VectorIO from llama_stack.log import get_logger +from llama_stack.models.llama.llama3.chat_format import ChatFormat +from llama_stack.models.llama.llama3.tokenizer import Tokenizer from llama_stack.providers.datatypes import RoutingTable +from llama_stack.providers.utils.telemetry.tracing import get_current_span logger = get_logger(name=__name__, category="core") @@ -119,9 +129,14 @@ class InferenceRouter(Inference): def __init__( self, routing_table: RoutingTable, + telemetry: Optional[Telemetry] = None, ) -> None: logger.debug("Initializing InferenceRouter") self.routing_table = routing_table + self.telemetry = telemetry + if self.telemetry: + self.tokenizer = Tokenizer.get_instance() + self.formatter = ChatFormat(self.tokenizer) async def initialize(self) -> None: logger.debug("InferenceRouter.initialize") @@ -144,6 +159,71 @@ class InferenceRouter(Inference): ) await self.routing_table.register_model(model_id, provider_model_id, provider_id, metadata, model_type) + def _construct_metrics( + self, prompt_tokens: int, completion_tokens: int, total_tokens: int, model: Model + ) -> List[MetricEvent]: + """Constructs a list of MetricEvent objects containing token usage metrics. + + Args: + prompt_tokens: Number of tokens in the prompt + completion_tokens: Number of tokens in the completion + total_tokens: Total number of tokens used + model: Model object containing model_id and provider_id + + Returns: + List of MetricEvent objects with token usage metrics + """ + span = get_current_span() + if span is None: + logger.warning("No span found for token usage metrics") + return [] + metrics = [ + ("prompt_tokens", prompt_tokens), + ("completion_tokens", completion_tokens), + ("total_tokens", total_tokens), + ] + metric_events = [] + for metric_name, value in metrics: + metric_events.append( + MetricEvent( + trace_id=span.trace_id, + span_id=span.span_id, + metric=metric_name, + value=value, + timestamp=time.time(), + unit="tokens", + attributes={ + "model_id": model.model_id, + "provider_id": model.provider_id, + }, + ) + ) + return metric_events + + async def _compute_and_log_token_usage( + self, + prompt_tokens: int, + completion_tokens: int, + total_tokens: int, + model: Model, + ) -> List[MetricEvent]: + metrics = self._construct_metrics(prompt_tokens, completion_tokens, total_tokens, model) + if self.telemetry: + for metric in metrics: + await self.telemetry.log_event(metric) + return metrics + + async def _count_tokens( + self, + messages: List[Message] | InterleavedContent, + tool_prompt_format: Optional[ToolPromptFormat] = None, + ) -> Optional[int]: + if isinstance(messages, list): + encoded = self.formatter.encode_dialog_prompt(messages, tool_prompt_format) + else: + encoded = self.formatter.encode_content(messages) + return len(encoded.tokens) if encoded and encoded.tokens else 0 + async def chat_completion( self, model_id: str, @@ -156,8 +236,9 @@ class InferenceRouter(Inference): stream: Optional[bool] = False, logprobs: Optional[LogProbConfig] = None, tool_config: Optional[ToolConfig] = None, - ) -> AsyncGenerator: + ) -> Union[ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]]: logger.debug( + "core", f"InferenceRouter.chat_completion: {model_id=}, {stream=}, {messages=}, {tools=}, {tool_config=}, {response_format=}", ) if sampling_params is None: @@ -206,10 +287,47 @@ class InferenceRouter(Inference): tool_config=tool_config, ) provider = self.routing_table.get_provider_impl(model_id) + prompt_tokens = await self._count_tokens(messages, tool_config.tool_prompt_format) + if stream: - return (chunk async for chunk in await provider.chat_completion(**params)) + + async def stream_generator(): + completion_text = "" + async for chunk in await provider.chat_completion(**params): + if chunk.event.event_type == ChatCompletionResponseEventType.progress: + if chunk.event.delta.type == "text": + completion_text += chunk.event.delta.text + if chunk.event.event_type == ChatCompletionResponseEventType.complete: + completion_tokens = await self._count_tokens( + [CompletionMessage(content=completion_text, stop_reason=StopReason.end_of_turn)], + tool_config.tool_prompt_format, + ) + total_tokens = (prompt_tokens or 0) + (completion_tokens or 0) + metrics = await self._compute_and_log_token_usage( + prompt_tokens or 0, + completion_tokens or 0, + total_tokens, + model, + ) + chunk.metrics = metrics if chunk.metrics is None else chunk.metrics + metrics + yield chunk + + return stream_generator() else: - return await provider.chat_completion(**params) + response = await provider.chat_completion(**params) + completion_tokens = await self._count_tokens( + [response.completion_message], + tool_config.tool_prompt_format, + ) + total_tokens = (prompt_tokens or 0) + (completion_tokens or 0) + metrics = await self._compute_and_log_token_usage( + prompt_tokens or 0, + completion_tokens or 0, + total_tokens, + model, + ) + response.metrics = metrics if response.metrics is None else response.metrics + metrics + return response async def completion( self, @@ -239,10 +357,41 @@ class InferenceRouter(Inference): stream=stream, logprobs=logprobs, ) + + prompt_tokens = await self._count_tokens(content) + if stream: - return (chunk async for chunk in await provider.completion(**params)) + + async def stream_generator(): + completion_text = "" + async for chunk in await provider.completion(**params): + if hasattr(chunk, "delta"): + completion_text += chunk.delta + if hasattr(chunk, "stop_reason") and chunk.stop_reason and self.telemetry: + completion_tokens = await self._count_tokens(completion_text) + total_tokens = (prompt_tokens or 0) + (completion_tokens or 0) + metrics = await self._compute_and_log_token_usage( + prompt_tokens or 0, + completion_tokens or 0, + total_tokens, + model, + ) + chunk.metrics = metrics if chunk.metrics is None else chunk.metrics + metrics + yield chunk + + return stream_generator() else: - return await provider.completion(**params) + response = await provider.completion(**params) + completion_tokens = await self._count_tokens(response.content) + total_tokens = (prompt_tokens or 0) + (completion_tokens or 0) + metrics = await self._compute_and_log_token_usage( + prompt_tokens or 0, + completion_tokens or 0, + total_tokens, + model, + ) + response.metrics = metrics if response.metrics is None else response.metrics + metrics + return response async def embeddings( self, diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py index 2cc70a738..7ca009b13 100644 --- a/llama_stack/distribution/server/server.py +++ b/llama_stack/distribution/server/server.py @@ -28,7 +28,7 @@ from typing_extensions import Annotated from llama_stack.distribution.datatypes import StackRunConfig from llama_stack.distribution.distribution import builtin_automatically_routed_apis from llama_stack.distribution.request_headers import ( - preserve_headers_context_async_generator, + PROVIDER_DATA_VAR, request_provider_data_context, ) from llama_stack.distribution.resolver import InvalidProviderError @@ -38,6 +38,7 @@ from llama_stack.distribution.stack import ( replace_env_vars, validate_env_pair, ) +from llama_stack.distribution.utils.context import preserve_contexts_async_generator from llama_stack.log import get_logger from llama_stack.providers.datatypes import Api from llama_stack.providers.inline.telemetry.meta_reference.config import TelemetryConfig @@ -45,6 +46,7 @@ from llama_stack.providers.inline.telemetry.meta_reference.telemetry import ( TelemetryAdapter, ) from llama_stack.providers.utils.telemetry.tracing import ( + CURRENT_TRACE_CONTEXT, end_trace, setup_logger, start_trace, @@ -182,7 +184,9 @@ def create_dynamic_typed_route(func: Any, method: str, route: str): try: if is_streaming: - gen = preserve_headers_context_async_generator(sse_generator(func(**kwargs))) + gen = preserve_contexts_async_generator( + sse_generator(func(**kwargs)), [CURRENT_TRACE_CONTEXT, PROVIDER_DATA_VAR] + ) return StreamingResponse(gen, media_type="text/event-stream") else: value = func(**kwargs) diff --git a/llama_stack/distribution/utils/context.py b/llama_stack/distribution/utils/context.py new file mode 100644 index 000000000..107ce7127 --- /dev/null +++ b/llama_stack/distribution/utils/context.py @@ -0,0 +1,33 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from contextvars import ContextVar +from typing import AsyncGenerator, List, TypeVar + +T = TypeVar("T") + + +def preserve_contexts_async_generator( + gen: AsyncGenerator[T, None], context_vars: List[ContextVar] +) -> AsyncGenerator[T, None]: + """ + Wraps an async generator to preserve context variables across iterations. + This is needed because we start a new asyncio event loop for each streaming request, + and we need to preserve the context across the event loop boundary. + """ + + async def wrapper(): + while True: + try: + item = await gen.__anext__() + context_values = {context_var.name: context_var.get() for context_var in context_vars} + yield item + for context_var in context_vars: + _ = context_var.set(context_values[context_var.name]) + except StopAsyncIteration: + break + + return wrapper() diff --git a/llama_stack/distribution/utils/tests/test_context.py b/llama_stack/distribution/utils/tests/test_context.py new file mode 100644 index 000000000..84944bfe8 --- /dev/null +++ b/llama_stack/distribution/utils/tests/test_context.py @@ -0,0 +1,155 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import asyncio +from concurrent.futures import ThreadPoolExecutor +from contextvars import ContextVar + +import pytest + +from llama_stack.distribution.utils.context import preserve_contexts_async_generator + + +@pytest.mark.asyncio +async def test_preserve_contexts_with_exception(): + # Create context variable + context_var = ContextVar("exception_var", default="initial") + token = context_var.set("start_value") + + # Create an async generator that raises an exception + async def exception_generator(): + yield context_var.get() + context_var.set("modified") + raise ValueError("Test exception") + yield None # This will never be reached + + # Wrap the generator + wrapped_gen = preserve_contexts_async_generator(exception_generator(), [context_var]) + + # First iteration should work + value = await wrapped_gen.__anext__() + assert value == "start_value" + + # Second iteration should raise the exception + with pytest.raises(ValueError, match="Test exception"): + await wrapped_gen.__anext__() + + # Clean up + context_var.reset(token) + + +@pytest.mark.asyncio +async def test_preserve_contexts_empty_generator(): + # Create context variable + context_var = ContextVar("empty_var", default="initial") + token = context_var.set("value") + + # Create an empty async generator + async def empty_generator(): + if False: # This condition ensures the generator yields nothing + yield None + + # Wrap the generator + wrapped_gen = preserve_contexts_async_generator(empty_generator(), [context_var]) + + # The generator should raise StopAsyncIteration immediately + with pytest.raises(StopAsyncIteration): + await wrapped_gen.__anext__() + + # Context variable should remain unchanged + assert context_var.get() == "value" + + # Clean up + context_var.reset(token) + + +@pytest.mark.asyncio +async def test_preserve_contexts_across_event_loops(): + """ + Test that context variables are preserved across event loop boundaries with nested generators. + This simulates the real-world scenario where: + 1. A new event loop is created for each streaming request + 2. The async generator runs inside that loop + 3. There are multiple levels of nested generators + 4. Context needs to be preserved across these boundaries + """ + # Create context variables + request_id = ContextVar("request_id", default=None) + user_id = ContextVar("user_id", default=None) + + # Set initial values + + # Results container to verify values across thread boundaries + results = [] + + # Inner-most generator (level 2) + async def inner_generator(): + # Should have the context from the outer scope + yield (1, request_id.get(), user_id.get()) + + # Modify one context variable + user_id.set("user-modified") + + # Should reflect the modification + yield (2, request_id.get(), user_id.get()) + + # Middle generator (level 1) + async def middle_generator(): + inner_gen = inner_generator() + + # Forward the first yield from inner + item = await inner_gen.__anext__() + yield item + + # Forward the second yield from inner + item = await inner_gen.__anext__() + yield item + + request_id.set("req-modified") + + # Add our own yield with both modified variables + yield (3, request_id.get(), user_id.get()) + + # Function to run in a separate thread with a new event loop + def run_in_new_loop(): + # Create a new event loop for this thread + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + try: + # Outer generator (runs in the new loop) + async def outer_generator(): + request_id.set("req-12345") + user_id.set("user-6789") + # Wrap the middle generator + wrapped_gen = preserve_contexts_async_generator(middle_generator(), [request_id, user_id]) + + # Process all items from the middle generator + async for item in wrapped_gen: + # Store results for verification + results.append(item) + + # Run the outer generator in the new loop + loop.run_until_complete(outer_generator()) + finally: + loop.close() + + # Run the generator chain in a separate thread with a new event loop + with ThreadPoolExecutor(max_workers=1) as executor: + future = executor.submit(run_in_new_loop) + future.result() # Wait for completion + + # Verify the results + assert len(results) == 3 + + # First yield should have original values + assert results[0] == (1, "req-12345", "user-6789") + + # Second yield should have modified user_id + assert results[1] == (2, "req-12345", "user-modified") + + # Third yield should have both modified values + assert results[2] == (3, "req-modified", "user-modified") diff --git a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py index e713a057f..4cdb420b2 100644 --- a/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py +++ b/llama_stack/providers/inline/telemetry/meta_reference/telemetry.py @@ -73,6 +73,7 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry): def __init__(self, config: TelemetryConfig, deps: Dict[str, Any]) -> None: self.config = config self.datasetio_api = deps.get(Api.datasetio) + self.meter = None resource = Resource.create( { @@ -171,6 +172,8 @@ class TelemetryAdapter(TelemetryDatasetMixin, Telemetry): return _GLOBAL_STORAGE["gauges"][name] def _log_metric(self, event: MetricEvent) -> None: + if self.meter is None: + return if isinstance(event.value, int): counter = self._get_or_create_counter(event.metric, event.unit) counter.add(event.value, attributes=event.attributes) From b7a9c454779979eaa89196901fc7856a5f10b900 Mon Sep 17 00:00:00 2001 From: ehhuang Date: Wed, 12 Mar 2025 12:10:21 -0700 Subject: [PATCH 025/557] chore: deprecate ToolResponseMessage in agent.resume API (#1566) # Summary: closes #1431 # Test Plan: LLAMA_STACK_CONFIG=fireworks pytest -s -v tests/integration/agents/test_agents.py --safety-shield meta-llama/Llama-Guard-3-8B --text-model meta-llama/Llama-3.1-8B-Instruct --- docs/_static/llama-stack-spec.html | 20 +++++-------------- docs/_static/llama-stack-spec.yaml | 13 ++++-------- llama_stack/apis/agents/agents.py | 5 ++--- .../agents/meta_reference/agent_instance.py | 18 +++++------------ .../inline/agents/meta_reference/agents.py | 2 +- 5 files changed, 17 insertions(+), 41 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index b0febbbef..709360ede 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -9490,21 +9490,11 @@ "type": "object", "properties": { "tool_responses": { - "oneOf": [ - { - "type": "array", - "items": { - "$ref": "#/components/schemas/ToolResponse" - } - }, - { - "type": "array", - "items": { - "$ref": "#/components/schemas/ToolResponseMessage" - } - } - ], - "description": "The tool call responses to resume the turn with. NOTE: ToolResponseMessage will be deprecated. Use ToolResponse." + "type": "array", + "items": { + "$ref": "#/components/schemas/ToolResponse" + }, + "description": "The tool call responses to resume the turn with." }, "stream": { "type": "boolean", diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 2985e6222..4c00fbe63 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -6405,16 +6405,11 @@ components: type: object properties: tool_responses: - oneOf: - - type: array - items: - $ref: '#/components/schemas/ToolResponse' - - type: array - items: - $ref: '#/components/schemas/ToolResponseMessage' + type: array + items: + $ref: '#/components/schemas/ToolResponse' description: >- - The tool call responses to resume the turn with. NOTE: ToolResponseMessage - will be deprecated. Use ToolResponse. + The tool call responses to resume the turn with. stream: type: boolean description: Whether to stream the response. diff --git a/llama_stack/apis/agents/agents.py b/llama_stack/apis/agents/agents.py index 1170a56d5..5cc910a55 100644 --- a/llama_stack/apis/agents/agents.py +++ b/llama_stack/apis/agents/agents.py @@ -370,7 +370,7 @@ class AgentTurnResumeRequest(BaseModel): agent_id: str session_id: str turn_id: str - tool_responses: Union[List[ToolResponse], List[ToolResponseMessage]] + tool_responses: List[ToolResponse] stream: Optional[bool] = False @@ -449,7 +449,7 @@ class Agents(Protocol): agent_id: str, session_id: str, turn_id: str, - tool_responses: Union[List[ToolResponse], List[ToolResponseMessage]], + tool_responses: List[ToolResponse], stream: Optional[bool] = False, ) -> Union[Turn, AsyncIterator[AgentTurnResponseStreamChunk]]: """Resume an agent turn with executed tool call responses. @@ -460,7 +460,6 @@ class Agents(Protocol): :param session_id: The ID of the session to resume. :param turn_id: The ID of the turn to resume. :param tool_responses: The tool call responses to resume the turn with. - NOTE: ToolResponseMessage will be deprecated. Use ToolResponse. :param stream: Whether to stream the response. :returns: A Turn object if stream is False, otherwise an AsyncIterator of AgentTurnResponseStreamChunk objects. """ diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py index fedd695c1..1d9f54e96 100644 --- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py +++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py @@ -218,18 +218,10 @@ class ChatAgent(ShieldRunnerMixin): steps = [] messages = await self.get_messages_from_turns(turns) if is_resume: - if isinstance(request.tool_responses[0], ToolResponseMessage): - tool_response_messages = request.tool_responses - tool_responses = [ - ToolResponse(call_id=x.call_id, tool_name=x.tool_name, content=x.content) - for x in request.tool_responses - ] - else: - tool_response_messages = [ - ToolResponseMessage(call_id=x.call_id, tool_name=x.tool_name, content=x.content) - for x in request.tool_responses - ] - tool_responses = request.tool_responses + tool_response_messages = [ + ToolResponseMessage(call_id=x.call_id, tool_name=x.tool_name, content=x.content) + for x in request.tool_responses + ] messages.extend(tool_response_messages) last_turn = turns[-1] last_turn_messages = self.turn_to_messages(last_turn) @@ -252,7 +244,7 @@ class ChatAgent(ShieldRunnerMixin): step_id=(in_progress_tool_call_step.step_id if in_progress_tool_call_step else str(uuid.uuid4())), turn_id=request.turn_id, tool_calls=(in_progress_tool_call_step.tool_calls if in_progress_tool_call_step else []), - tool_responses=tool_responses, + tool_responses=request.tool_responses, completed_at=now, started_at=(in_progress_tool_call_step.started_at if in_progress_tool_call_step else now), ) diff --git a/llama_stack/providers/inline/agents/meta_reference/agents.py b/llama_stack/providers/inline/agents/meta_reference/agents.py index c24b14e35..5ca123595 100644 --- a/llama_stack/providers/inline/agents/meta_reference/agents.py +++ b/llama_stack/providers/inline/agents/meta_reference/agents.py @@ -172,7 +172,7 @@ class MetaReferenceAgentsImpl(Agents): agent_id: str, session_id: str, turn_id: str, - tool_responses: Union[List[ToolResponse], List[ToolResponseMessage]], + tool_responses: List[ToolResponse], stream: Optional[bool] = False, ) -> AsyncGenerator: request = AgentTurnResumeRequest( From 0fdb15bcc74f236cd4e6ac4291a361a08b6bf1b3 Mon Sep 17 00:00:00 2001 From: Dinesh Yeduguru Date: Wed, 12 Mar 2025 13:26:23 -0700 Subject: [PATCH 026/557] fix: fix build error in context.py (#1595) # What does this PR do? This fixes the build error ## Test Plan pre-commit run --all-files check for merge conflicts................................................Passed trim trailing whitespace.................................................Passed check for added large files..............................................Passed fix end of files.........................................................Passed Insert license in comments...............................................Passed ruff.....................................................................Passed ruff-format..............................................................Passed blacken-docs.............................................................Passed uv-lock..................................................................Passed uv-export................................................................Passed mypy.....................................................................Passed Distribution Template Codegen............................................Passed --- llama_stack/distribution/utils/context.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_stack/distribution/utils/context.py b/llama_stack/distribution/utils/context.py index 107ce7127..2f32afba2 100644 --- a/llama_stack/distribution/utils/context.py +++ b/llama_stack/distribution/utils/context.py @@ -19,7 +19,7 @@ def preserve_contexts_async_generator( and we need to preserve the context across the event loop boundary. """ - async def wrapper(): + async def wrapper() -> AsyncGenerator[T, None]: while True: try: item = await gen.__anext__() From 1311faf3f5e7e18111b642be6bbdd941c2034e02 Mon Sep 17 00:00:00 2001 From: ehhuang Date: Wed, 12 Mar 2025 14:57:31 -0700 Subject: [PATCH 027/557] fix: logging (#1598) Summary: Test Plan: --- llama_stack/distribution/routers/routers.py | 1 - 1 file changed, 1 deletion(-) diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py index 68b8e55cb..34102d04b 100644 --- a/llama_stack/distribution/routers/routers.py +++ b/llama_stack/distribution/routers/routers.py @@ -238,7 +238,6 @@ class InferenceRouter(Inference): tool_config: Optional[ToolConfig] = None, ) -> Union[ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]]: logger.debug( - "core", f"InferenceRouter.chat_completion: {model_id=}, {stream=}, {messages=}, {tools=}, {tool_config=}, {response_format=}", ) if sampling_params is None: From ad939c97c37f8e33d0e94fe43893773cffe7618e Mon Sep 17 00:00:00 2001 From: Nathan Weinberg <31703736+nathan-weinberg@users.noreply.github.com> Date: Wed, 12 Mar 2025 18:41:35 -0400 Subject: [PATCH 028/557] docs: add unit test badge to README (#1591) # What does this PR do? This PR adds a simple unit test badge to the project README It also modifies the workflow to run on merges to main, so that the status reflected in the README is that of main and not pull request branches --------- Signed-off-by: Nathan Weinberg --- .github/workflows/unit-tests.yml | 2 ++ README.md | 1 + 2 files changed, 3 insertions(+) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 39505ba11..59d18b3be 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -1,6 +1,8 @@ name: Unit Tests on: + push: + branches: [ main ] pull_request: branches: [ main ] workflow_dispatch: diff --git a/README.md b/README.md index b24e69514..6e1fd088e 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ [![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-stack)](https://pypi.org/project/llama-stack/) [![License](https://img.shields.io/pypi/l/llama_stack.svg)](https://github.com/meta-llama/llama-stack/blob/main/LICENSE) [![Discord](https://img.shields.io/discord/1257833999603335178)](https://discord.gg/llama-stack) +![Unit](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml/badge.svg?branch=main) [**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) From 99bbe0e70b125f93da659ca722a9d5c2f6ef7022 Mon Sep 17 00:00:00 2001 From: Dinesh Yeduguru Date: Wed, 12 Mar 2025 15:45:44 -0700 Subject: [PATCH 029/557] feat: Add new compact MetricInResponse type (#1593) # What does this PR do? This change adds a compact type to include metrics in response as opposed to the full MetricEvent which is relevant for internal logging purposes. ## Test Plan ``` LLAMA_STACK_CONFIG=~/.llama/distributions/fireworks/fireworks-run.yaml pytest -s -v agents/test_agents.py --safety-shield meta-llama/Llama-Guard-3-8B --text-model meta-llama/Llama-3.1-8B-Instruct llama stack run ~/.llama/distributions/fireworks/fireworks-run.yaml curl --request POST \ --url http://localhost:8321/v1/inference/chat-completion \ --header 'content-type: application/json' \ --data '{ "model_id": "meta-llama/Llama-3.1-70B-Instruct", "messages": [ { "role": "user", "content": { "type": "text", "text": "where do humans live" } } ], "stream": false }' { "metrics": [ { "metric": "prompt_tokens", "value": 10, "unit": null }, { "metric": "completion_tokens", "value": 522, "unit": null }, { "metric": "total_tokens", "value": 532, "unit": null } ], "completion_message": { "role": "assistant", "content": "Humans live in various parts of the world...............", "stop_reason": "out_of_tokens", "tool_calls": [] }, "logprobs": null } ``` --- docs/_static/llama-stack-spec.html | 133 +++++++++++++------- docs/_static/llama-stack-spec.yaml | 82 +++++++----- llama_stack/apis/telemetry/telemetry.py | 9 +- llama_stack/distribution/routers/routers.py | 6 +- 4 files changed, 150 insertions(+), 80 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 709360ede..dbd530aa3 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -4549,7 +4549,7 @@ "metrics": { "type": "array", "items": { - "$ref": "#/components/schemas/MetricEvent" + "$ref": "#/components/schemas/MetricInResponse" } }, "completion_message": { @@ -4571,46 +4571,9 @@ "title": "ChatCompletionResponse", "description": "Response from a chat completion request." }, - "MetricEvent": { + "MetricInResponse": { "type": "object", "properties": { - "trace_id": { - "type": "string" - }, - "span_id": { - "type": "string" - }, - "timestamp": { - "type": "string", - "format": "date-time" - }, - "attributes": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "string" - }, - { - "type": "integer" - }, - { - "type": "number" - }, - { - "type": "boolean" - }, - { - "type": "null" - } - ] - } - }, - "type": { - "type": "string", - "const": "metric", - "default": "metric" - }, "metric": { "type": "string" }, @@ -4630,15 +4593,10 @@ }, "additionalProperties": false, "required": [ - "trace_id", - "span_id", - "timestamp", - "type", "metric", - "value", - "unit" + "value" ], - "title": "MetricEvent" + "title": "MetricInResponse" }, "TokenLogProbs": { "type": "object", @@ -4715,6 +4673,12 @@ "CompletionResponse": { "type": "object", "properties": { + "metrics": { + "type": "array", + "items": { + "$ref": "#/components/schemas/MetricInResponse" + } + }, "content": { "type": "string", "description": "The generated completion text" @@ -4924,7 +4888,7 @@ "metrics": { "type": "array", "items": { - "$ref": "#/components/schemas/MetricEvent" + "$ref": "#/components/schemas/MetricInResponse" } }, "event": { @@ -5082,6 +5046,12 @@ "CompletionResponseStreamChunk": { "type": "object", "properties": { + "metrics": { + "type": "array", + "items": { + "$ref": "#/components/schemas/MetricInResponse" + } + }, "delta": { "type": "string", "description": "New content generated since last chunk. This can be one or more tokens." @@ -8363,6 +8333,75 @@ ], "title": "LogSeverity" }, + "MetricEvent": { + "type": "object", + "properties": { + "trace_id": { + "type": "string" + }, + "span_id": { + "type": "string" + }, + "timestamp": { + "type": "string", + "format": "date-time" + }, + "attributes": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + }, + { + "type": "null" + } + ] + } + }, + "type": { + "type": "string", + "const": "metric", + "default": "metric" + }, + "metric": { + "type": "string" + }, + "value": { + "oneOf": [ + { + "type": "integer" + }, + { + "type": "number" + } + ] + }, + "unit": { + "type": "string" + } + }, + "additionalProperties": false, + "required": [ + "trace_id", + "span_id", + "timestamp", + "type", + "metric", + "value", + "unit" + ], + "title": "MetricEvent" + }, "SpanEndPayload": { "type": "object", "properties": { diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 4c00fbe63..cca1872a4 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -3101,7 +3101,7 @@ components: metrics: type: array items: - $ref: '#/components/schemas/MetricEvent' + $ref: '#/components/schemas/MetricInResponse' completion_message: $ref: '#/components/schemas/CompletionMessage' description: The complete response message @@ -3116,29 +3116,9 @@ components: - completion_message title: ChatCompletionResponse description: Response from a chat completion request. - MetricEvent: + MetricInResponse: type: object properties: - trace_id: - type: string - span_id: - type: string - timestamp: - type: string - format: date-time - attributes: - type: object - additionalProperties: - oneOf: - - type: string - - type: integer - - type: number - - type: boolean - - type: 'null' - type: - type: string - const: metric - default: metric metric: type: string value: @@ -3149,14 +3129,9 @@ components: type: string additionalProperties: false required: - - trace_id - - span_id - - timestamp - - type - metric - value - - unit - title: MetricEvent + title: MetricInResponse TokenLogProbs: type: object properties: @@ -3213,6 +3188,10 @@ components: CompletionResponse: type: object properties: + metrics: + type: array + items: + $ref: '#/components/schemas/MetricInResponse' content: type: string description: The generated completion text @@ -3412,7 +3391,7 @@ components: metrics: type: array items: - $ref: '#/components/schemas/MetricEvent' + $ref: '#/components/schemas/MetricInResponse' event: $ref: '#/components/schemas/ChatCompletionResponseEvent' description: The event containing the new content @@ -3531,6 +3510,10 @@ components: CompletionResponseStreamChunk: type: object properties: + metrics: + type: array + items: + $ref: '#/components/schemas/MetricInResponse' delta: type: string description: >- @@ -5703,6 +5686,47 @@ components: - error - critical title: LogSeverity + MetricEvent: + type: object + properties: + trace_id: + type: string + span_id: + type: string + timestamp: + type: string + format: date-time + attributes: + type: object + additionalProperties: + oneOf: + - type: string + - type: integer + - type: number + - type: boolean + - type: 'null' + type: + type: string + const: metric + default: metric + metric: + type: string + value: + oneOf: + - type: integer + - type: number + unit: + type: string + additionalProperties: false + required: + - trace_id + - span_id + - timestamp + - type + - metric + - value + - unit + title: MetricEvent SpanEndPayload: type: object properties: diff --git a/llama_stack/apis/telemetry/telemetry.py b/llama_stack/apis/telemetry/telemetry.py index fe75677e7..cbea57e79 100644 --- a/llama_stack/apis/telemetry/telemetry.py +++ b/llama_stack/apis/telemetry/telemetry.py @@ -96,6 +96,13 @@ class MetricEvent(EventCommon): unit: str +@json_schema_type +class MetricInResponse(BaseModel): + metric: str + value: Union[int, float] + unit: Optional[str] = None + + # This is a short term solution to allow inference API to return metrics # The ideal way to do this is to have a way for all response types to include metrics # and all metric events logged to the telemetry API to be inlcuded with the response @@ -117,7 +124,7 @@ class MetricEvent(EventCommon): class MetricResponseMixin(BaseModel): - metrics: Optional[List[MetricEvent]] = None + metrics: Optional[List[MetricInResponse]] = None @json_schema_type diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py index 34102d04b..22a1e46f9 100644 --- a/llama_stack/distribution/routers/routers.py +++ b/llama_stack/distribution/routers/routers.py @@ -48,7 +48,7 @@ from llama_stack.apis.scoring import ( ScoringFnParams, ) from llama_stack.apis.shields import Shield -from llama_stack.apis.telemetry import MetricEvent, Telemetry +from llama_stack.apis.telemetry import MetricEvent, MetricInResponse, Telemetry from llama_stack.apis.tools import ( RAGDocument, RAGQueryConfig, @@ -206,12 +206,12 @@ class InferenceRouter(Inference): completion_tokens: int, total_tokens: int, model: Model, - ) -> List[MetricEvent]: + ) -> List[MetricInResponse]: metrics = self._construct_metrics(prompt_tokens, completion_tokens, total_tokens, model) if self.telemetry: for metric in metrics: await self.telemetry.log_event(metric) - return metrics + return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in metrics] async def _count_tokens( self, From 41c9bca1aa7a44cf2048b6c9371cd7740d2e47c1 Mon Sep 17 00:00:00 2001 From: ehhuang Date: Wed, 12 Mar 2025 18:48:03 -0700 Subject: [PATCH 030/557] chore: refactor Agent toolgroup processing (#1381) Summary: Refactoring only. Centralize logic to preprocess toolgroup to one place. Test Plan: LLAMA_STACK_CONFIG=fireworks pytest -s -v tests/api/agents/test_agents.py --safety-shield meta-llama/Llama-Guard-3-8B --- [//]: # (BEGIN SAPLING FOOTER) Stack created with [Sapling](https://sapling-scm.com). Best reviewed with [ReviewStack](https://reviewstack.dev/meta-llama/llama-stack/pull/1381). * #1384 * __->__ #1381 --- .../agents/meta_reference/agent_instance.py | 120 ++++++++---------- 1 file changed, 55 insertions(+), 65 deletions(-) diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py index 1d9f54e96..1884094df 100644 --- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py +++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py @@ -12,7 +12,7 @@ import secrets import string import uuid from datetime import datetime -from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union +from typing import AsyncGenerator, Dict, List, Optional, Tuple, Union from urllib.parse import urlparse import httpx @@ -181,6 +181,7 @@ class ChatAgent(ShieldRunnerMixin): return messages async def create_and_execute_turn(self, request: AgentTurnCreateRequest) -> AsyncGenerator: + await self._initialize_tools(request.toolgroups) async with tracing.span("create_and_execute_turn") as span: span.set_attribute("session_id", request.session_id) span.set_attribute("agent_id", self.agent_id) @@ -191,6 +192,7 @@ class ChatAgent(ShieldRunnerMixin): yield chunk async def resume_turn(self, request: AgentTurnResumeRequest) -> AsyncGenerator: + await self._initialize_tools() async with tracing.span("resume_turn") as span: span.set_attribute("agent_id", self.agent_id) span.set_attribute("session_id", request.session_id) @@ -275,7 +277,6 @@ class ChatAgent(ShieldRunnerMixin): sampling_params=self.agent_config.sampling_params, stream=request.stream, documents=request.documents if not is_resume else None, - toolgroups_for_turn=request.toolgroups if not is_resume else None, ): if isinstance(chunk, CompletionMessage): output_message = chunk @@ -327,7 +328,6 @@ class ChatAgent(ShieldRunnerMixin): sampling_params: SamplingParams, stream: bool = False, documents: Optional[List[Document]] = None, - toolgroups_for_turn: Optional[List[AgentToolGroup]] = None, ) -> AsyncGenerator: # Doing async generators makes downstream code much simpler and everything amenable to # streaming. However, it also makes things complicated here because AsyncGenerators cannot @@ -350,7 +350,6 @@ class ChatAgent(ShieldRunnerMixin): sampling_params, stream, documents, - toolgroups_for_turn, ): if isinstance(res, bool): return @@ -451,30 +450,17 @@ class ChatAgent(ShieldRunnerMixin): sampling_params: SamplingParams, stream: bool = False, documents: Optional[List[Document]] = None, - toolgroups_for_turn: Optional[List[AgentToolGroup]] = None, ) -> AsyncGenerator: - # TODO: simplify all of this code, it can be simpler - toolgroup_args = {} - toolgroups = set() - for toolgroup in self.agent_config.toolgroups + (toolgroups_for_turn or []): - if isinstance(toolgroup, AgentToolGroupWithArgs): - tool_group_name, tool_name = self._parse_toolgroup_name(toolgroup.name) - toolgroups.add(tool_group_name) - toolgroup_args[tool_group_name] = toolgroup.args - else: - toolgroups.add(toolgroup) - - tool_defs, tool_to_group = await self._get_tool_defs(toolgroups_for_turn) if documents: - await self.handle_documents(session_id, documents, input_messages, tool_defs) + await self.handle_documents(session_id, documents, input_messages) session_info = await self.storage.get_session_info(session_id) # if the session has a memory bank id, let the memory tool use it if session_info and session_info.vector_db_id: - if RAG_TOOL_GROUP not in toolgroup_args: - toolgroup_args[RAG_TOOL_GROUP] = {"vector_db_ids": [session_info.vector_db_id]} + if RAG_TOOL_GROUP not in self.toolgroup_to_args: + self.toolgroup_to_args[RAG_TOOL_GROUP] = {"vector_db_ids": [session_info.vector_db_id]} else: - toolgroup_args[RAG_TOOL_GROUP]["vector_db_ids"].append(session_info.vector_db_id) + self.toolgroup_to_args[RAG_TOOL_GROUP]["vector_db_ids"].append(session_info.vector_db_id) output_attachments = [] @@ -504,7 +490,7 @@ class ChatAgent(ShieldRunnerMixin): async for chunk in await self.inference_api.chat_completion( self.agent_config.model, input_messages, - tools=tool_defs, + tools=self.tool_defs, tool_prompt_format=self.agent_config.tool_config.tool_prompt_format, response_format=self.agent_config.response_format, stream=True, @@ -686,12 +672,9 @@ class ChatAgent(ShieldRunnerMixin): ) as span: tool_execution_start_time = datetime.now().astimezone().isoformat() tool_call = message.tool_calls[0] - tool_result = await execute_tool_call_maybe( - self.tool_runtime_api, + tool_result = await self.execute_tool_call_maybe( session_id, tool_call, - toolgroup_args, - tool_to_group, ) if tool_result.content is None: raise ValueError( @@ -744,6 +727,15 @@ class ChatAgent(ShieldRunnerMixin): input_messages = input_messages + [message, result_message] + async def _initialize_tools(self, toolgroups_for_turn: Optional[List[AgentToolGroup]] = None): + self.toolgroup_to_args = {} + for toolgroup in self.agent_config.toolgroups + (toolgroups_for_turn or []): + if isinstance(toolgroup, AgentToolGroupWithArgs): + tool_group_name, _ = self._parse_toolgroup_name(toolgroup.name) + self.toolgroup_to_args[tool_group_name] = toolgroup.args + + self.tool_defs, self.tool_name_to_group_id = await self._get_tool_defs(toolgroups_for_turn) + async def _get_tool_defs( self, toolgroups_for_turn: Optional[List[AgentToolGroup]] = None ) -> Tuple[List[ToolDefinition], Dict[str, str]]: @@ -756,7 +748,7 @@ class ChatAgent(ShieldRunnerMixin): agent_config_toolgroups.append(name) tool_name_to_def = {} - tool_to_group = {} + tool_name_to_group_id = {} for tool_def in self.agent_config.client_tools: if tool_name_to_def.get(tool_def.name, None): @@ -774,7 +766,7 @@ class ChatAgent(ShieldRunnerMixin): for param in tool_def.parameters }, ) - tool_to_group[tool_def.name] = "__client_tools__" + tool_name_to_group_id[tool_def.name] = "__client_tools__" for toolgroup_name_with_maybe_tool_name in agent_config_toolgroups: toolgroup_name, tool_name = self._parse_toolgroup_name(toolgroup_name_with_maybe_tool_name) tools = await self.tool_groups_api.list_tools(toolgroup_id=toolgroup_name) @@ -813,7 +805,7 @@ class ChatAgent(ShieldRunnerMixin): for param in tool_def.parameters }, ) - tool_to_group[built_in_type] = tool_def.toolgroup_id + tool_name_to_group_id[built_in_type] = tool_def.toolgroup_id continue if tool_name_to_def.get(tool_def.identifier, None): @@ -832,9 +824,9 @@ class ChatAgent(ShieldRunnerMixin): for param in tool_def.parameters }, ) - tool_to_group[tool_def.identifier] = tool_def.toolgroup_id + tool_name_to_group_id[tool_def.identifier] = tool_def.toolgroup_id - return list(tool_name_to_def.values()), tool_to_group + return list(tool_name_to_def.values()), tool_name_to_group_id def _parse_toolgroup_name(self, toolgroup_name_with_maybe_tool_name: str) -> tuple[str, Optional[str]]: """Parse a toolgroup name into its components. @@ -853,15 +845,44 @@ class ChatAgent(ShieldRunnerMixin): tool_group, tool_name = split_names[0], None return tool_group, tool_name + async def execute_tool_call_maybe( + self, + session_id: str, + tool_call: ToolCall, + ) -> ToolInvocationResult: + name = tool_call.tool_name + group_name = self.tool_name_to_group_id.get(name, None) + if group_name is None: + raise ValueError( + f"Tool {name} not found in any tool group, available tools: {', '.join(self.tool_name_to_group_id.keys())}" + ) + if isinstance(name, BuiltinTool): + if name == BuiltinTool.brave_search: + name = WEB_SEARCH_TOOL + else: + name = name.value + + logger.info(f"executing tool call: {name} with args: {tool_call.arguments}") + result = await self.tool_runtime_api.invoke_tool( + tool_name=name, + kwargs={ + "session_id": session_id, + # get the arguments generated by the model and augment with toolgroup arg overrides for the agent + **tool_call.arguments, + **self.toolgroup_to_args.get(group_name, {}), + }, + ) + logger.debug(f"tool call {name} completed with result: {result}") + return result + async def handle_documents( self, session_id: str, documents: List[Document], input_messages: List[Message], - tool_defs: Dict[str, ToolDefinition], ) -> None: - memory_tool = any(tool_def.tool_name == MEMORY_QUERY_TOOL for tool_def in tool_defs) - code_interpreter_tool = any(tool_def.tool_name == BuiltinTool.code_interpreter for tool_def in tool_defs) + memory_tool = any(tool_def.tool_name == MEMORY_QUERY_TOOL for tool_def in self.tool_defs) + code_interpreter_tool = any(tool_def.tool_name == BuiltinTool.code_interpreter for tool_def in self.tool_defs) content_items = [] url_items = [] pattern = re.compile("^(https?://|file://|data:)") @@ -994,37 +1015,6 @@ async def attachment_message(tempdir: str, urls: List[URL]) -> ToolResponseMessa ) -async def execute_tool_call_maybe( - tool_runtime_api: ToolRuntime, - session_id: str, - tool_call: ToolCall, - toolgroup_args: Dict[str, Dict[str, Any]], - tool_to_group: Dict[str, str], -) -> ToolInvocationResult: - name = tool_call.tool_name - group_name = tool_to_group.get(name, None) - if group_name is None: - raise ValueError(f"Tool {name} not found in any tool group") - if isinstance(name, BuiltinTool): - if name == BuiltinTool.brave_search: - name = WEB_SEARCH_TOOL - else: - name = name.value - - logger.info(f"executing tool call: {name} with args: {tool_call.arguments}") - result = await tool_runtime_api.invoke_tool( - tool_name=name, - kwargs={ - "session_id": session_id, - # get the arguments generated by the model and augment with toolgroup arg overrides for the agent - **tool_call.arguments, - **toolgroup_args.get(group_name, {}), - }, - ) - logger.info(f"tool call {name} completed with result: {result}") - return result - - def _interpret_content_as_attachment( content: str, ) -> Optional[Attachment]: From ed6caead724aa8b5b1c4e53528e120888517a812 Mon Sep 17 00:00:00 2001 From: ehhuang Date: Wed, 12 Mar 2025 18:51:18 -0700 Subject: [PATCH 031/557] chore: simplify _get_tool_defs (#1384) Summary: Test Plan: LLAMA_STACK_CONFIG=fireworks pytest -s -v tests/integration/agents/test_agents.py --safety-shield meta-llama/Llama-Guard-3-8B --text-model meta-llama/Llama-3.1-8B-Instruct --- .../agents/meta_reference/agent_instance.py | 111 ++++++++---------- 1 file changed, 50 insertions(+), 61 deletions(-) diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py index 1884094df..3f09cacc0 100644 --- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py +++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py @@ -12,7 +12,7 @@ import secrets import string import uuid from datetime import datetime -from typing import AsyncGenerator, Dict, List, Optional, Tuple, Union +from typing import AsyncGenerator, List, Optional, Union from urllib.parse import urlparse import httpx @@ -457,10 +457,12 @@ class ChatAgent(ShieldRunnerMixin): session_info = await self.storage.get_session_info(session_id) # if the session has a memory bank id, let the memory tool use it if session_info and session_info.vector_db_id: - if RAG_TOOL_GROUP not in self.toolgroup_to_args: - self.toolgroup_to_args[RAG_TOOL_GROUP] = {"vector_db_ids": [session_info.vector_db_id]} - else: - self.toolgroup_to_args[RAG_TOOL_GROUP]["vector_db_ids"].append(session_info.vector_db_id) + for tool_name in self.tool_name_to_args.keys(): + if tool_name == MEMORY_QUERY_TOOL: + if "vector_db_ids" not in self.tool_name_to_args[tool_name]: + self.tool_name_to_args[tool_name]["vector_db_ids"] = [session_info.vector_db_id] + else: + self.tool_name_to_args[tool_name]["vector_db_ids"].append(session_info.vector_db_id) output_attachments = [] @@ -727,18 +729,16 @@ class ChatAgent(ShieldRunnerMixin): input_messages = input_messages + [message, result_message] - async def _initialize_tools(self, toolgroups_for_turn: Optional[List[AgentToolGroup]] = None): - self.toolgroup_to_args = {} - for toolgroup in self.agent_config.toolgroups + (toolgroups_for_turn or []): + async def _initialize_tools( + self, + toolgroups_for_turn: Optional[List[AgentToolGroup]] = None, + ) -> None: + toolgroup_to_args = {} + for toolgroup in (self.agent_config.toolgroups or []) + (toolgroups_for_turn or []): if isinstance(toolgroup, AgentToolGroupWithArgs): tool_group_name, _ = self._parse_toolgroup_name(toolgroup.name) - self.toolgroup_to_args[tool_group_name] = toolgroup.args + toolgroup_to_args[tool_group_name] = toolgroup.args - self.tool_defs, self.tool_name_to_group_id = await self._get_tool_defs(toolgroups_for_turn) - - async def _get_tool_defs( - self, toolgroups_for_turn: Optional[List[AgentToolGroup]] = None - ) -> Tuple[List[ToolDefinition], Dict[str, str]]: # Determine which tools to include tool_groups_to_include = toolgroups_for_turn or self.agent_config.toolgroups or [] agent_config_toolgroups = [] @@ -747,8 +747,10 @@ class ChatAgent(ShieldRunnerMixin): if name not in agent_config_toolgroups: agent_config_toolgroups.append(name) + toolgroup_to_args = toolgroup_to_args or {} + tool_name_to_def = {} - tool_name_to_group_id = {} + tool_name_to_args = {} for tool_def in self.agent_config.client_tools: if tool_name_to_def.get(tool_def.name, None): @@ -766,53 +768,38 @@ class ChatAgent(ShieldRunnerMixin): for param in tool_def.parameters }, ) - tool_name_to_group_id[tool_def.name] = "__client_tools__" for toolgroup_name_with_maybe_tool_name in agent_config_toolgroups: - toolgroup_name, tool_name = self._parse_toolgroup_name(toolgroup_name_with_maybe_tool_name) + toolgroup_name, input_tool_name = self._parse_toolgroup_name(toolgroup_name_with_maybe_tool_name) tools = await self.tool_groups_api.list_tools(toolgroup_id=toolgroup_name) if not tools.data: available_tool_groups = ", ".join( [t.identifier for t in (await self.tool_groups_api.list_tool_groups()).data] ) raise ValueError(f"Toolgroup {toolgroup_name} not found, available toolgroups: {available_tool_groups}") - if tool_name is not None and not any(tool.identifier == tool_name for tool in tools.data): + if input_tool_name is not None and not any(tool.identifier == input_tool_name for tool in tools.data): raise ValueError( - f"Tool {tool_name} not found in toolgroup {toolgroup_name}. Available tools: {', '.join([tool.identifier for tool in tools.data])}" + f"Tool {input_tool_name} not found in toolgroup {toolgroup_name}. Available tools: {', '.join([tool.identifier for tool in tools.data])}" ) for tool_def in tools.data: if toolgroup_name.startswith("builtin") and toolgroup_name != RAG_TOOL_GROUP: - tool_name = tool_def.identifier - built_in_type = BuiltinTool.brave_search - if tool_name == "web_search": - built_in_type = BuiltinTool.brave_search + identifier: str | BuiltinTool | None = tool_def.identifier + if identifier == "web_search": + identifier = BuiltinTool.brave_search else: - built_in_type = BuiltinTool(tool_name) + identifier = BuiltinTool(identifier) + else: + # add if tool_name is unspecified or the tool_def identifier is the same as the tool_name + if input_tool_name in (None, tool_def.identifier): + identifier = tool_def.identifier + else: + identifier = None - if tool_name_to_def.get(built_in_type, None): - raise ValueError(f"Tool {built_in_type} already exists") - - tool_name_to_def[built_in_type] = ToolDefinition( - tool_name=built_in_type, - description=tool_def.description, - parameters={ - param.name: ToolParamDefinition( - param_type=param.parameter_type, - description=param.description, - required=param.required, - default=param.default, - ) - for param in tool_def.parameters - }, - ) - tool_name_to_group_id[built_in_type] = tool_def.toolgroup_id - continue - - if tool_name_to_def.get(tool_def.identifier, None): - raise ValueError(f"Tool {tool_def.identifier} already exists") - if tool_name in (None, tool_def.identifier): + if tool_name_to_def.get(identifier, None): + raise ValueError(f"Tool {identifier} already exists") + if identifier: tool_name_to_def[tool_def.identifier] = ToolDefinition( - tool_name=tool_def.identifier, + tool_name=identifier, description=tool_def.description, parameters={ param.name: ToolParamDefinition( @@ -824,9 +811,9 @@ class ChatAgent(ShieldRunnerMixin): for param in tool_def.parameters }, ) - tool_name_to_group_id[tool_def.identifier] = tool_def.toolgroup_id + tool_name_to_args[tool_def.identifier] = toolgroup_to_args.get(toolgroup_name, {}) - return list(tool_name_to_def.values()), tool_name_to_group_id + self.tool_defs, self.tool_name_to_args = list(tool_name_to_def.values()), tool_name_to_args def _parse_toolgroup_name(self, toolgroup_name_with_maybe_tool_name: str) -> tuple[str, Optional[str]]: """Parse a toolgroup name into its components. @@ -850,29 +837,31 @@ class ChatAgent(ShieldRunnerMixin): session_id: str, tool_call: ToolCall, ) -> ToolInvocationResult: - name = tool_call.tool_name - group_name = self.tool_name_to_group_id.get(name, None) - if group_name is None: + tool_name = tool_call.tool_name + registered_tool_names = [tool_def.tool_name for tool_def in self.tool_defs] + if tool_name not in registered_tool_names: raise ValueError( - f"Tool {name} not found in any tool group, available tools: {', '.join(self.tool_name_to_group_id.keys())}" + f"Tool {tool_name} not found in provided tools, registered tools: {', '.join([str(x) for x in registered_tool_names])}" ) - if isinstance(name, BuiltinTool): - if name == BuiltinTool.brave_search: - name = WEB_SEARCH_TOOL + if isinstance(tool_name, BuiltinTool): + if tool_name == BuiltinTool.brave_search: + tool_name_str = WEB_SEARCH_TOOL else: - name = name.value + tool_name_str = tool_name.value + else: + tool_name_str = tool_name - logger.info(f"executing tool call: {name} with args: {tool_call.arguments}") + logger.info(f"executing tool call: {tool_name_str} with args: {tool_call.arguments}") result = await self.tool_runtime_api.invoke_tool( - tool_name=name, + tool_name=tool_name_str, kwargs={ "session_id": session_id, # get the arguments generated by the model and augment with toolgroup arg overrides for the agent **tool_call.arguments, - **self.toolgroup_to_args.get(group_name, {}), + **self.tool_name_to_args.get(tool_name_str, {}), }, ) - logger.debug(f"tool call {name} completed with result: {result}") + logger.debug(f"tool call {tool_name_str} completed with result: {result}") return result async def handle_documents( From 2baf200b63e2677ec991adc5e77c19d46477d838 Mon Sep 17 00:00:00 2001 From: Nathan Weinberg <31703736+nathan-weinberg@users.noreply.github.com> Date: Wed, 12 Mar 2025 22:05:49 -0400 Subject: [PATCH 032/557] ci: add html report to unit test artifacts (#1576) # What does this PR do? additional artifacts make test results more human-readable ## Test Plan Ran locally Signed-off-by: Nathan Weinberg --- .github/workflows/unit-tests.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 59d18b3be..517b5c39a 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -33,7 +33,7 @@ jobs: - name: Run unit tests run: | - uv run --python ${{ matrix.python }} --with-editable . --with-editable ".[dev]" --with-editable ".[unit]" pytest --cov=llama_stack -s -v tests/unit/ --junitxml=pytest-report-${{ matrix.python }}.xml + uv run --python ${{ matrix.python }} --with-editable . --with-editable ".[dev]" --with-editable ".[unit]" pytest --cov=llama_stack -s -v tests/unit/ --junitxml=pytest-report-${{ matrix.python }}.xml --cov-report=html:htmlcov-${{ matrix.python }} - name: Upload test results if: always() @@ -43,4 +43,5 @@ jobs: path: | .pytest_cache/ pytest-report-${{ matrix.python }}.xml + htmlcov-${{ matrix.python }}/ retention-days: 7 From 6bfcb65343091e53c2d9cff0b8f41862279d4bbe Mon Sep 17 00:00:00 2001 From: ehhuang Date: Wed, 12 Mar 2025 19:21:53 -0700 Subject: [PATCH 033/557] test: code exec on mac (#1549) Summary: 1. adds option to not use bwrap for code execution 2. disable bwrap when running tests on macs Test Plan: ``` LLAMA_STACK_CONFIG=fireworks pytest -s -v tests/integration/agents/test_agents.py --safety-shield meta-llama/Llama-Guard-3-8B --text-model meta-llama/Llama-3.1-8B-Instruct ``` Verify code_interpreter result in logs INFO 2025-03-11 08:10:39,858 llama_stack.providers.inline.agents.meta_reference.agent_instance:1032 agents: tool call code_interpreter completed with result: content='completed\n\n541\n' error_message=None error_code=None metadata=None --- .../code_interpreter/code_execution.py | 10 +- .../code_interpreter/code_interpreter.py | 5 +- tests/integration/agents/test_agents.py | 2 +- tests/integration/conftest.py | 9 + .../recorded_responses/chat_completion.json | 6066 ++++++++++++++--- .../recorded_responses/invoke_tool.json | 74 +- 6 files changed, 5260 insertions(+), 906 deletions(-) diff --git a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py index 6f4b25b9d..d7b2dbdef 100644 --- a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py +++ b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py @@ -76,6 +76,7 @@ class CodeExecutionRequest: only_last_cell_fail: bool = True seed: int = 0 strip_fpaths_in_stderr: bool = True + use_bwrap: bool = True class CodeExecutor: @@ -103,8 +104,6 @@ _set_seeds()\ script = "\n\n".join([seeds_prefix] + [CODE_ENV_PREFIX] + scripts) with tempfile.TemporaryDirectory() as dpath: - bwrap_prefix = "bwrap " + generate_bwrap_command(bind_dirs=[dpath]) - cmd = [*bwrap_prefix.split(), sys.executable, "-c", script] code_fpath = os.path.join(dpath, "code.py") with open(code_fpath, "w") as f: f.write(script) @@ -118,6 +117,13 @@ _set_seeds()\ MPLBACKEND="module://matplotlib_custom_backend", PYTHONPATH=f"{DIRNAME}:{python_path}", ) + + if req.use_bwrap: + bwrap_prefix = "bwrap " + generate_bwrap_command(bind_dirs=[dpath]) + cmd = [*bwrap_prefix.split(), sys.executable, "-c", script] + else: + cmd = [sys.executable, "-c", script] + stdout, stderr, returncode = do_subprocess( cmd=cmd, env=env, diff --git a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py index 54f17f9a2..4b97914c5 100644 --- a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py +++ b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_interpreter.py @@ -6,6 +6,7 @@ import logging +import os import tempfile from typing import Any, Dict, List, Optional @@ -61,7 +62,9 @@ class CodeInterpreterToolRuntimeImpl(ToolsProtocolPrivate, ToolRuntime): async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> ToolInvocationResult: script = kwargs["code"] - req = CodeExecutionRequest(scripts=[script]) + # Use environment variable to control bwrap usage + force_disable_bwrap = os.environ.get("DISABLE_CODE_SANDBOX", "").lower() in ("1", "true", "yes") + req = CodeExecutionRequest(scripts=[script], use_bwrap=not force_disable_bwrap) res = self.code_executor.execute(req) pieces = [res["process_status"]] for out_type in ["stdout", "stderr"]: diff --git a/tests/integration/agents/test_agents.py b/tests/integration/agents/test_agents.py index a542e5403..f6bde8927 100644 --- a/tests/integration/agents/test_agents.py +++ b/tests/integration/agents/test_agents.py @@ -187,7 +187,7 @@ def test_builtin_tool_web_search(llama_stack_client_with_mocked_inference, agent messages=[ { "role": "user", - "content": "Search the web and tell me who the current CEO of Meta is.", + "content": "Search the web and tell me who the founder of Meta is.", } ], session_id=session_id, diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index f4fe9e8ff..bf1092c4a 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -6,12 +6,17 @@ import inspect import itertools import os +import platform import textwrap from dotenv import load_dotenv +from llama_stack.log import get_logger + from .report import Report +logger = get_logger(__name__, category="tests") + def pytest_configure(config): config.option.tbstyle = "short" @@ -24,6 +29,10 @@ def pytest_configure(config): key, value = env_var.split("=", 1) os.environ[key] = value + if platform.system() == "Darwin": # Darwin is the system name for macOS + os.environ["DISABLE_CODE_SANDBOX"] = "1" + logger.info("Setting DISABLE_CODE_SANDBOX=1 for macOS") + if config.getoption("--report"): config.pluginmanager.register(Report(config)) diff --git a/tests/integration/fixtures/recorded_responses/chat_completion.json b/tests/integration/fixtures/recorded_responses/chat_completion.json index 7234b6c31..30b7e0b4d 100644 --- a/tests/integration/fixtures/recorded_responses/chat_completion.json +++ b/tests/integration/fixtures/recorded_responses/chat_completion.json @@ -12535,7 +12535,23 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 139 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 23 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 162 + } + ] } } ], @@ -12589,7 +12605,7 @@ "data": { "event": { "delta": { - "text": "type\": \"function\", \"name\":", + "text": "type\": \"function\", \"name\": \"", "type": "text" }, "event_type": { @@ -12609,7 +12625,7 @@ "data": { "event": { "delta": { - "text": " \"get_boiling_point\", \"parameters\":", + "text": "get_boiling_point\", \"parameters\": {\"", "type": "text" }, "event_type": { @@ -12629,7 +12645,7 @@ "data": { "event": { "delta": { - "text": " {\"liquid_name\": \"polyjuice", + "text": "liquid_name\": \"polyjuice\", \"celcius\":", "type": "text" }, "event_type": { @@ -12649,7 +12665,7 @@ "data": { "event": { "delta": { - "text": "\", \"celcius\": \"false\"}}", + "text": " \"false\"}}", "type": "text" }, "event_type": { @@ -12679,7 +12695,7 @@ "celcius": "false", "liquid_name": "polyjuice" }, - "call_id": "bffe07d7-343f-49c4-bcff-d83c99fa7d4a", + "call_id": "fc7e2525-3e7b-47ff-8731-12dd7655dfd6", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -12720,7 +12736,23 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 91 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 45 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 136 + } + ] } } ], @@ -12774,7 +12806,7 @@ "data": { "event": { "delta": { - "text": " \"type\": \"function\",\n \"name\": \"get", + "text": " \"type\": \"function\",\n \"", "type": "text" }, "event_type": { @@ -12794,7 +12826,7 @@ "data": { "event": { "delta": { - "text": "_boiling_point\",\n \"parameters\": {\n \"liquid_name", + "text": "name\": \"get_boiling_point\",\n \"parameters\":", "type": "text" }, "event_type": { @@ -12814,7 +12846,7 @@ "data": { "event": { "delta": { - "text": "\": \"polyjuice\",\n ", + "text": " {\n \"liquid_name\": \"polyjuice\",\n \"celci", "type": "text" }, "event_type": { @@ -12834,7 +12866,7 @@ "data": { "event": { "delta": { - "text": " \"celcius\": \"true\"\n }\n}", + "text": "us\": \"true\"\n }\n}", "type": "text" }, "event_type": { @@ -12864,7 +12896,7 @@ "celcius": "true", "liquid_name": "polyjuice" }, - "call_id": "41ce6bfb-81c1-438d-8520-329c4446f1bc", + "call_id": "73212def-09c0-4a29-845e-149afb38fcd1", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -12905,7 +12937,23 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 43 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 55 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 98 + } + ] } } ], @@ -12959,7 +13007,27 @@ "data": { "event": { "delta": { - "text": " boiling point of polyjuice is -100\u00b0C.", + "text": " boiling point of polyjuice is", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " -100\u00b0C.", "type": "text" }, "event_type": { @@ -12994,7 +13062,23 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 85 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 22 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 107 + } + ] } } ], @@ -13048,7 +13132,27 @@ "data": { "event": { "delta": { - "text": " boiling point of polyjuice is -100\u00b0C.", + "text": " boiling point of polyjuice is -100", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "\u00b0C.", "type": "text" }, "event_type": { @@ -13083,13 +13187,230 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 87 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 22 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 109 + } + ] } } ], "type": "generator" }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "{\"type\": \"function\", \"name", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\": \"get_boiling_point\", \"parameters\": {\"liquid_name\": \"poly", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "juice\", \"celcius\": \"true\"}}", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": "true", + "liquid_name": "polyjuice" + }, + "call_id": "e4b0121a-7b75-4e89-be40-d13021a3bb11", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 37 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 47 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -13172,7 +13493,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "\", \"parameters\": {\"liquid_name\": \"polyjuice\", \"cel", + "tool_call": "_with_metadata\", \"parameters\": {\"liquid_name\": \"polyjuice\",", "type": "tool_call" }, "event_type": { @@ -13197,7 +13518,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "cius\": \"true\"}}", + "tool_call": " \"celcius\": \"true\"}}", "type": "tool_call" }, "event_type": { @@ -13227,192 +13548,7 @@ "celcius": "true", "liquid_name": "polyjuice" }, - "call_id": "6161b956-9b68-4e88-87bf-e26a07d4c7ca", - "tool_name": "get_boiling_point" - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "_point_with_metadata\", \"parameters\": {\"liquid_name\": \"polyju", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "ice\", \"celcius\": \"true\"}}", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "celcius": "true", - "liquid_name": "polyjuice" - }, - "call_id": "11da4a37-d7ad-468a-98c8-0f1e295d14a9", + "call_id": "11c04896-2b7b-49bd-b832-47a1c9f3796f", "tool_name": "get_boiling_point_with_metadata" }, "type": "tool_call" @@ -13453,7 +13589,23 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 37 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 47 + } + ] } } ], @@ -13507,7 +13659,27 @@ "data": { "event": { "delta": { - "text": " customer smiled and said \"hello\" to the friendly store clerk.", + "text": " customer smiled and said \"hello\" to the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " friendly store clerk.", "type": "text" }, "event_type": { @@ -13542,7 +13714,23 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 30 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 24 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 54 + } + ] } } ], @@ -13957,6 +14145,1165 @@ ], "type": "generator" }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\ndf = pd.read_csv(\\\"\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stdout]\\nYear Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec\\n0 2014 1.6 1.6 1.7 1.8 2.0 1.9 1.9 1.7 1.7 1.8 1.7 1.6\\n1 2015 1.6 1.7 1.8 1.8 1.7 1.8 1.8 1.8 1.9 1.9 2.0 2.1\\n2 2016 2.2 2.3 2.2 2.1 2.2 2.2 2.2 2.3 2.2 2.1 2.1 2.2\\n3 2017 2.3 2.2 2.0 1.9 1.7 1.7 1.7 1.7 1.7 1.8 1.7 1.8\\n4 2018 1.8 1.8 2.1 2.1 2.2 2.3 2.4 2.2 2.2 2.1 2.2 2.2\\n[/stdout]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " csv file contains data on inflation rates for each month of the year from", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " 2014 to 2018. The columns are:\n\n- Year", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ": The year of the inflation rate\n-", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Jan to Dec: The inflation rate for each month of", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the year\n\nThe inflation rates are all in the range of ", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "1.6 to 2.4, indicating", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " a relatively stable inflation rate over the years.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 471 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 91 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 562 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 142, in \\n line 23, in \\n from .code_execution import CodeExecutionContext, CodeExecutionRequest, CodeExecutor\\nImportError: attempted relative import with no known parent package\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 142, in \\n line 23, in \\n from .code_execution import CodeExecutionContext, CodeExecutionRequest, CodeExecutor\\nImportError: attempted relative import with no known parent package\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " error message indicates that there is an issue with the import statement. However", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ", the code provided does not contain any import statements that", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " would cause this error.\n\nTo provide a more accurate answer, I", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " would need to know the contents of the CSV file or more information", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " about the error message.\n\nHowever, based on the code provided, it", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " seems like the code is trying to load a CSV", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file and print some basic information about it. If the file is", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " not found or there is an issue with the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " path, this could cause an error.\n\nHere is a revised version", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " of the code that includes some error handling:\n\n``", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "`\nimport pandas as pd\nimport code_interpreter\n\ntry:\n", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " # Load the CSV file\n df = pd.read_csv(\"/", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "var/folders/cz/vyh7y1d11x", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "g881lsxsshnc5c0000gn/T/tmp", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "_d_cdeif/6TpkUAo", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "0inflation.csv\")\n\n # Print the first few rows of the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " dataframe\n print(df.head())\n\n # Print the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " data types of each column\n print(df", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ".dtypes)\n\n # Print the summary statistics of the dataframe\n", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " print(df.describe())\n\nexcept FileNotFoundError:\n print(\"The file was", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " not found.\")\nexcept pd.errors.EmptyDataError:\n print(\"The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file is empty.\")\nexcept pd.errors.ParserError:\n print(\"An", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " error occurred while parsing the file.\")\nexcept Exception as e:\n print", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "(\"An error occurred: \", str(e))\n```\n\nThis code will", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " catch specific exceptions that could occur when loading the CSV file and print a", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " more informative error message.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 391 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 330 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 721 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 142, in \\n line 23, in \\n from .code_execution import CodeExecutionContext, CodeExecutionRequest, CodeExecutor\\nImportError: attempted relative import with no known parent package\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\nimport code_interpreter\n\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "# Load the CSV file\ndf = pd.read_csv(\"/var", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "/folders/cz/vyh7y1d11xg881", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "lsxsshnc5c0000gn/T/tmp_d_cdeif", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "/6TpkUAo0inflation.csv\")\n\n# Print the", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " first few rows of the dataframe\nprint(df.head())\n\n# Print the", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " data types of each column\nprint(df.dtypes)\n\n# Print the", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " summary statistics of the dataframe\nprint(df.describe())", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmp_d_cdeif/6TpkUAo0inflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())" + }, + "call_id": "fa1b393f-3fc7-416f-98ab-05d879def880", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 214 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 224 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { @@ -15046,7 +16393,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "import pandas as pd\n", + "tool_call": "import pandas as pd\ndf = pd.read_csv(\"/var/folders/c", "type": "tool_call" }, "event_type": { @@ -15071,7 +16418,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "import code_interpreter\n\n# Load the CSV file\ndf =", + "tool_call": "z/vyh7y1d11xg881lsxsshnc5", "type": "tool_call" }, "event_type": { @@ -15096,7 +16443,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " pd.read_csv(\"/var/folders", + "tool_call": "c0000gn/T/tmpe8u6r9sz/R", "type": "tool_call" }, "event_type": { @@ -15121,157 +16468,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "/cz/vyh7y1d11xg881", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "lsxsshnc5c0000gn/T/tmp4ed7", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "p2bg/Csr659svinflation.csv\")\n\n# Print", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " the first few rows of the dataframe\nprint(df.head())\n\n#", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " Print the data types of each column\nprint(df.dtypes)\n\n", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "# Print the summary statistics of the dataframe", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\nprint(df.describe())", + "tool_call": "ChoI8s0inflation.csv\")\nprint(df.head())", "type": "tool_call" }, "event_type": { @@ -15298,9 +16495,9 @@ }, "tool_call": { "arguments": { - "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmp4ed7p2bg/Csr659svinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())" + "code": "import pandas as pd\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpe8u6r9sz/RChoI8s0inflation.csv\")\nprint(df.head())" }, - "call_id": "8aeab20b-341b-4349-84dc-3e3c3299d713", + "call_id": "35e85870-f8f3-44f4-8879-e7b02a2805f6", "tool_name": { "__enum__": "BuiltinTool", "__module__": "llama_stack.models.llama.datatypes", @@ -15345,7 +16542,23 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 37 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 47 + } + ] } } ], @@ -17611,6 +18824,670 @@ ], "type": "generator" }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stdout]\\nNumber of rows and columns in the data: (10, 13)\\nColumns of the data are: 13\\nColumns of the data are: Index(['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',\\n 'Oct', 'Nov', 'Dec'],\\n dtype='object')\\nDatatype of the columns are: Year int64\\nJan float64\\nFeb float64\\nMar float64\\nApr float64\\nMay float64\\nJun float64\\nJul float64\\nAug float64\\nSep float64\\nOct float64\\nNov float64\\nDec float64\\ndtype: object\\n[/stdout]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"This CSV file contains 10 rows and 13 columns. The columns are named 'Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'. The data types of these columns are int64 for 'Year' and float64 for the rest.\\n\\nIt appears that this CSV file contains monthly inflation rates for different years. The 'Year' column represents the year, and the rest of the columns represent the inflation rates for each month of the year.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n\\n# Calculate average yearly inflation\\ndf['Average'] = df[['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']].mean(axis=1)\\n\\n# Plot time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df['Year'], df['Average'])\\nplt.xlabel('Year')\\nplt.ylabel('Average Yearly Inflation')\\nplt.title('Average Yearly Inflation Over Time')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "This", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " code will create a line plot of the average", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " yearly inflation over time. The x-axis represents", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the year, and the y-axis represents the average", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " yearly inflation. The plot will show the trend of average yearly inflation", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " over the years.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 633 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 56 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 689 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stdout]\\nNumber of rows and columns in the data: (10, 13)\\nColumns of the data are: 13\\nColumns of the data are: Index(['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',\\n 'Oct', 'Nov', 'Dec'],\\n dtype='object')\\nDatatype of the columns are: Year int64\\nJan float64\\nFeb float64\\nMar float64\\nApr float64\\nMay float64\\nJun float64\\nJul float64\\nAug float64\\nSep float64\\nOct float64\\nNov float64\\nDec float64\\ndtype: object\\n[/stdout]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"This CSV file contains 10 rows and 13 columns. The columns are named 'Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'. The data types of these columns are int64 for 'Year' and float64 for the rest.\\n\\nIt appears that this CSV file contains monthly inflation rates for different years. The 'Year' column represents the year, and the rest of the columns represent the inflation rates for each month of the year.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\nimport matplotlib.pyplot", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " as plt\n\n# Load data\ndf = pd.read_csv(\"/", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "var/folders/cz/vyh7y1d11", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "xg881lsxsshnc5c0000gn/T", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "/tmp_d_cdeif/UuctHlJzinflation.csv\")\n\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "# Calculate average yearly inflation\ndf['Average", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "'] = df[['Jan', 'Feb', 'Mar',", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " 'Apr', 'May', 'Jun',", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " 'Jul', 'Aug', 'Sep', 'Oct', '", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "Nov', 'Dec']].mean(axis=1)\n\n# Plot time series", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\nplt.figure(figsize=(10,6))\nplt.plot(df['Year", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "'], df['Average'])\nplt.xlabel('Year')\nplt.ylabel('Average", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Yearly Inflation')\nplt.title('Average Yearly Inflation Over", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Time')\nplt.grid(True)\nplt.show()", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmp_d_cdeif/UuctHlJzinflation.csv\")\n\n# Calculate average yearly inflation\ndf['Average'] = df[['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']].mean(axis=1)\n\n# Plot time series\nplt.figure(figsize=(10,6))\nplt.plot(df['Year'], df['Average'])\nplt.xlabel('Year')\nplt.ylabel('Average Yearly Inflation')\nplt.title('Average Yearly Inflation Over Time')\nplt.grid(True)\nplt.show()" + }, + "call_id": "f953fd92-9413-4968-9ffa-f85ddea173dc", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 453 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 463 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { @@ -18040,6 +19917,291 @@ ], "type": "generator" }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stdout]\\nNumber of rows and columns in the data: (10, 13)\\nColumns of the data are: 13\\nColumns of the data are: Index(['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',\\n 'Oct', 'Nov', 'Dec'],\\n dtype='object')\\nDatatype of the columns are: Year int64\\nJan float64\\nFeb float64\\nMar float64\\nApr float64\\nMay float64\\nJun float64\\nJul float64\\nAug float64\\nSep float64\\nOct float64\\nNov float64\\nDec float64\\ndtype: object\\n[/stdout]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "This", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " CSV file contains 10 rows and 13 columns. The columns", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " are named 'Year', 'Jan', 'Feb', 'Mar',", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " 'Apr', 'May', 'Jun', 'Jul', 'Aug", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "', 'Sep', 'Oct', 'Nov', '", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "Dec'. The data types of these columns are int64 for '", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "Year' and float64 for the rest.\n\nIt appears that this CSV", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file contains monthly inflation rates for different years. The 'Year' column", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " represents the year, and the rest of the columns represent the inflation rates", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " for each month of the year", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ".", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 326 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 125 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 451 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { @@ -18098,7 +20260,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "import pandas as pd\n# Load data\ndf = pd.read", + "tool_call": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var", "type": "tool_call" }, "event_type": { @@ -18123,7 +20285,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "_csv(\"/var/folders/cz/vyh7y1d", + "tool_call": "/folders/cz/vyh7y1d11xg881lsx", "type": "tool_call" }, "event_type": { @@ -18148,7 +20310,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "11xg881lsxsshnc", + "tool_call": "sshnc5c0000gn/T/tmp_d_cdeif/Uuct", "type": "tool_call" }, "event_type": { @@ -18173,7 +20335,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "5c0000gn/T/tmp4ed7p2bg/U", + "tool_call": "HlJzinflation.csv\")\n# Rows\nprint(\"Number of rows", "type": "tool_call" }, "event_type": { @@ -18198,7 +20360,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "Z0Z335vinflation.csv\")\n# Rows\nprint(\"", + "tool_call": " and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns", "type": "tool_call" }, "event_type": { @@ -18223,7 +20385,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "Number of rows and columns in the data:\", df.shape)\n# Columns", + "tool_call": " of the data are:\", len(df.columns))\n# Column names\nprint(\"", "type": "tool_call" }, "event_type": { @@ -18248,7 +20410,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "\nprint(\"Columns of the data are:\", len(df.columns))\n# Column", + "tool_call": "Columns of the data are:\", df.columns)\n# Column dtypes\nprint", "type": "tool_call" }, "event_type": { @@ -18273,57 +20435,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " names\nprint(\"Columns of the data are:\", df.columns)\n# Column", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " dtypes\nprint(\"Datatype of the columns are:\", df.dtypes", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ")", + "tool_call": "(\"Datatype of the columns are:\", df.dtypes)", "type": "tool_call" }, "event_type": { @@ -18350,9 +20462,9 @@ }, "tool_call": { "arguments": { - "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmp4ed7p2bg/UZ0Z335vinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)" + "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmp_d_cdeif/UuctHlJzinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)" }, - "call_id": "98e27ff4-d4d7-4764-9213-f46bb928ec68", + "call_id": "479e0208-711f-4318-b284-745599a9fb9c", "tool_name": { "__enum__": "BuiltinTool", "__module__": "llama_stack.models.llama.datatypes", @@ -18397,8 +20509,1524 @@ "value": "end_of_turn" } }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 36 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 46 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:02bc2\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:e40e6\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:200a9\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:e40e6\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:e40e6\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:e40e6\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:02bc2\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:e40e6\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:200a9\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "{\"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "type\": \"function\", \"name\": \"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "knowledge_search\", \"parameters\": {\"query\": \"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "How to use LoRA in Torchtune\"}}", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "How to use LoRA in Torchtune" + }, + "call_id": "6ee142d9-1a65-433e-a681-f20066a2e1f7", + "tool_name": "knowledge_search" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 117 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 40 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 157 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:02bc2\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:e40e6\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:200a9\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "I", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "'m ready to help you answer questions about Torcht", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "une based on the documentation you provided.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " What's your first question?", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 75 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 35 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 110 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:16a6a\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:cc255\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:7a06a\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:cc255\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:cc255\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:cc255\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:16a6a\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:cc255\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:7a06a\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "{\"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "type\": \"function\", \"name\": \"knowledge_search\", \"parameters", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "\": {\"query\": \"How to use LoRA in Torchtune", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "\"}}", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "How to use LoRA in Torchtune" + }, + "call_id": "a7b02498-0a50-40c2-abf2-563d4d26d01f", + "tool_name": "knowledge_search" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 117 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 40 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 157 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:16a6a\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:cc255\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:7a06a\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "I", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "'m ready to help you answer questions about Torchtune based on the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " documentation you provided. What's your first question?", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 75 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 35 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 110 + } + ] + } } ], "type": "generator" @@ -23085,7 +26713,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters", + "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\",", "type": "tool_call" }, "event_type": { @@ -23110,7 +26738,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "\": {\"query\": \"Torchtune documentation\"}}", + "tool_call": " \"parameters\": {\"query\": \"Torchtune documentation\"}}", "type": "tool_call" }, "event_type": { @@ -23139,7 +26767,7 @@ "arguments": { "query": "Torchtune documentation" }, - "call_id": "5cfa4683-2147-41ab-9a44-a8b7f23e9f75", + "call_id": "8b279fd4-0479-48d4-8ef7-882004f994b2", "tool_name": "knowledge_search" }, "type": "tool_call" @@ -23180,7 +26808,23 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 39 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 49 + } + ] } } ], @@ -23234,7 +26878,27 @@ "data": { "event": { "delta": { - "text": " attention type used by Llama3-8B is grouped-query attention.", + "text": " attention type used by Llama3-8B is grouped-query attention", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ".", "type": "text" }, "event_type": { @@ -23269,7 +26933,23 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 80 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 26 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 106 + } + ] } } ], @@ -23323,7 +27003,7 @@ "data": { "event": { "delta": { - "text": " attention type used by Llama3", + "text": " attention type used by Llama3-8B is grouped", "type": "text" }, "event_type": { @@ -23343,7 +27023,7 @@ "data": { "event": { "delta": { - "text": "-8B is grouped-query attention.", + "text": "-query attention.", "type": "text" }, "event_type": { @@ -23378,7 +27058,23 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 80 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 26 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 106 + } + ] } } ], @@ -23432,7 +27128,7 @@ "data": { "event": { "delta": { - "text": " \"type\": \"function\",\n ", + "text": " \"type\": \"function\",\n \"name\": \"knowledge", "type": "text" }, "event_type": { @@ -23452,27 +27148,7 @@ "data": { "event": { "delta": { - "text": " \"name\": \"knowledge_search\",\n ", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " \"parameters\": {\n \"query", + "text": "_search\",\n \"parameters\": {\n \"query", "type": "text" }, "event_type": { @@ -23521,7 +27197,7 @@ "arguments": { "query": "Llama3-8B attention type" }, - "call_id": "b2d62231-df92-43ed-b51f-f7b8a4bc4b15", + "call_id": "34e18a7e-5c52-403b-9dd3-cdb3f7f8cb89", "tool_name": "knowledge_search" }, "type": "tool_call" @@ -23562,7 +27238,23 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 40 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 48 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 88 + } + ] } } ], @@ -23626,7 +27318,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "{\"type\": \"function\", \"name\":", + "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters", "type": "tool_call" }, "event_type": { @@ -23651,32 +27343,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " \"knowledge_search\", \"parameters\": {\"query\": \"Llama", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "3-8B attention type\"}}", + "tool_call": "\": {\"query\": \"Llama3-8B attention type\"}}", "type": "tool_call" }, "event_type": { @@ -23705,7 +27372,7 @@ "arguments": { "query": "Llama3-8B attention type" }, - "call_id": "52c2b1ea-3695-4030-87a1-d0ca6d1056af", + "call_id": "2c25de18-5466-4f97-bef1-b463abbbcd55", "tool_name": "knowledge_search" }, "type": "tool_call" @@ -23746,7 +27413,23 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 40 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 50 + } + ] } } ], @@ -24269,7 +27952,596 @@ ], "type": "generator" }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the founder of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Meta founder\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"{\\\"query\\\": \\\"Meta founder\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.81595254, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\\\", \\\"score\\\": 0.70726365, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\\\", \\\"score\\\": 0.467308, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meta Platforms - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Meta_Platforms\\\", \\\"content\\\": \\\"Following a period of intense scrutiny and damaging whistleblower leaks, news started to emerge on October 21, 2021, about Facebook's plan to rebrand the company and change its name.[15][54] In the Q3 2021 Earnings Call on October 25, Mark Zuckerberg discussed the ongoing criticism of the company's social services and the way it operates, and pointed to the pivoting efforts to building the metaverse \\\\u2013 without mentioning the rebranding and the name change.[55] The metaverse vision and the name change from Facebook, Inc. to Meta Platforms was introduced at Facebook Connect on October 28, 2021.[16] Based on Facebook's PR campaign, the name change reflects the company's shifting long term focus of building the metaverse, a digital extension of the physical world by social media, virtual reality and augmented reality features.[16][56]\\\", \\\"score\\\": 0.14999175, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.03678684, \\\"raw_content\\\": null}]}\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " founder of Meta is Mark Zuckerberg.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 1220 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 18 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 1238 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the founder of Meta is.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "brave_search.call(query=\"Meta founder\")", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "Meta founder" + }, + "call_id": "3918bbe3-28b2-411d-be4a-f0eb2aa137e8", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "brave_search" + } + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 33 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 43 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " function `get_boiling_point` is", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " not able to find the boiling point of poly", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "juice as it is a fictional liquid from the Harry Potter series", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ". The function is only able to find the boiling point of real", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " liquids.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 70 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 56 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 126 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " function `get_boiling_point", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "` is not able to find the boiling point of polyjuice", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " as it is not a real liquid.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 70 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 38 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 108 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -24337,7 +28609,7 @@ "data": { "event": { "delta": { - "text": " find the boiling point of polyjuice as", + "text": " find the boiling point of polyju", "type": "text" }, "event_type": { @@ -24357,27 +28629,7 @@ "data": { "event": { "delta": { - "text": " it is a fictional liquid from the Harry Potter series. The", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " function is only able to find the boiling point of real liquids.", + "text": "ice as it is not a real liquid.", "type": "text" }, "event_type": { @@ -24412,265 +28664,23 @@ "value": "end_of_turn" } }, - "metrics": null - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 70 }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" + { + "metric": "completion_tokens", + "unit": null, + "value": 38 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "The", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " function `get_boiling_point` is not able to find the", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " boiling point of polyjuice as it is not a real liquid", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ".", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" + { + "metric": "total_tokens", + "unit": null, + "value": 108 } - }, - "metrics": null - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "The", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " function `get_boiling_point` is not able to find the", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " boiling point of polyjuice as it is not a real liquid", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ".", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null + ] } } ], @@ -24734,7 +28744,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling", + "tool_call": "{\"type\": \"function\", \"name\": \"", "type": "tool_call" }, "event_type": { @@ -24759,7 +28769,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "_point\", \"parameters\": {\"liquid_name\": \"poly", + "tool_call": "get_boiling_point\", \"parameters\": {\"liquid_name\": \"", "type": "tool_call" }, "event_type": { @@ -24784,7 +28794,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "juice\"}}", + "tool_call": "polyjuice\"}}", "type": "tool_call" }, "event_type": { @@ -24813,7 +28823,7 @@ "arguments": { "liquid_name": "polyjuice" }, - "call_id": "83d9f330-4c7a-4dd3-8fcb-ccc5301c1f83", + "call_id": "98ea49c2-7788-4fda-8d84-4a584eb1dd27", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -24854,7 +28864,23 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 30 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 40 + } + ] } } ], @@ -24918,7 +28944,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "{\"type\": \"function\", \"name\":", + "tool_call": "{\"type\": \"function\", \"name\": \"get", "type": "tool_call" }, "event_type": { @@ -24943,7 +28969,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " \"get_boiling_point\", \"parameters\": {\"liquid_name\":", + "tool_call": "_boiling_point\", \"parameters\": {\"liquid_name\": \"polyju", "type": "tool_call" }, "event_type": { @@ -24968,7 +28994,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " \"polyjuice\"}}", + "tool_call": "ice\"}}", "type": "tool_call" }, "event_type": { @@ -24997,7 +29023,7 @@ "arguments": { "liquid_name": "polyjuice" }, - "call_id": "98c63572-06c8-4cc0-a14e-3b10fb9ddc19", + "call_id": "a9d6416d-a739-49b0-b7df-63694412a376", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -25038,7 +29064,23 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 30 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 40 + } + ] } } ], @@ -25092,7 +29134,7 @@ "data": { "event": { "delta": { - "text": " couldn't find any information on the boiling point of Polyjuice", + "text": " couldn't find any information on the boiling point of", "type": "text" }, "event_type": { @@ -25112,7 +29154,7 @@ "data": { "event": { "delta": { - "text": ". Polyjuice is a magical potion in the Harry Potter series", + "text": " Polyjuice. Polyjuice is a magical potion in the", "type": "text" }, "event_type": { @@ -25132,7 +29174,7 @@ "data": { "event": { "delta": { - "text": " that allows the drinker to transform into someone else. It's", + "text": " Harry Potter series that allows the drinker to transform into someone else", "type": "text" }, "event_type": { @@ -25152,7 +29194,7 @@ "data": { "event": { "delta": { - "text": " not a physical substance with a boiling point. If you have any", + "text": ". It's not a physical substance with a boiling point. If", "type": "text" }, "event_type": { @@ -25172,7 +29214,27 @@ "data": { "event": { "delta": { - "text": " other questions, I'd be happy to help.", + "text": " you have any other questions, I'd be happy to", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " help.", "type": "text" }, "event_type": { @@ -25207,7 +29269,23 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 30 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 73 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 103 + } + ] } } ], @@ -25271,7 +29349,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "{\"type\": \"function\", \"name\":", + "tool_call": "{\"type\": \"function\", \"name\": \"", "type": "tool_call" }, "event_type": { @@ -25296,7 +29374,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " \"get_boiling_point\", \"parameters\": {\"liquid_name\":", + "tool_call": "get_boiling_point\", \"parameters\": {\"liquid_name\":", "type": "tool_call" }, "event_type": { @@ -25350,7 +29428,7 @@ "arguments": { "liquid_name": "polyjuice" }, - "call_id": "cdccc866-97a0-40fd-b6e2-a0555f0ed921", + "call_id": "722b4201-b9ac-440f-80c4-0e9a7f0b5369", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -25391,7 +29469,23 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 30 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 40 + } + ] } } ], @@ -25486,6 +29580,111 @@ ], "type": "generator" }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"def is_prime(n):\\n if n <= 1:\\n return False\\n if n <= 3:\\n return True\\n if n % 2 == 0 or n % 3 == 0:\\n return False\\n i = 5\\n while i * i <= n:\\n if n % i == 0 or n % (i + 2) == 0:\\n return False\\n i += 6\\n return True\\n\\ndef get_nth_prime(n):\\n count = 0\\n num = 2\\n while True:\\n if is_prime(num):\\n count += 1\\n if count == n:\\n return num\\n num += 1\\n\\nprint(get_nth_prime(100))\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stdout]\\n541\\n[/stdout]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " 100th prime number is 541.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 217 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 20 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 237 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { @@ -25544,7 +29743,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "def is_prime(n):\n if n <= 1:\n ", + "tool_call": "def is_prime(n):\n if n <= 1:\n return False\n", "type": "tool_call" }, "event_type": { @@ -25569,7 +29768,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " return False\n if n <= 3:\n return True", + "tool_call": " if n <= 3:\n return True\n if n % ", "type": "tool_call" }, "event_type": { @@ -25594,7 +29793,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "\n if n % 2 == 0 or n % 3 == 0:\n return False\n i = 5\n while i * i <= n:\n if n % i == 0 or n % (i +", + "tool_call": "2 == 0 or n % 3 == 0:\n return False", "type": "tool_call" }, "event_type": { @@ -25619,7 +29818,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " 2) == 0:\n return False\n ", + "tool_call": "\n i = 5\n while i * i <= n:\n ", "type": "tool_call" }, "event_type": { @@ -25644,7 +29843,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " i += 6\n return True\n\ndef get_nth_prime(n):\n count", + "tool_call": " if n % i == 0 or n % (i + 2)", "type": "tool_call" }, "event_type": { @@ -25669,7 +29868,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " = 0\n num = 2\n while True:\n if", + "tool_call": " == 0:\n return False\n i", "type": "tool_call" }, "event_type": { @@ -25694,7 +29893,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " is_prime(num):\n count += 1", + "tool_call": " += 6\n return True", "type": "tool_call" }, "event_type": { @@ -25719,7 +29918,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "\n if count == n:\n return num\n num += ", + "tool_call": "\n\n", "type": "tool_call" }, "event_type": { @@ -25744,7 +29943,82 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "1\n\nprint(get_nth_prime(100))", + "tool_call": "def get_nth_prime(n):\n count = 0\n num = 2\n while True:\n if is_prime(num):\n ", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " count += 1\n if count == n:\n return", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " num\n num += 1\n\nprint(get_nth_prime(100", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "))", "type": "tool_call" }, "event_type": { @@ -25773,7 +30047,7 @@ "arguments": { "code": "def is_prime(n):\n if n <= 1:\n return False\n if n <= 3:\n return True\n if n % 2 == 0 or n % 3 == 0:\n return False\n i = 5\n while i * i <= n:\n if n % i == 0 or n % (i + 2) == 0:\n return False\n i += 6\n return True\n\ndef get_nth_prime(n):\n count = 0\n num = 2\n while True:\n if is_prime(num):\n count += 1\n if count == n:\n return num\n num += 1\n\nprint(get_nth_prime(100))" }, - "call_id": "7fca0515-82f3-46e1-bbec-eceb8fa5162e", + "call_id": "229f89b4-d07d-4748-b8ae-f805cb52209e", "tool_name": { "__enum__": "BuiltinTool", "__module__": "llama_stack.models.llama.datatypes", @@ -25818,7 +30092,23 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 40 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 50 + } + ] } } ], @@ -25872,7 +30162,7 @@ "data": { "event": { "delta": { - "text": "plexity the company was founded in 2022", + "text": "plexity the company was founded in 202", "type": "text" }, "event_type": { @@ -25892,7 +30182,7 @@ "data": { "event": { "delta": { - "text": ".", + "text": "2.", "type": "text" }, "event_type": { @@ -25927,7 +30217,23 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 105 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 22 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 127 + } + ] } } ], @@ -25981,7 +30287,7 @@ "data": { "event": { "delta": { - "text": "type\": \"function\", \"name\": \"knowledge_search\",", + "text": "type\": \"function\", \"name\": \"knowledge_search\", \"", "type": "text" }, "event_type": { @@ -26001,27 +30307,7 @@ "data": { "event": { "delta": { - "text": " \"parameters\": {\"query\": \"Perplexity company founding", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " date\"}}", + "text": "parameters\": {\"query\": \"Perplexity company founding date\"}}", "type": "text" }, "event_type": { @@ -26050,7 +30336,7 @@ "arguments": { "query": "Perplexity company founding date" }, - "call_id": "ca248109-25af-4737-90cb-6461faaf4e63", + "call_id": "5ea88dde-f090-4157-9219-45a16100ef21", "tool_name": "knowledge_search" }, "type": "tool_call" @@ -26091,7 +30377,23 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 67 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 37 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 104 + } + ] } } ], @@ -26180,32 +30482,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "\": {\"query\": \"Perplexity", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " company founding date\"}}", + "tool_call": "\": {\"query\": \"Perplexity company founding date\"}}", "type": "tool_call" }, "event_type": { @@ -26234,7 +30511,7 @@ "arguments": { "query": "Perplexity company founding date" }, - "call_id": "94a9fd55-7658-482d-8595-d2c2a23b3a1e", + "call_id": "06c95bef-9b2d-4380-bf16-e1338bb7cf2c", "tool_name": "knowledge_search" }, "type": "tool_call" @@ -26275,7 +30552,23 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 29 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 39 + } + ] } } ], @@ -26438,7 +30731,7 @@ "data": { "event": { "delta": { - "text": "{\"", + "text": "The", "type": "text" }, "event_type": { @@ -26458,7 +30751,7 @@ "data": { "event": { "delta": { - "text": "type\": \"function\", \"name\": \"knowledge_search\", \"parameters", + "text": " NBA was created on August 3, 1949, with", "type": "text" }, "event_type": { @@ -26478,7 +30771,7 @@ "data": { "event": { "delta": { - "text": "\": {\"query\": \"when was the nba created\"}}", + "text": " the merger of the Basketball Association of America (BAA) and", "type": "text" }, "event_type": { @@ -26498,19 +30791,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "when was the nba created" - }, - "call_id": "7b01a40d-a6a8-4c86-b91d-1790e7480e57", - "tool_name": "knowledge_search" - }, - "type": "tool_call" + "text": " the National Basketball League (NBL).", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -26518,11 +30800,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -26548,7 +30826,23 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 65 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 45 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 110 + } + ] } } ], @@ -26612,7 +30906,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\",", + "tool_call": "{\"type\": \"function\", \"", "type": "tool_call" }, "event_type": { @@ -26637,7 +30931,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " \"parameters\": {\"query\": \"when was the nba created", + "tool_call": "name\": \"knowledge_search\", \"parameters\": {\"query\": \"", "type": "tool_call" }, "event_type": { @@ -26662,7 +30956,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "\"}}", + "tool_call": "when was the nba created\"}}", "type": "tool_call" }, "event_type": { @@ -26691,7 +30985,7 @@ "arguments": { "query": "when was the nba created" }, - "call_id": "bbaf750a-0337-4c83-9bf2-76c2f72d45c3", + "call_id": "b08bb4c0-c0a1-4063-b110-3947559e4061", "tool_name": "knowledge_search" }, "type": "tool_call" @@ -26732,7 +31026,23 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 27 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 37 + } + ] } } ], diff --git a/tests/integration/fixtures/recorded_responses/invoke_tool.json b/tests/integration/fixtures/recorded_responses/invoke_tool.json index 76191e992..30a132904 100644 --- a/tests/integration/fixtures/recorded_responses/invoke_tool.json +++ b/tests/integration/fixtures/recorded_responses/invoke_tool.json @@ -5,7 +5,7 @@ "__module__": "llama_stack.apis.tools.tools", "__pydantic__": "ToolInvocationResult", "data": { - "content": "completed\n[stderr]\nTraceback (most recent call last):\n line 5, in \n from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]", + "content": "completed\n[stdout]\n541\n[/stdout]", "error_code": null, "error_message": null, "metadata": null @@ -31,7 +31,7 @@ "__module__": "llama_stack.apis.tools.tools", "__pydantic__": "ToolInvocationResult", "data": { - "content": "completed\n[stderr]\nTraceback (most recent call last):\n line 5, in \n from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]", + "content": "completed\n[stdout]\nNumber of rows and columns in the data: (10, 13)\nColumns of the data are: 13\nColumns of the data are: Index(['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',\n 'Oct', 'Nov', 'Dec'],\n dtype='object')\nDatatype of the columns are: Year int64\nJan float64\nFeb float64\nMar float64\nApr float64\nMay float64\nJun float64\nJul float64\nAug float64\nSep float64\nOct float64\nNov float64\nDec float64\ndtype: object\n[/stdout]", "error_code": null, "error_message": null, "metadata": null @@ -70,7 +70,7 @@ "__module__": "llama_stack.apis.tools.tools", "__pydantic__": "ToolInvocationResult", "data": { - "content": "completed\n[stderr]\nTraceback (most recent call last):\n line 5, in \n from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]", + "content": "completed\n[stdout]\nYear Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec\n0 2014 1.6 1.6 1.7 1.8 2.0 1.9 1.9 1.7 1.7 1.8 1.7 1.6\n1 2015 1.6 1.7 1.8 1.8 1.7 1.8 1.8 1.8 1.9 1.9 2.0 2.1\n2 2016 2.2 2.3 2.2 2.1 2.2 2.2 2.2 2.3 2.2 2.1 2.1 2.2\n3 2017 2.3 2.2 2.0 1.9 1.7 1.7 1.7 1.7 1.7 1.8 1.7 1.8\n4 2018 1.8 1.8 2.1 2.1 2.2 2.3 2.4 2.2 2.2 2.1 2.2 2.2\n[/stdout]", "error_code": null, "error_message": null, "metadata": null @@ -83,7 +83,7 @@ "__module__": "llama_stack.apis.tools.tools", "__pydantic__": "ToolInvocationResult", "data": { - "content": "completed\n[stderr]\nTraceback (most recent call last):\n line 5, in \n from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]", + "content": "completed\n[stderr]\nTraceback (most recent call last):\n line 142, in \n line 23, in \n from .code_execution import CodeExecutionContext, CodeExecutionRequest, CodeExecutor\nImportError: attempted relative import with no known parent package\n[/stderr]", "error_code": null, "error_message": null, "metadata": null @@ -116,6 +116,19 @@ } } }, + "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n\\n# Calculate average yearly inflation\\ndf['Average'] = df[['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']].mean(axis=1)\\n\\n# Plot time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df['Year'], df['Average'])\\nplt.xlabel('Year')\\nplt.ylabel('Average Yearly Inflation')\\nplt.title('Average Yearly Inflation Over Time')\\nplt.grid(True)\\nplt.show()\", \"session_id\": \"\"}, \"tool_name\": \"code_interpreter\"}]": { + "type": "value", + "value": { + "__module__": "llama_stack.apis.tools.tools", + "__pydantic__": "ToolInvocationResult", + "data": { + "content": "completed", + "error_code": null, + "error_message": null, + "metadata": null + } + } + }, "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv(\\\"inflation.csv\\\")\\n\\n# Convert date column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\", \"session_id\": \"\"}, \"tool_name\": \"code_interpreter\"}]": { "type": "value", "value": { @@ -154,23 +167,23 @@ "type": "text" }, { - "text": "Result 1:\nDocument_id:961ff\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\n See :ref:`below` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n # Print the first layer's self-attention in the usual Llama2 model\n >>> print(base_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (pos_embeddings): RotaryPositionalEmbeddings()\n )\n\n # Print the same for Llama2 with LoRA weights\n >>> print(lora_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): LoRALinear(\n (dropout): Dropout(p=0.0, inplace=False)\n \n", + "text": "Result 2:\nDocument_id:cc255\nContent: LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n # Build Llama2 without any LoRA layers\n base_model = llama2_7b()\n\n # The default settings for lora_llama2_7b will match those for llama2_7b\n # We just need to define which layers we want LoRA applied to.\n # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n # layers outside of the self-attention.\n lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n Calling :func:`lora_llama_2_7b ` alone will not handle the definition of which parameters are trainable.\n See :ref:`below` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n # Print the first layer's self-attention in the usual Llama2 model\n >>> print(base_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (pos_embeddings): RotaryPositionalEmbeddings()\n )\n\n # Print the same for Llama2 with LoRA weights\n >>> print(lora_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): LoRALinear(\n (dropout): Dropout(p=0.0, inplace=False)\n \n", "type": "text" }, { - "text": "Result 3:\nDocument_id:961ff\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", + "text": "Result 3:\nDocument_id:cc255\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", "type": "text" }, { - "text": "Result 4:\nDocument_id:961ff\nContent: from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n # Assuming that base_model already has the pretrained Llama2 weights,\n # this will directly load them into your LoRA model without any conversion necessary.\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n :func:`validate_missing_and_unexpected_for_lora() `.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n # Fetch all params from the model that are associated with LoRA.\n lora_params = get_adapter_params(lora_model)\n\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n set_trainable_params(lora_model, lora_params)\n\n # Print the total number of parameters\n total_params = sum([p.numel() for p in lora_model.parameters()])\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n print(\n f\"\"\"\n {total_params} total params,\n {trainable_params}\" trainable params,\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n \"\"\"\n )\n\n 6742609920 total params,\n 4194304 trainable params,\n 0.06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n # Fetch all params from the model that are associated with LoRA.\n lora_params = get_adapter_params(lora_model)\n\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n set_trainable_params(lora_model, lora_params)\n\n # Print the total number of parameters\n total_params = sum([p.numel() for p in lora_model.parameters()])\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n print(\n f\"\"\"\n {total_params} total params,\n {trainable_params}\" trainable params,\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n \"\"\"\n )\n\n 6742609920 total params,\n 4194304 trainable params,\n 0.06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \"}, \"tool_name\": \"web_search\"}]": { + "type": "value", + "value": { + "__module__": "llama_stack.apis.tools.tools", + "__pydantic__": "ToolInvocationResult", + "data": { + "content": "{\"query\": \"Meta founder\", \"top_k\": [{\"title\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\u00a9 2025 Meta\", \"score\": 0.81595254, \"raw_content\": null}, {\"title\": \"Executives - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\u2018Boz\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\", \"score\": 0.70726365, \"raw_content\": null}, {\"title\": \"Meta - Leadership & Governance\", \"url\": \"https://investor.atmeta.com/leadership-and-governance/\", \"content\": \"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\", \"score\": 0.467308, \"raw_content\": null}, {\"title\": \"Meta Platforms - Wikipedia\", \"url\": \"https://en.wikipedia.org/wiki/Meta_Platforms\", \"content\": \"Following a period of intense scrutiny and damaging whistleblower leaks, news started to emerge on October 21, 2021, about Facebook's plan to rebrand the company and change its name.[15][54] In the Q3 2021 Earnings Call on October 25, Mark Zuckerberg discussed the ongoing criticism of the company's social services and the way it operates, and pointed to the pivoting efforts to building the metaverse \\u2013 without mentioning the rebranding and the name change.[55] The metaverse vision and the name change from Facebook, Inc. to Meta Platforms was introduced at Facebook Connect on October 28, 2021.[16] Based on Facebook's PR campaign, the name change reflects the company's shifting long term focus of building the metaverse, a digital extension of the physical world by social media, virtual reality and augmented reality features.[16][56]\", \"score\": 0.14999175, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Wikipedia\", \"url\": \"https://en.wikipedia.org/wiki/Mark_Zuckerberg\", \"content\": \"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\u9648\\u660e\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\", \"score\": 0.03678684, \"raw_content\": null}]}", + "error_code": null, + "error_message": null, + "metadata": null + } + } + }, "[[], {\"kwargs\": {\"query\": \"NBA creation date\", \"session_id\": \"\", \"vector_db_ids\": [\"test-vector-db-\"]}, \"tool_name\": \"knowledge_search\"}]": { "type": "value", "value": { @@ -374,23 +400,23 @@ "type": "text" }, { - "text": "Result 1:\nDocument_id:24443\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\nlook like so:\n\n.. code-block:: python\n\n from torchtune.datasets import chat_dataset\n from torchtune.models.llama3 import llama3_tokenizer\n\n tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\n ds = chat_dataset(\n tokenizer=tokenizer,\n source=\"json\",\n data_files=\"data/my_data.json\",\n split=\"train\",\n conversation_column=\"dialogue\",\n conversation_style=\"sharegpt\",\n )\n\n.. code-block:: yaml\n\n # In config\n tokenizer:\n _component_: torchtune.models.llama3.llama3_tokenizer\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\n\n dataset:\n _component_: torchtune.datasets.chat_dataset\n source: json\n data_files: data/my_data.json\n split: train\n conversation_column: dialogue\n conversation_style: sharegpt\n\n.. note::\n You can pass in any keyword argument for `load_dataset `_ into all our\n Dataset classes and they will honor them. This is useful for common parameters\n such as specifying the data split with :code:`split` or configuration with\n :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations `_ into all our\n Dataset classes and they will honor them. This is useful for common parameters\n such as specifying the data split with :code:`split` or configuration with\n :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW `.\n.. .. _glossary_fsdp2:\n\n", + "text": "Result 3:\nDocument_id:7a06a\nContent: ` module, which we swap\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP `.\n.. .. _glossary_fsdp2:\n\n", "type": "text" }, { - "text": "Result 4:\nDocument_id:961ff\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", + "text": "Result 4:\nDocument_id:cc255\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", "type": "text" }, { - "text": "Result 5:\nDocument_id:b49f7\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.use_dora=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.apply_lora_to_mlp=True \\\n model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n model.lora_rank=16 \\\n model.lora_alpha=32 \\\n model.use_dora=True \\\n model.quantize_base=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n apply_lora_to_mlp: True\n lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n lora_rank: 16\n lora_alpha: 32\n use_dora: True\n quantize_base: True\n\n\n.. note::\n\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP `.\n.. .. _glossary_fsdp2:\n\n", + "text": "Result 5:\nDocument_id:7a06a\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.use_dora=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.apply_lora_to_mlp=True \\\n model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n model.lora_rank=16 \\\n model.lora_alpha=32 \\\n model.use_dora=True \\\n model.quantize_base=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n apply_lora_to_mlp: True\n lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n lora_rank: 16\n lora_alpha: 32\n use_dora: True\n quantize_base: True\n\n\n.. note::\n\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP `.\n.. .. _glossary_fsdp2:\n\n", "type": "text" }, { @@ -402,11 +428,11 @@ "error_message": null, "metadata": { "document_ids": [ - "24443dfb-a0b3-4ce8-820e-3fb1f12364bb", - "961ff2d1-8887-41ef-a4fe-fa4cbab7b932", - "b49f7985-6615-4dcf-99be-d1765b6a6fc6", - "961ff2d1-8887-41ef-a4fe-fa4cbab7b932", - "b49f7985-6615-4dcf-99be-d1765b6a6fc6" + "16a6ae01-049e-4a44-b305-8248d20a8f7d", + "cc2559a9-2b56-43d8-9ec4-b2181bb96acb", + "7a06a3a9-7e9d-4693-8c07-15343f0654aa", + "cc2559a9-2b56-43d8-9ec4-b2181bb96acb", + "7a06a3a9-7e9d-4693-8c07-15343f0654aa" ] } } From a505bf45a34891c1ed88633abeb88248db92889f Mon Sep 17 00:00:00 2001 From: ehhuang Date: Wed, 12 Mar 2025 19:41:48 -0700 Subject: [PATCH 034/557] feat(api): remove tool_name from ToolResponseMessage (#1599) Summary: This is not used anywhere. closes #1421 Test Plan: LLAMA_STACK_CONFIG=fireworks pytest -s -v tests/integration/agents/test_agents.py --safety-shield meta-llama/Llama-Guard-3-8B --text-model meta-llama/Llama-3.1-8B-Instruct --record-responses --- docs/_static/llama-stack-spec.html | 39 +- docs/_static/llama-stack-spec.yaml | 28 +- llama_stack/apis/inference/inference.py | 2 - .../agents/meta_reference/agent_instance.py | 8 +- .../recorded_responses/chat_completion.json | 8712 ++++++++++++++++- .../recorded_responses/invoke_tool.json | 40 +- 6 files changed, 8637 insertions(+), 192 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index dbd530aa3..c50554092 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -4347,24 +4347,6 @@ "type": "string", "description": "Unique identifier for the tool call this response is for" }, - "tool_name": { - "oneOf": [ - { - "type": "string", - "enum": [ - "brave_search", - "wolfram_alpha", - "photogen", - "code_interpreter" - ], - "title": "BuiltinTool" - }, - { - "type": "string" - } - ], - "description": "Name of the tool that was called" - }, "content": { "$ref": "#/components/schemas/InterleavedContent", "description": "The response content from the tool" @@ -4374,7 +4356,6 @@ "required": [ "role", "call_id", - "tool_name", "content" ], "title": "ToolResponseMessage", @@ -4673,12 +4654,22 @@ "CompletionResponse": { "type": "object", "properties": { +<<<<<<< dest: ed6caead724a - ehhuang: chore: simplify _get_tool_defs (#1384) "metrics": { "type": "array", "items": { "$ref": "#/components/schemas/MetricInResponse" } }, +||||||| base: 1311faf3f5e7 - ehhuang: fix: logging (#1598) +======= + "metrics": { + "type": "array", + "items": { + "$ref": "#/components/schemas/MetricEvent" + } + }, +>>>>>>> source: ad32270ad0d5 - erichuang: feat(api): remove tool_name from To... "content": { "type": "string", "description": "The generated completion text" @@ -5046,12 +5037,22 @@ "CompletionResponseStreamChunk": { "type": "object", "properties": { +<<<<<<< dest: ed6caead724a - ehhuang: chore: simplify _get_tool_defs (#1384) "metrics": { "type": "array", "items": { "$ref": "#/components/schemas/MetricInResponse" } }, +||||||| base: 1311faf3f5e7 - ehhuang: fix: logging (#1598) +======= + "metrics": { + "type": "array", + "items": { + "$ref": "#/components/schemas/MetricEvent" + } + }, +>>>>>>> source: ad32270ad0d5 - erichuang: feat(api): remove tool_name from To... "delta": { "type": "string", "description": "New content generated since last chunk. This can be one or more tokens." diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index cca1872a4..1f9536c2e 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -2943,17 +2943,6 @@ components: type: string description: >- Unique identifier for the tool call this response is for - tool_name: - oneOf: - - type: string - enum: - - brave_search - - wolfram_alpha - - photogen - - code_interpreter - title: BuiltinTool - - type: string - description: Name of the tool that was called content: $ref: '#/components/schemas/InterleavedContent' description: The response content from the tool @@ -2961,7 +2950,6 @@ components: required: - role - call_id - - tool_name - content title: ToolResponseMessage description: >- @@ -3188,10 +3176,18 @@ components: CompletionResponse: type: object properties: +<<<<<<< dest: ed6caead724a - ehhuang: chore: simplify _get_tool_defs (#1384) metrics: type: array items: $ref: '#/components/schemas/MetricInResponse' +||||||| base: 1311faf3f5e7 - ehhuang: fix: logging (#1598) +======= + metrics: + type: array + items: + $ref: '#/components/schemas/MetricEvent' +>>>>>>> source: ad32270ad0d5 - erichuang: feat(api): remove tool_name from To... content: type: string description: The generated completion text @@ -3510,10 +3506,18 @@ components: CompletionResponseStreamChunk: type: object properties: +<<<<<<< dest: ed6caead724a - ehhuang: chore: simplify _get_tool_defs (#1384) metrics: type: array items: $ref: '#/components/schemas/MetricInResponse' +||||||| base: 1311faf3f5e7 - ehhuang: fix: logging (#1598) +======= + metrics: + type: array + items: + $ref: '#/components/schemas/MetricEvent' +>>>>>>> source: ad32270ad0d5 - erichuang: feat(api): remove tool_name from To... delta: type: string description: >- diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py index fa917ac22..0a4324cdf 100644 --- a/llama_stack/apis/inference/inference.py +++ b/llama_stack/apis/inference/inference.py @@ -117,13 +117,11 @@ class ToolResponseMessage(BaseModel): :param role: Must be "tool" to identify this as a tool response :param call_id: Unique identifier for the tool call this response is for - :param tool_name: Name of the tool that was called :param content: The response content from the tool """ role: Literal["tool"] = "tool" call_id: str - tool_name: Union[BuiltinTool, str] content: InterleavedContent diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py index 3f09cacc0..0ae1996cc 100644 --- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py +++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py @@ -153,7 +153,6 @@ class ChatAgent(ShieldRunnerMixin): messages.append( ToolResponseMessage( call_id=response.call_id, - tool_name=response.tool_name, content=response.content, ) ) @@ -221,8 +220,7 @@ class ChatAgent(ShieldRunnerMixin): messages = await self.get_messages_from_turns(turns) if is_resume: tool_response_messages = [ - ToolResponseMessage(call_id=x.call_id, tool_name=x.tool_name, content=x.content) - for x in request.tool_responses + ToolResponseMessage(call_id=x.call_id, content=x.content) for x in request.tool_responses ] messages.extend(tool_response_messages) last_turn = turns[-1] @@ -685,7 +683,6 @@ class ChatAgent(ShieldRunnerMixin): result_messages = [ ToolResponseMessage( call_id=tool_call.call_id, - tool_name=tool_call.tool_name, content=tool_result.content, ) ] @@ -705,7 +702,7 @@ class ChatAgent(ShieldRunnerMixin): tool_responses=[ ToolResponse( call_id=result_message.call_id, - tool_name=result_message.tool_name, + tool_name=tool_call.tool_name, content=result_message.content, metadata=tool_result.metadata, ) @@ -999,7 +996,6 @@ async def attachment_message(tempdir: str, urls: List[URL]) -> ToolResponseMessa return ToolResponseMessage( call_id="", - tool_name=BuiltinTool.code_interpreter, content=content, ) diff --git a/tests/integration/fixtures/recorded_responses/chat_completion.json b/tests/integration/fixtures/recorded_responses/chat_completion.json index 30b7e0b4d..8694cc271 100644 --- a/tests/integration/fixtures/recorded_responses/chat_completion.json +++ b/tests/integration/fixtures/recorded_responses/chat_completion.json @@ -12758,6 +12758,292 @@ ], "type": "generator" }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"false\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " boiling point of polyjuice is -100 degrees Fahrenheit.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 139 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 23 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 162 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "{\"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "type\": \"function\", \"name\": \"get_boiling_point\", \"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "parameters\": {\"liquid_name\": \"polyjuice\", \"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "celcius\": \"false\"}}", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": "false", + "liquid_name": "polyjuice" + }, + "call_id": "1ef7adda-5ebb-41d5-a2c6-3e6700de5f81", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 91 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 45 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 136 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { @@ -12806,7 +13092,7 @@ "data": { "event": { "delta": { - "text": " \"type\": \"function\",\n \"", + "text": " \"type\": \"function\",\n \"name\": \"get", "type": "text" }, "event_type": { @@ -12826,7 +13112,7 @@ "data": { "event": { "delta": { - "text": "name\": \"get_boiling_point\",\n \"parameters\":", + "text": "_boiling_point\",\n \"parameters\": {\n \"liquid", "type": "text" }, "event_type": { @@ -12846,7 +13132,7 @@ "data": { "event": { "delta": { - "text": " {\n \"liquid_name\": \"polyjuice\",\n \"celci", + "text": "_name\": \"polyjuice\",\n \"celci", "type": "text" }, "event_type": { @@ -12896,7 +13182,7 @@ "celcius": "true", "liquid_name": "polyjuice" }, - "call_id": "73212def-09c0-4a29-845e-149afb38fcd1", + "call_id": "40293d5b-8a76-4df5-8325-d6e8755ba513", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -13084,6 +13370,111 @@ ], "type": "generator" }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " boiling point of polyjuice is -100\u00b0C.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 85 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 22 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 107 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point_with_metadata\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { "chunks": [ { @@ -13209,6 +13600,131 @@ ], "type": "generator" }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " boiling point of polyjuice is -", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "100 degrees Celcius.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 87 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 25 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 112 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { @@ -13267,7 +13783,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "{\"type\": \"function\", \"name", + "tool_call": "{\"type\": \"function\", \"", "type": "tool_call" }, "event_type": { @@ -13292,7 +13808,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "\": \"get_boiling_point\", \"parameters\": {\"liquid_name\": \"poly", + "tool_call": "name\": \"get_boiling_point\", \"parameters", "type": "tool_call" }, "event_type": { @@ -13317,7 +13833,32 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "juice\", \"celcius\": \"true\"}}", + "tool_call": "\": {\"liquid_name\": \"polyjuice\", \"celci", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "us\": \"true\"}}", "type": "tool_call" }, "event_type": { @@ -13347,7 +13888,7 @@ "celcius": "true", "liquid_name": "polyjuice" }, - "call_id": "e4b0121a-7b75-4e89-be40-d13021a3bb11", + "call_id": "f146d04b-c400-4193-a6d8-ccfea7f7b529", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -13468,7 +14009,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling_point", + "tool_call": "{\"type\": \"function\", \"name\": \"", "type": "tool_call" }, "event_type": { @@ -13493,7 +14034,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "_with_metadata\", \"parameters\": {\"liquid_name\": \"polyjuice\",", + "tool_call": "get_boiling_point_with_metadata\", \"parameters\": {\"liquid", "type": "tool_call" }, "event_type": { @@ -13518,7 +14059,57 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " \"celcius\": \"true\"}}", + "tool_call": "_name\": \"polyjuice\", \"celci", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "us\": \"", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "true\"}}", "type": "tool_call" }, "event_type": { @@ -13548,7 +14139,7 @@ "celcius": "true", "liquid_name": "polyjuice" }, - "call_id": "11c04896-2b7b-49bd-b832-47a1c9f3796f", + "call_id": "d6b8a25d-9b4c-4650-bbe6-f94b5fa97e56", "tool_name": "get_boiling_point_with_metadata" }, "type": "tool_call" @@ -13659,7 +14250,7 @@ "data": { "event": { "delta": { - "text": " customer smiled and said \"hello\" to the", + "text": " customer smiled and said \"hello\" to the friendly store", "type": "text" }, "event_type": { @@ -13679,7 +14270,7 @@ "data": { "event": { "delta": { - "text": " friendly store clerk.", + "text": " clerk.", "type": "text" }, "event_type": { @@ -16564,6 +17155,1609 @@ ], "type": "generator" }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\ndf = pd.read_csv(\\\"\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stdout]\\nYear Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec\\n0 2014 1.6 1.6 1.7 1.8 2.0 1.9 1.9 1.7 1.7 1.8 1.7 1.6\\n1 2015 1.6 1.7 1.8 1.8 1.7 1.8 1.8 1.8 1.9 1.9 2.0 2.1\\n2 2016 2.2 2.3 2.2 2.1 2.2 2.2 2.2 2.3 2.2 2.1 2.1 2.2\\n3 2017 2.3 2.2 2.0 1.9 1.7 1.7 1.7 1.7 1.7 1.8 1.7 1.8\\n4 2018 1.8 1.8 2.1 2.1 2.2 2.3 2.4 2.2 2.2 2.1 2.2 2.2\\n[/stdout]\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " csv file contains a table with 12 columns (Jan to", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Dec) and 5 rows (2014 to", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " 2018). The values in the table represent the inflation rate", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " for each month of the year from 2014", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " to 2018.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 469 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 61 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 530 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 142, in \\n line 23, in \\n from .code_execution import CodeExecutionContext, CodeExecutionRequest, CodeExecutor\\nImportError: attempted relative import with no known parent package\\n[/stderr]\", \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 142, in \\n line 23, in \\n from .code_execution import CodeExecutionContext, CodeExecutionRequest, CodeExecutor\\nImportError: attempted relative import with no known parent package\\n[/stderr]\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " error message indicates that there is an issue with", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the import statement. However, the code provided does not contain any", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " import statements that would cause this error.\n\nTo provide a more accurate", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " answer, I would need to know the contents of the CSV file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " or more information about the error message.\n\nHowever, based on the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " code provided, it seems like the code is trying to load a", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " CSV file and print some basic information about it. If the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " is not found or there is an issue with the file path,", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " this could cause an error.\n\nHere is a", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " revised version of the code that includes some error", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " handling:\n\n```\nimport pandas as pd\nimport code_interpreter", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "\n\ntry:\n # Load the CSV file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "\n df = pd.read_csv(\"/var/folders/cz", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "/vyh7y1d11xg", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "881lsxsshnc5", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "c0000gn/T/tmpflpgiagc/", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "8S20Zj2Oinflation.csv\")\n\n ", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " # Print the first few rows of the dataframe\n print(df.head", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "())\n\n # Print the data types of each column\n print", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "(df.dtypes)\n\n # Print the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " summary statistics of the dataframe\n ", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " print(df.describe())\n\nexcept FileNotFoundError:\n print(\"The file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " was not found.\")\nexcept pd.errors.EmptyDataError", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ":\n print(\"The file is empty.\")\nexcept pd.errors.ParserError", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ":\n print(\"An error occurred while parsing the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file.\")\nexcept Exception as e:\n print", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "(\"An error occurred: \", str(e))\n``", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "`\n\nThis code will catch specific exceptions that could occur when loading the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " CSV file and print a more", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " informative error message.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 393 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 331 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 724 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 142, in \\n line 23, in \\n from .code_execution import CodeExecutionContext, CodeExecutionRequest, CodeExecutor\\nImportError: attempted relative import with no known parent package\\n[/stderr]\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\nimport code_interpreter\n\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "# Load the CSV file\ndf = pd.read_csv(\"/var/f", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "olders/cz/vyh7y1d11xg881lsx", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "sshnc5c0000gn/T/tmpfl", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "pgiagc/8S20Zj2Oinflation", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".csv\")\n\n# Print the first few rows of the", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " dataframe\nprint(df.head())\n\n# Print the data types of each", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " column\nprint(df.dtypes)\n\n#", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Print the summary statistics of the dataframe\nprint(df.describe())", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpflpgiagc/8S20Zj2Oinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())" + }, + "call_id": "e999a578-cbd8-4bb8-bc53-deb2fff1ffce", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 215 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 225 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\nimport code_interpreter\n\n# Load the", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " CSV file\ndf = pd.read", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "_csv(\"/var/folders/cz/vyh", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "7y1d11xg881lsxsshnc5c", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "0000gn/T/tmpflpgiagc/8S", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "20Zj2Oinflation.csv\")\n\n# Print the first", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " few rows of the dataframe\nprint(df.head())\n\n#", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Print the data types of each column\nprint", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "(df.dtypes)\n\n# Print the summary statistics of the dataframe", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\nprint(df.describe())", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpflpgiagc/8S20Zj2Oinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())" + }, + "call_id": "ea72d524-2d0f-4220-a898-4c295315235e", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 37 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 47 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"It seems that the file \\\"\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If the file is in your current directory, you can use the following code:\\n\\n```python\\nimport pandas as pd\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n# Print the first 5 rows of the dataframe\\nprint(df.head())\\n# Print the summary of the dataframe\\nprint(df.info())\\nprint(df.describe())\\n```\\n\\nThis will print the first 5 rows of the dataframe, the summary of the dataframe (including the index dtype and column count), and the description of the dataframe (including count, mean, std, min, 25%, 50%, 75%, max for each column).\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n\\n# Convert 'date' column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot the time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { @@ -19488,6 +21682,1269 @@ ], "type": "generator" }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stdout]\\nNumber of rows and columns in the data: (10, 13)\\nColumns of the data are: 13\\nColumns of the data are: Index(['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',\\n 'Oct', 'Nov', 'Dec'],\\n dtype='object')\\nDatatype of the columns are: Year int64\\nJan float64\\nFeb float64\\nMar float64\\nApr float64\\nMay float64\\nJun float64\\nJul float64\\nAug float64\\nSep float64\\nOct float64\\nNov float64\\nDec float64\\ndtype: object\\n[/stdout]\", \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"This CSV file contains 10 rows and 13 columns. The columns are named 'Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'. The data types of these columns are int64 for 'Year' and float64 for the rest.\\n\\nIt appears that this CSV file contains monthly inflation rates for different years. The 'Year' column represents the year, and the rest of the columns represent the inflation rates for each month of the year.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n\\n# Calculate average yearly inflation\\ndf['Average'] = df[['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']].mean(axis=1)\\n\\n# Plot time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df['Year'], df['Average'])\\nplt.xlabel('Year')\\nplt.ylabel('Average Yearly Inflation')\\nplt.title('Average Yearly Inflation Over Time')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "This", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " code will create a line plot of the average yearly inflation over time. The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " x-axis represents the year, and the y-axis represents the average yearly inflation", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ". The plot will show the trend of average yearly inflation over the years", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ".", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 635 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 56 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 691 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stdout]\\nNumber of rows and columns in the data: (10, 13)\\nColumns of the data are: 13\\nColumns of the data are: Index(['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',\\n 'Oct', 'Nov', 'Dec'],\\n dtype='object')\\nDatatype of the columns are: Year int64\\nJan float64\\nFeb float64\\nMar float64\\nApr float64\\nMay float64\\nJun float64\\nJul float64\\nAug float64\\nSep float64\\nOct float64\\nNov float64\\nDec float64\\ndtype: object\\n[/stdout]\", \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"This CSV file contains 10 rows and 13 columns. The columns are named 'Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'. The data types of these columns are int64 for 'Year' and float64 for the rest.\\n\\nIt appears that this CSV file contains monthly inflation rates for different years. The 'Year' column represents the year, and the rest of the columns represent the inflation rates for each month of the year.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\nimport matplotlib.pyplot as", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " plt\n\n# Load data\ndf = pd.read_csv(\"/var/f", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "olders/cz/vyh7y1d11xg881lsx", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "sshnc5c0000gn/T/tmpflpgiagc/", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "2VkeqrPlinflation.csv\")\n\n# Calculate average yearly inflation", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\ndf['Average'] = df[['Jan', 'Feb', '", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "Mar', 'Apr', 'May', 'Jun', 'Jul',", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " 'Aug', 'Sep', 'Oct', 'Nov', 'Dec", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "']].mean(axis=1)\n\n# Plot time series\nplt.figure(figsize", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "=(10,6))\nplt.plot(df['Year'], df['Average", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "'])\nplt.xlabel('Year')\nplt.ylabel('Average Yearly", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Inflation')\nplt.title('Average Yearly Inflation Over", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Time')\nplt.grid(True)\nplt.show()", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpflpgiagc/2VkeqrPlinflation.csv\")\n\n# Calculate average yearly inflation\ndf['Average'] = df[['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']].mean(axis=1)\n\n# Plot time series\nplt.figure(figsize=(10,6))\nplt.plot(df['Year'], df['Average'])\nplt.xlabel('Year')\nplt.ylabel('Average Yearly Inflation')\nplt.title('Average Yearly Inflation Over Time')\nplt.grid(True)\nplt.show()" + }, + "call_id": "f82fa3fd-e3be-4cb7-9298-8b4625cf709e", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 454 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 464 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stdout]\\nNumber of rows and columns in the data: (10, 13)\\nColumns of the data are: 13\\nColumns of the data are: Index(['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',\\n 'Oct', 'Nov', 'Dec'],\\n dtype='object')\\nDatatype of the columns are: Year int64\\nJan float64\\nFeb float64\\nMar float64\\nApr float64\\nMay float64\\nJun float64\\nJul float64\\nAug float64\\nSep float64\\nOct float64\\nNov float64\\nDec float64\\ndtype: object\\n[/stdout]\", \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"This CSV file contains 10 rows and 13 columns. The columns are named 'Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'. The data types of these columns are int64 for 'Year' and float64 for the rest.\\n\\nThe 'Year' column likely contains the year for which the inflation rates are given. The other columns ('Jan' to 'Dec') likely contain the inflation rates for each month of the year.\\n\\nPlease note that the actual data in the CSV file is not provided, so the above description is based on the structure of the file.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n\\n# Calculate average yearly inflation\\ndf['Average'] = df[['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']].mean(axis=1)\\n\\n# Plot time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df['Year'], df['Average'])\\nplt.xlabel('Year')\\nplt.ylabel('Average Yearly Inflation')\\nplt.title('Average Yearly Inflation Over Time')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "This", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " code will create a line plot of the average yearly inflation over", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " time. The x-axis represents the year and the y-axis", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " represents the average yearly inflation. The plot will show the trend", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " of average yearly inflation over the years.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 661 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 55 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 716 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stdout]\\nNumber of rows and columns in the data: (10, 13)\\nColumns of the data are: 13\\nColumns of the data are: Index(['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',\\n 'Oct', 'Nov', 'Dec'],\\n dtype='object')\\nDatatype of the columns are: Year int64\\nJan float64\\nFeb float64\\nMar float64\\nApr float64\\nMay float64\\nJun float64\\nJul float64\\nAug float64\\nSep float64\\nOct float64\\nNov float64\\nDec float64\\ndtype: object\\n[/stdout]\", \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"This CSV file contains 10 rows and 13 columns. The columns are named 'Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'. The data types of these columns are int64 for 'Year' and float64 for the rest.\\n\\nThe 'Year' column likely contains the year for which the inflation rates are given. The other columns ('Jan' to 'Dec') likely contain the inflation rates for each month of the year.\\n\\nPlease note that the actual data in the CSV file is not provided, so the above description is based on the structure of the file.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "df = pd.read_csv(\"/var/folders/cz/vyh7", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "y1d11xg881lsxsshnc5c0000", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "gn/T/tmpfsp7c9_g/Aih5TPOuin", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "flation.csv\")\n\n# Calculate average yearly inflation", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\ndf['Average'] = df[['Jan', 'Feb',", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " 'Mar', 'Apr', 'May', 'Jun', '", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "Jul', 'Aug', 'Sep', 'Oct', 'Nov", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "', 'Dec']].mean(axis=1)\n\n# Plot time series", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\nplt.figure(figsize=(10,6))\nplt.plot(df['", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "Year'], df['Average'])\nplt.xlabel('Year')\nplt.ylabel", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "('Average Yearly Inflation')\nplt.title('", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "Average Yearly Inflation Over Time')\nplt.grid(True)\nplt", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".show()", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpfsp7c9_g/Aih5TPOuinflation.csv\")\n\n# Calculate average yearly inflation\ndf['Average'] = df[['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']].mean(axis=1)\n\n# Plot time series\nplt.figure(figsize=(10,6))\nplt.plot(df['Year'], df['Average'])\nplt.xlabel('Year')\nplt.ylabel('Average Yearly Inflation')\nplt.title('Average Yearly Inflation Over Time')\nplt.grid(True)\nplt.show()" + }, + "call_id": "dce1b106-06e1-4163-ae85-f9a2491f4375", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 480 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 490 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { @@ -20531,6 +23988,645 @@ ], "type": "generator" }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stdout]\\nNumber of rows and columns in the data: (10, 13)\\nColumns of the data are: 13\\nColumns of the data are: Index(['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',\\n 'Oct', 'Nov', 'Dec'],\\n dtype='object')\\nDatatype of the columns are: Year int64\\nJan float64\\nFeb float64\\nMar float64\\nApr float64\\nMay float64\\nJun float64\\nJul float64\\nAug float64\\nSep float64\\nOct float64\\nNov float64\\nDec float64\\ndtype: object\\n[/stdout]\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "This", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " CSV file contains 10 rows and 13 columns. The columns are", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " named 'Year', 'Jan', 'Feb', 'Mar', '", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "Apr', 'May', 'Jun', 'Jul', 'Aug',", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " 'Sep', 'Oct', 'Nov', 'Dec'. The data", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " types of these columns are int64 for 'Year", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "' and float64 for the rest.\n\nIt appears that this CSV file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " contains monthly inflation rates for different years. The 'Year' column represents", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the year, and the rest of the columns represent the inflation rates", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " for each month of the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " year.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 327 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 125 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 452 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\n# Load data\ndf = pd.read", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "_csv(\"/var/folders/cz/vyh7", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "y1d11xg881lsxsshnc5c000", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "0gn/T/tmpflpgiagc/2VkeqrPlinflation", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".csv\")\n# Rows\nprint(\"Number of rows and columns in", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ":\", len(df.columns))\n# Column names\nprint(\"Columns of the data", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " the columns are:\", df.dtypes)", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpflpgiagc/2VkeqrPlinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)" + }, + "call_id": "b8aab119-7997-428e-81ab-e6aa163f7acc", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 36 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 46 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:02bc2\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:e40e6\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:200a9\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:e40e6\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:e40e6\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:e40e6\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:7bf28\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:b299f\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:af719\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:b299f\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:b299f\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:b299f\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:7bf28\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:b299f\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:af719\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "{\"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "type\": \"function\", \"name\":", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " \"knowledge_search\", \"parameters", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "\": {\"query\": \"How to use Lo", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "RA in Torchtune\"}}", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "How to use LoRA in Torchtune" + }, + "call_id": "3d9a3bd1-4a05-4feb-b5a2-eed7a7a24f1b", + "tool_name": "knowledge_search" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 117 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 40 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 157 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:7bf28\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:b299f\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:af719\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "I", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "'m ready to help you answer questions about", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Torchtune based on the documentation you provided", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ". What's your first question?", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 75 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 35 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 110 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8c1f5\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:13786\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:f9c19\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:13786\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:13786\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:13786\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:bbddb\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:15b86\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:83901\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:15b86\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:15b86\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:15b86\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:bbddb\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:15b86\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:83901\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "{\"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "type\": \"function\", \"name\": \"knowledge_search\",", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " \"parameters\": {\"query\": \"How to use LoRA", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " in Torchtune\"}}", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "How to use LoRA in Torchtune" + }, + "call_id": "548b1430-be4a-4c22-9430-62bda6dd150c", + "tool_name": "knowledge_search" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 117 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 40 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 157 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:bbddb\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:15b86\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:83901\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "I", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "'m ready to help you answer questions about Torchtune based on", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the documentation you provided. What's your first question", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "?", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 75 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 35 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 110 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:da8ed\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:65275\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:f4ddd\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:65275\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:65275\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:65275\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:da8ed\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:65275\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:f4ddd\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "{\"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "type\": \"function\", \"name\": \"knowledge", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "_search\", \"parameters\": {\"query\": \"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "How to use LoRA in Torchtune\"}}", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "How to use LoRA in Torchtune" + }, + "call_id": "b1a5c1c5-905e-4206-95f6-e30f9b07376d", + "tool_name": "knowledge_search" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 117 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 40 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 157 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:da8ed\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:65275\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:f4ddd\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "I", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "'m ready to help you answer questions about Torcht", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "une based on the documentation you provided. What's your first", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " question?", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 75 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 35 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 110 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:ea3f6\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:5c435\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:91d52\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:5c435\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:5c435\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:5c435\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:fa9cd\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:6dc04\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:6f75f\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:6dc04\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:6dc04\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:6dc04\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:fa9cd\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:6dc04\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:6f75f\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "{\"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "type\": \"function\", \"name\": \"knowledge_search\", \"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "parameters\": {\"query\": \"How to use LoRA in", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Torchtune\"}}", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "How to use LoRA in Torchtune" + }, + "call_id": "d4e8b8eb-a0be-4434-b270-48315bf20723", + "tool_name": "knowledge_search" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 117 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 40 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 157 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:fa9cd\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:6dc04\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:6f75f\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "I", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "'m ready to help you answer questions about", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Torchtune based on the documentation you provided", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ". What's your first question?", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 75 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 35 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 110 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { @@ -26713,7 +33769,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\",", + "tool_call": "{\"type\": \"function\", \"name\": \"", "type": "tool_call" }, "event_type": { @@ -26738,7 +33794,32 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " \"parameters\": {\"query\": \"Torchtune documentation\"}}", + "tool_call": "knowledge_search\", \"parameters\": {\"query\": \"Tor", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "chtune documentation\"}}", "type": "tool_call" }, "event_type": { @@ -26767,7 +33848,7 @@ "arguments": { "query": "Torchtune documentation" }, - "call_id": "8b279fd4-0479-48d4-8ef7-882004f994b2", + "call_id": "cf722fb9-6067-46ea-8534-852b7d364278", "tool_name": "knowledge_search" }, "type": "tool_call" @@ -27080,6 +34161,256 @@ ], "type": "generator" }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent: 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent: instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n [INST] <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = [\\n {\\n \\\"role\\\": \\\"system\\\",\\n \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent: LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b()\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " attention type used by Llama3-8B", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " is grouped-query attention.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 80 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 26 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 106 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent: 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent: instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n [INST] <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = [\\n {\\n \\\"role\\\": \\\"system\\\",\\n \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent: LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b()\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " attention type used by Llama3-8B", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " is grouped-query attention.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 80 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 26 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 106 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { @@ -27128,7 +34459,7 @@ "data": { "event": { "delta": { - "text": " \"type\": \"function\",\n \"name\": \"knowledge", + "text": " \"type\": \"function\",\n \"name\": \"knowledge_search\",\n", "type": "text" }, "event_type": { @@ -27148,7 +34479,7 @@ "data": { "event": { "delta": { - "text": "_search\",\n \"parameters\": {\n \"query", + "text": " \"parameters\": {\n \"query\": \"L", "type": "text" }, "event_type": { @@ -27168,7 +34499,7 @@ "data": { "event": { "delta": { - "text": "\": \"Llama3-8B attention type\"\n }\n}", + "text": "lama3-8B attention type\"\n }\n}", "type": "text" }, "event_type": { @@ -27197,7 +34528,7 @@ "arguments": { "query": "Llama3-8B attention type" }, - "call_id": "34e18a7e-5c52-403b-9dd3-cdb3f7f8cb89", + "call_id": "9106bccf-d0c5-4b0a-9398-0b5972ada295", "tool_name": "knowledge_search" }, "type": "tool_call" @@ -27318,7 +34649,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters", + "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\",", "type": "tool_call" }, "event_type": { @@ -27343,7 +34674,32 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "\": {\"query\": \"Llama3-8B attention type\"}}", + "tool_call": " \"parameters\": {\"query\": \"Llama3-8", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "B attention type\"}}", "type": "tool_call" }, "event_type": { @@ -27372,7 +34728,7 @@ "arguments": { "query": "Llama3-8B attention type" }, - "call_id": "2c25de18-5466-4f97-bef1-b463abbbcd55", + "call_id": "768fe977-8297-42bd-90c3-b1dc07882ce0", "tool_name": "knowledge_search" }, "type": "tool_call" @@ -28057,6 +35413,111 @@ ], "type": "generator" }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the founder of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Meta founder\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"{\\\"query\\\": \\\"Meta founder\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.81595254, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\\\", \\\"score\\\": 0.70726365, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\\\", \\\"score\\\": 0.467308, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meta Platforms - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Meta_Platforms\\\", \\\"content\\\": \\\"Following a period of intense scrutiny and damaging whistleblower leaks, news started to emerge on October 21, 2021, about Facebook's plan to rebrand the company and change its name.[15][54] In the Q3 2021 Earnings Call on October 25, Mark Zuckerberg discussed the ongoing criticism of the company's social services and the way it operates, and pointed to the pivoting efforts to building the metaverse \\\\u2013 without mentioning the rebranding and the name change.[55] The metaverse vision and the name change from Facebook, Inc. to Meta Platforms was introduced at Facebook Connect on October 28, 2021.[16] Based on Facebook's PR campaign, the name change reflects the company's shifting long term focus of building the metaverse, a digital extension of the physical world by social media, virtual reality and augmented reality features.[16][56]\\\", \\\"score\\\": 0.14999175, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.03678684, \\\"raw_content\\\": null}]}\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " founder of Meta is Mark Zuckerberg.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 1220 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 18 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 1238 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the founder of Meta is.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { "chunks": [ { @@ -28115,7 +35576,32 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "brave_search.call(query=\"Meta founder\")", + "tool_call": "brave_search.call(query", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "=\"Meta founder\")", "type": "tool_call" }, "event_type": { @@ -28144,7 +35630,7 @@ "arguments": { "query": "Meta founder" }, - "call_id": "3918bbe3-28b2-411d-be4a-f0eb2aa137e8", + "call_id": "b81c41ae-5eb7-41b7-b466-78eb25a91bb7", "tool_name": { "__enum__": "BuiltinTool", "__module__": "llama_stack.models.llama.datatypes", @@ -28686,6 +36172,461 @@ ], "type": "generator" }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " function `get_boiling_point` is not able to", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " find the boiling point of polyjuice as", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " it is a fictional liquid from the Harry Potter series. The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " function is only able to find the boiling point of real liquids", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ".", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 70 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 56 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 126 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " function `get_boiling_point` is not able to find the boiling point", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " of polyjuice as it is not a real liquid.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 70 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 38 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 108 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " function `get_boiling_point` is not able to find the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " boiling point of polyjuice as", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " it is not a real liquid.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 70 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 38 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 108 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { @@ -28744,7 +36685,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "{\"type\": \"function\", \"name\": \"", + "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling_point\",", "type": "tool_call" }, "event_type": { @@ -28769,32 +36710,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "get_boiling_point\", \"parameters\": {\"liquid_name\": \"", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "polyjuice\"}}", + "tool_call": " \"parameters\": {\"liquid_name\": \"polyjuice\"}}", "type": "tool_call" }, "event_type": { @@ -28823,7 +36739,7 @@ "arguments": { "liquid_name": "polyjuice" }, - "call_id": "98ea49c2-7788-4fda-8d84-4a584eb1dd27", + "call_id": "b63f9b8c-c514-48bb-8e0f-788b29c1c106", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -28944,7 +36860,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "{\"type\": \"function\", \"name\": \"get", + "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling", "type": "tool_call" }, "event_type": { @@ -28969,7 +36885,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "_boiling_point\", \"parameters\": {\"liquid_name\": \"polyju", + "tool_call": "_point\", \"parameters\": {\"liquid_name\": \"polyjuice", "type": "tool_call" }, "event_type": { @@ -28994,7 +36910,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "ice\"}}", + "tool_call": "\"}}", "type": "tool_call" }, "event_type": { @@ -29023,7 +36939,7 @@ "arguments": { "liquid_name": "polyjuice" }, - "call_id": "a9d6416d-a739-49b0-b7df-63694412a376", + "call_id": "ec121f44-66e0-47e8-971a-211142998c65", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -29134,7 +37050,7 @@ "data": { "event": { "delta": { - "text": " couldn't find any information on the boiling point of", + "text": " couldn't find any information on the boiling point of Polyjuice", "type": "text" }, "event_type": { @@ -29154,7 +37070,7 @@ "data": { "event": { "delta": { - "text": " Polyjuice. Polyjuice is a magical potion in the", + "text": ". Polyjuice is a magical potion in the Harry Potter series", "type": "text" }, "event_type": { @@ -29174,7 +37090,7 @@ "data": { "event": { "delta": { - "text": " Harry Potter series that allows the drinker to transform into someone else", + "text": " that allows the drinker to transform into someone else.", "type": "text" }, "event_type": { @@ -29194,7 +37110,7 @@ "data": { "event": { "delta": { - "text": ". It's not a physical substance with a boiling point. If", + "text": " It's not a physical substance with a boiling point. If", "type": "text" }, "event_type": { @@ -29214,7 +37130,7 @@ "data": { "event": { "delta": { - "text": " you have any other questions, I'd be happy to", + "text": " you have any other questions, I'd", "type": "text" }, "event_type": { @@ -29234,7 +37150,7 @@ "data": { "event": { "delta": { - "text": " help.", + "text": " be happy to help.", "type": "text" }, "event_type": { @@ -29349,7 +37265,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "{\"type\": \"function\", \"name\": \"", + "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling", "type": "tool_call" }, "event_type": { @@ -29374,7 +37290,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "get_boiling_point\", \"parameters\": {\"liquid_name\":", + "tool_call": "_point\", \"parameters\": {\"liquid_name\": \"polyjuice", "type": "tool_call" }, "event_type": { @@ -29399,7 +37315,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " \"polyjuice\"}}", + "tool_call": "\"}}", "type": "tool_call" }, "event_type": { @@ -29428,7 +37344,7 @@ "arguments": { "liquid_name": "polyjuice" }, - "call_id": "722b4201-b9ac-440f-80c4-0e9a7f0b5369", + "call_id": "1ca40c99-853b-44e3-ab2c-f194e3ed1b45", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -29685,6 +37601,111 @@ ], "type": "generator" }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"def is_prime(n):\\n if n <= 1:\\n return False\\n if n <= 3:\\n return True\\n if n % 2 == 0 or n % 3 == 0:\\n return False\\n i = 5\\n while i * i <= n:\\n if n % i == 0 or n % (i + 2) == 0:\\n return False\\n i += 6\\n return True\\n\\ndef get_nth_prime(n):\\n count = 0\\n num = 2\\n while True:\\n if is_prime(num):\\n count += 1\\n if count == n:\\n return num\\n num += 1\\n\\nprint(get_nth_prime(100))\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stdout]\\n541\\n[/stdout]\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " 100th prime number is 541.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 217 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 20 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 237 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { @@ -29743,7 +37764,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "def is_prime(n):\n if n <= 1:\n return False\n", + "tool_call": "def is_prime(n):\n if n <= 1:\n return False", "type": "tool_call" }, "event_type": { @@ -29768,7 +37789,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " if n <= 3:\n return True\n if n % ", + "tool_call": "\n if n <= 3:\n ", "type": "tool_call" }, "event_type": { @@ -29793,7 +37814,32 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "2 == 0 or n % 3 == 0:\n return False", + "tool_call": " return True\n if n % 2 ==", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " 0 or n % 3 == 0:\n return False", "type": "tool_call" }, "event_type": { @@ -29843,7 +37889,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " if n % i == 0 or n % (i + 2)", + "tool_call": " if n % i == 0 or n % (i + 2", "type": "tool_call" }, "event_type": { @@ -29868,7 +37914,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " == 0:\n return False\n i", + "tool_call": ") == 0:\n return False\n i += 6\n ", "type": "tool_call" }, "event_type": { @@ -29893,7 +37939,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " += 6\n return True", + "tool_call": " return True\n\ndef get_nth_prime(n):\n count = 0\n ", "type": "tool_call" }, "event_type": { @@ -29918,7 +37964,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "\n\n", + "tool_call": " num = 2\n while True:\n if is_prime(num):\n ", "type": "tool_call" }, "event_type": { @@ -29943,7 +37989,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "def get_nth_prime(n):\n count = 0\n num = 2\n while True:\n if is_prime(num):\n ", + "tool_call": " count += 1\n if count == n:\n return num\n", "type": "tool_call" }, "event_type": { @@ -29968,57 +38014,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " count += 1\n if count == n:\n return", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " num\n num += 1\n\nprint(get_nth_prime(100", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "))", + "tool_call": " num += 1\n\nprint(get_nth_prime(100))", "type": "tool_call" }, "event_type": { @@ -30047,7 +38043,7 @@ "arguments": { "code": "def is_prime(n):\n if n <= 1:\n return False\n if n <= 3:\n return True\n if n % 2 == 0 or n % 3 == 0:\n return False\n i = 5\n while i * i <= n:\n if n % i == 0 or n % (i + 2) == 0:\n return False\n i += 6\n return True\n\ndef get_nth_prime(n):\n count = 0\n num = 2\n while True:\n if is_prime(num):\n count += 1\n if count == n:\n return num\n num += 1\n\nprint(get_nth_prime(100))" }, - "call_id": "229f89b4-d07d-4748-b8ae-f805cb52209e", + "call_id": "d8ece88b-7b3e-4f72-9555-5a928c27012c", "tool_name": { "__enum__": "BuiltinTool", "__module__": "llama_stack.models.llama.datatypes", @@ -30399,6 +38395,291 @@ ], "type": "generator" }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was Perplexity the company founded?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Perplexity company founding date\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n Konwinski was among the founding team at Databricks.\\n Yarats, the CTO, was an AI research scientist at Meta.\\n Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Perplexity company founding date\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n Konwinski was among the founding team at Databricks.\\n Yarats, the CTO, was an AI research scientist at Meta.\\n Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "Per", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "plexity the company was founded in 2022.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 105 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 22 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 127 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was Perplexity the company founded?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Perplexity company founding date\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n Konwinski was among the founding team at Databricks.\\n Yarats, the CTO, was an AI research scientist at Meta.\\n Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "{\"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "type\": \"function\", \"name\": \"knowledge_search\", \"parameters", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "\": {\"query\": \"Perplexity company founding", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " date\"}}", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "Perplexity company founding date" + }, + "call_id": "7f40db23-2182-4006-9234-4c5b7dac978f", + "tool_name": "knowledge_search" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 67 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 37 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 104 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was Perplexity the company founded?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { @@ -30511,7 +38792,7 @@ "arguments": { "query": "Perplexity company founding date" }, - "call_id": "06c95bef-9b2d-4380-bf16-e1338bb7cf2c", + "call_id": "7f65affe-6ecb-4db5-b70f-71e05e28c310", "tool_name": "knowledge_search" }, "type": "tool_call" @@ -30848,6 +39129,171 @@ ], "type": "generator" }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was the nba created?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"when was the nba created\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n Konwinski was among the founding team at Databricks.\\n Yarats, the CTO, was an AI research scientist at Meta.\\n Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:perpl\\nContent: Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " NBA was created on August 3,", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " 1949, with the merger of the Basketball Association of", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " America (BAA) and the National Basketball League (NBL", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ").", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 65 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 45 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 110 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was the nba created?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { @@ -30906,7 +39352,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "{\"type\": \"function\", \"", + "tool_call": "{\"type\": \"function\", \"name\": \"", "type": "tool_call" }, "event_type": { @@ -30931,7 +39377,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "name\": \"knowledge_search\", \"parameters\": {\"query\": \"", + "tool_call": "knowledge_search\", \"parameters\": {\"query\": \"when", "type": "tool_call" }, "event_type": { @@ -30956,7 +39402,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "when was the nba created\"}}", + "tool_call": " was the nba created\"}}", "type": "tool_call" }, "event_type": { @@ -30985,7 +39431,7 @@ "arguments": { "query": "when was the nba created" }, - "call_id": "b08bb4c0-c0a1-4063-b110-3947559e4061", + "call_id": "0f4d0151-e44c-443a-8101-e0ac92c9d45f", "tool_name": "knowledge_search" }, "type": "tool_call" diff --git a/tests/integration/fixtures/recorded_responses/invoke_tool.json b/tests/integration/fixtures/recorded_responses/invoke_tool.json index 30a132904..8db8ad966 100644 --- a/tests/integration/fixtures/recorded_responses/invoke_tool.json +++ b/tests/integration/fixtures/recorded_responses/invoke_tool.json @@ -167,23 +167,23 @@ "type": "text" }, { - "text": "Result 1:\nDocument_id:cc255\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\n See :ref:`below` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n # Print the first layer's self-attention in the usual Llama2 model\n >>> print(base_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (pos_embeddings): RotaryPositionalEmbeddings()\n )\n\n # Print the same for Llama2 with LoRA weights\n >>> print(lora_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): LoRALinear(\n (dropout): Dropout(p=0.0, inplace=False)\n \n", + "text": "Result 2:\nDocument_id:15b86\nContent: LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n # Build Llama2 without any LoRA layers\n base_model = llama2_7b()\n\n # The default settings for lora_llama2_7b will match those for llama2_7b\n # We just need to define which layers we want LoRA applied to.\n # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n # layers outside of the self-attention.\n lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n Calling :func:`lora_llama_2_7b ` alone will not handle the definition of which parameters are trainable.\n See :ref:`below` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n # Print the first layer's self-attention in the usual Llama2 model\n >>> print(base_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (pos_embeddings): RotaryPositionalEmbeddings()\n )\n\n # Print the same for Llama2 with LoRA weights\n >>> print(lora_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): LoRALinear(\n (dropout): Dropout(p=0.0, inplace=False)\n \n", "type": "text" }, { - "text": "Result 3:\nDocument_id:cc255\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", + "text": "Result 3:\nDocument_id:15b86\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", "type": "text" }, { - "text": "Result 4:\nDocument_id:cc255\nContent: from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n # Assuming that base_model already has the pretrained Llama2 weights,\n # this will directly load them into your LoRA model without any conversion necessary.\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n :func:`validate_missing_and_unexpected_for_lora() `.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n # Fetch all params from the model that are associated with LoRA.\n lora_params = get_adapter_params(lora_model)\n\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n set_trainable_params(lora_model, lora_params)\n\n # Print the total number of parameters\n total_params = sum([p.numel() for p in lora_model.parameters()])\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n print(\n f\"\"\"\n {total_params} total params,\n {trainable_params}\" trainable params,\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n \"\"\"\n )\n\n 6742609920 total params,\n 4194304 trainable params,\n 0.06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n # Fetch all params from the model that are associated with LoRA.\n lora_params = get_adapter_params(lora_model)\n\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n set_trainable_params(lora_model, lora_params)\n\n # Print the total number of parameters\n total_params = sum([p.numel() for p in lora_model.parameters()])\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n print(\n f\"\"\"\n {total_params} total params,\n {trainable_params}\" trainable params,\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n \"\"\"\n )\n\n 6742609920 total params,\n 4194304 trainable params,\n 0.06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_ into all our\n Dataset classes and they will honor them. This is useful for common parameters\n such as specifying the data split with :code:`split` or configuration with\n :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations `_ into all our\n Dataset classes and they will honor them. This is useful for common parameters\n such as specifying the data split with :code:`split` or configuration with\n :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW `.\n.. .. _glossary_fsdp2:\n\n", + "text": "Result 3:\nDocument_id:83901\nContent: ` module, which we swap\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP `.\n.. .. _glossary_fsdp2:\n\n", "type": "text" }, { - "text": "Result 4:\nDocument_id:cc255\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", + "text": "Result 4:\nDocument_id:15b86\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", "type": "text" }, { - "text": "Result 5:\nDocument_id:7a06a\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.use_dora=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.apply_lora_to_mlp=True \\\n model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n model.lora_rank=16 \\\n model.lora_alpha=32 \\\n model.use_dora=True \\\n model.quantize_base=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n apply_lora_to_mlp: True\n lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n lora_rank: 16\n lora_alpha: 32\n use_dora: True\n quantize_base: True\n\n\n.. note::\n\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP `.\n.. .. _glossary_fsdp2:\n\n", + "text": "Result 5:\nDocument_id:83901\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.use_dora=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.apply_lora_to_mlp=True \\\n model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n model.lora_rank=16 \\\n model.lora_alpha=32 \\\n model.use_dora=True \\\n model.quantize_base=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n apply_lora_to_mlp: True\n lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n lora_rank: 16\n lora_alpha: 32\n use_dora: True\n quantize_base: True\n\n\n.. note::\n\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP `.\n.. .. _glossary_fsdp2:\n\n", "type": "text" }, { @@ -428,11 +428,11 @@ "error_message": null, "metadata": { "document_ids": [ - "16a6ae01-049e-4a44-b305-8248d20a8f7d", - "cc2559a9-2b56-43d8-9ec4-b2181bb96acb", - "7a06a3a9-7e9d-4693-8c07-15343f0654aa", - "cc2559a9-2b56-43d8-9ec4-b2181bb96acb", - "7a06a3a9-7e9d-4693-8c07-15343f0654aa" + "bbddbe62-508d-4c8d-9455-3b60bc2825a5", + "15b8638f-b1b6-4f58-adfa-eb6644c47de3", + "83901b53-33d4-4f5e-8145-b94c783e9f61", + "15b8638f-b1b6-4f58-adfa-eb6644c47de3", + "83901b53-33d4-4f5e-8145-b94c783e9f61" ] } } From d263edbf90f958349c7b9adea8fd4e5181932a69 Mon Sep 17 00:00:00 2001 From: Nathan Weinberg <31703736+nathan-weinberg@users.noreply.github.com> Date: Wed, 12 Mar 2025 23:08:24 -0400 Subject: [PATCH 035/557] build: remove .python-version (#1513) # What does this PR do? the current `.python-version` file forces `uv` to setup the development environment with Python 3.10 this causes an error if a dev system does not have Python 3.10, even though the project officially supports newer versions of Python as well since `uv` can use the `pyproject.toml` to determine python versions, we can safely remove this file from the repo and subsequent git tracking follows up on https://github.com/meta-llama/llama-stack/pull/1172 ## Test Plan N/A --------- Signed-off-by: Nathan Weinberg --- .gitignore | 1 + .python-version | 1 - CONTRIBUTING.md | 6 ++++++ 3 files changed, 7 insertions(+), 1 deletion(-) delete mode 100644 .python-version diff --git a/.gitignore b/.gitignore index 1b15107f3..0ef25cdf1 100644 --- a/.gitignore +++ b/.gitignore @@ -22,3 +22,4 @@ pyrightconfig.json venv/ pytest-report.xml .coverage +.python-version diff --git a/.python-version b/.python-version deleted file mode 100644 index c8cfe3959..000000000 --- a/.python-version +++ /dev/null @@ -1 +0,0 @@ -3.10 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7c0b5d94e..71e610064 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -61,6 +61,7 @@ outlined on that page and do not file a public issue. We use [uv](https://github.com/astral-sh/uv) to manage python dependencies and virtual environments. You can install `uv` by following this [guide](https://docs.astral.sh/uv/getting-started/installation/). + You can install the dependencies by running: ```bash @@ -70,6 +71,11 @@ uv pip install -e . source .venv/bin/activate ``` +> [!NOTE] +> You can pin a specific version of Python to use for `uv` by adding a `.python-version` file in the root project directory. +> Otherwise, `uv` will automatically select a Python version according to the `requires-python` section of the `pyproject.toml`. +> For more info, see the [uv docs around Python versions](https://docs.astral.sh/uv/concepts/python-versions/). + Note that you can create a dotenv file `.env` that includes necessary environment variables: ``` LLAMA_STACK_BASE_URL=http://localhost:8321 From 0a0d6cb96e9653aa24998fd4a6de01063c14317e Mon Sep 17 00:00:00 2001 From: ehhuang Date: Wed, 12 Mar 2025 21:55:05 -0700 Subject: [PATCH 036/557] fix: openapi spec gen (#1602) Summary: Test Plan: sh docs/openapi_generator/run_openapi_generator.sh --- docs/_static/llama-stack-spec.html | 20 -------------------- docs/_static/llama-stack-spec.yaml | 16 ---------------- 2 files changed, 36 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index c50554092..22fa781ac 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -4654,22 +4654,12 @@ "CompletionResponse": { "type": "object", "properties": { -<<<<<<< dest: ed6caead724a - ehhuang: chore: simplify _get_tool_defs (#1384) "metrics": { "type": "array", "items": { "$ref": "#/components/schemas/MetricInResponse" } }, -||||||| base: 1311faf3f5e7 - ehhuang: fix: logging (#1598) -======= - "metrics": { - "type": "array", - "items": { - "$ref": "#/components/schemas/MetricEvent" - } - }, ->>>>>>> source: ad32270ad0d5 - erichuang: feat(api): remove tool_name from To... "content": { "type": "string", "description": "The generated completion text" @@ -5037,22 +5027,12 @@ "CompletionResponseStreamChunk": { "type": "object", "properties": { -<<<<<<< dest: ed6caead724a - ehhuang: chore: simplify _get_tool_defs (#1384) "metrics": { "type": "array", "items": { "$ref": "#/components/schemas/MetricInResponse" } }, -||||||| base: 1311faf3f5e7 - ehhuang: fix: logging (#1598) -======= - "metrics": { - "type": "array", - "items": { - "$ref": "#/components/schemas/MetricEvent" - } - }, ->>>>>>> source: ad32270ad0d5 - erichuang: feat(api): remove tool_name from To... "delta": { "type": "string", "description": "New content generated since last chunk. This can be one or more tokens." diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 1f9536c2e..1f01351e9 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -3176,18 +3176,10 @@ components: CompletionResponse: type: object properties: -<<<<<<< dest: ed6caead724a - ehhuang: chore: simplify _get_tool_defs (#1384) metrics: type: array items: $ref: '#/components/schemas/MetricInResponse' -||||||| base: 1311faf3f5e7 - ehhuang: fix: logging (#1598) -======= - metrics: - type: array - items: - $ref: '#/components/schemas/MetricEvent' ->>>>>>> source: ad32270ad0d5 - erichuang: feat(api): remove tool_name from To... content: type: string description: The generated completion text @@ -3506,18 +3498,10 @@ components: CompletionResponseStreamChunk: type: object properties: -<<<<<<< dest: ed6caead724a - ehhuang: chore: simplify _get_tool_defs (#1384) metrics: type: array items: $ref: '#/components/schemas/MetricInResponse' -||||||| base: 1311faf3f5e7 - ehhuang: fix: logging (#1598) -======= - metrics: - type: array - items: - $ref: '#/components/schemas/MetricEvent' ->>>>>>> source: ad32270ad0d5 - erichuang: feat(api): remove tool_name from To... delta: type: string description: >- From d072b5fa0c2b58dcf27e936bb4fa3a95bc6f41d7 Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Wed, 12 Mar 2025 22:29:58 -0700 Subject: [PATCH 037/557] test: add unit test to ensure all config types are instantiable (#1601) --- .../inline/datasetio/localfs/config.py | 16 ++++-- .../inline/eval/meta_reference/config.py | 16 ++++-- .../providers/inline/inference/vllm/config.py | 4 +- .../inline/post_training/torchtune/config.py | 8 ++- .../inline/safety/code_scanner/config.py | 6 ++- .../inline/safety/llama_guard/config.py | 8 ++- .../inline/safety/prompt_guard/config.py | 7 +++ .../providers/inline/scoring/basic/config.py | 7 ++- .../inline/scoring/llm_as_judge/config.py | 7 ++- .../inline/telemetry/sample/__init__.py | 17 ------- .../inline/telemetry/sample/config.py | 12 ----- .../inline/telemetry/sample/sample.py | 17 ------- .../tool_runtime/code_interpreter/config.py | 6 ++- .../inline/tool_runtime/rag/config.py | 6 ++- .../inline/vector_io/chroma/config.py | 4 +- llama_stack/providers/registry/agents.py | 11 ---- llama_stack/providers/registry/inference.py | 9 ---- llama_stack/providers/registry/safety.py | 30 ----------- llama_stack/providers/registry/telemetry.py | 11 ---- llama_stack/providers/registry/vector_io.py | 10 ---- .../remote/agents/sample/__init__.py | 17 ------- .../providers/remote/agents/sample/config.py | 12 ----- .../providers/remote/agents/sample/sample.py | 17 ------- .../remote/datasetio/huggingface/config.py | 16 ++++-- .../remote/inference/databricks/config.py | 13 +++++ .../remote/inference/runpod/__init__.py | 3 +- .../remote/inference/runpod/config.py | 9 +++- .../remote/inference/runpod/runpod.py | 1 - .../remote/inference/sample/__init__.py | 17 ------- .../remote/inference/sample/config.py | 12 ----- .../remote/inference/sample/sample.py | 23 --------- .../remote/safety/sample/__init__.py | 17 ------- .../providers/remote/safety/sample/config.py | 12 ----- .../providers/remote/safety/sample/sample.py | 23 --------- .../remote/tool_runtime/bing_search/config.py | 8 ++- .../model_context_protocol/config.py | 6 ++- .../tool_runtime/wolfram_alpha/config.py | 8 ++- .../remote/vector_io/qdrant/config.py | 8 ++- .../remote/vector_io/sample/__init__.py | 17 ------- .../remote/vector_io/sample/config.py | 12 ----- .../remote/vector_io/sample/sample.py | 26 ---------- .../remote/vector_io/weaviate/config.py | 6 ++- llama_stack/templates/bedrock/run.yaml | 18 +++++-- llama_stack/templates/cerebras/run.yaml | 21 ++++++-- llama_stack/templates/ci-tests/run.yaml | 21 ++++++-- .../templates/dell/run-with-safety.yaml | 21 ++++++-- llama_stack/templates/dell/run.yaml | 21 ++++++-- llama_stack/templates/dev/run.yaml | 21 ++++++-- .../templates/fireworks/run-with-safety.yaml | 21 ++++++-- llama_stack/templates/fireworks/run.yaml | 24 +++++++-- llama_stack/templates/groq/run.yaml | 21 ++++++-- .../hf-endpoint/run-with-safety.yaml | 21 ++++++-- llama_stack/templates/hf-endpoint/run.yaml | 21 ++++++-- .../hf-serverless/run-with-safety.yaml | 21 ++++++-- llama_stack/templates/hf-serverless/run.yaml | 21 ++++++-- .../meta-reference-gpu/run-with-safety.yaml | 21 ++++++-- .../templates/meta-reference-gpu/run.yaml | 21 ++++++-- .../meta-reference-quantized-gpu/run.yaml | 21 ++++++-- llama_stack/templates/nvidia/run.yaml | 21 ++++++-- .../templates/ollama/run-with-safety.yaml | 21 ++++++-- llama_stack/templates/ollama/run.yaml | 24 +++++++-- llama_stack/templates/open-benchmark/run.yaml | 21 ++++++-- .../remote-vllm/run-with-safety.yaml | 24 +++++++-- llama_stack/templates/remote-vllm/run.yaml | 24 +++++++-- llama_stack/templates/sambanova/run.yaml | 3 +- .../templates/tgi/run-with-safety.yaml | 21 ++++++-- llama_stack/templates/tgi/run.yaml | 21 ++++++-- .../templates/together/run-with-safety.yaml | 21 ++++++-- llama_stack/templates/together/run.yaml | 24 +++++++-- llama_stack/templates/vllm-gpu/run.yaml | 21 ++++++-- tests/unit/providers/test_configs.py | 50 +++++++++++++++++++ 71 files changed, 662 insertions(+), 465 deletions(-) delete mode 100644 llama_stack/providers/inline/telemetry/sample/__init__.py delete mode 100644 llama_stack/providers/inline/telemetry/sample/config.py delete mode 100644 llama_stack/providers/inline/telemetry/sample/sample.py delete mode 100644 llama_stack/providers/remote/agents/sample/__init__.py delete mode 100644 llama_stack/providers/remote/agents/sample/config.py delete mode 100644 llama_stack/providers/remote/agents/sample/sample.py delete mode 100644 llama_stack/providers/remote/inference/sample/__init__.py delete mode 100644 llama_stack/providers/remote/inference/sample/config.py delete mode 100644 llama_stack/providers/remote/inference/sample/sample.py delete mode 100644 llama_stack/providers/remote/safety/sample/__init__.py delete mode 100644 llama_stack/providers/remote/safety/sample/config.py delete mode 100644 llama_stack/providers/remote/safety/sample/sample.py delete mode 100644 llama_stack/providers/remote/vector_io/sample/__init__.py delete mode 100644 llama_stack/providers/remote/vector_io/sample/config.py delete mode 100644 llama_stack/providers/remote/vector_io/sample/sample.py create mode 100644 tests/unit/providers/test_configs.py diff --git a/llama_stack/providers/inline/datasetio/localfs/config.py b/llama_stack/providers/inline/datasetio/localfs/config.py index f4f495b95..d74521f1f 100644 --- a/llama_stack/providers/inline/datasetio/localfs/config.py +++ b/llama_stack/providers/inline/datasetio/localfs/config.py @@ -3,9 +3,10 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from typing import Any, Dict + from pydantic import BaseModel -from llama_stack.distribution.utils.config_dirs import RUNTIME_BASE_DIR from llama_stack.providers.utils.kvstore.config import ( KVStoreConfig, SqliteKVStoreConfig, @@ -13,6 +14,13 @@ from llama_stack.providers.utils.kvstore.config import ( class LocalFSDatasetIOConfig(BaseModel): - kvstore: KVStoreConfig = SqliteKVStoreConfig( - db_path=(RUNTIME_BASE_DIR / "localfs_datasetio.db").as_posix() - ) # Uses SQLite config specific to localfs storage + kvstore: KVStoreConfig + + @classmethod + def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]: + return { + "kvstore": SqliteKVStoreConfig.sample_run_config( + __distro_dir__=__distro_dir__, + db_name="localfs_datasetio.db", + ) + } diff --git a/llama_stack/providers/inline/eval/meta_reference/config.py b/llama_stack/providers/inline/eval/meta_reference/config.py index 95b780cca..5b2bec259 100644 --- a/llama_stack/providers/inline/eval/meta_reference/config.py +++ b/llama_stack/providers/inline/eval/meta_reference/config.py @@ -3,9 +3,10 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from typing import Any, Dict + from pydantic import BaseModel -from llama_stack.distribution.utils.config_dirs import RUNTIME_BASE_DIR from llama_stack.providers.utils.kvstore.config import ( KVStoreConfig, SqliteKVStoreConfig, @@ -13,6 +14,13 @@ from llama_stack.providers.utils.kvstore.config import ( class MetaReferenceEvalConfig(BaseModel): - kvstore: KVStoreConfig = SqliteKVStoreConfig( - db_path=(RUNTIME_BASE_DIR / "meta_reference_eval.db").as_posix() - ) # Uses SQLite config specific to Meta Reference Eval storage + kvstore: KVStoreConfig + + @classmethod + def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]: + return { + "kvstore": SqliteKVStoreConfig.sample_run_config( + __distro_dir__=__distro_dir__, + db_name="meta_reference_eval.db", + ) + } diff --git a/llama_stack/providers/inline/inference/vllm/config.py b/llama_stack/providers/inline/inference/vllm/config.py index 0e85c9a48..51d48e6d5 100644 --- a/llama_stack/providers/inline/inference/vllm/config.py +++ b/llama_stack/providers/inline/inference/vllm/config.py @@ -4,6 +4,8 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from typing import Any, Dict + from pydantic import BaseModel, Field from llama_stack.schema_utils import json_schema_type @@ -40,7 +42,7 @@ class VLLMConfig(BaseModel): ) @classmethod - def sample_run_config(cls): + def sample_run_config(cls, **kwargs: Any) -> Dict[str, Any]: return { "tensor_parallel_size": "${env.TENSOR_PARALLEL_SIZE:1}", "max_tokens": "${env.MAX_TOKENS:4096}", diff --git a/llama_stack/providers/inline/post_training/torchtune/config.py b/llama_stack/providers/inline/post_training/torchtune/config.py index 2f48ddfad..ee3504f9e 100644 --- a/llama_stack/providers/inline/post_training/torchtune/config.py +++ b/llama_stack/providers/inline/post_training/torchtune/config.py @@ -4,7 +4,7 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Literal, Optional +from typing import Any, Dict, Literal, Optional from pydantic import BaseModel @@ -12,3 +12,9 @@ from pydantic import BaseModel class TorchtunePostTrainingConfig(BaseModel): torch_seed: Optional[int] = None checkpoint_format: Optional[Literal["meta", "huggingface"]] = "meta" + + @classmethod + def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]: + return { + "checkpoint_format": "meta", + } diff --git a/llama_stack/providers/inline/safety/code_scanner/config.py b/llama_stack/providers/inline/safety/code_scanner/config.py index 75c90d69a..1d880ee9c 100644 --- a/llama_stack/providers/inline/safety/code_scanner/config.py +++ b/llama_stack/providers/inline/safety/code_scanner/config.py @@ -4,8 +4,12 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from typing import Any, Dict + from pydantic import BaseModel class CodeScannerConfig(BaseModel): - pass + @classmethod + def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]: + return {} diff --git a/llama_stack/providers/inline/safety/llama_guard/config.py b/llama_stack/providers/inline/safety/llama_guard/config.py index 72036fd1c..53849ab33 100644 --- a/llama_stack/providers/inline/safety/llama_guard/config.py +++ b/llama_stack/providers/inline/safety/llama_guard/config.py @@ -4,10 +4,16 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import List +from typing import Any, Dict, List from pydantic import BaseModel class LlamaGuardConfig(BaseModel): excluded_categories: List[str] = [] + + @classmethod + def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]: + return { + "excluded_categories": [], + } diff --git a/llama_stack/providers/inline/safety/prompt_guard/config.py b/llama_stack/providers/inline/safety/prompt_guard/config.py index bddd28452..76bd5978d 100644 --- a/llama_stack/providers/inline/safety/prompt_guard/config.py +++ b/llama_stack/providers/inline/safety/prompt_guard/config.py @@ -5,6 +5,7 @@ # the root directory of this source tree. from enum import Enum +from typing import Any, Dict from pydantic import BaseModel, field_validator @@ -23,3 +24,9 @@ class PromptGuardConfig(BaseModel): if v not in [t.value for t in PromptGuardType]: raise ValueError(f"Unknown prompt guard type: {v}") return v + + @classmethod + def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]: + return { + "guard_type": "injection", + } diff --git a/llama_stack/providers/inline/scoring/basic/config.py b/llama_stack/providers/inline/scoring/basic/config.py index d9dbe71bc..5866be359 100644 --- a/llama_stack/providers/inline/scoring/basic/config.py +++ b/llama_stack/providers/inline/scoring/basic/config.py @@ -3,7 +3,12 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from typing import Any, Dict + from pydantic import BaseModel -class BasicScoringConfig(BaseModel): ... +class BasicScoringConfig(BaseModel): + @classmethod + def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]: + return {} diff --git a/llama_stack/providers/inline/scoring/llm_as_judge/config.py b/llama_stack/providers/inline/scoring/llm_as_judge/config.py index 1b538420c..ff63fc5e7 100644 --- a/llama_stack/providers/inline/scoring/llm_as_judge/config.py +++ b/llama_stack/providers/inline/scoring/llm_as_judge/config.py @@ -3,7 +3,12 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from typing import Any, Dict + from pydantic import BaseModel -class LlmAsJudgeScoringConfig(BaseModel): ... +class LlmAsJudgeScoringConfig(BaseModel): + @classmethod + def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]: + return {} diff --git a/llama_stack/providers/inline/telemetry/sample/__init__.py b/llama_stack/providers/inline/telemetry/sample/__init__.py deleted file mode 100644 index 4fb27ac27..000000000 --- a/llama_stack/providers/inline/telemetry/sample/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from typing import Any - -from .config import SampleConfig - - -async def get_adapter_impl(config: SampleConfig, _deps) -> Any: - from .sample import SampleTelemetryImpl - - impl = SampleTelemetryImpl(config) - await impl.initialize() - return impl diff --git a/llama_stack/providers/inline/telemetry/sample/config.py b/llama_stack/providers/inline/telemetry/sample/config.py deleted file mode 100644 index 4b7404a26..000000000 --- a/llama_stack/providers/inline/telemetry/sample/config.py +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from pydantic import BaseModel - - -class SampleConfig(BaseModel): - host: str = "localhost" - port: int = 9999 diff --git a/llama_stack/providers/inline/telemetry/sample/sample.py b/llama_stack/providers/inline/telemetry/sample/sample.py deleted file mode 100644 index a4147a1b2..000000000 --- a/llama_stack/providers/inline/telemetry/sample/sample.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from llama_stack.apis.telemetry import Telemetry - -from .config import SampleConfig - - -class SampleTelemetryImpl(Telemetry): - def __init__(self, config: SampleConfig): - self.config = config - - async def initialize(self): - pass diff --git a/llama_stack/providers/inline/tool_runtime/code_interpreter/config.py b/llama_stack/providers/inline/tool_runtime/code_interpreter/config.py index 167a2c318..7de1ec453 100644 --- a/llama_stack/providers/inline/tool_runtime/code_interpreter/config.py +++ b/llama_stack/providers/inline/tool_runtime/code_interpreter/config.py @@ -4,8 +4,12 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from typing import Any, Dict + from pydantic import BaseModel class CodeInterpreterToolConfig(BaseModel): - pass + @classmethod + def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]: + return {} diff --git a/llama_stack/providers/inline/tool_runtime/rag/config.py b/llama_stack/providers/inline/tool_runtime/rag/config.py index 2d0d2f595..c75c3fc51 100644 --- a/llama_stack/providers/inline/tool_runtime/rag/config.py +++ b/llama_stack/providers/inline/tool_runtime/rag/config.py @@ -4,8 +4,12 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from typing import Any, Dict + from pydantic import BaseModel class RagToolRuntimeConfig(BaseModel): - pass + @classmethod + def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]: + return {} diff --git a/llama_stack/providers/inline/vector_io/chroma/config.py b/llama_stack/providers/inline/vector_io/chroma/config.py index a1fb60fa6..1e333fe92 100644 --- a/llama_stack/providers/inline/vector_io/chroma/config.py +++ b/llama_stack/providers/inline/vector_io/chroma/config.py @@ -13,5 +13,5 @@ class ChromaVectorIOConfig(BaseModel): db_path: str @classmethod - def sample_config(cls) -> Dict[str, Any]: - return {"db_path": "{env.CHROMADB_PATH}"} + def sample_run_config(cls, db_path: str = "${env.CHROMADB_PATH}", **kwargs: Any) -> Dict[str, Any]: + return {"db_path": db_path} diff --git a/llama_stack/providers/registry/agents.py b/llama_stack/providers/registry/agents.py index 655303f98..3ed59304d 100644 --- a/llama_stack/providers/registry/agents.py +++ b/llama_stack/providers/registry/agents.py @@ -7,11 +7,9 @@ from typing import List from llama_stack.providers.datatypes import ( - AdapterSpec, Api, InlineProviderSpec, ProviderSpec, - remote_provider_spec, ) from llama_stack.providers.utils.kvstore import kvstore_dependencies @@ -39,13 +37,4 @@ def available_providers() -> List[ProviderSpec]: Api.tool_groups, ], ), - remote_provider_spec( - api=Api.agents, - adapter=AdapterSpec( - adapter_type="sample", - pip_packages=[], - module="llama_stack.providers.remote.agents.sample", - config_class="llama_stack.providers.remote.agents.sample.SampleConfig", - ), - ), ] diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py index d5f095740..ca4dc59f7 100644 --- a/llama_stack/providers/registry/inference.py +++ b/llama_stack/providers/registry/inference.py @@ -68,15 +68,6 @@ def available_providers() -> List[ProviderSpec]: module="llama_stack.providers.inline.inference.sentence_transformers", config_class="llama_stack.providers.inline.inference.sentence_transformers.config.SentenceTransformersInferenceConfig", ), - remote_provider_spec( - api=Api.inference, - adapter=AdapterSpec( - adapter_type="sample", - pip_packages=[], - module="llama_stack.providers.remote.inference.sample", - config_class="llama_stack.providers.remote.inference.sample.SampleConfig", - ), - ), remote_provider_spec( api=Api.inference, adapter=AdapterSpec( diff --git a/llama_stack/providers/registry/safety.py b/llama_stack/providers/registry/safety.py index b9f7b6d78..1364352e6 100644 --- a/llama_stack/providers/registry/safety.py +++ b/llama_stack/providers/registry/safety.py @@ -27,27 +27,6 @@ def available_providers() -> List[ProviderSpec]: module="llama_stack.providers.inline.safety.prompt_guard", config_class="llama_stack.providers.inline.safety.prompt_guard.PromptGuardConfig", ), - InlineProviderSpec( - api=Api.safety, - provider_type="inline::meta-reference", - pip_packages=[ - "transformers", - "torch --index-url https://download.pytorch.org/whl/cpu", - ], - module="llama_stack.providers.inline.safety.meta_reference", - config_class="llama_stack.providers.inline.safety.meta_reference.SafetyConfig", - api_dependencies=[ - Api.inference, - ], - deprecation_error=""" -Provider `inline::meta-reference` for API `safety` does not work with the latest Llama Stack. - -- if you are using Llama Guard v3, please use the `inline::llama-guard` provider instead. -- if you are using Prompt Guard, please use the `inline::prompt-guard` provider instead. -- if you are using Code Scanner, please use the `inline::code-scanner` provider instead. - - """, - ), InlineProviderSpec( api=Api.safety, provider_type="inline::llama-guard", @@ -67,15 +46,6 @@ Provider `inline::meta-reference` for API `safety` does not work with the latest module="llama_stack.providers.inline.safety.code_scanner", config_class="llama_stack.providers.inline.safety.code_scanner.CodeScannerConfig", ), - remote_provider_spec( - api=Api.safety, - adapter=AdapterSpec( - adapter_type="sample", - pip_packages=[], - module="llama_stack.providers.remote.safety.sample", - config_class="llama_stack.providers.remote.safety.sample.SampleConfig", - ), - ), remote_provider_spec( api=Api.safety, adapter=AdapterSpec( diff --git a/llama_stack/providers/registry/telemetry.py b/llama_stack/providers/registry/telemetry.py index f3b41374c..fc249f3e2 100644 --- a/llama_stack/providers/registry/telemetry.py +++ b/llama_stack/providers/registry/telemetry.py @@ -7,11 +7,9 @@ from typing import List from llama_stack.providers.datatypes import ( - AdapterSpec, Api, InlineProviderSpec, ProviderSpec, - remote_provider_spec, ) @@ -28,13 +26,4 @@ def available_providers() -> List[ProviderSpec]: module="llama_stack.providers.inline.telemetry.meta_reference", config_class="llama_stack.providers.inline.telemetry.meta_reference.config.TelemetryConfig", ), - remote_provider_spec( - api=Api.telemetry, - adapter=AdapterSpec( - adapter_type="sample", - pip_packages=[], - module="llama_stack.providers.remote.telemetry.sample", - config_class="llama_stack.providers.remote.telemetry.sample.SampleConfig", - ), - ), ] diff --git a/llama_stack/providers/registry/vector_io.py b/llama_stack/providers/registry/vector_io.py index 8471748d8..fbc495d83 100644 --- a/llama_stack/providers/registry/vector_io.py +++ b/llama_stack/providers/registry/vector_io.py @@ -92,16 +92,6 @@ def available_providers() -> List[ProviderSpec]: ), api_dependencies=[Api.inference], ), - remote_provider_spec( - api=Api.vector_io, - adapter=AdapterSpec( - adapter_type="sample", - pip_packages=[], - module="llama_stack.providers.remote.vector_io.sample", - config_class="llama_stack.providers.remote.vector_io.sample.SampleVectorIOConfig", - ), - api_dependencies=[], - ), remote_provider_spec( Api.vector_io, AdapterSpec( diff --git a/llama_stack/providers/remote/agents/sample/__init__.py b/llama_stack/providers/remote/agents/sample/__init__.py deleted file mode 100644 index 94456d98b..000000000 --- a/llama_stack/providers/remote/agents/sample/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from typing import Any - -from .config import SampleConfig - - -async def get_adapter_impl(config: SampleConfig, _deps) -> Any: - from .sample import SampleAgentsImpl - - impl = SampleAgentsImpl(config) - await impl.initialize() - return impl diff --git a/llama_stack/providers/remote/agents/sample/config.py b/llama_stack/providers/remote/agents/sample/config.py deleted file mode 100644 index 4b7404a26..000000000 --- a/llama_stack/providers/remote/agents/sample/config.py +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from pydantic import BaseModel - - -class SampleConfig(BaseModel): - host: str = "localhost" - port: int = 9999 diff --git a/llama_stack/providers/remote/agents/sample/sample.py b/llama_stack/providers/remote/agents/sample/sample.py deleted file mode 100644 index 02e889496..000000000 --- a/llama_stack/providers/remote/agents/sample/sample.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from llama_stack.apis.agents import Agents - -from .config import SampleConfig - - -class SampleAgentsImpl(Agents): - def __init__(self, config: SampleConfig): - self.config = config - - async def initialize(self): - pass diff --git a/llama_stack/providers/remote/datasetio/huggingface/config.py b/llama_stack/providers/remote/datasetio/huggingface/config.py index 1cdae0625..c06996b6f 100644 --- a/llama_stack/providers/remote/datasetio/huggingface/config.py +++ b/llama_stack/providers/remote/datasetio/huggingface/config.py @@ -3,9 +3,10 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from typing import Any, Dict + from pydantic import BaseModel -from llama_stack.distribution.utils.config_dirs import RUNTIME_BASE_DIR from llama_stack.providers.utils.kvstore.config import ( KVStoreConfig, SqliteKVStoreConfig, @@ -13,6 +14,13 @@ from llama_stack.providers.utils.kvstore.config import ( class HuggingfaceDatasetIOConfig(BaseModel): - kvstore: KVStoreConfig = SqliteKVStoreConfig( - db_path=(RUNTIME_BASE_DIR / "huggingface_datasetio.db").as_posix() - ) # Uses SQLite config specific to HF storage + kvstore: KVStoreConfig + + @classmethod + def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]: + return { + "kvstore": SqliteKVStoreConfig.sample_run_config( + __distro_dir__=__distro_dir__, + db_name="huggingface_datasetio.db", + ) + } diff --git a/llama_stack/providers/remote/inference/databricks/config.py b/llama_stack/providers/remote/inference/databricks/config.py index 6aaf7e594..1d51125cb 100644 --- a/llama_stack/providers/remote/inference/databricks/config.py +++ b/llama_stack/providers/remote/inference/databricks/config.py @@ -4,6 +4,7 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from typing import Any, Dict from pydantic import BaseModel, Field @@ -20,3 +21,15 @@ class DatabricksImplConfig(BaseModel): default=None, description="The Databricks API token", ) + + @classmethod + def sample_run_config( + cls, + url: str = "${env.DATABRICKS_URL}", + api_token: str = "${env.DATABRICKS_API_TOKEN}", + **kwargs: Any, + ) -> Dict[str, Any]: + return { + "url": url, + "api_token": api_token, + } diff --git a/llama_stack/providers/remote/inference/runpod/__init__.py b/llama_stack/providers/remote/inference/runpod/__init__.py index dcdfa9a84..69bf95046 100644 --- a/llama_stack/providers/remote/inference/runpod/__init__.py +++ b/llama_stack/providers/remote/inference/runpod/__init__.py @@ -5,10 +5,11 @@ # the root directory of this source tree. from .config import RunpodImplConfig -from .runpod import RunpodInferenceAdapter async def get_adapter_impl(config: RunpodImplConfig, _deps): + from .runpod import RunpodInferenceAdapter + assert isinstance(config, RunpodImplConfig), f"Unexpected config type: {type(config)}" impl = RunpodInferenceAdapter(config) await impl.initialize() diff --git a/llama_stack/providers/remote/inference/runpod/config.py b/llama_stack/providers/remote/inference/runpod/config.py index e59cfe59b..377a7fe6a 100644 --- a/llama_stack/providers/remote/inference/runpod/config.py +++ b/llama_stack/providers/remote/inference/runpod/config.py @@ -4,7 +4,7 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Optional +from typing import Any, Dict, Optional from pydantic import BaseModel, Field @@ -21,3 +21,10 @@ class RunpodImplConfig(BaseModel): default=None, description="The API token", ) + + @classmethod + def sample_run_config(cls, **kwargs: Any) -> Dict[str, Any]: + return { + "url": "${env.RUNPOD_URL:}", + "api_token": "${env.RUNPOD_API_TOKEN:}", + } diff --git a/llama_stack/providers/remote/inference/runpod/runpod.py b/llama_stack/providers/remote/inference/runpod/runpod.py index 783842f71..72f858cd8 100644 --- a/llama_stack/providers/remote/inference/runpod/runpod.py +++ b/llama_stack/providers/remote/inference/runpod/runpod.py @@ -8,7 +8,6 @@ from typing import AsyncGenerator from openai import OpenAI from llama_stack.apis.inference import * # noqa: F403 -from llama_stack.models.llama.datatypes import Message # from llama_stack.providers.datatypes import ModelsProtocolPrivate from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper diff --git a/llama_stack/providers/remote/inference/sample/__init__.py b/llama_stack/providers/remote/inference/sample/__init__.py deleted file mode 100644 index 13263744e..000000000 --- a/llama_stack/providers/remote/inference/sample/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from typing import Any - -from .config import SampleConfig - - -async def get_adapter_impl(config: SampleConfig, _deps) -> Any: - from .sample import SampleInferenceImpl - - impl = SampleInferenceImpl(config) - await impl.initialize() - return impl diff --git a/llama_stack/providers/remote/inference/sample/config.py b/llama_stack/providers/remote/inference/sample/config.py deleted file mode 100644 index 4b7404a26..000000000 --- a/llama_stack/providers/remote/inference/sample/config.py +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from pydantic import BaseModel - - -class SampleConfig(BaseModel): - host: str = "localhost" - port: int = 9999 diff --git a/llama_stack/providers/remote/inference/sample/sample.py b/llama_stack/providers/remote/inference/sample/sample.py deleted file mode 100644 index 106381618..000000000 --- a/llama_stack/providers/remote/inference/sample/sample.py +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from llama_stack.apis.inference import Inference -from llama_stack.apis.models import Model - -from .config import SampleConfig - - -class SampleInferenceImpl(Inference): - def __init__(self, config: SampleConfig): - self.config = config - - async def register_model(self, model: Model) -> None: - # these are the model names the Llama Stack will use to route requests to this provider - # perform validation here if necessary - pass - - async def initialize(self): - pass diff --git a/llama_stack/providers/remote/safety/sample/__init__.py b/llama_stack/providers/remote/safety/sample/__init__.py deleted file mode 100644 index 83a8d0890..000000000 --- a/llama_stack/providers/remote/safety/sample/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from typing import Any - -from .config import SampleConfig - - -async def get_adapter_impl(config: SampleConfig, _deps) -> Any: - from .sample import SampleSafetyImpl - - impl = SampleSafetyImpl(config) - await impl.initialize() - return impl diff --git a/llama_stack/providers/remote/safety/sample/config.py b/llama_stack/providers/remote/safety/sample/config.py deleted file mode 100644 index 4b7404a26..000000000 --- a/llama_stack/providers/remote/safety/sample/config.py +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from pydantic import BaseModel - - -class SampleConfig(BaseModel): - host: str = "localhost" - port: int = 9999 diff --git a/llama_stack/providers/remote/safety/sample/sample.py b/llama_stack/providers/remote/safety/sample/sample.py deleted file mode 100644 index 7645c69e9..000000000 --- a/llama_stack/providers/remote/safety/sample/sample.py +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from llama_stack.apis.safety import Safety -from llama_stack.apis.shields import Shield - -from .config import SampleConfig - - -class SampleSafetyImpl(Safety): - def __init__(self, config: SampleConfig): - self.config = config - - async def register_shield(self, shield: Shield) -> None: - # these are the safety shields the Llama Stack will use to route requests to this provider - # perform validation here if necessary - pass - - async def initialize(self): - pass diff --git a/llama_stack/providers/remote/tool_runtime/bing_search/config.py b/llama_stack/providers/remote/tool_runtime/bing_search/config.py index 67283d8d5..4f089439f 100644 --- a/llama_stack/providers/remote/tool_runtime/bing_search/config.py +++ b/llama_stack/providers/remote/tool_runtime/bing_search/config.py @@ -4,7 +4,7 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Optional +from typing import Any, Dict, Optional from pydantic import BaseModel @@ -14,3 +14,9 @@ class BingSearchToolConfig(BaseModel): api_key: Optional[str] = None top_k: int = 3 + + @classmethod + def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]: + return { + "api_key": "${env.BING_API_KEY:}", + } diff --git a/llama_stack/providers/remote/tool_runtime/model_context_protocol/config.py b/llama_stack/providers/remote/tool_runtime/model_context_protocol/config.py index ffe4c9887..30ac407bc 100644 --- a/llama_stack/providers/remote/tool_runtime/model_context_protocol/config.py +++ b/llama_stack/providers/remote/tool_runtime/model_context_protocol/config.py @@ -4,8 +4,12 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from typing import Any, Dict + from pydantic import BaseModel class ModelContextProtocolConfig(BaseModel): - pass + @classmethod + def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]: + return {} diff --git a/llama_stack/providers/remote/tool_runtime/wolfram_alpha/config.py b/llama_stack/providers/remote/tool_runtime/wolfram_alpha/config.py index 13996b639..8ea49c7b5 100644 --- a/llama_stack/providers/remote/tool_runtime/wolfram_alpha/config.py +++ b/llama_stack/providers/remote/tool_runtime/wolfram_alpha/config.py @@ -4,7 +4,7 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Optional +from typing import Any, Dict, Optional from pydantic import BaseModel @@ -13,3 +13,9 @@ class WolframAlphaToolConfig(BaseModel): """Configuration for WolframAlpha Tool Runtime""" api_key: Optional[str] = None + + @classmethod + def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]: + return { + "api_key": "${env.WOLFRAM_ALPHA_API_KEY:}", + } diff --git a/llama_stack/providers/remote/vector_io/qdrant/config.py b/llama_stack/providers/remote/vector_io/qdrant/config.py index f212882d8..ce68aa492 100644 --- a/llama_stack/providers/remote/vector_io/qdrant/config.py +++ b/llama_stack/providers/remote/vector_io/qdrant/config.py @@ -4,7 +4,7 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Optional +from typing import Any, Dict, Optional from pydantic import BaseModel @@ -24,3 +24,9 @@ class QdrantVectorIOConfig(BaseModel): timeout: Optional[int] = None host: Optional[str] = None path: Optional[str] = None + + @classmethod + def sample_run_config(cls, **kwargs: Any) -> Dict[str, Any]: + return { + "api_key": "${env.QDRANT_API_KEY}", + } diff --git a/llama_stack/providers/remote/vector_io/sample/__init__.py b/llama_stack/providers/remote/vector_io/sample/__init__.py deleted file mode 100644 index 221f47b1c..000000000 --- a/llama_stack/providers/remote/vector_io/sample/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from typing import Any - -from .config import SampleVectorIOConfig - - -async def get_adapter_impl(config: SampleVectorIOConfig, _deps) -> Any: - from .sample import SampleVectorIOImpl - - impl = SampleVectorIOImpl(config) - await impl.initialize() - return impl diff --git a/llama_stack/providers/remote/vector_io/sample/config.py b/llama_stack/providers/remote/vector_io/sample/config.py deleted file mode 100644 index 5126e5eff..000000000 --- a/llama_stack/providers/remote/vector_io/sample/config.py +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from pydantic import BaseModel - - -class SampleVectorIOConfig(BaseModel): - host: str = "localhost" - port: int = 9999 diff --git a/llama_stack/providers/remote/vector_io/sample/sample.py b/llama_stack/providers/remote/vector_io/sample/sample.py deleted file mode 100644 index cb7193cf4..000000000 --- a/llama_stack/providers/remote/vector_io/sample/sample.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from llama_stack.apis.vector_dbs import VectorDB -from llama_stack.apis.vector_io import VectorIO - -from .config import SampleVectorIOConfig - - -class SampleVectorIOImpl(VectorIO): - def __init__(self, config: SampleVectorIOConfig): - self.config = config - - async def register_vector_db(self, vector_db: VectorDB) -> None: - # these are the vector dbs the Llama Stack will use to route requests to this provider - # perform validation here if necessary - pass - - async def initialize(self): - pass - - async def shutdown(self): - pass diff --git a/llama_stack/providers/remote/vector_io/weaviate/config.py b/llama_stack/providers/remote/vector_io/weaviate/config.py index 6aad9a5a6..cc587f252 100644 --- a/llama_stack/providers/remote/vector_io/weaviate/config.py +++ b/llama_stack/providers/remote/vector_io/weaviate/config.py @@ -4,6 +4,8 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from typing import Any, Dict + from pydantic import BaseModel @@ -13,4 +15,6 @@ class WeaviateRequestProviderData(BaseModel): class WeaviateVectorIOConfig(BaseModel): - pass + @classmethod + def sample_run_config(cls, **kwargs: Any) -> Dict[str, Any]: + return {} diff --git a/llama_stack/templates/bedrock/run.yaml b/llama_stack/templates/bedrock/run.yaml index 00a02e0d5..39ed8cf48 100644 --- a/llama_stack/templates/bedrock/run.yaml +++ b/llama_stack/templates/bedrock/run.yaml @@ -45,14 +45,26 @@ providers: eval: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/huggingface_datasetio.db - provider_id: localfs provider_type: inline::localfs - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/localfs_datasetio.db scoring: - provider_id: basic provider_type: inline::basic diff --git a/llama_stack/templates/cerebras/run.yaml b/llama_stack/templates/cerebras/run.yaml index 43d3158ba..8315f75d5 100644 --- a/llama_stack/templates/cerebras/run.yaml +++ b/llama_stack/templates/cerebras/run.yaml @@ -23,7 +23,8 @@ providers: safety: - provider_id: llama-guard provider_type: inline::llama-guard - config: {} + config: + excluded_categories: [] vector_io: - provider_id: faiss provider_type: inline::faiss @@ -43,14 +44,26 @@ providers: eval: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/cerebras}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/cerebras}/huggingface_datasetio.db - provider_id: localfs provider_type: inline::localfs - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/cerebras}/localfs_datasetio.db scoring: - provider_id: basic provider_type: inline::basic diff --git a/llama_stack/templates/ci-tests/run.yaml b/llama_stack/templates/ci-tests/run.yaml index 715d7c86d..ae2b3912c 100644 --- a/llama_stack/templates/ci-tests/run.yaml +++ b/llama_stack/templates/ci-tests/run.yaml @@ -28,7 +28,8 @@ providers: safety: - provider_id: llama-guard provider_type: inline::llama-guard - config: {} + config: + excluded_categories: [] agents: - provider_id: meta-reference provider_type: inline::meta-reference @@ -47,14 +48,26 @@ providers: eval: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ci-tests}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ci-tests}/huggingface_datasetio.db - provider_id: localfs provider_type: inline::localfs - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ci-tests}/localfs_datasetio.db scoring: - provider_id: basic provider_type: inline::basic diff --git a/llama_stack/templates/dell/run-with-safety.yaml b/llama_stack/templates/dell/run-with-safety.yaml index ddec3a715..8a62a5a42 100644 --- a/llama_stack/templates/dell/run-with-safety.yaml +++ b/llama_stack/templates/dell/run-with-safety.yaml @@ -31,7 +31,8 @@ providers: safety: - provider_id: llama-guard provider_type: inline::llama-guard - config: {} + config: + excluded_categories: [] agents: - provider_id: meta-reference provider_type: inline::meta-reference @@ -50,14 +51,26 @@ providers: eval: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dell}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dell}/huggingface_datasetio.db - provider_id: localfs provider_type: inline::localfs - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dell}/localfs_datasetio.db scoring: - provider_id: basic provider_type: inline::basic diff --git a/llama_stack/templates/dell/run.yaml b/llama_stack/templates/dell/run.yaml index 9394c94ef..31c63bd83 100644 --- a/llama_stack/templates/dell/run.yaml +++ b/llama_stack/templates/dell/run.yaml @@ -27,7 +27,8 @@ providers: safety: - provider_id: llama-guard provider_type: inline::llama-guard - config: {} + config: + excluded_categories: [] agents: - provider_id: meta-reference provider_type: inline::meta-reference @@ -46,14 +47,26 @@ providers: eval: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dell}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dell}/huggingface_datasetio.db - provider_id: localfs provider_type: inline::localfs - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dell}/localfs_datasetio.db scoring: - provider_id: basic provider_type: inline::basic diff --git a/llama_stack/templates/dev/run.yaml b/llama_stack/templates/dev/run.yaml index f908af8c3..dba13b357 100644 --- a/llama_stack/templates/dev/run.yaml +++ b/llama_stack/templates/dev/run.yaml @@ -57,7 +57,8 @@ providers: safety: - provider_id: llama-guard provider_type: inline::llama-guard - config: {} + config: + excluded_categories: [] agents: - provider_id: meta-reference provider_type: inline::meta-reference @@ -76,14 +77,26 @@ providers: eval: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/huggingface_datasetio.db - provider_id: localfs provider_type: inline::localfs - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/localfs_datasetio.db scoring: - provider_id: basic provider_type: inline::basic diff --git a/llama_stack/templates/fireworks/run-with-safety.yaml b/llama_stack/templates/fireworks/run-with-safety.yaml index e04141a07..2d79a3548 100644 --- a/llama_stack/templates/fireworks/run-with-safety.yaml +++ b/llama_stack/templates/fireworks/run-with-safety.yaml @@ -56,14 +56,26 @@ providers: eval: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/huggingface_datasetio.db - provider_id: localfs provider_type: inline::localfs - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/localfs_datasetio.db scoring: - provider_id: basic provider_type: inline::basic @@ -88,7 +100,8 @@ providers: max_results: 3 - provider_id: wolfram-alpha provider_type: remote::wolfram-alpha - config: {} + config: + api_key: ${env.WOLFRAM_ALPHA_API_KEY:} - provider_id: code-interpreter provider_type: inline::code-interpreter config: {} diff --git a/llama_stack/templates/fireworks/run.yaml b/llama_stack/templates/fireworks/run.yaml index 369b9ae7b..285495ad9 100644 --- a/llama_stack/templates/fireworks/run.yaml +++ b/llama_stack/templates/fireworks/run.yaml @@ -31,7 +31,8 @@ providers: safety: - provider_id: llama-guard provider_type: inline::llama-guard - config: {} + config: + excluded_categories: [] agents: - provider_id: meta-reference provider_type: inline::meta-reference @@ -50,14 +51,26 @@ providers: eval: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/huggingface_datasetio.db - provider_id: localfs provider_type: inline::localfs - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/localfs_datasetio.db scoring: - provider_id: basic provider_type: inline::basic @@ -82,7 +95,8 @@ providers: max_results: 3 - provider_id: wolfram-alpha provider_type: remote::wolfram-alpha - config: {} + config: + api_key: ${env.WOLFRAM_ALPHA_API_KEY:} - provider_id: code-interpreter provider_type: inline::code-interpreter config: {} diff --git a/llama_stack/templates/groq/run.yaml b/llama_stack/templates/groq/run.yaml index 78212c8d9..6afea2355 100644 --- a/llama_stack/templates/groq/run.yaml +++ b/llama_stack/templates/groq/run.yaml @@ -31,7 +31,8 @@ providers: safety: - provider_id: llama-guard provider_type: inline::llama-guard - config: {} + config: + excluded_categories: [] agents: - provider_id: meta-reference provider_type: inline::meta-reference @@ -50,14 +51,26 @@ providers: eval: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/groq}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/groq}/huggingface_datasetio.db - provider_id: localfs provider_type: inline::localfs - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/groq}/localfs_datasetio.db scoring: - provider_id: basic provider_type: inline::basic diff --git a/llama_stack/templates/hf-endpoint/run-with-safety.yaml b/llama_stack/templates/hf-endpoint/run-with-safety.yaml index 867d7a076..f6f23a987 100644 --- a/llama_stack/templates/hf-endpoint/run-with-safety.yaml +++ b/llama_stack/templates/hf-endpoint/run-with-safety.yaml @@ -36,7 +36,8 @@ providers: safety: - provider_id: llama-guard provider_type: inline::llama-guard - config: {} + config: + excluded_categories: [] agents: - provider_id: meta-reference provider_type: inline::meta-reference @@ -55,14 +56,26 @@ providers: eval: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/huggingface_datasetio.db - provider_id: localfs provider_type: inline::localfs - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/localfs_datasetio.db scoring: - provider_id: basic provider_type: inline::basic diff --git a/llama_stack/templates/hf-endpoint/run.yaml b/llama_stack/templates/hf-endpoint/run.yaml index d60acdefd..461f97128 100644 --- a/llama_stack/templates/hf-endpoint/run.yaml +++ b/llama_stack/templates/hf-endpoint/run.yaml @@ -31,7 +31,8 @@ providers: safety: - provider_id: llama-guard provider_type: inline::llama-guard - config: {} + config: + excluded_categories: [] agents: - provider_id: meta-reference provider_type: inline::meta-reference @@ -50,14 +51,26 @@ providers: eval: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/huggingface_datasetio.db - provider_id: localfs provider_type: inline::localfs - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/localfs_datasetio.db scoring: - provider_id: basic provider_type: inline::basic diff --git a/llama_stack/templates/hf-serverless/run-with-safety.yaml b/llama_stack/templates/hf-serverless/run-with-safety.yaml index e58ad15b3..7f1724f34 100644 --- a/llama_stack/templates/hf-serverless/run-with-safety.yaml +++ b/llama_stack/templates/hf-serverless/run-with-safety.yaml @@ -36,7 +36,8 @@ providers: safety: - provider_id: llama-guard provider_type: inline::llama-guard - config: {} + config: + excluded_categories: [] agents: - provider_id: meta-reference provider_type: inline::meta-reference @@ -55,14 +56,26 @@ providers: eval: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/huggingface_datasetio.db - provider_id: localfs provider_type: inline::localfs - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/localfs_datasetio.db scoring: - provider_id: basic provider_type: inline::basic diff --git a/llama_stack/templates/hf-serverless/run.yaml b/llama_stack/templates/hf-serverless/run.yaml index 5045e821a..ac013488b 100644 --- a/llama_stack/templates/hf-serverless/run.yaml +++ b/llama_stack/templates/hf-serverless/run.yaml @@ -31,7 +31,8 @@ providers: safety: - provider_id: llama-guard provider_type: inline::llama-guard - config: {} + config: + excluded_categories: [] agents: - provider_id: meta-reference provider_type: inline::meta-reference @@ -50,14 +51,26 @@ providers: eval: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/huggingface_datasetio.db - provider_id: localfs provider_type: inline::localfs - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/localfs_datasetio.db scoring: - provider_id: basic provider_type: inline::basic diff --git a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml index caac65c8c..190c08494 100644 --- a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml +++ b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml @@ -38,7 +38,8 @@ providers: safety: - provider_id: llama-guard provider_type: inline::llama-guard - config: {} + config: + excluded_categories: [] agents: - provider_id: meta-reference provider_type: inline::meta-reference @@ -57,14 +58,26 @@ providers: eval: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/huggingface_datasetio.db - provider_id: localfs provider_type: inline::localfs - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/localfs_datasetio.db scoring: - provider_id: basic provider_type: inline::basic diff --git a/llama_stack/templates/meta-reference-gpu/run.yaml b/llama_stack/templates/meta-reference-gpu/run.yaml index bade9a076..07763a4df 100644 --- a/llama_stack/templates/meta-reference-gpu/run.yaml +++ b/llama_stack/templates/meta-reference-gpu/run.yaml @@ -32,7 +32,8 @@ providers: safety: - provider_id: llama-guard provider_type: inline::llama-guard - config: {} + config: + excluded_categories: [] agents: - provider_id: meta-reference provider_type: inline::meta-reference @@ -51,14 +52,26 @@ providers: eval: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/huggingface_datasetio.db - provider_id: localfs provider_type: inline::localfs - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/localfs_datasetio.db scoring: - provider_id: basic provider_type: inline::basic diff --git a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml index f131e8ea6..51b9dc250 100644 --- a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml +++ b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml @@ -34,7 +34,8 @@ providers: safety: - provider_id: llama-guard provider_type: inline::llama-guard - config: {} + config: + excluded_categories: [] agents: - provider_id: meta-reference provider_type: inline::meta-reference @@ -53,14 +54,26 @@ providers: eval: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-quantized-gpu}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-quantized-gpu}/huggingface_datasetio.db - provider_id: localfs provider_type: inline::localfs - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-quantized-gpu}/localfs_datasetio.db scoring: - provider_id: basic provider_type: inline::basic diff --git a/llama_stack/templates/nvidia/run.yaml b/llama_stack/templates/nvidia/run.yaml index 52e78df7b..213e22cb2 100644 --- a/llama_stack/templates/nvidia/run.yaml +++ b/llama_stack/templates/nvidia/run.yaml @@ -28,7 +28,8 @@ providers: safety: - provider_id: llama-guard provider_type: inline::llama-guard - config: {} + config: + excluded_categories: [] agents: - provider_id: meta-reference provider_type: inline::meta-reference @@ -47,14 +48,26 @@ providers: eval: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/huggingface_datasetio.db - provider_id: localfs provider_type: inline::localfs - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/localfs_datasetio.db scoring: - provider_id: basic provider_type: inline::basic diff --git a/llama_stack/templates/ollama/run-with-safety.yaml b/llama_stack/templates/ollama/run-with-safety.yaml index a96031272..2b8eb44db 100644 --- a/llama_stack/templates/ollama/run-with-safety.yaml +++ b/llama_stack/templates/ollama/run-with-safety.yaml @@ -49,14 +49,26 @@ providers: eval: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/huggingface_datasetio.db - provider_id: localfs provider_type: inline::localfs - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/localfs_datasetio.db scoring: - provider_id: basic provider_type: inline::basic @@ -90,7 +102,8 @@ providers: config: {} - provider_id: wolfram-alpha provider_type: remote::wolfram-alpha - config: {} + config: + api_key: ${env.WOLFRAM_ALPHA_API_KEY:} metadata_store: type: sqlite db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/registry.db diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml index 661d880a7..c9531f417 100644 --- a/llama_stack/templates/ollama/run.yaml +++ b/llama_stack/templates/ollama/run.yaml @@ -27,7 +27,8 @@ providers: safety: - provider_id: llama-guard provider_type: inline::llama-guard - config: {} + config: + excluded_categories: [] agents: - provider_id: meta-reference provider_type: inline::meta-reference @@ -46,14 +47,26 @@ providers: eval: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/huggingface_datasetio.db - provider_id: localfs provider_type: inline::localfs - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/localfs_datasetio.db scoring: - provider_id: basic provider_type: inline::basic @@ -87,7 +100,8 @@ providers: config: {} - provider_id: wolfram-alpha provider_type: remote::wolfram-alpha - config: {} + config: + api_key: ${env.WOLFRAM_ALPHA_API_KEY:} metadata_store: type: sqlite db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/registry.db diff --git a/llama_stack/templates/open-benchmark/run.yaml b/llama_stack/templates/open-benchmark/run.yaml index 97c54e621..5ef25435b 100644 --- a/llama_stack/templates/open-benchmark/run.yaml +++ b/llama_stack/templates/open-benchmark/run.yaml @@ -54,7 +54,8 @@ providers: safety: - provider_id: llama-guard provider_type: inline::llama-guard - config: {} + config: + excluded_categories: [] agents: - provider_id: meta-reference provider_type: inline::meta-reference @@ -73,14 +74,26 @@ providers: eval: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/open-benchmark}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/open-benchmark}/huggingface_datasetio.db - provider_id: localfs provider_type: inline::localfs - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/open-benchmark}/localfs_datasetio.db scoring: - provider_id: basic provider_type: inline::basic diff --git a/llama_stack/templates/remote-vllm/run-with-safety.yaml b/llama_stack/templates/remote-vllm/run-with-safety.yaml index 45af8427a..9741f5302 100644 --- a/llama_stack/templates/remote-vllm/run-with-safety.yaml +++ b/llama_stack/templates/remote-vllm/run-with-safety.yaml @@ -38,7 +38,8 @@ providers: safety: - provider_id: llama-guard provider_type: inline::llama-guard - config: {} + config: + excluded_categories: [] agents: - provider_id: meta-reference provider_type: inline::meta-reference @@ -50,14 +51,26 @@ providers: eval: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/huggingface_datasetio.db - provider_id: localfs provider_type: inline::localfs - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/localfs_datasetio.db scoring: - provider_id: basic provider_type: inline::basic @@ -98,7 +111,8 @@ providers: config: {} - provider_id: wolfram-alpha provider_type: remote::wolfram-alpha - config: {} + config: + api_key: ${env.WOLFRAM_ALPHA_API_KEY:} metadata_store: type: sqlite db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db diff --git a/llama_stack/templates/remote-vllm/run.yaml b/llama_stack/templates/remote-vllm/run.yaml index 674085045..e26b20e88 100644 --- a/llama_stack/templates/remote-vllm/run.yaml +++ b/llama_stack/templates/remote-vllm/run.yaml @@ -32,7 +32,8 @@ providers: safety: - provider_id: llama-guard provider_type: inline::llama-guard - config: {} + config: + excluded_categories: [] agents: - provider_id: meta-reference provider_type: inline::meta-reference @@ -44,14 +45,26 @@ providers: eval: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/huggingface_datasetio.db - provider_id: localfs provider_type: inline::localfs - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/localfs_datasetio.db scoring: - provider_id: basic provider_type: inline::basic @@ -92,7 +105,8 @@ providers: config: {} - provider_id: wolfram-alpha provider_type: remote::wolfram-alpha - config: {} + config: + api_key: ${env.WOLFRAM_ALPHA_API_KEY:} metadata_store: type: sqlite db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db diff --git a/llama_stack/templates/sambanova/run.yaml b/llama_stack/templates/sambanova/run.yaml index cfa0cc194..616d82a61 100644 --- a/llama_stack/templates/sambanova/run.yaml +++ b/llama_stack/templates/sambanova/run.yaml @@ -37,7 +37,8 @@ providers: safety: - provider_id: llama-guard provider_type: inline::llama-guard - config: {} + config: + excluded_categories: [] agents: - provider_id: meta-reference provider_type: inline::meta-reference diff --git a/llama_stack/templates/tgi/run-with-safety.yaml b/llama_stack/templates/tgi/run-with-safety.yaml index e1d85f59a..db54c0393 100644 --- a/llama_stack/templates/tgi/run-with-safety.yaml +++ b/llama_stack/templates/tgi/run-with-safety.yaml @@ -31,7 +31,8 @@ providers: safety: - provider_id: llama-guard provider_type: inline::llama-guard - config: {} + config: + excluded_categories: [] agents: - provider_id: meta-reference provider_type: inline::meta-reference @@ -50,14 +51,26 @@ providers: eval: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/huggingface_datasetio.db - provider_id: localfs provider_type: inline::localfs - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/localfs_datasetio.db scoring: - provider_id: basic provider_type: inline::basic diff --git a/llama_stack/templates/tgi/run.yaml b/llama_stack/templates/tgi/run.yaml index fc73e0978..dafb59aa9 100644 --- a/llama_stack/templates/tgi/run.yaml +++ b/llama_stack/templates/tgi/run.yaml @@ -30,7 +30,8 @@ providers: safety: - provider_id: llama-guard provider_type: inline::llama-guard - config: {} + config: + excluded_categories: [] agents: - provider_id: meta-reference provider_type: inline::meta-reference @@ -49,14 +50,26 @@ providers: eval: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/huggingface_datasetio.db - provider_id: localfs provider_type: inline::localfs - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/localfs_datasetio.db scoring: - provider_id: basic provider_type: inline::basic diff --git a/llama_stack/templates/together/run-with-safety.yaml b/llama_stack/templates/together/run-with-safety.yaml index 3a7d3dfba..e0bf46c11 100644 --- a/llama_stack/templates/together/run-with-safety.yaml +++ b/llama_stack/templates/together/run-with-safety.yaml @@ -56,14 +56,26 @@ providers: eval: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/huggingface_datasetio.db - provider_id: localfs provider_type: inline::localfs - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/localfs_datasetio.db scoring: - provider_id: basic provider_type: inline::basic @@ -97,7 +109,8 @@ providers: config: {} - provider_id: wolfram-alpha provider_type: remote::wolfram-alpha - config: {} + config: + api_key: ${env.WOLFRAM_ALPHA_API_KEY:} metadata_store: type: sqlite db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/registry.db diff --git a/llama_stack/templates/together/run.yaml b/llama_stack/templates/together/run.yaml index 10668914a..9d0acaf31 100644 --- a/llama_stack/templates/together/run.yaml +++ b/llama_stack/templates/together/run.yaml @@ -31,7 +31,8 @@ providers: safety: - provider_id: llama-guard provider_type: inline::llama-guard - config: {} + config: + excluded_categories: [] agents: - provider_id: meta-reference provider_type: inline::meta-reference @@ -50,14 +51,26 @@ providers: eval: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/huggingface_datasetio.db - provider_id: localfs provider_type: inline::localfs - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/localfs_datasetio.db scoring: - provider_id: basic provider_type: inline::basic @@ -91,7 +104,8 @@ providers: config: {} - provider_id: wolfram-alpha provider_type: remote::wolfram-alpha - config: {} + config: + api_key: ${env.WOLFRAM_ALPHA_API_KEY:} metadata_store: type: sqlite db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/registry.db diff --git a/llama_stack/templates/vllm-gpu/run.yaml b/llama_stack/templates/vllm-gpu/run.yaml index 8a15ff016..bf85de0a2 100644 --- a/llama_stack/templates/vllm-gpu/run.yaml +++ b/llama_stack/templates/vllm-gpu/run.yaml @@ -35,7 +35,8 @@ providers: safety: - provider_id: llama-guard provider_type: inline::llama-guard - config: {} + config: + excluded_categories: [] agents: - provider_id: meta-reference provider_type: inline::meta-reference @@ -54,14 +55,26 @@ providers: eval: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/huggingface_datasetio.db - provider_id: localfs provider_type: inline::localfs - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/localfs_datasetio.db scoring: - provider_id: basic provider_type: inline::basic diff --git a/tests/unit/providers/test_configs.py b/tests/unit/providers/test_configs.py new file mode 100644 index 000000000..246470372 --- /dev/null +++ b/tests/unit/providers/test_configs.py @@ -0,0 +1,50 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import pytest +from pydantic import BaseModel + +from llama_stack.distribution.distribution import get_provider_registry, providable_apis +from llama_stack.distribution.utils.dynamic import instantiate_class_type + + +class TestProviderConfigurations: + """Test suite for testing provider configurations across all API types.""" + + def test_all_api_providers_exist(self): + provider_registry = get_provider_registry() + for api in providable_apis(): + providers = provider_registry.get(api, {}) + assert providers, f"No providers found for API type: {api}" + + @pytest.mark.parametrize("api", providable_apis()) + def test_api_providers(self, api): + provider_registry = get_provider_registry() + providers = provider_registry.get(api, {}) + assert providers, f"No providers found for API type: {api}" + + failures = [] + for provider_type, provider_spec in providers.items(): + try: + self._verify_provider_config(provider_type, provider_spec) + except Exception as e: + failures.append(f"Failed to verify {provider_type} config: {str(e)}") + + if failures: + pytest.fail("\n".join(failures)) + + def _verify_provider_config(self, provider_type, provider_spec): + """Helper method to verify a single provider configuration.""" + # Get the config class + config_class_name = provider_spec.config_class + config_type = instantiate_class_type(config_class_name) + + assert issubclass(config_type, BaseModel), f"{config_class_name} is not a subclass of BaseModel" + + assert hasattr(config_type, "sample_run_config"), f"{config_class_name} does not have sample_run_config method" + + sample_config = config_type.sample_run_config(__distro_dir__="foobarbaz") + assert isinstance(sample_config, dict), f"{config_class_name}.sample_run_config() did not return a dict" From 9617468d13edfa98d4ae6fa02b645a31d4ad9fe4 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Thu, 13 Mar 2025 09:44:26 -0700 Subject: [PATCH 038/557] fix: passthrough provider template + fix (#1612) # What does this PR do? - Fix issue w/ passthrough provider [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan llama stack run [//]: # (## Documentation) --- distributions/dependencies.json | 34 +++ .../self_hosted_distro/passthrough.md | 42 ++++ llama_stack/templates/passthrough/__init__.py | 7 + llama_stack/templates/passthrough/build.yaml | 4 +- .../templates/passthrough/doc_template.md | 35 +++ .../templates/passthrough/passthrough.py | 201 ++++++++++++++++++ .../passthrough/run-with-safety.yaml | 154 ++++++++++++++ llama_stack/templates/passthrough/run.yaml | 36 +++- 8 files changed, 506 insertions(+), 7 deletions(-) create mode 100644 docs/source/distributions/self_hosted_distro/passthrough.md create mode 100644 llama_stack/templates/passthrough/__init__.py create mode 100644 llama_stack/templates/passthrough/doc_template.md create mode 100644 llama_stack/templates/passthrough/passthrough.py create mode 100644 llama_stack/templates/passthrough/run-with-safety.yaml diff --git a/distributions/dependencies.json b/distributions/dependencies.json index 82fbcec8d..c3f039247 100644 --- a/distributions/dependencies.json +++ b/distributions/dependencies.json @@ -487,6 +487,40 @@ "transformers", "uvicorn" ], + "passthrough": [ + "aiosqlite", + "autoevals", + "blobfile", + "chardet", + "chromadb-client", + "datasets", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "matplotlib", + "mcp", + "nltk", + "numpy", + "openai", + "opentelemetry-exporter-otlp-proto-http", + "opentelemetry-sdk", + "pandas", + "pillow", + "psycopg2-binary", + "pymongo", + "pypdf", + "redis", + "requests", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch torchvision --index-url https://download.pytorch.org/whl/cpu" + ], "remote-vllm": [ "aiosqlite", "autoevals", diff --git a/docs/source/distributions/self_hosted_distro/passthrough.md b/docs/source/distributions/self_hosted_distro/passthrough.md new file mode 100644 index 000000000..558d7ca08 --- /dev/null +++ b/docs/source/distributions/self_hosted_distro/passthrough.md @@ -0,0 +1,42 @@ +--- +orphan: true +--- + +# Passthrough Distribution + +```{toctree} +:maxdepth: 2 +:hidden: + +self +``` + +The `llamastack/distribution-passthrough` distribution consists of the following provider configurations. + +| API | Provider(s) | +|-----|-------------| +| agents | `inline::meta-reference` | +| datasetio | `remote::huggingface`, `inline::localfs` | +| eval | `inline::meta-reference` | +| inference | `remote::passthrough`, `inline::sentence-transformers` | +| safety | `inline::llama-guard` | +| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | +| telemetry | `inline::meta-reference` | +| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `remote::wolfram-alpha`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` | +| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | + + +### Environment Variables + +The following environment variables can be configured: + +- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `5001`) +- `PASSTHROUGH_API_KEY`: Passthrough API Key (default: ``) +- `PASSTHROUGH_URL`: Passthrough URL (default: ``) + +### Models + +The following models are available by default: + +- `llama3.1-8b-instruct ` +- `llama3.2-11b-vision-instruct ` diff --git a/llama_stack/templates/passthrough/__init__.py b/llama_stack/templates/passthrough/__init__.py new file mode 100644 index 000000000..9632c09fb --- /dev/null +++ b/llama_stack/templates/passthrough/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .passthrough import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/passthrough/build.yaml b/llama_stack/templates/passthrough/build.yaml index 5fed5286e..fb1fb1066 100644 --- a/llama_stack/templates/passthrough/build.yaml +++ b/llama_stack/templates/passthrough/build.yaml @@ -1,9 +1,10 @@ version: '2' distribution_spec: - description: Use for running LLM inference with the endpoint that compatible with Llama Stack API + description: Use Passthrough hosted llama-stack endpoint for LLM inference providers: inference: - remote::passthrough + - inline::sentence-transformers vector_io: - inline::faiss - remote::chromadb @@ -26,6 +27,7 @@ distribution_spec: tool_runtime: - remote::brave-search - remote::tavily-search + - remote::wolfram-alpha - inline::code-interpreter - inline::rag-runtime - remote::model-context-protocol diff --git a/llama_stack/templates/passthrough/doc_template.md b/llama_stack/templates/passthrough/doc_template.md new file mode 100644 index 000000000..f9e88873d --- /dev/null +++ b/llama_stack/templates/passthrough/doc_template.md @@ -0,0 +1,35 @@ +--- +orphan: true +--- +# Passthrough Distribution + +```{toctree} +:maxdepth: 2 +:hidden: + +self +``` + +The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations. + +{{ providers_table }} + +{% if run_config_env_vars %} +### Environment Variables + +The following environment variables can be configured: + +{% for var, (default_value, description) in run_config_env_vars.items() %} +- `{{ var }}`: {{ description }} (default: `{{ default_value }}`) +{% endfor %} +{% endif %} + +{% if default_models %} +### Models + +The following models are available by default: + +{% for model in default_models %} +- `{{ model.model_id }} {{ model.doc_string }}` +{% endfor %} +{% endif %} diff --git a/llama_stack/templates/passthrough/passthrough.py b/llama_stack/templates/passthrough/passthrough.py new file mode 100644 index 000000000..cc3f55937 --- /dev/null +++ b/llama_stack/templates/passthrough/passthrough.py @@ -0,0 +1,201 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from pathlib import Path + +from llama_stack.apis.models.models import ModelType +from llama_stack.distribution.datatypes import ( + ModelInput, + Provider, + ShieldInput, + ToolGroupInput, +) +from llama_stack.providers.inline.inference.sentence_transformers import ( + SentenceTransformersInferenceConfig, +) +from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig +from llama_stack.providers.remote.inference.passthrough.config import ( + PassthroughImplConfig, +) +from llama_stack.providers.utils.inference.model_registry import ProviderModelEntry +from llama_stack.templates.template import ( + DistributionTemplate, + RunConfigSettings, +) + + +def get_distribution_template() -> DistributionTemplate: + providers = { + "inference": ["remote::passthrough", "inline::sentence-transformers"], + "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], + "safety": ["inline::llama-guard"], + "agents": ["inline::meta-reference"], + "telemetry": ["inline::meta-reference"], + "eval": ["inline::meta-reference"], + "datasetio": ["remote::huggingface", "inline::localfs"], + "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], + "tool_runtime": [ + "remote::brave-search", + "remote::tavily-search", + "remote::wolfram-alpha", + "inline::code-interpreter", + "inline::rag-runtime", + "remote::model-context-protocol", + ], + } + + name = "passthrough" + + inference_provider = Provider( + provider_id="passthrough", + provider_type="remote::passthrough", + config=PassthroughImplConfig.sample_run_config(), + ) + embedding_provider = Provider( + provider_id="sentence-transformers", + provider_type="inline::sentence-transformers", + config=SentenceTransformersInferenceConfig.sample_run_config(), + ) + vector_io_provider = Provider( + provider_id="faiss", + provider_type="inline::faiss", + config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"), + ) + + default_models = [ + ModelInput( + metadata={}, + model_id="meta-llama/Llama-3.1-8B-Instruct", + provider_id="passthrough", + provider_model_id="llama3.1-8b-instruct", + model_type=ModelType.llm, + ), + ModelInput( + metadata={}, + model_id="meta-llama/Llama-3.2-11B-Vision-Instruct", + provider_id="passthrough", + provider_model_id="llama3.2-11b-vision-instruct", + model_type=ModelType.llm, + ), + ] + + embedding_model = ModelInput( + model_id="all-MiniLM-L6-v2", + provider_id="sentence-transformers", + model_type=ModelType.embedding, + metadata={ + "embedding_dimension": 384, + }, + ) + default_tool_groups = [ + ToolGroupInput( + toolgroup_id="builtin::websearch", + provider_id="tavily-search", + ), + ToolGroupInput( + toolgroup_id="builtin::wolfram_alpha", + provider_id="wolfram-alpha", + ), + ToolGroupInput( + toolgroup_id="builtin::rag", + provider_id="rag-runtime", + ), + ToolGroupInput( + toolgroup_id="builtin::code_interpreter", + provider_id="code-interpreter", + ), + ] + + return DistributionTemplate( + name=name, + distro_type="self_hosted", + description="Use Passthrough hosted llama-stack endpoint for LLM inference", + container_image=None, + template_path=Path(__file__).parent / "doc_template.md", + providers=providers, + available_models_by_provider={ + "passthrough": [ + ProviderModelEntry( + provider_model_id="llama3.1-8b-instruct", + model_type=ModelType.llm, + ), + ProviderModelEntry( + provider_model_id="llama3.2-11b-vision-instruct", + model_type=ModelType.llm, + ), + ], + }, + run_configs={ + "run.yaml": RunConfigSettings( + provider_overrides={ + "inference": [inference_provider, embedding_provider], + "vector_io": [vector_io_provider], + }, + default_models=default_models + [embedding_model], + default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")], + default_tool_groups=default_tool_groups, + ), + "run-with-safety.yaml": RunConfigSettings( + provider_overrides={ + "inference": [ + inference_provider, + embedding_provider, + ], + "vector_io": [vector_io_provider], + "safety": [ + Provider( + provider_id="llama-guard", + provider_type="inline::llama-guard", + config={}, + ), + Provider( + provider_id="llama-guard-vision", + provider_type="inline::llama-guard", + config={}, + ), + Provider( + provider_id="code-scanner", + provider_type="inline::code-scanner", + config={}, + ), + ], + }, + default_models=[ + *default_models, + embedding_model, + ], + default_shields=[ + ShieldInput( + shield_id="meta-llama/Llama-Guard-3-8B", + provider_id="llama-guard", + ), + ShieldInput( + shield_id="meta-llama/Llama-Guard-3-11B-Vision", + provider_id="llama-guard-vision", + ), + ShieldInput( + shield_id="CodeScanner", + provider_id="code-scanner", + ), + ], + default_tool_groups=default_tool_groups, + ), + }, + run_config_env_vars={ + "LLAMA_STACK_PORT": ( + "5001", + "Port for the Llama Stack distribution server", + ), + "PASSTHROUGH_API_KEY": ( + "", + "Passthrough API Key", + ), + "PASSTHROUGH_URL": ( + "", + "Passthrough URL", + ), + }, + ) diff --git a/llama_stack/templates/passthrough/run-with-safety.yaml b/llama_stack/templates/passthrough/run-with-safety.yaml new file mode 100644 index 000000000..fbfa4afe7 --- /dev/null +++ b/llama_stack/templates/passthrough/run-with-safety.yaml @@ -0,0 +1,154 @@ +version: '2' +image_name: passthrough +apis: +- agents +- datasetio +- eval +- inference +- safety +- scoring +- telemetry +- tool_runtime +- vector_io +providers: + inference: + - provider_id: passthrough + provider_type: remote::passthrough + config: + url: ${env.PASSTHROUGH_URL} + api_key: ${env.PASSTHROUGH_API_KEY} + - provider_id: sentence-transformers + provider_type: inline::sentence-transformers + config: {} + vector_io: + - provider_id: faiss + provider_type: inline::faiss + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/faiss_store.db + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard + config: {} + - provider_id: llama-guard-vision + provider_type: inline::llama-guard + config: {} + - provider_id: code-scanner + provider_type: inline::code-scanner + config: {} + agents: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + persistence_store: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/agents_store.db + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + service_name: ${env.OTEL_SERVICE_NAME:llama-stack} + sinks: ${env.TELEMETRY_SINKS:console,sqlite} + sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/passthrough/trace_store.db} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/meta_reference_eval.db + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/huggingface_datasetio.db + - provider_id: localfs + provider_type: inline::localfs + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/localfs_datasetio.db + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: + openai_api_key: ${env.OPENAI_API_KEY:} + tool_runtime: + - provider_id: brave-search + provider_type: remote::brave-search + config: + api_key: ${env.BRAVE_SEARCH_API_KEY:} + max_results: 3 + - provider_id: tavily-search + provider_type: remote::tavily-search + config: + api_key: ${env.TAVILY_SEARCH_API_KEY:} + max_results: 3 + - provider_id: wolfram-alpha + provider_type: remote::wolfram-alpha + config: + api_key: ${env.WOLFRAM_ALPHA_API_KEY:} + - provider_id: code-interpreter + provider_type: inline::code-interpreter + config: {} + - provider_id: rag-runtime + provider_type: inline::rag-runtime + config: {} + - provider_id: model-context-protocol + provider_type: remote::model-context-protocol + config: {} +metadata_store: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/registry.db +models: +- metadata: {} + model_id: meta-llama/Llama-3.1-8B-Instruct + provider_id: passthrough + provider_model_id: llama3.1-8b-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.2-11B-Vision-Instruct + provider_id: passthrough + provider_model_id: llama3.2-11b-vision-instruct + model_type: llm +- metadata: + embedding_dimension: 384 + model_id: all-MiniLM-L6-v2 + provider_id: sentence-transformers + model_type: embedding +shields: +- shield_id: meta-llama/Llama-Guard-3-8B + provider_id: llama-guard +- shield_id: meta-llama/Llama-Guard-3-11B-Vision + provider_id: llama-guard-vision +- shield_id: CodeScanner + provider_id: code-scanner +vector_dbs: [] +datasets: [] +scoring_fns: [] +benchmarks: [] +tool_groups: +- toolgroup_id: builtin::websearch + provider_id: tavily-search +- toolgroup_id: builtin::wolfram_alpha + provider_id: wolfram-alpha +- toolgroup_id: builtin::rag + provider_id: rag-runtime +- toolgroup_id: builtin::code_interpreter + provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/passthrough/run.yaml b/llama_stack/templates/passthrough/run.yaml index 2548faa5d..6956bc6e3 100644 --- a/llama_stack/templates/passthrough/run.yaml +++ b/llama_stack/templates/passthrough/run.yaml @@ -31,7 +31,8 @@ providers: safety: - provider_id: llama-guard provider_type: inline::llama-guard - config: {} + config: + excluded_categories: [] agents: - provider_id: meta-reference provider_type: inline::meta-reference @@ -50,14 +51,26 @@ providers: eval: - provider_id: meta-reference provider_type: inline::meta-reference - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/huggingface_datasetio.db - provider_id: localfs provider_type: inline::localfs - config: {} + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/localfs_datasetio.db scoring: - provider_id: basic provider_type: inline::basic @@ -80,6 +93,10 @@ providers: config: api_key: ${env.TAVILY_SEARCH_API_KEY:} max_results: 3 + - provider_id: wolfram-alpha + provider_type: remote::wolfram-alpha + config: + api_key: ${env.WOLFRAM_ALPHA_API_KEY:} - provider_id: code-interpreter provider_type: inline::code-interpreter config: {} @@ -91,7 +108,7 @@ providers: config: {} metadata_store: type: sqlite - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-llama}/registry.db + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/registry.db models: - metadata: {} model_id: meta-llama/Llama-3.1-8B-Instruct @@ -103,15 +120,22 @@ models: provider_id: passthrough provider_model_id: llama3.2-11b-vision-instruct model_type: llm +- metadata: + embedding_dimension: 384 + model_id: all-MiniLM-L6-v2 + provider_id: sentence-transformers + model_type: embedding shields: - shield_id: meta-llama/Llama-Guard-3-8B vector_dbs: [] datasets: [] scoring_fns: [] -eval_tasks: [] +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search +- toolgroup_id: builtin::wolfram_alpha + provider_id: wolfram-alpha - toolgroup_id: builtin::rag provider_id: rag-runtime - toolgroup_id: builtin::code_interpreter From 5e54113b1930bc92b63f7d025d064fdc4986c95a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Thu, 13 Mar 2025 18:14:01 +0100 Subject: [PATCH 039/557] ci: add dynamic CI job to test templates (#1230) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? Introduced a new CI job that dynamically generates a build matrix based on available templates from `llama_stack/templates/*/build.yaml`. This allows automated testing for all templates without manual intervention. The CI currently builds for venv and containers. Signed-off-by: Sébastien Han ~Will pass once https://github.com/meta-llama/llama-stack/pull/1228 merges.~ Signed-off-by: Sébastien Han --- .github/workflows/providers-build.yml | 76 +++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 .github/workflows/providers-build.yml diff --git a/.github/workflows/providers-build.yml b/.github/workflows/providers-build.yml new file mode 100644 index 000000000..1bf6591dc --- /dev/null +++ b/.github/workflows/providers-build.yml @@ -0,0 +1,76 @@ +name: Test Llama Stack Build + +on: + push: + branches: + - main + paths: + - 'llama_stack/cli/stack/build.py' + - 'llama_stack/cli/stack/_build.py' + - 'llama_stack/distribution/build.*' + - 'llama_stack/distribution/*.sh' + - '.github/workflows/providers-build.yml' + pull_request: + paths: + - 'llama_stack/cli/stack/build.py' + - 'llama_stack/cli/stack/_build.py' + - 'llama_stack/distribution/build.*' + - 'llama_stack/distribution/*.sh' + - '.github/workflows/providers-build.yml' + +jobs: + generate-matrix: + runs-on: ubuntu-latest + outputs: + templates: ${{ steps.set-matrix.outputs.templates }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Generate Template List + id: set-matrix + run: | + templates=$(ls llama_stack/templates/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]') + echo "templates=$templates" >> "$GITHUB_OUTPUT" + + build: + needs: generate-matrix + runs-on: ubuntu-latest + strategy: + matrix: + template: ${{ fromJson(needs.generate-matrix.outputs.templates) }} + image-type: [venv, container] + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install uv + uses: astral-sh/setup-uv@v4 + with: + python-version: "3.10" + + - name: Install LlamaStack + run: | + uv venv + source .venv/bin/activate + uv pip install -e . + + - name: Print build dependencies + run: | + uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test --print-deps-only + + - name: Run Llama Stack Build + run: | + uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test + + - name: Print dependencies in the image + if: matrix.image-type == 'venv' + run: | + source test/bin/activate + uv pip list From 98811cc0347b4921e772236a3c5117d73c98600e Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Thu, 13 Mar 2025 11:01:52 -0700 Subject: [PATCH 040/557] fix: clean up test imports (#1600) # What does this PR do? - Clean up dead SDK code in https://github.com/meta-llama/llama-stack-client-python/pull/198 - Regen for local cache key issue [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan ``` pytest -v -s --nbval-lax ./docs/getting_started.ipynb LLAMA_STACK_CONFIG=fireworks pytest -v tests/integration/ --text-model meta-llama/Llama-3.3-70B-Instruct ``` - CI: https://github.com/meta-llama/llama-stack-ops/actions/runs/13823512113 image [//]: # (## Documentation) --- tests/integration/agents/test_agents.py | 5 ++-- tests/integration/datasetio/test_datasetio.py | 17 ++++++++++-- tests/integration/scoring/test_scoring.py | 26 +++++++++++++------ 3 files changed, 35 insertions(+), 13 deletions(-) diff --git a/tests/integration/agents/test_agents.py b/tests/integration/agents/test_agents.py index f6bde8927..61249ad17 100644 --- a/tests/integration/agents/test_agents.py +++ b/tests/integration/agents/test_agents.py @@ -10,8 +10,7 @@ from uuid import uuid4 import pytest from llama_stack_client.lib.agents.agent import Agent from llama_stack_client.lib.agents.event_logger import EventLogger -from llama_stack_client.types.agents.turn_create_params import Document as AgentDocument -from llama_stack_client.types.memory_insert_params import Document +from llama_stack_client.types.agents.turn_create_params import Document from llama_stack_client.types.shared_params.agent_config import AgentConfig, ToolConfig from llama_stack.apis.agents.agents import ( @@ -242,7 +241,7 @@ def test_code_interpreter_for_attachments(llama_stack_client_with_mocked_inferen codex_agent = Agent(llama_stack_client_with_mocked_inference, **agent_config) session_id = codex_agent.create_session(f"test-session-{uuid4()}") - inflation_doc = AgentDocument( + inflation_doc = Document( content="https://raw.githubusercontent.com/meta-llama/llama-stack-apps/main/examples/resources/inflation.csv", mime_type="text/csv", ) diff --git a/tests/integration/datasetio/test_datasetio.py b/tests/integration/datasetio/test_datasetio.py index f112071a6..459589e7b 100644 --- a/tests/integration/datasetio/test_datasetio.py +++ b/tests/integration/datasetio/test_datasetio.py @@ -9,11 +9,25 @@ import mimetypes import os from pathlib import Path +import pytest + # How to run this test: # # LLAMA_STACK_CONFIG="template-name" pytest -v tests/integration/datasetio +@pytest.fixture +def dataset_for_test(llama_stack_client): + dataset_id = "test_dataset" + register_dataset(llama_stack_client, dataset_id=dataset_id) + yield + # Teardown - this always runs, even if the test fails + try: + llama_stack_client.datasets.unregister(dataset_id) + except Exception as e: + print(f"Warning: Failed to unregister test_dataset: {e}") + + def data_url_from_file(file_path: str) -> str: if not os.path.exists(file_path): raise FileNotFoundError(f"File not found: {file_path}") @@ -80,8 +94,7 @@ def test_register_unregister_dataset(llama_stack_client): assert len(response) == 0 -def test_get_rows_paginated(llama_stack_client): - register_dataset(llama_stack_client) +def test_get_rows_paginated(llama_stack_client, dataset_for_test): response = llama_stack_client.datasetio.get_rows_paginated( dataset_id="test_dataset", rows_in_page=3, diff --git a/tests/integration/scoring/test_scoring.py b/tests/integration/scoring/test_scoring.py index 2fcdf54e2..970a96f40 100644 --- a/tests/integration/scoring/test_scoring.py +++ b/tests/integration/scoring/test_scoring.py @@ -10,6 +10,19 @@ import pytest from ..datasetio.test_datasetio import register_dataset +@pytest.fixture +def rag_dataset_for_test(llama_stack_client): + dataset_id = "test_dataset" + register_dataset(llama_stack_client, for_rag=True, dataset_id=dataset_id) + yield # This is where the test function will run + + # Teardown - this always runs, even if the test fails + try: + llama_stack_client.datasets.unregister(dataset_id) + except Exception as e: + print(f"Warning: Failed to unregister test_dataset: {e}") + + @pytest.fixture def sample_judge_prompt_template(): return "Output a number response in the following format: Score: , where is the number between 0 and 9." @@ -79,9 +92,7 @@ def test_scoring_functions_register( # TODO: add unregister api for scoring functions -def test_scoring_score(llama_stack_client): - register_dataset(llama_stack_client, for_rag=True) - +def test_scoring_score(llama_stack_client, rag_dataset_for_test): # scoring individual rows rows = llama_stack_client.datasetio.get_rows_paginated( dataset_id="test_dataset", @@ -115,9 +126,9 @@ def test_scoring_score(llama_stack_client): assert len(response.results[x].score_rows) == 5 -def test_scoring_score_with_params_llm_as_judge(llama_stack_client, sample_judge_prompt_template, judge_model_id): - register_dataset(llama_stack_client, for_rag=True) - +def test_scoring_score_with_params_llm_as_judge( + llama_stack_client, sample_judge_prompt_template, judge_model_id, rag_dataset_for_test +): # scoring individual rows rows = llama_stack_client.datasetio.get_rows_paginated( dataset_id="test_dataset", @@ -167,9 +178,8 @@ def test_scoring_score_with_params_llm_as_judge(llama_stack_client, sample_judge ], ) def test_scoring_score_with_aggregation_functions( - llama_stack_client, sample_judge_prompt_template, judge_model_id, provider_id + llama_stack_client, sample_judge_prompt_template, judge_model_id, provider_id, rag_dataset_for_test ): - register_dataset(llama_stack_client, for_rag=True) rows = llama_stack_client.datasetio.get_rows_paginated( dataset_id="test_dataset", rows_in_page=3, From 42788a9d505950085dfde878fc51620f3b334b5c Mon Sep 17 00:00:00 2001 From: ehhuang Date: Thu, 13 Mar 2025 11:21:10 -0700 Subject: [PATCH 041/557] test: re record responses after client sync (#1615) Summary: Test Plan: LLAMA_STACK_CONFIG=fireworks pytest -s -v tests/integration/agents/test_agents.py --safety-shield meta-llama/Llama-Guard-3-8B --text-model meta-llama/Llama-3.1-8B-Instruct --record-responses --- .../recorded_responses/chat_completion.json | 5010 ++++++++++++++++- .../recorded_responses/invoke_tool.json | 42 +- 2 files changed, 4748 insertions(+), 304 deletions(-) diff --git a/tests/integration/fixtures/recorded_responses/chat_completion.json b/tests/integration/fixtures/recorded_responses/chat_completion.json index 8694cc271..37bb28ac2 100644 --- a/tests/integration/fixtures/recorded_responses/chat_completion.json +++ b/tests/integration/fixtures/recorded_responses/chat_completion.json @@ -12758,6 +12758,131 @@ ], "type": "generator" }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"false\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " boiling point of polyjuice is -100", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " degrees Fahrenheit.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 139 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 23 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 162 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"false\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { @@ -12863,6 +12988,207 @@ ], "type": "generator" }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "{\"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "type\": \"function\", \"name\":", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " \"get_boiling_point\", \"parameters\": {\"liquid_name", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "\": \"polyjuice\", \"celcius\":", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " \"false\"}}", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": "false", + "liquid_name": "polyjuice" + }, + "call_id": "b0413eb2-f446-4e09-910b-7d8ba4375c87", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 91 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 45 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 136 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { @@ -13044,6 +13370,207 @@ ], "type": "generator" }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "{\n", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " \"type\": \"function_call\",\n \"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "name\": \"get_boiling_point\",\n \"parameters\": {\n", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " \"liquid_name\": \"polyjuice\",\n \"celcius", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "\": \"true\"\n }\n}", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": "true", + "liquid_name": "polyjuice" + }, + "call_id": "62095a5a-c53c-4850-9f4f-b3a41699a32b", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 43 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 56 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 99 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { @@ -13370,6 +13897,131 @@ ], "type": "generator" }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " boiling point of polyjuice is -100\u00b0C", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ".", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 85 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 22 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 107 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { @@ -13600,6 +14252,131 @@ ], "type": "generator" }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " boiling point of polyjuice is -100", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "\u00b0C.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 87 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 22 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 109 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { "chunks": [ { @@ -13725,6 +14502,458 @@ ], "type": "generator" }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "{\"type\": \"function\", \"name\":", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " \"get_boiling_point\",", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " \"parameters\": {\"liquid_name\": \"", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "polyjuice\", \"celcius\": \"true\"}}", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": "true", + "liquid_name": "polyjuice" + }, + "call_id": "139fe8b9-7bfc-4dcb-ac0d-da1d97257c6e", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 37 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 47 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "_point_with_metadata\", \"parameters\": {\"liquid", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "_name\": \"polyjuice\", \"celcius\": \"", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "true\"}}", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": "true", + "liquid_name": "polyjuice" + }, + "call_id": "49ab2b64-cbcb-4e71-b02c-99026116c45e", + "tool_name": "get_boiling_point_with_metadata" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 37 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 47 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { @@ -14250,7 +15479,7 @@ "data": { "event": { "delta": { - "text": " customer smiled and said \"hello\" to the friendly store", + "text": " customer smiled and said \"hello\" to the", "type": "text" }, "event_type": { @@ -14270,7 +15499,7 @@ "data": { "event": { "delta": { - "text": " clerk.", + "text": " friendly store clerk.", "type": "text" }, "event_type": { @@ -17388,7 +18617,7 @@ "data": { "event": { "delta": { - "text": " error message indicates that there is an issue with", + "text": " error message indicates that there is an issue with the", "type": "text" }, "event_type": { @@ -17408,7 +18637,7 @@ "data": { "event": { "delta": { - "text": " the import statement. However, the code provided does not contain any", + "text": " import statement. However, the code provided does", "type": "text" }, "event_type": { @@ -17428,7 +18657,7 @@ "data": { "event": { "delta": { - "text": " import statements that would cause this error.\n\nTo provide a more accurate", + "text": " not contain any import statements that would cause this error.\n\nTo provide", "type": "text" }, "event_type": { @@ -17448,7 +18677,7 @@ "data": { "event": { "delta": { - "text": " answer, I would need to know the contents of the CSV file", + "text": " a more accurate answer, I would need to know the contents of the", "type": "text" }, "event_type": { @@ -17468,7 +18697,7 @@ "data": { "event": { "delta": { - "text": " or more information about the error message.\n\nHowever, based on the", + "text": " CSV file or more information about the error message.\n\nHowever, based on", "type": "text" }, "event_type": { @@ -17488,7 +18717,7 @@ "data": { "event": { "delta": { - "text": " code provided, it seems like the code is trying to load a", + "text": " the code provided, it seems like the intention is to load a CSV", "type": "text" }, "event_type": { @@ -17508,7 +18737,7 @@ "data": { "event": { "delta": { - "text": " CSV file and print some basic information about it. If the file", + "text": " file and print some basic information about it. If the file is not", "type": "text" }, "event_type": { @@ -17528,7 +18757,7 @@ "data": { "event": { "delta": { - "text": " is not found or there is an issue with the file path,", + "text": " found or there is an issue with the file path, this could cause", "type": "text" }, "event_type": { @@ -17548,7 +18777,7 @@ "data": { "event": { "delta": { - "text": " this could cause an error.\n\nHere is a", + "text": " an error.\n\nHere is an updated version of the code that includes some", "type": "text" }, "event_type": { @@ -17568,7 +18797,7 @@ "data": { "event": { "delta": { - "text": " revised version of the code that includes some error", + "text": " error handling:\n\n```\nimport pandas as pd\n", "type": "text" }, "event_type": { @@ -17588,7 +18817,7 @@ "data": { "event": { "delta": { - "text": " handling:\n\n```\nimport pandas as pd\nimport code_interpreter", + "text": "import code_interpreter\n\ntry:\n #", "type": "text" }, "event_type": { @@ -17608,7 +18837,7 @@ "data": { "event": { "delta": { - "text": "\n\ntry:\n # Load the CSV file", + "text": " Load the CSV file\n df = pd.read_csv(\"/", "type": "text" }, "event_type": { @@ -17628,7 +18857,7 @@ "data": { "event": { "delta": { - "text": "\n df = pd.read_csv(\"/var/folders/cz", + "text": "var/folders/cz/vyh7y1d11", "type": "text" }, "event_type": { @@ -17648,7 +18877,7 @@ "data": { "event": { "delta": { - "text": "/vyh7y1d11xg", + "text": "xg881lsxsshnc5c0000gn/T/tmpmy", "type": "text" }, "event_type": { @@ -17668,7 +18897,7 @@ "data": { "event": { "delta": { - "text": "881lsxsshnc5", + "text": "lybr76/IEQ51fUginflation.csv\")\n\n ", "type": "text" }, "event_type": { @@ -17688,7 +18917,7 @@ "data": { "event": { "delta": { - "text": "c0000gn/T/tmpflpgiagc/", + "text": " # Print the first few rows of the dataframe\n print(df", "type": "text" }, "event_type": { @@ -17708,7 +18937,7 @@ "data": { "event": { "delta": { - "text": "8S20Zj2Oinflation.csv\")\n\n ", + "text": ".head())\n\n # Print the data", "type": "text" }, "event_type": { @@ -17728,7 +18957,7 @@ "data": { "event": { "delta": { - "text": " # Print the first few rows of the dataframe\n print(df.head", + "text": " types of each column\n print(df.dtypes)\n\n #", "type": "text" }, "event_type": { @@ -17748,7 +18977,7 @@ "data": { "event": { "delta": { - "text": "())\n\n # Print the data types of each column\n print", + "text": " Print the summary statistics of the dataframe\n", "type": "text" }, "event_type": { @@ -17768,7 +18997,7 @@ "data": { "event": { "delta": { - "text": "(df.dtypes)\n\n # Print the", + "text": " print(df.describe())\n\nexcept FileNotFoundError:\n print(\"The", "type": "text" }, "event_type": { @@ -17788,7 +19017,7 @@ "data": { "event": { "delta": { - "text": " summary statistics of the dataframe\n ", + "text": " file was not found.\")\nexcept pd.errors.Empty", "type": "text" }, "event_type": { @@ -17808,7 +19037,7 @@ "data": { "event": { "delta": { - "text": " print(df.describe())\n\nexcept FileNotFoundError:\n print(\"The file", + "text": "DataError:\n print(\"The file", "type": "text" }, "event_type": { @@ -17828,7 +19057,7 @@ "data": { "event": { "delta": { - "text": " was not found.\")\nexcept pd.errors.EmptyDataError", + "text": " is empty.\")\nexcept pd.errors.ParserError:\n", "type": "text" }, "event_type": { @@ -17848,7 +19077,7 @@ "data": { "event": { "delta": { - "text": ":\n print(\"The file is empty.\")\nexcept pd.errors.ParserError", + "text": " print(\"An error occurred while parsing the file.\")\n", "type": "text" }, "event_type": { @@ -17868,7 +19097,7 @@ "data": { "event": { "delta": { - "text": ":\n print(\"An error occurred while parsing the", + "text": "except Exception as e:\n print(\"An error occurred: \",", "type": "text" }, "event_type": { @@ -17888,7 +19117,7 @@ "data": { "event": { "delta": { - "text": " file.\")\nexcept Exception as e:\n print", + "text": " str(e))\n```\n\nThis code will", "type": "text" }, "event_type": { @@ -17908,7 +19137,7 @@ "data": { "event": { "delta": { - "text": "(\"An error occurred: \", str(e))\n``", + "text": " catch specific exceptions that could occur when loading the CSV file and print", "type": "text" }, "event_type": { @@ -17928,47 +19157,7 @@ "data": { "event": { "delta": { - "text": "`\n\nThis code will catch specific exceptions that could occur when loading the", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " CSV file and print a more", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " informative error message.", + "text": " a more informative error message.", "type": "text" }, "event_type": { @@ -18007,17 +19196,17 @@ { "metric": "prompt_tokens", "unit": null, - "value": 393 + "value": 389 }, { "metric": "completion_tokens", "unit": null, - "value": 331 + "value": 328 }, { "metric": "total_tokens", "unit": null, - "value": 724 + "value": 717 } ] } @@ -18083,7 +19272,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "import pandas as pd\nimport code_interpreter\n\n", + "tool_call": "import pandas as pd\nimport code_interpreter", "type": "tool_call" }, "event_type": { @@ -18108,7 +19297,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "# Load the CSV file\ndf = pd.read_csv(\"/var/f", + "tool_call": "\n\n# Load the CSV file", "type": "tool_call" }, "event_type": { @@ -18133,7 +19322,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "olders/cz/vyh7y1d11xg881lsx", + "tool_call": "\ndf = pd.read_csv(\"/var/folders/c", "type": "tool_call" }, "event_type": { @@ -18158,7 +19347,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "sshnc5c0000gn/T/tmpfl", + "tool_call": "z/vyh7y1d11xg881lsxsshnc", "type": "tool_call" }, "event_type": { @@ -18183,7 +19372,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "pgiagc/8S20Zj2Oinflation", + "tool_call": "5c0000gn/T/tmpmylybr76/IE", "type": "tool_call" }, "event_type": { @@ -18208,7 +19397,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": ".csv\")\n\n# Print the first few rows of the", + "tool_call": "Q51fUginflation.csv\")\n\n# Print the first few", "type": "tool_call" }, "event_type": { @@ -18233,7 +19422,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " dataframe\nprint(df.head())\n\n# Print the data types of each", + "tool_call": " rows of the dataframe\nprint(df.head())\n\n# Print the data", "type": "tool_call" }, "event_type": { @@ -18258,7 +19447,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " column\nprint(df.dtypes)\n\n#", + "tool_call": " types of each column\nprint(df.dtypes)\n\n", "type": "tool_call" }, "event_type": { @@ -18283,7 +19472,32 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " Print the summary statistics of the dataframe\nprint(df.describe())", + "tool_call": "# Print the summary statistics of the", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " dataframe\nprint(df.describe())", "type": "tool_call" }, "event_type": { @@ -18310,9 +19524,9 @@ }, "tool_call": { "arguments": { - "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpflpgiagc/8S20Zj2Oinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())" + "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpmylybr76/IEQ51fUginflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())" }, - "call_id": "e999a578-cbd8-4bb8-bc53-deb2fff1ffce", + "call_id": "c4c54781-a26e-427d-aea8-6d4b9829bbcc", "tool_name": { "__enum__": "BuiltinTool", "__module__": "llama_stack.models.llama.datatypes", @@ -18361,7 +19575,7 @@ { "metric": "prompt_tokens", "unit": null, - "value": 215 + "value": 213 }, { "metric": "completion_tokens", @@ -18371,7 +19585,7 @@ { "metric": "total_tokens", "unit": null, - "value": 225 + "value": 223 } ] } @@ -18462,7 +19676,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " CSV file\ndf = pd.read", + "tool_call": " CSV file\ndf = pd.read_csv(\"/var/folders/cz/v", "type": "tool_call" }, "event_type": { @@ -18487,7 +19701,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "_csv(\"/var/folders/cz/vyh", + "tool_call": "yh7y1d11xg881lsx", "type": "tool_call" }, "event_type": { @@ -18512,7 +19726,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "7y1d11xg881lsxsshnc5c", + "tool_call": "sshnc5c0000gn/T/tmpmylybr76", "type": "tool_call" }, "event_type": { @@ -18537,7 +19751,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "0000gn/T/tmpflpgiagc/8S", + "tool_call": "/IEQ51fUginflation.csv\")\n\n", "type": "tool_call" }, "event_type": { @@ -18562,7 +19776,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "20Zj2Oinflation.csv\")\n\n# Print the first", + "tool_call": "# Print the first few rows of the dataframe", "type": "tool_call" }, "event_type": { @@ -18587,7 +19801,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " few rows of the dataframe\nprint(df.head())\n\n#", + "tool_call": "\nprint(df.head())\n\n# Print the data types of", "type": "tool_call" }, "event_type": { @@ -18612,7 +19826,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " Print the data types of each column\nprint", + "tool_call": " each column\nprint(df.dtypes)\n\n# Print the summary", "type": "tool_call" }, "event_type": { @@ -18637,32 +19851,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "(df.dtypes)\n\n# Print the summary statistics of the dataframe", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\nprint(df.describe())", + "tool_call": " statistics of the dataframe\nprint(df.describe())", "type": "tool_call" }, "event_type": { @@ -18689,9 +19878,9 @@ }, "tool_call": { "arguments": { - "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpflpgiagc/8S20Zj2Oinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())" + "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpmylybr76/IEQ51fUginflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())" }, - "call_id": "ea72d524-2d0f-4220-a898-4c295315235e", + "call_id": "1f1ed34a-bffb-459d-9f64-eb66d13b2aa5", "tool_name": { "__enum__": "BuiltinTool", "__module__": "llama_stack.models.llama.datatypes", @@ -21730,7 +22919,7 @@ "data": { "event": { "delta": { - "text": " code will create a line plot of the average yearly inflation over time. The", + "text": " code will create a line plot of the average yearly inflation over", "type": "text" }, "event_type": { @@ -21750,7 +22939,7 @@ "data": { "event": { "delta": { - "text": " x-axis represents the year, and the y-axis represents the average yearly inflation", + "text": " time. The x-axis represents the year, and the y-axis", "type": "text" }, "event_type": { @@ -21770,7 +22959,7 @@ "data": { "event": { "delta": { - "text": ". The plot will show the trend of average yearly inflation over the years", + "text": " represents the average yearly inflation. The plot will show the trend of", "type": "text" }, "event_type": { @@ -21790,7 +22979,7 @@ "data": { "event": { "delta": { - "text": ".", + "text": " average yearly inflation over the years.", "type": "text" }, "event_type": { @@ -21829,7 +23018,7 @@ { "metric": "prompt_tokens", "unit": null, - "value": 635 + "value": 631 }, { "metric": "completion_tokens", @@ -21839,7 +23028,7 @@ { "metric": "total_tokens", "unit": null, - "value": 691 + "value": 687 } ] } @@ -21905,7 +23094,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "import pandas as pd\nimport matplotlib.pyplot as", + "tool_call": "import pandas as pd\nimport matplotlib", "type": "tool_call" }, "event_type": { @@ -21930,7 +23119,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " plt\n\n# Load data\ndf = pd.read_csv(\"/var/f", + "tool_call": ".pyplot as plt\n\n# Load data", "type": "tool_call" }, "event_type": { @@ -21955,7 +23144,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "olders/cz/vyh7y1d11xg881lsx", + "tool_call": "\ndf = pd.read_csv(\"/var/folders/c", "type": "tool_call" }, "event_type": { @@ -21980,7 +23169,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "sshnc5c0000gn/T/tmpflpgiagc/", + "tool_call": "z/vyh7y1d11xg881", "type": "tool_call" }, "event_type": { @@ -22005,7 +23194,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "2VkeqrPlinflation.csv\")\n\n# Calculate average yearly inflation", + "tool_call": "lsxsshnc5c0000gn/T/tmpmy", "type": "tool_call" }, "event_type": { @@ -22030,7 +23219,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "\ndf['Average'] = df[['Jan', 'Feb', '", + "tool_call": "lybr76/Dhwctgpwinflation.csv\")\n\n#", "type": "tool_call" }, "event_type": { @@ -22055,7 +23244,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "Mar', 'Apr', 'May', 'Jun', 'Jul',", + "tool_call": " Calculate average yearly inflation\ndf['Average'] = df[['", "type": "tool_call" }, "event_type": { @@ -22080,7 +23269,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " 'Aug', 'Sep', 'Oct', 'Nov', 'Dec", + "tool_call": "Jan', 'Feb', 'Mar', 'Apr", "type": "tool_call" }, "event_type": { @@ -22105,7 +23294,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "']].mean(axis=1)\n\n# Plot time series\nplt.figure(figsize", + "tool_call": "', 'May', 'Jun', 'Jul', '", "type": "tool_call" }, "event_type": { @@ -22130,7 +23319,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "=(10,6))\nplt.plot(df['Year'], df['Average", + "tool_call": "Aug', 'Sep', 'Oct', 'Nov', 'Dec", "type": "tool_call" }, "event_type": { @@ -22155,7 +23344,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "'])\nplt.xlabel('Year')\nplt.ylabel('Average Yearly", + "tool_call": "']].mean(axis=1)\n\n# Plot time series\nplt", "type": "tool_call" }, "event_type": { @@ -22180,7 +23369,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " Inflation')\nplt.title('Average Yearly Inflation Over", + "tool_call": ".figure(figsize=(10,6))\n", "type": "tool_call" }, "event_type": { @@ -22205,7 +23394,82 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " Time')\nplt.grid(True)\nplt.show()", + "tool_call": "plt.plot(df['Year'], df['Average'])\nplt", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".xlabel('Year')\nplt.ylabel('Average Yearly Inflation')\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "plt.title('Average Yearly Inflation Over Time')\nplt", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".grid(True)\nplt.show()", "type": "tool_call" }, "event_type": { @@ -22232,9 +23496,9 @@ }, "tool_call": { "arguments": { - "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpflpgiagc/2VkeqrPlinflation.csv\")\n\n# Calculate average yearly inflation\ndf['Average'] = df[['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']].mean(axis=1)\n\n# Plot time series\nplt.figure(figsize=(10,6))\nplt.plot(df['Year'], df['Average'])\nplt.xlabel('Year')\nplt.ylabel('Average Yearly Inflation')\nplt.title('Average Yearly Inflation Over Time')\nplt.grid(True)\nplt.show()" + "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpmylybr76/Dhwctgpwinflation.csv\")\n\n# Calculate average yearly inflation\ndf['Average'] = df[['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']].mean(axis=1)\n\n# Plot time series\nplt.figure(figsize=(10,6))\nplt.plot(df['Year'], df['Average'])\nplt.xlabel('Year')\nplt.ylabel('Average Yearly Inflation')\nplt.title('Average Yearly Inflation Over Time')\nplt.grid(True)\nplt.show()" }, - "call_id": "f82fa3fd-e3be-4cb7-9298-8b4625cf709e", + "call_id": "73dbb112-a028-48fd-8664-a6c408d1f13d", "tool_name": { "__enum__": "BuiltinTool", "__module__": "llama_stack.models.llama.datatypes", @@ -22283,7 +23547,7 @@ { "metric": "prompt_tokens", "unit": null, - "value": 454 + "value": 452 }, { "metric": "completion_tokens", @@ -22293,7 +23557,7 @@ { "metric": "total_tokens", "unit": null, - "value": 464 + "value": 462 } ] } @@ -24036,7 +25300,7 @@ "data": { "event": { "delta": { - "text": " CSV file contains 10 rows and 13 columns. The columns are", + "text": " CSV file contains 10 rows and 13 columns. The columns", "type": "text" }, "event_type": { @@ -24056,7 +25320,7 @@ "data": { "event": { "delta": { - "text": " named 'Year', 'Jan', 'Feb', 'Mar', '", + "text": " are named 'Year', 'Jan', 'Feb', 'Mar", "type": "text" }, "event_type": { @@ -24076,7 +25340,7 @@ "data": { "event": { "delta": { - "text": "Apr', 'May', 'Jun', 'Jul', 'Aug',", + "text": "', 'Apr', 'May', 'Jun', 'Jul',", "type": "text" }, "event_type": { @@ -24096,7 +25360,7 @@ "data": { "event": { "delta": { - "text": " 'Sep', 'Oct', 'Nov', 'Dec'. The data", + "text": " 'Aug', 'Sep', 'Oct',", "type": "text" }, "event_type": { @@ -24116,7 +25380,7 @@ "data": { "event": { "delta": { - "text": " types of these columns are int64 for 'Year", + "text": " 'Nov', 'Dec'. The", "type": "text" }, "event_type": { @@ -24136,7 +25400,7 @@ "data": { "event": { "delta": { - "text": "' and float64 for the rest.\n\nIt appears that this CSV file", + "text": " data types of these columns are int64 for", "type": "text" }, "event_type": { @@ -24156,7 +25420,7 @@ "data": { "event": { "delta": { - "text": " contains monthly inflation rates for different years. The 'Year' column represents", + "text": " 'Year' and float64 for the rest", "type": "text" }, "event_type": { @@ -24176,7 +25440,7 @@ "data": { "event": { "delta": { - "text": " the year, and the rest of the columns represent the inflation rates", + "text": ".\n\nIt appears that this CSV file contains monthly inflation rates for", "type": "text" }, "event_type": { @@ -24196,7 +25460,7 @@ "data": { "event": { "delta": { - "text": " for each month of the", + "text": " different years. The 'Year' column represents the year,", "type": "text" }, "event_type": { @@ -24216,7 +25480,27 @@ "data": { "event": { "delta": { - "text": " year.", + "text": " and the rest of the columns represent the inflation rates for each", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " month of the year.", "type": "text" }, "event_type": { @@ -24255,7 +25539,7 @@ { "metric": "prompt_tokens", "unit": null, - "value": 327 + "value": 325 }, { "metric": "completion_tokens", @@ -24265,7 +25549,7 @@ { "metric": "total_tokens", "unit": null, - "value": 452 + "value": 450 } ] } @@ -24356,7 +25640,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "_csv(\"/var/folders/cz/vyh7", + "tool_call": "_csv(\"/var/folders/cz/vyh7y1", "type": "tool_call" }, "event_type": { @@ -24381,7 +25665,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "y1d11xg881lsxsshnc5c000", + "tool_call": "d11xg881lsxsshnc", "type": "tool_call" }, "event_type": { @@ -24406,7 +25690,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "0gn/T/tmpflpgiagc/2VkeqrPlinflation", + "tool_call": "5c0000gn/T/tmpmyly", "type": "tool_call" }, "event_type": { @@ -24431,7 +25715,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": ".csv\")\n# Rows\nprint(\"Number of rows and columns in", + "tool_call": "br76/Dhwctgpwinflation.csv\")\n#", "type": "tool_call" }, "event_type": { @@ -24456,7 +25740,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are", + "tool_call": " Rows\nprint(\"Number of rows and columns in the data:\",", "type": "tool_call" }, "event_type": { @@ -24481,7 +25765,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": ":\", len(df.columns))\n# Column names\nprint(\"Columns of the data", + "tool_call": " df.shape)\n# Columns\nprint(\"Columns of the data are", "type": "tool_call" }, "event_type": { @@ -24506,7 +25790,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of", + "tool_call": ":\", len(df.columns))\n# Column names\nprint(\"Columns of", "type": "tool_call" }, "event_type": { @@ -24531,7 +25815,57 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " the columns are:\", df.dtypes)", + "tool_call": " the data are:\", df.columns)\n# Column dt", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "ypes\nprint(\"Datatype of the columns", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " are:\", df.dtypes)", "type": "tool_call" }, "event_type": { @@ -24558,9 +25892,9 @@ }, "tool_call": { "arguments": { - "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpflpgiagc/2VkeqrPlinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)" + "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpmylybr76/Dhwctgpwinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)" }, - "call_id": "b8aab119-7997-428e-81ab-e6aa163f7acc", + "call_id": "f1d86c1d-75bd-43f3-9117-a906e41598f8", "tool_name": { "__enum__": "BuiltinTool", "__module__": "llama_stack.models.llama.datatypes", @@ -29279,6 +30613,1666 @@ ], "type": "generator" }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8c735\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:ef2c1\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:4857b\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:ef2c1\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:ef2c1\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:ef2c1\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8c735\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:ef2c1\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:4857b\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "{\"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "type\": \"function\", \"name\": \"knowledge", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "_search\", \"parameters\": {\"query\":", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " \"How to use LoRA in Tor", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "chtune\"}}", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "How to use LoRA in Torchtune" + }, + "call_id": "8414f84a-98b1-41eb-90bd-bce084da79eb", + "tool_name": "knowledge_search" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 117 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 40 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 157 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8c735\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:ef2c1\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:4857b\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "I", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "'m ready to help you answer questions about Tor", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "chtune based on the documentation you provided.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " What's your first question?", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 75 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 35 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 110 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:9050a\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:c4e00\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:15efa\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:c4e00\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:c4e00\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:c4e00\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:9050a\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:c4e00\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:15efa\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "{\"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "type\": \"function\", \"name\": \"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "knowledge_search\", \"parameters\": {\"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "query\": \"How to use LoRA in", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Torchtune\"}}", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "How to use LoRA in Torchtune" + }, + "call_id": "0784780b-c3dc-4f4a-a37f-e75e83e9be61", + "tool_name": "knowledge_search" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 117 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 40 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 157 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:9050a\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:c4e00\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:15efa\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "I", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "'m ready to help you answer questions about Torchtune based on the documentation", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " you provided. What's your first question?", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 75 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 35 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 110 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:a4c57\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:46132\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:392a8\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:46132\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:46132\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:46132\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"{\\\"query\\\": \\\"Meta founder\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.81595254, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\\\", \\\"score\\\": 0.46759978, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Meta leadership: images of senior executives for download to use in articles about the company. ... Mark Zuckerberg, Founder, Chairman and Chief Executive Officer. Nick Clegg, President, Global Affairs. Joel Kaplan, Chief Global Affairs Officer. Susan Li, Chief Financial Officer.\\\", \\\"score\\\": 0.46482924, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meta Platforms - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Meta_Platforms\\\", \\\"content\\\": \\\"Following a period of intense scrutiny and damaging whistleblower leaks, news started to emerge on October 21, 2021, about Facebook's plan to rebrand the company and change its name.[15][54] In the Q3 2021 Earnings Call on October 25, Mark Zuckerberg discussed the ongoing criticism of the company's social services and the way it operates, and pointed to the pivoting efforts to building the metaverse \\\\u2013 without mentioning the rebranding and the name change.[55] The metaverse vision and the name change from Facebook, Inc. to Meta Platforms was introduced at Facebook Connect on October 28, 2021.[16] Based on Facebook's PR campaign, the name change reflects the company's shifting long term focus of building the metaverse, a digital extension of the physical world by social media, virtual reality and augmented reality features.[16][56]\\\", \\\"score\\\": 0.14999175, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.036911618, \\\"raw_content\\\": null}]}\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " founder of Meta is Mark Zuckerberg.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 1101 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 18 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 1119 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the founder of Meta is.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { "chunks": [ { @@ -35576,32 +38645,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "brave_search.call(query", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "=\"Meta founder\")", + "tool_call": "brave_search.call(query=\"Meta founder\")", "type": "tool_call" }, "event_type": { @@ -35630,7 +38674,7 @@ "arguments": { "query": "Meta founder" }, - "call_id": "b81c41ae-5eb7-41b7-b466-78eb25a91bb7", + "call_id": "a9a452ac-a1a1-4414-a107-4cdc283f4129", "tool_name": { "__enum__": "BuiltinTool", "__module__": "llama_stack.models.llama.datatypes", @@ -36172,6 +39216,191 @@ ], "type": "generator" }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " function `get_boiling_point`", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " is not able to find the boiling point", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " of polyjuice as it is a", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " fictional liquid from the Harry Potter series. The function is only able", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " to find the boiling point of real liquids.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 70 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 56 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 126 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { @@ -36357,6 +39586,151 @@ ], "type": "generator" }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " function `get_boiling_point` is not able to find the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " boiling point of polyjuice as it is", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " not a real liquid.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 70 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 38 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 108 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { "chunks": [ { @@ -36482,6 +39856,151 @@ ], "type": "generator" }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " function `get_boiling_point` is not able to find", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the boiling point of polyjuice as it is", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " not a real liquid.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 70 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 38 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 108 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { @@ -36627,6 +40146,206 @@ ], "type": "generator" }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "_point\", \"parameters\": {\"liquid_name\": \"polyjuice", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\"}}", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "liquid_name": "polyjuice" + }, + "call_id": "918c5630-abc9-4500-ac0b-b630e0743561", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 30 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 40 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { @@ -36802,6 +40521,206 @@ ], "type": "generator" }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "{\"type\": \"function\", \"name\": \"get_bo", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "iling_point\", \"parameters\": {\"liquid_name\": \"polyjuice", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\"}}", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "liquid_name": "polyjuice" + }, + "call_id": "364ad4a8-2e6e-4afb-8c81-1cf98774758a", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 30 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 40 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { "chunks": [ { @@ -37002,6 +40921,231 @@ ], "type": "generator" }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"none\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "I", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " couldn't find any information on", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the boiling point of Polyjuice.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Polyjuice is a magical potion in the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Harry Potter series that allows the drinker to transform into", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " someone else. It's not a physical substance", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " with a boiling point. If you have any other questions, I", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "'d be happy to help.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 30 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 73 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 103 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"none\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { @@ -37207,6 +41351,206 @@ ], "type": "generator" }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "{\"type\": \"function\", \"name", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\": \"get_boiling_point\", \"parameters\": {\"liquid", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "_name\": \"polyjuice\"}}", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "liquid_name": "polyjuice" + }, + "call_id": "b41fafca-4559-4a0a-b49b-f4edf893d08a", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 30 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 40 + } + ] + } + } + ], + "type": "generator" + }, "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { @@ -37764,7 +42108,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "def is_prime(n):\n if n <= 1:\n return False", + "tool_call": "def is_prime(n):\n if n <= 1:\n", "type": "tool_call" }, "event_type": { @@ -37789,7 +42133,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "\n if n <= 3:\n ", + "tool_call": " return False\n if n <= 3:\n return", "type": "tool_call" }, "event_type": { @@ -37814,7 +42158,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " return True\n if n % 2 ==", + "tool_call": " True\n if n % 2 == 0 or", "type": "tool_call" }, "event_type": { @@ -37839,7 +42183,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " 0 or n % 3 == 0:\n return False", + "tool_call": " n % 3 == 0", "type": "tool_call" }, "event_type": { @@ -37864,7 +42208,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "\n i = 5\n while i * i <= n:\n ", + "tool_call": ":\n return False\n i = 5\n while", "type": "tool_call" }, "event_type": { @@ -37889,7 +42233,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " if n % i == 0 or n % (i + 2", + "tool_call": " i * i <= n:\n if n % i == ", "type": "tool_call" }, "event_type": { @@ -37914,7 +42258,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": ") == 0:\n return False\n i += 6\n ", + "tool_call": "0 or n % (i + 2) == ", "type": "tool_call" }, "event_type": { @@ -37939,7 +42283,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " return True\n\ndef get_nth_prime(n):\n count = 0\n ", + "tool_call": "0:\n return False\n i", "type": "tool_call" }, "event_type": { @@ -37964,7 +42308,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " num = 2\n while True:\n if is_prime(num):\n ", + "tool_call": " += 6\n return True\n\ndef get_nth_prime(n", "type": "tool_call" }, "event_type": { @@ -37989,7 +42333,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " count += 1\n if count == n:\n return num\n", + "tool_call": "):\n count = 0\n", "type": "tool_call" }, "event_type": { @@ -38014,7 +42358,107 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " num += 1\n\nprint(get_nth_prime(100))", + "tool_call": " num = 2\n while", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " True:\n if is_prime(num):\n count += 1", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\n if count == n:\n return num\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " num += 1\n\nprint(get_nth_prime", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "(100))", "type": "tool_call" }, "event_type": { @@ -38043,7 +42487,7 @@ "arguments": { "code": "def is_prime(n):\n if n <= 1:\n return False\n if n <= 3:\n return True\n if n % 2 == 0 or n % 3 == 0:\n return False\n i = 5\n while i * i <= n:\n if n % i == 0 or n % (i + 2) == 0:\n return False\n i += 6\n return True\n\ndef get_nth_prime(n):\n count = 0\n num = 2\n while True:\n if is_prime(num):\n count += 1\n if count == n:\n return num\n num += 1\n\nprint(get_nth_prime(100))" }, - "call_id": "d8ece88b-7b3e-4f72-9555-5a928c27012c", + "call_id": "a1296d7e-6ca3-4056-b43f-19a9663e8bcb", "tool_name": { "__enum__": "BuiltinTool", "__module__": "llama_stack.models.llama.datatypes", @@ -38548,7 +42992,7 @@ "data": { "event": { "delta": { - "text": "type\": \"function\", \"name\": \"knowledge_search\", \"parameters", + "text": "type\": \"function\", \"name\": \"knowledge_search\", \"", "type": "text" }, "event_type": { @@ -38568,7 +43012,7 @@ "data": { "event": { "delta": { - "text": "\": {\"query\": \"Perplexity company founding", + "text": "parameters\": {\"query\": \"Perplexity", "type": "text" }, "event_type": { @@ -38588,7 +43032,7 @@ "data": { "event": { "delta": { - "text": " date\"}}", + "text": " company founding date\"}}", "type": "text" }, "event_type": { @@ -38617,7 +43061,7 @@ "arguments": { "query": "Perplexity company founding date" }, - "call_id": "7f40db23-2182-4006-9234-4c5b7dac978f", + "call_id": "75b712aa-fdeb-48bb-be40-c7fcd06242b6", "tool_name": "knowledge_search" }, "type": "tool_call" @@ -38738,7 +43182,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters", + "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\",", "type": "tool_call" }, "event_type": { @@ -38763,7 +43207,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "\": {\"query\": \"Perplexity company founding date\"}}", + "tool_call": " \"parameters\": {\"query\": \"Perplexity company founding date\"}}", "type": "tool_call" }, "event_type": { @@ -38792,7 +43236,7 @@ "arguments": { "query": "Perplexity company founding date" }, - "call_id": "7f65affe-6ecb-4db5-b70f-71e05e28c310", + "call_id": "3d505e8e-fe35-486e-9661-27f67702621d", "tool_name": "knowledge_search" }, "type": "tool_call" @@ -39177,7 +43621,7 @@ "data": { "event": { "delta": { - "text": " NBA was created on August 3,", + "text": " NBA was created on August 3, 1949, with", "type": "text" }, "event_type": { @@ -39197,7 +43641,7 @@ "data": { "event": { "delta": { - "text": " 1949, with the merger of the Basketball Association of", + "text": " the merger of the Basketball Association of America (", "type": "text" }, "event_type": { @@ -39217,7 +43661,7 @@ "data": { "event": { "delta": { - "text": " America (BAA) and the National Basketball League (NBL", + "text": "BAA) and the National Basketball League (NBL", "type": "text" }, "event_type": { @@ -39352,7 +43796,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "{\"type\": \"function\", \"name\": \"", + "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search", "type": "tool_call" }, "event_type": { @@ -39377,7 +43821,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "knowledge_search\", \"parameters\": {\"query\": \"when", + "tool_call": "\", \"parameters\": {\"query\": \"when was the n", "type": "tool_call" }, "event_type": { @@ -39402,7 +43846,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " was the nba created\"}}", + "tool_call": "ba created\"}}", "type": "tool_call" }, "event_type": { @@ -39431,7 +43875,7 @@ "arguments": { "query": "when was the nba created" }, - "call_id": "0f4d0151-e44c-443a-8101-e0ac92c9d45f", + "call_id": "03ce919a-d1b5-4120-896e-433e79910757", "tool_name": "knowledge_search" }, "type": "tool_call" diff --git a/tests/integration/fixtures/recorded_responses/invoke_tool.json b/tests/integration/fixtures/recorded_responses/invoke_tool.json index 8db8ad966..f3a2cfbcb 100644 --- a/tests/integration/fixtures/recorded_responses/invoke_tool.json +++ b/tests/integration/fixtures/recorded_responses/invoke_tool.json @@ -167,23 +167,23 @@ "type": "text" }, { - "text": "Result 1:\nDocument_id:15b86\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\n See :ref:`below` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n # Print the first layer's self-attention in the usual Llama2 model\n >>> print(base_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (pos_embeddings): RotaryPositionalEmbeddings()\n )\n\n # Print the same for Llama2 with LoRA weights\n >>> print(lora_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): LoRALinear(\n (dropout): Dropout(p=0.0, inplace=False)\n \n", + "text": "Result 2:\nDocument_id:c4e00\nContent: LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n # Build Llama2 without any LoRA layers\n base_model = llama2_7b()\n\n # The default settings for lora_llama2_7b will match those for llama2_7b\n # We just need to define which layers we want LoRA applied to.\n # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n # layers outside of the self-attention.\n lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n Calling :func:`lora_llama_2_7b ` alone will not handle the definition of which parameters are trainable.\n See :ref:`below` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n # Print the first layer's self-attention in the usual Llama2 model\n >>> print(base_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (pos_embeddings): RotaryPositionalEmbeddings()\n )\n\n # Print the same for Llama2 with LoRA weights\n >>> print(lora_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): LoRALinear(\n (dropout): Dropout(p=0.0, inplace=False)\n \n", "type": "text" }, { - "text": "Result 3:\nDocument_id:15b86\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", + "text": "Result 3:\nDocument_id:c4e00\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", "type": "text" }, { - "text": "Result 4:\nDocument_id:15b86\nContent: from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n # Assuming that base_model already has the pretrained Llama2 weights,\n # this will directly load them into your LoRA model without any conversion necessary.\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n :func:`validate_missing_and_unexpected_for_lora() `.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n # Fetch all params from the model that are associated with LoRA.\n lora_params = get_adapter_params(lora_model)\n\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n set_trainable_params(lora_model, lora_params)\n\n # Print the total number of parameters\n total_params = sum([p.numel() for p in lora_model.parameters()])\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n print(\n f\"\"\"\n {total_params} total params,\n {trainable_params}\" trainable params,\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n \"\"\"\n )\n\n 6742609920 total params,\n 4194304 trainable params,\n 0.06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n # Fetch all params from the model that are associated with LoRA.\n lora_params = get_adapter_params(lora_model)\n\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n set_trainable_params(lora_model, lora_params)\n\n # Print the total number of parameters\n total_params = sum([p.numel() for p in lora_model.parameters()])\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n print(\n f\"\"\"\n {total_params} total params,\n {trainable_params}\" trainable params,\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n \"\"\"\n )\n\n 6742609920 total params,\n 4194304 trainable params,\n 0.06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_ into all our\n Dataset classes and they will honor them. This is useful for common parameters\n such as specifying the data split with :code:`split` or configuration with\n :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations `_ into all our\n Dataset classes and they will honor them. This is useful for common parameters\n such as specifying the data split with :code:`split` or configuration with\n :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW `.\n.. .. _glossary_fsdp2:\n\n", + "text": "Result 3:\nDocument_id:15efa\nContent: ` module, which we swap\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP `.\n.. .. _glossary_fsdp2:\n\n", "type": "text" }, { - "text": "Result 4:\nDocument_id:15b86\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", + "text": "Result 4:\nDocument_id:c4e00\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", "type": "text" }, { - "text": "Result 5:\nDocument_id:83901\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.use_dora=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.apply_lora_to_mlp=True \\\n model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n model.lora_rank=16 \\\n model.lora_alpha=32 \\\n model.use_dora=True \\\n model.quantize_base=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n apply_lora_to_mlp: True\n lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n lora_rank: 16\n lora_alpha: 32\n use_dora: True\n quantize_base: True\n\n\n.. note::\n\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP `.\n.. .. _glossary_fsdp2:\n\n", + "text": "Result 5:\nDocument_id:15efa\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.use_dora=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.apply_lora_to_mlp=True \\\n model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n model.lora_rank=16 \\\n model.lora_alpha=32 \\\n model.use_dora=True \\\n model.quantize_base=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n apply_lora_to_mlp: True\n lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n lora_rank: 16\n lora_alpha: 32\n use_dora: True\n quantize_base: True\n\n\n.. note::\n\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP `.\n.. .. _glossary_fsdp2:\n\n", "type": "text" }, { @@ -428,11 +428,11 @@ "error_message": null, "metadata": { "document_ids": [ - "bbddbe62-508d-4c8d-9455-3b60bc2825a5", - "15b8638f-b1b6-4f58-adfa-eb6644c47de3", - "83901b53-33d4-4f5e-8145-b94c783e9f61", - "15b8638f-b1b6-4f58-adfa-eb6644c47de3", - "83901b53-33d4-4f5e-8145-b94c783e9f61" + "9050ae1c-eba1-4846-b550-2db1957fee7d", + "c4e00391-aeb8-4d32-ac41-ae3242f38a19", + "15efa3d7-f804-4d31-ab05-a5524d82b96a", + "c4e00391-aeb8-4d32-ac41-ae3242f38a19", + "15efa3d7-f804-4d31-ab05-a5524d82b96a" ] } } From edfcb02a0e723931dcdb464d6e9bf6856108e502 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Thu, 13 Mar 2025 20:04:53 +0100 Subject: [PATCH 042/557] ci(ollama): add GitHub Actions workflow for integration tests (#1546) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? Added a GitHub Action to run inference tests for the Ollama provider. This ensures we have coverage for Ollama integration. --------- Signed-off-by: Sébastien Han Co-authored-by: Ashwin Bharambe --- .github/workflows/integration-tests.yml | 80 +++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 .github/workflows/integration-tests.yml diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml new file mode 100644 index 000000000..af268e728 --- /dev/null +++ b/.github/workflows/integration-tests.yml @@ -0,0 +1,80 @@ +name: Integration tests + +on: + pull_request: + push: + branches: [main] + +jobs: + ollama: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v4 + with: + python-version: "3.10" + + - name: Install Ollama + run: | + curl -fsSL https://ollama.com/install.sh | sh + + - name: Pull Ollama image + run: | + ollama pull llama3.2:3b-instruct-fp16 + + - name: Start Ollama in background + run: | + nohup ollama run llama3.2:3b-instruct-fp16 > ollama.log 2>&1 & + + - name: Set Up Environment and Install Dependencies + run: | + uv sync --extra dev --extra test + uv pip install ollama faiss-cpu + uv pip install -e . + + - name: Wait for Ollama to start + run: | + echo "Waiting for Ollama..." + for i in {1..30}; do + if curl -s http://localhost:11434 | grep -q "Ollama is running"; then + echo "Ollama is running!" + exit 0 + fi + sleep 1 + done + echo "Ollama failed to start" + ollama ps + ollama.log + exit 1 + + - name: Start Llama Stack server in background + env: + INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct" + run: | + source .venv/bin/activate + # TODO: use "llama stack run" + nohup uv run python -m llama_stack.distribution.server.server --yaml-config ./llama_stack/templates/ollama/run.yaml > server.log 2>&1 & + + - name: Wait for Llama Stack server to be ready + run: | + echo "Waiting for Llama Stack server..." + for i in {1..30}; do + if curl -s http://localhost:8321/v1/health | grep -q "OK"; then + echo " Llama Stack server is up!" + exit 0 + fi + sleep 1 + done + echo " Llama Stack server failed to start" + cat server.log + exit 1 + + - name: Run Inference Integration Tests + env: + INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct" + run: | + uv run pytest -v tests/integration/inference --stack-config=ollama --text-model="meta-llama/Llama-3.2-3B-Instruct" --embedding-model=all-MiniLM-L6-v2 From 28aade9a277b039024015914d7890b9c2cad1ce1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Thu, 13 Mar 2025 20:09:04 +0100 Subject: [PATCH 043/557] ci: add GitHub Action to close stale issues and PRs (#1613) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? - Issues/PRs inactive for 60 days are marked as stale - Stale items are closed after 30 additional days of inactivity - Adds appropriate warning and closing messages - Sets daily schedule for stale checks Signed-off-by: Sébastien Han --- .github/workflows/stale_bot.yml | 45 +++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 .github/workflows/stale_bot.yml diff --git a/.github/workflows/stale_bot.yml b/.github/workflows/stale_bot.yml new file mode 100644 index 000000000..2039fcbb4 --- /dev/null +++ b/.github/workflows/stale_bot.yml @@ -0,0 +1,45 @@ +name: Close stale issues and PRs + +on: + schedule: + - cron: '0 0 * * *' # every day at midnight + +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + stale: + permissions: + issues: write + pull-requests: write + runs-on: ubuntu-latest + steps: + - name: Stale Action + uses: actions/stale@v9 + with: + stale-issue-label: 'stale' + stale-issue-message: > + This issue has been automatically marked as stale because it has not had activity within 60 days. + It will be automatically closed if no further activity occurs within 30 days. + close-issue-message: > + This issue has been automatically closed due to inactivity. + Please feel free to reopen if you feel it is still relevant! + days-before-issue-stale: 60 + days-before-issue-close: 30 + stale-pr-label: 'stale' + stale-pr-message: > + This pull request has been automatically marked as stale because it has not had activity within 60 days. + It will be automatically closed if no further activity occurs within 30 days. + close-pr-message: > + This pull request has been automatically closed due to inactivity. + Please feel free to reopen if you intend to continue working on it! + days-before-pr-stale: 60 + days-before-pr-close: 30 + operations-per-run: 300 From a1bb7c8d82ff7575f9861bf1ab0f4933822b6fe9 Mon Sep 17 00:00:00 2001 From: Yuan Tang Date: Thu, 13 Mar 2025 15:47:58 -0400 Subject: [PATCH 044/557] docs: Add OpenAI, Anthropic, Gemini to API providers table (#1617) # What does this PR do? These are supported via https://github.com/meta-llama/llama-stack/pull/1267. cc @ashwinb Signed-off-by: Yuan Tang --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 6e1fd088e..aade9c15f 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,10 @@ Here is a list of the various API providers and available distributions that can | PG Vector | Single Node | | | ✅ | | | | PyTorch ExecuTorch | On-device iOS | ✅ | ✅ | | | | | vLLM | Hosted and Single Node | | ✅ | | | | +| OpenAI | Hosted | | ✅ | | | | +| Anthropic | Hosted | | ✅ | | | | +| Gemini | Hosted | | ✅ | | | | + ### Distributions From ed841380dc34eb7974be1fd8edad821d407c53ab Mon Sep 17 00:00:00 2001 From: ehhuang Date: Thu, 13 Mar 2025 13:18:08 -0700 Subject: [PATCH 045/557] test: turn off recordable mock for now (#1616) Summary: will figure out how to do this best, turning it off for now. Test Plan: test_agents.py --- [//]: # (BEGIN SAPLING FOOTER) Stack created with [Sapling](https://sapling-scm.com). Best reviewed with [ReviewStack](https://reviewstack.dev/meta-llama/llama-stack/pull/1616). * __->__ #1616 * #1615 --- tests/integration/fixtures/common.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integration/fixtures/common.py b/tests/integration/fixtures/common.py index e410039e7..1878c9e88 100644 --- a/tests/integration/fixtures/common.py +++ b/tests/integration/fixtures/common.py @@ -52,6 +52,8 @@ def llama_stack_client_with_mocked_inference(llama_stack_client, request): If --record-responses is passed, it will call the real APIs and record the responses. """ + # TODO: will rework this to be more stable + return llama_stack_client if not isinstance(llama_stack_client, LlamaStackAsLibraryClient): logging.warning( "llama_stack_client_with_mocked_inference is not supported for this client, returning original client without mocking" From a3d710e59cf469ac8eba136d98a6afd15db4e50e Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Thu, 13 Mar 2025 16:19:44 -0400 Subject: [PATCH 046/557] chore: Always check that git merge conflict markers are not present (#1610) # What does this PR do? Before the change, it was only doing it during the merge. [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan ``` $ git checkout d263edbf90f958349c7b9adea8fd4e5181932a69 $ pre-commit run --all-files check for merge conflicts................................................Failed - hook id: check-merge-conflict - exit code: 1 docs/_static/llama-stack-spec.yaml:3179: Merge conflict string '<<<<<<<' found docs/_static/llama-stack-spec.yaml:3185: Merge conflict string '=======' found docs/_static/llama-stack-spec.yaml:3190: Merge conflict string '>>>>>>>' found [...] ``` [//]: # (## Documentation) Signed-off-by: Ihar Hrachyshka --- .pre-commit-config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 926ae21cc..80a303b09 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -8,6 +8,7 @@ repos: rev: v5.0.0 # Latest stable version hooks: - id: check-merge-conflict + args: ['--assume-in-merge'] - id: trailing-whitespace exclude: '\.py$' # Exclude Python files as Ruff already handles them - id: check-added-large-files From e101d15f12dba3a31f35762a4da66f8cecbc7579 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 13 Mar 2025 16:40:15 -0400 Subject: [PATCH 047/557] build(deps): bump astral-sh/setup-uv from 4 to 5 (#1620) --- .github/workflows/integration-tests.yml | 2 +- .github/workflows/providers-build.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index af268e728..bae5188fa 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -14,7 +14,7 @@ jobs: uses: actions/checkout@v4 - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@v5 with: python-version: "3.10" diff --git a/.github/workflows/providers-build.yml b/.github/workflows/providers-build.yml index 1bf6591dc..be4298a98 100644 --- a/.github/workflows/providers-build.yml +++ b/.github/workflows/providers-build.yml @@ -51,7 +51,7 @@ jobs: python-version: '3.10' - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@v5 with: python-version: "3.10" From a062723d031fa715dd2f1e98ef299e1845ad0a41 Mon Sep 17 00:00:00 2001 From: Charlie Doern Date: Thu, 13 Mar 2025 18:07:21 -0400 Subject: [PATCH 048/557] feat: add provider API for listing and inspecting provider info (#1429) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? currently the `inspect` API for providers is really a `list` API. Create a new `providers` API which has a GET `providers/{provider_id}` inspect API which returns "user friendly" configuration to the end user. Also add a GET `/providers` endpoint which returns the list of providers as `inspect/providers` does today. This API follows CRUD and is more intuitive/RESTful. This work is part of the RFC at https://github.com/meta-llama/llama-stack/pull/1359 sensitive fields are redacted using `redact_sensetive_fields` on the server side before returning a response: Screenshot 2025-03-13 at 4 40 21 PM ## Test Plan using https://github.com/meta-llama/llama-stack-client-python/pull/181 a user is able to to run the following: `llama stack build --template ollama --image-type venv` `llama stack run --image-type venv ~/.llama/distributions/ollama/ollama-run.yaml` `llama-stack-client providers inspect ollama` Screenshot 2025-03-13 at 4 39 35 PM also, was able to run the new test_list integration test locally with ollama: Screenshot 2025-03-13 at 11 03 40 AM Signed-off-by: Charlie Doern --- docs/_static/llama-stack-spec.html | 76 ++++++++++++++++++- docs/_static/llama-stack-spec.yaml | 51 +++++++++++++ llama_stack/apis/datatypes.py | 1 + llama_stack/apis/inspect/inspect.py | 18 ++--- llama_stack/apis/providers/__init__.py | 7 ++ llama_stack/apis/providers/providers.py | 40 ++++++++++ llama_stack/distribution/configure.py | 2 +- llama_stack/distribution/distribution.py | 2 +- llama_stack/distribution/providers.py | 59 ++++++++++++++ llama_stack/distribution/resolver.py | 21 +++++ llama_stack/distribution/server/server.py | 1 + llama_stack/distribution/stack.py | 2 + pyproject.toml | 1 + tests/integration/providers/__init__.py | 5 ++ tests/integration/providers/test_providers.py | 17 +++++ 15 files changed, 291 insertions(+), 12 deletions(-) create mode 100644 llama_stack/apis/providers/__init__.py create mode 100644 llama_stack/apis/providers/providers.py create mode 100644 llama_stack/distribution/providers.py create mode 100644 tests/integration/providers/__init__.py create mode 100644 tests/integration/providers/test_providers.py diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index 22fa781ac..e62f66bd6 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -2642,7 +2642,81 @@ } } }, - "/v1/inspect/providers": { + "/v1/providers": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ListProvidersResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Providers" + ], + "description": "", + "parameters": [] + } + }, + "/v1/providers/{provider_id}": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/GetProviderResponse" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Providers" + ], + "description": "", + "parameters": [ + { + "name": "provider_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ] + }, + "/v1/inspect/providers": { "get": { "responses": { "200": { diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index 1f01351e9..cb31848ee 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -1782,6 +1782,57 @@ paths: schema: $ref: '#/components/schemas/RegisterModelRequest' required: true + /v1/providers: + get: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/ListProvidersResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Providers + description: '' + parameters: [] + /v1/providers/{provider_id}: + get: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/GetProviderResponse' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Providers + description: '' + parameters: + - name: provider_id + in: path + required: true + schema: + type: string /v1/inspect/providers: get: responses: diff --git a/llama_stack/apis/datatypes.py b/llama_stack/apis/datatypes.py index 842a2b63d..f644e5137 100644 --- a/llama_stack/apis/datatypes.py +++ b/llama_stack/apis/datatypes.py @@ -14,6 +14,7 @@ from llama_stack.schema_utils import json_schema_type @json_schema_type class Api(Enum): + providers = "providers" inference = "inference" safety = "safety" agents = "agents" diff --git a/llama_stack/apis/inspect/inspect.py b/llama_stack/apis/inspect/inspect.py index 4a647a2d9..25937bb61 100644 --- a/llama_stack/apis/inspect/inspect.py +++ b/llama_stack/apis/inspect/inspect.py @@ -11,13 +11,6 @@ from pydantic import BaseModel from llama_stack.schema_utils import json_schema_type, webmethod -@json_schema_type -class ProviderInfo(BaseModel): - api: str - provider_id: str - provider_type: str - - @json_schema_type class RouteInfo(BaseModel): route: str @@ -32,14 +25,21 @@ class HealthInfo(BaseModel): @json_schema_type -class VersionInfo(BaseModel): - version: str +class ProviderInfo(BaseModel): + api: str + provider_id: str + provider_type: str class ListProvidersResponse(BaseModel): data: List[ProviderInfo] +@json_schema_type +class VersionInfo(BaseModel): + version: str + + class ListRoutesResponse(BaseModel): data: List[RouteInfo] diff --git a/llama_stack/apis/providers/__init__.py b/llama_stack/apis/providers/__init__.py new file mode 100644 index 000000000..b554a5d23 --- /dev/null +++ b/llama_stack/apis/providers/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .providers import * # noqa: F401 F403 diff --git a/llama_stack/apis/providers/providers.py b/llama_stack/apis/providers/providers.py new file mode 100644 index 000000000..fd37bd500 --- /dev/null +++ b/llama_stack/apis/providers/providers.py @@ -0,0 +1,40 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import List, Protocol, runtime_checkable + +from pydantic import BaseModel + +from llama_stack.distribution.datatypes import Provider +from llama_stack.schema_utils import json_schema_type, webmethod + + +@json_schema_type +class ProviderInfo(BaseModel): + api: str + provider_id: str + provider_type: str + + +class GetProviderResponse(BaseModel): + data: Provider | None + + +class ListProvidersResponse(BaseModel): + data: List[ProviderInfo] + + +@runtime_checkable +class Providers(Protocol): + """ + Providers API for inspecting, listing, and modifying providers and their configurations. + """ + + @webmethod(route="/providers", method="GET") + async def list_providers(self) -> ListProvidersResponse: ... + + @webmethod(route="/providers/{provider_id}", method="GET") + async def inspect_provider(self, provider_id: str) -> GetProviderResponse: ... diff --git a/llama_stack/distribution/configure.py b/llama_stack/distribution/configure.py index 715bb5db4..2a3bf7053 100644 --- a/llama_stack/distribution/configure.py +++ b/llama_stack/distribution/configure.py @@ -62,7 +62,7 @@ def configure_api_providers(config: StackRunConfig, build_spec: DistributionSpec if config.apis: apis_to_serve = config.apis else: - apis_to_serve = [a.value for a in Api if a not in (Api.telemetry, Api.inspect)] + apis_to_serve = [a.value for a in Api if a not in (Api.telemetry, Api.inspect, Api.providers)] for api_str in apis_to_serve: api = Api(api_str) diff --git a/llama_stack/distribution/distribution.py b/llama_stack/distribution/distribution.py index 308081415..ddb727663 100644 --- a/llama_stack/distribution/distribution.py +++ b/llama_stack/distribution/distribution.py @@ -56,7 +56,7 @@ def builtin_automatically_routed_apis() -> List[AutoRoutedApiInfo]: def providable_apis() -> List[Api]: routing_table_apis = {x.routing_table_api for x in builtin_automatically_routed_apis()} - return [api for api in Api if api not in routing_table_apis and api != Api.inspect] + return [api for api in Api if api not in routing_table_apis and api != Api.inspect and api != Api.providers] def get_provider_registry() -> Dict[Api, Dict[str, ProviderSpec]]: diff --git a/llama_stack/distribution/providers.py b/llama_stack/distribution/providers.py new file mode 100644 index 000000000..219384900 --- /dev/null +++ b/llama_stack/distribution/providers.py @@ -0,0 +1,59 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from pydantic import BaseModel + +from llama_stack.apis.providers import GetProviderResponse, ListProvidersResponse, ProviderInfo, Providers + +from .datatypes import StackRunConfig +from .stack import redact_sensitive_fields + + +class ProviderImplConfig(BaseModel): + run_config: StackRunConfig + + +async def get_provider_impl(config, deps): + impl = ProviderImpl(config, deps) + await impl.initialize() + return impl + + +class ProviderImpl(Providers): + def __init__(self, config, deps): + self.config = config + self.deps = deps + + async def initialize(self) -> None: + pass + + async def list_providers(self) -> ListProvidersResponse: + run_config = self.config.run_config + ret = [] + for api, providers in run_config.providers.items(): + ret.extend( + [ + ProviderInfo( + api=api, + provider_id=p.provider_id, + provider_type=p.provider_type, + ) + for p in providers + ] + ) + + return ListProvidersResponse(data=ret) + + async def inspect_provider(self, provider_id: str) -> GetProviderResponse: + run_config = self.config.run_config + safe_config = StackRunConfig(**redact_sensitive_fields(run_config.model_dump())) + ret = None + for _, providers in safe_config.providers.items(): + for p in providers: + if p.provider_id == provider_id: + ret = p + + return GetProviderResponse(data=ret) diff --git a/llama_stack/distribution/resolver.py b/llama_stack/distribution/resolver.py index ab075f399..e9e406699 100644 --- a/llama_stack/distribution/resolver.py +++ b/llama_stack/distribution/resolver.py @@ -16,6 +16,7 @@ from llama_stack.apis.inference import Inference from llama_stack.apis.inspect import Inspect from llama_stack.apis.models import Models from llama_stack.apis.post_training import PostTraining +from llama_stack.apis.providers import Providers as ProvidersAPI from llama_stack.apis.safety import Safety from llama_stack.apis.scoring import Scoring from llama_stack.apis.scoring_functions import ScoringFunctions @@ -59,6 +60,7 @@ class InvalidProviderError(Exception): def api_protocol_map() -> Dict[Api, Any]: return { + Api.providers: ProvidersAPI, Api.agents: Agents, Api.inference: Inference, Api.inspect: Inspect, @@ -247,6 +249,25 @@ def sort_providers_by_deps( ) ) + sorted_providers.append( + ( + "providers", + ProviderWithSpec( + provider_id="__builtin__", + provider_type="__builtin__", + config={"run_config": run_config.model_dump()}, + spec=InlineProviderSpec( + api=Api.providers, + provider_type="__builtin__", + config_class="llama_stack.distribution.providers.ProviderImplConfig", + module="llama_stack.distribution.providers", + api_dependencies=apis, + deps__=[x.value for x in apis], + ), + ), + ) + ) + logger.debug(f"Resolved {len(sorted_providers)} providers") for api_str, provider in sorted_providers: logger.debug(f" {api_str} => {provider.provider_id}") diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py index 7ca009b13..8f9500ae9 100644 --- a/llama_stack/distribution/server/server.py +++ b/llama_stack/distribution/server/server.py @@ -368,6 +368,7 @@ def main(): apis_to_serve.add(inf.routing_table_api.value) apis_to_serve.add("inspect") + apis_to_serve.add("providers") for api_str in apis_to_serve: api = Api(api_str) diff --git a/llama_stack/distribution/stack.py b/llama_stack/distribution/stack.py index 2b974739a..9c9289a77 100644 --- a/llama_stack/distribution/stack.py +++ b/llama_stack/distribution/stack.py @@ -23,6 +23,7 @@ from llama_stack.apis.inference import Inference from llama_stack.apis.inspect import Inspect from llama_stack.apis.models import Models from llama_stack.apis.post_training import PostTraining +from llama_stack.apis.providers import Providers from llama_stack.apis.safety import Safety from llama_stack.apis.scoring import Scoring from llama_stack.apis.scoring_functions import ScoringFunctions @@ -44,6 +45,7 @@ logger = get_logger(name=__name__, category="core") class LlamaStack( + Providers, VectorDBs, Inference, BatchInference, diff --git a/pyproject.toml b/pyproject.toml index 055fa7a55..aaea4f7c4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -170,6 +170,7 @@ exclude = [ "^llama_stack/apis/inspect/inspect\\.py$", "^llama_stack/apis/models/models\\.py$", "^llama_stack/apis/post_training/post_training\\.py$", + "^llama_stack/apis/providers/providers\\.py$", "^llama_stack/apis/resource\\.py$", "^llama_stack/apis/safety/safety\\.py$", "^llama_stack/apis/scoring/scoring\\.py$", diff --git a/tests/integration/providers/__init__.py b/tests/integration/providers/__init__.py new file mode 100644 index 000000000..756f351d8 --- /dev/null +++ b/tests/integration/providers/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/tests/integration/providers/test_providers.py b/tests/integration/providers/test_providers.py new file mode 100644 index 000000000..174d01b5c --- /dev/null +++ b/tests/integration/providers/test_providers.py @@ -0,0 +1,17 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import pytest +from llama_stack_client import LlamaStackClient + +from llama_stack import LlamaStackAsLibraryClient + + +class TestProviders: + @pytest.mark.asyncio + def test_list(self, llama_stack_client: LlamaStackAsLibraryClient | LlamaStackClient): + provider_list = llama_stack_client.providers.list() + assert provider_list is not None From b906bad23803bf947573e6dcc6557b9cb95625ce Mon Sep 17 00:00:00 2001 From: Yuan Tang Date: Thu, 13 Mar 2025 18:28:52 -0400 Subject: [PATCH 049/557] docs: Add OpenAI, Anthropic, Gemini to inference API providers table (#1622) # What does this PR do? Forgot to update this page as well as part of https://github.com/meta-llama/llama-stack/pull/1617. Signed-off-by: Yuan Tang --- docs/source/index.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/source/index.md b/docs/source/index.md index 0d0508466..af381f9c9 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -61,6 +61,10 @@ A number of "adapters" are available for some popular Inference and Vector Store | Groq | Hosted | | SambaNova | Hosted | | PyTorch ExecuTorch | On-device iOS, Android | +| OpenAI | Hosted | +| Anthropic | Hosted | +| Gemini | Hosted | + **Vector IO API** | **Provider** | **Environments** | From 98b1b15e0fbff1cbf53496c2e28227a1511289c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Thu, 13 Mar 2025 23:34:53 +0100 Subject: [PATCH 050/557] refactor: move all datetime.now() calls to UTC (#1589) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? Updated all instances of datetime.now() to use timezone.utc for consistency in handling time across different systems. This ensures that timestamps are always in Coordinated Universal Time (UTC), avoiding issues with time zone discrepancies and promoting uniformity in time-related data. Signed-off-by: Sébastien Han --- llama_stack/cli/download.py | 4 ++-- .../llama3/prompt_templates/system_prompts.py | 4 +++- .../agents/meta_reference/agent_instance.py | 24 +++++++++---------- .../agents/meta_reference/persistence.py | 4 ++-- .../post_training/torchtune/post_training.py | 8 +++---- .../recipes/lora_finetuning_single_device.py | 4 ++-- .../meta_reference/console_span_processor.py | 8 +++---- .../meta_reference/sqlite_span_processor.py | 12 +++++----- .../code_interpreter/code_execution.py | 2 +- .../providers/utils/telemetry/tracing.py | 6 ++--- pyproject.toml | 21 +++++++++------- 11 files changed, 52 insertions(+), 45 deletions(-) diff --git a/llama_stack/cli/download.py b/llama_stack/cli/download.py index b43d50217..f1b722183 100644 --- a/llama_stack/cli/download.py +++ b/llama_stack/cli/download.py @@ -10,7 +10,7 @@ import json import os import shutil from dataclasses import dataclass -from datetime import datetime +from datetime import datetime, timezone from functools import partial from pathlib import Path from typing import Dict, List, Optional @@ -404,7 +404,7 @@ def _download_from_manifest(manifest_file: str, max_concurrent_downloads: int): d = json.load(f) manifest = Manifest(**d) - if datetime.now() > manifest.expires_on: + if datetime.now(timezone.utc) > manifest.expires_on: raise ValueError(f"Manifest URLs have expired on {manifest.expires_on}") console = Console() diff --git a/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py b/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py index 02e4814bc..b835d0ec0 100644 --- a/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py +++ b/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py @@ -34,7 +34,9 @@ class SystemDefaultGenerator(PromptTemplateGeneratorBase): ) return PromptTemplate( template_str.lstrip("\n"), - {"today": datetime.now().strftime("%d %B %Y")}, + { + "today": datetime.now().strftime("%d %B %Y") # noqa: DTZ005 - we don't care about timezones here since we are displaying the date + }, ) def data_examples(self) -> List[Any]: diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py index 0ae1996cc..03692bcc7 100644 --- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py +++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py @@ -11,7 +11,7 @@ import re import secrets import string import uuid -from datetime import datetime +from datetime import datetime, timezone from typing import AsyncGenerator, List, Optional, Union from urllib.parse import urlparse @@ -239,7 +239,7 @@ class ChatAgent(ShieldRunnerMixin): in_progress_tool_call_step = await self.storage.get_in_progress_tool_call_step( request.session_id, request.turn_id ) - now = datetime.now().astimezone().isoformat() + now = datetime.now(timezone.utc).isoformat() tool_execution_step = ToolExecutionStep( step_id=(in_progress_tool_call_step.step_id if in_progress_tool_call_step else str(uuid.uuid4())), turn_id=request.turn_id, @@ -264,7 +264,7 @@ class ChatAgent(ShieldRunnerMixin): start_time = last_turn.started_at else: messages.extend(request.messages) - start_time = datetime.now().astimezone().isoformat() + start_time = datetime.now(timezone.utc).isoformat() input_messages = request.messages output_message = None @@ -295,7 +295,7 @@ class ChatAgent(ShieldRunnerMixin): input_messages=input_messages, output_message=output_message, started_at=start_time, - completed_at=datetime.now().astimezone().isoformat(), + completed_at=datetime.now(timezone.utc).isoformat(), steps=steps, ) await self.storage.add_turn_to_session(request.session_id, turn) @@ -386,7 +386,7 @@ class ChatAgent(ShieldRunnerMixin): return step_id = str(uuid.uuid4()) - shield_call_start_time = datetime.now().astimezone().isoformat() + shield_call_start_time = datetime.now(timezone.utc).isoformat() try: yield AgentTurnResponseStreamChunk( event=AgentTurnResponseEvent( @@ -410,7 +410,7 @@ class ChatAgent(ShieldRunnerMixin): turn_id=turn_id, violation=e.violation, started_at=shield_call_start_time, - completed_at=datetime.now().astimezone().isoformat(), + completed_at=datetime.now(timezone.utc).isoformat(), ), ) ) @@ -433,7 +433,7 @@ class ChatAgent(ShieldRunnerMixin): turn_id=turn_id, violation=None, started_at=shield_call_start_time, - completed_at=datetime.now().astimezone().isoformat(), + completed_at=datetime.now(timezone.utc).isoformat(), ), ) ) @@ -472,7 +472,7 @@ class ChatAgent(ShieldRunnerMixin): client_tools[tool.name] = tool while True: step_id = str(uuid.uuid4()) - inference_start_time = datetime.now().astimezone().isoformat() + inference_start_time = datetime.now(timezone.utc).isoformat() yield AgentTurnResponseStreamChunk( event=AgentTurnResponseEvent( payload=AgentTurnResponseStepStartPayload( @@ -582,7 +582,7 @@ class ChatAgent(ShieldRunnerMixin): turn_id=turn_id, model_response=copy.deepcopy(message), started_at=inference_start_time, - completed_at=datetime.now().astimezone().isoformat(), + completed_at=datetime.now(timezone.utc).isoformat(), ), ) ) @@ -653,7 +653,7 @@ class ChatAgent(ShieldRunnerMixin): turn_id=turn_id, tool_calls=[tool_call], tool_responses=[], - started_at=datetime.now().astimezone().isoformat(), + started_at=datetime.now(timezone.utc).isoformat(), ), ) yield message @@ -670,7 +670,7 @@ class ChatAgent(ShieldRunnerMixin): "input": message.model_dump_json(), }, ) as span: - tool_execution_start_time = datetime.now().astimezone().isoformat() + tool_execution_start_time = datetime.now(timezone.utc).isoformat() tool_call = message.tool_calls[0] tool_result = await self.execute_tool_call_maybe( session_id, @@ -708,7 +708,7 @@ class ChatAgent(ShieldRunnerMixin): ) ], started_at=tool_execution_start_time, - completed_at=datetime.now().astimezone().isoformat(), + completed_at=datetime.now(timezone.utc).isoformat(), ), ) ) diff --git a/llama_stack/providers/inline/agents/meta_reference/persistence.py b/llama_stack/providers/inline/agents/meta_reference/persistence.py index 2c04305fd..e7d7d1828 100644 --- a/llama_stack/providers/inline/agents/meta_reference/persistence.py +++ b/llama_stack/providers/inline/agents/meta_reference/persistence.py @@ -7,7 +7,7 @@ import json import logging import uuid -from datetime import datetime +from datetime import datetime, timezone from typing import List, Optional from pydantic import BaseModel @@ -36,7 +36,7 @@ class AgentPersistence: session_info = AgentSessionInfo( session_id=session_id, session_name=name, - started_at=datetime.now(), + started_at=datetime.now(timezone.utc), ) await self.kvstore.set( key=f"session:{self.agent_id}:{session_id}", diff --git a/llama_stack/providers/inline/post_training/torchtune/post_training.py b/llama_stack/providers/inline/post_training/torchtune/post_training.py index 3a1affc91..2c129ef41 100644 --- a/llama_stack/providers/inline/post_training/torchtune/post_training.py +++ b/llama_stack/providers/inline/post_training/torchtune/post_training.py @@ -3,7 +3,7 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from datetime import datetime +from datetime import datetime, timezone from typing import Any, Dict, Optional from llama_stack.apis.datasetio import DatasetIO @@ -64,7 +64,7 @@ class TorchtunePostTrainingImpl: job_status_response = PostTrainingJobStatusResponse( job_uuid=job_uuid, status=JobStatus.scheduled, - scheduled_at=datetime.now(), + scheduled_at=datetime.now(timezone.utc), ) self.jobs[job_uuid] = job_status_response @@ -84,7 +84,7 @@ class TorchtunePostTrainingImpl: ) job_status_response.status = JobStatus.in_progress - job_status_response.started_at = datetime.now() + job_status_response.started_at = datetime.now(timezone.utc) await recipe.setup() resources_allocated, checkpoints = await recipe.train() @@ -93,7 +93,7 @@ class TorchtunePostTrainingImpl: job_status_response.resources_allocated = resources_allocated job_status_response.checkpoints = checkpoints job_status_response.status = JobStatus.completed - job_status_response.completed_at = datetime.now() + job_status_response.completed_at = datetime.now(timezone.utc) except Exception: job_status_response.status = JobStatus.failed diff --git a/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py b/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py index c88787f18..941c629e3 100644 --- a/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py +++ b/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py @@ -8,7 +8,7 @@ import gc import logging import os import time -from datetime import datetime +from datetime import datetime, timezone from functools import partial from pathlib import Path from typing import Any, Dict, List, Optional, Tuple @@ -532,7 +532,7 @@ class LoraFinetuningSingleDevice: checkpoint_path = await self.save_checkpoint(epoch=curr_epoch) checkpoint = Checkpoint( identifier=f"{self.model_id}-sft-{curr_epoch}", - created_at=datetime.now(), + created_at=datetime.now(timezone.utc), epoch=curr_epoch, post_training_job_id=self.job_uuid, path=checkpoint_path, diff --git a/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py b/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py index 2e3bd4d3a..42b538876 100644 --- a/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py +++ b/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py @@ -5,7 +5,7 @@ # the root directory of this source tree. import json -from datetime import datetime +from datetime import datetime, timezone from opentelemetry.sdk.trace import ReadableSpan from opentelemetry.sdk.trace.export import SpanProcessor @@ -34,7 +34,7 @@ class ConsoleSpanProcessor(SpanProcessor): if span.attributes and span.attributes.get("__autotraced__"): return - timestamp = datetime.utcfromtimestamp(span.start_time / 1e9).strftime("%H:%M:%S.%f")[:-3] + timestamp = datetime.fromtimestamp(span.start_time / 1e9, tz=timezone.utc).strftime("%H:%M:%S.%f")[:-3] print( f"{COLORS['dim']}{timestamp}{COLORS['reset']} " @@ -46,7 +46,7 @@ class ConsoleSpanProcessor(SpanProcessor): if span.attributes and span.attributes.get("__autotraced__"): return - timestamp = datetime.utcfromtimestamp(span.end_time / 1e9).strftime("%H:%M:%S.%f")[:-3] + timestamp = datetime.fromtimestamp(span.end_time / 1e9, tz=timezone.utc).strftime("%H:%M:%S.%f")[:-3] span_context = ( f"{COLORS['dim']}{timestamp}{COLORS['reset']} " @@ -74,7 +74,7 @@ class ConsoleSpanProcessor(SpanProcessor): print(f" {COLORS['dim']}{key}: {str_value}{COLORS['reset']}") for event in span.events: - event_time = datetime.utcfromtimestamp(event.timestamp / 1e9).strftime("%H:%M:%S.%f")[:-3] + event_time = datetime.fromtimestamp(event.timestamp / 1e9, tz=timezone.utc).strftime("%H:%M:%S.%f")[:-3] severity = event.attributes.get("severity", "info") message = event.attributes.get("message", event.name) diff --git a/llama_stack/providers/inline/telemetry/meta_reference/sqlite_span_processor.py b/llama_stack/providers/inline/telemetry/meta_reference/sqlite_span_processor.py index b39ee7716..5ed586fce 100644 --- a/llama_stack/providers/inline/telemetry/meta_reference/sqlite_span_processor.py +++ b/llama_stack/providers/inline/telemetry/meta_reference/sqlite_span_processor.py @@ -8,7 +8,7 @@ import json import os import sqlite3 import threading -from datetime import datetime +from datetime import datetime, timezone from opentelemetry.sdk.trace import SpanProcessor from opentelemetry.trace import Span @@ -124,8 +124,8 @@ class SQLiteSpanProcessor(SpanProcessor): trace_id, service_name, (span_id if not parent_span_id else None), - datetime.fromtimestamp(span.start_time / 1e9).isoformat(), - datetime.fromtimestamp(span.end_time / 1e9).isoformat(), + datetime.fromtimestamp(span.start_time / 1e9, timezone.utc).isoformat(), + datetime.fromtimestamp(span.end_time / 1e9, timezone.utc).isoformat(), ), ) @@ -143,8 +143,8 @@ class SQLiteSpanProcessor(SpanProcessor): trace_id, parent_span_id, span.name, - datetime.fromtimestamp(span.start_time / 1e9).isoformat(), - datetime.fromtimestamp(span.end_time / 1e9).isoformat(), + datetime.fromtimestamp(span.start_time / 1e9, timezone.utc).isoformat(), + datetime.fromtimestamp(span.end_time / 1e9, timezone.utc).isoformat(), json.dumps(dict(span.attributes)), span.status.status_code.name, span.kind.name, @@ -161,7 +161,7 @@ class SQLiteSpanProcessor(SpanProcessor): ( span_id, event.name, - datetime.fromtimestamp(event.timestamp / 1e9).isoformat(), + datetime.fromtimestamp(event.timestamp / 1e9, timezone.utc).isoformat(), json.dumps(dict(event.attributes)), ), ) diff --git a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py index d7b2dbdef..810591c1c 100644 --- a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py +++ b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py @@ -168,7 +168,7 @@ def process_matplotlib_response(response, matplotlib_dump_dir: str): image_paths = [] for i, img in enumerate(images): # create new directory for each day to better organize data: - dump_dname = datetime.today().strftime("%Y-%m-%d") + dump_dname = datetime.today().strftime("%Y-%m-%d") # noqa: DTZ002 - we don't care about timezones here since we are displaying the date dump_dpath = Path(matplotlib_dump_dir, dump_dname) dump_dpath.mkdir(parents=True, exist_ok=True) # save image into a file diff --git a/llama_stack/providers/utils/telemetry/tracing.py b/llama_stack/providers/utils/telemetry/tracing.py index bef229080..607d1a918 100644 --- a/llama_stack/providers/utils/telemetry/tracing.py +++ b/llama_stack/providers/utils/telemetry/tracing.py @@ -11,7 +11,7 @@ import logging import queue import threading import uuid -from datetime import datetime +from datetime import datetime, timezone from functools import wraps from typing import Any, Callable, Dict, List, Optional @@ -86,7 +86,7 @@ class TraceContext: span_id=generate_short_uuid(), trace_id=self.trace_id, name=name, - start_time=datetime.now(), + start_time=datetime.now(timezone.utc), parent_span_id=current_span.span_id if current_span else None, attributes=attributes, ) @@ -203,7 +203,7 @@ class TelemetryHandler(logging.Handler): UnstructuredLogEvent( trace_id=span.trace_id, span_id=span.span_id, - timestamp=datetime.now(), + timestamp=datetime.now(timezone.utc), message=self.format(record), severity=severity(record.levelname), ) diff --git a/pyproject.toml b/pyproject.toml index aaea4f7c4..ff7f46f77 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -124,14 +124,15 @@ exclude = [ [tool.ruff.lint] select = [ - "B", # flake8-bugbear - "B9", # flake8-bugbear subset - "C", # comprehensions - "E", # pycodestyle - "F", # Pyflakes - "N", # Naming - "W", # Warnings - "I", # isort + "B", # flake8-bugbear + "B9", # flake8-bugbear subset + "C", # comprehensions + "E", # pycodestyle + "F", # Pyflakes + "N", # Naming + "W", # Warnings + "I", # isort + "DTZ", # datetime rules ] ignore = [ # The following ignores are desired by the project maintainers. @@ -145,6 +146,10 @@ ignore = [ "C901", # Complexity of the function is too high ] +# Ignore the following errors for the following files +[tool.ruff.lint.per-file-ignores] +"tests/**/*.py" = ["DTZ"] # Ignore datetime rules for tests + [tool.mypy] mypy_path = ["llama_stack"] packages = ["llama_stack"] From c02464b6356006744e80c1e8fb96a4d22b1392ba Mon Sep 17 00:00:00 2001 From: Alina Ryan <66272285+alinaryan@users.noreply.github.com> Date: Thu, 13 Mar 2025 20:47:09 -0400 Subject: [PATCH 051/557] fix: Clarify `llama model prompt-format` help text (#1010) # What does this PR do? Updates the help text for the `llama model prompt-format` command to clarify that users should provide a specific model name (e.g., Llama3.1-8B, Llama3.2-11B-Vision), not a model family. Removes the default value and field for `--model-name` to prevent users from mistakenly thinking a model family name is acceptable. Adds guidance to run `llama model list` to view valid model names. [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan Output of `llama model prompt-format -h` Before: ``` (venv) alina@fedora:~/dev/llama/llama-stack$ llama model prompt-format -h usage: llama model prompt-format [-h] [-m MODEL_NAME] Show llama model message formats options: -h, --help show this help message and exit -m MODEL_NAME, --model-name MODEL_NAME Model Family (llama3_1, llama3_X, etc.) Example: llama model prompt-format (venv) alina@fedora:~/dev/llama/llama-stack$ llama model prompt-format --model-name llama3_1 usage: llama model prompt-format [-h] [-m MODEL_NAME] llama model prompt-format: error: llama3_1 is not a valid Model. Choose one from -- Llama3.1-8B Llama3.1-70B Llama3.1-405B Llama3.1-8B-Instruct Llama3.1-70B-Instruct Llama3.1-405B-Instruct Llama3.2-1B Llama3.2-3B Llama3.2-1B-Instruct Llama3.2-3B-Instruct Llama3.2-11B-Vision Llama3.2-90B-Vision Llama3.2-11B-Vision-Instruct Llama3.2-90B-Vision-Instruct ``` Output of `llama model prompt-format -h` After: ``` (venv) alina@fedora:~/dev/llama/llama-stack$ llama model prompt-format -h usage: llama model prompt-format [-h] [-m MODEL_NAME] Show llama model message formats options: -h, --help show this help message and exit -m MODEL_NAME, --model-name MODEL_NAME Example: Llama3.1-8B or Llama3.2-11B-Vision, etc (Run `llama model list` to see a list of valid model names) Example: llama model prompt-format ``` Signed-off-by: Alina Ryan --- llama_stack/cli/model/prompt_format.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/llama_stack/cli/model/prompt_format.py b/llama_stack/cli/model/prompt_format.py index 8058db461..0d62dcc11 100644 --- a/llama_stack/cli/model/prompt_format.py +++ b/llama_stack/cli/model/prompt_format.py @@ -41,8 +41,8 @@ class ModelPromptFormat(Subcommand): "-m", "--model-name", type=str, - default="llama3_1", - help="Model Family (llama3_1, llama3_X, etc.)", + help="Example: Llama3.1-8B or Llama3.2-11B-Vision, etc\n" + "(Run `llama model list` to see a list of valid model names)", ) self.parser.add_argument( "-l", @@ -81,10 +81,16 @@ class ModelPromptFormat(Subcommand): try: model_id = CoreModelId(args.model_name) except ValueError: - self.parser.error(f"{args.model_name} is not a valid Model. Choose one from --\n{model_str}") + self.parser.error( + f"{args.model_name} is not a valid Model. Choose one from the list of valid models. " + f"Run `llama model list` to see the valid model names." + ) if model_id not in supported_model_ids: - self.parser.error(f"{model_id} is not a valid Model. Choose one from --\n {model_str}") + self.parser.error( + f"{model_id} is not a valid Model. Choose one from the list of valid models. " + f"Run `llama model list` to see the valid model names." + ) llama_3_1_file = ROOT_DIR / "models" / "llama" / "llama3_1" / "prompt_format.md" llama_3_2_text_file = ROOT_DIR / "models" / "llama" / "llama3_2" / "text_prompt_format.md" From ca0cbf4338957c347848a5274fec752bd4255d48 Mon Sep 17 00:00:00 2001 From: Yuan Tang Date: Thu, 13 Mar 2025 21:57:42 -0400 Subject: [PATCH 052/557] fix: Fix pre-commit check (#1628) # What does this PR do? Fixes pre-commit check failure after merging https://github.com/meta-llama/llama-stack/pull/1010: https://github.com/meta-llama/llama-stack/actions/runs/13847489719/job/38748770971 Signed-off-by: Yuan Tang --- llama_stack/cli/model/prompt_format.py | 1 - 1 file changed, 1 deletion(-) diff --git a/llama_stack/cli/model/prompt_format.py b/llama_stack/cli/model/prompt_format.py index 0d62dcc11..3ce77655b 100644 --- a/llama_stack/cli/model/prompt_format.py +++ b/llama_stack/cli/model/prompt_format.py @@ -60,7 +60,6 @@ class ModelPromptFormat(Subcommand): ] model_list = [m.value for m in supported_model_ids] - model_str = "\n".join(model_list) if args.list: headers = ["Model(s)"] From 9e73341008eb7cab053af5b58b26ed669d2c4bd1 Mon Sep 17 00:00:00 2001 From: Kai Wu Date: Thu, 13 Mar 2025 18:58:12 -0700 Subject: [PATCH 053/557] fix: change dog.jpg path in test_vision_inference.py (#1624) # What does this PR do? quick fix as the vision_inference test dog.jpg path has been changed. [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan [Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.*] [//]: # (## Documentation) --- tests/integration/inference/test_vision_inference.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/inference/test_vision_inference.py b/tests/integration/inference/test_vision_inference.py index 984e563d7..9f6fb0478 100644 --- a/tests/integration/inference/test_vision_inference.py +++ b/tests/integration/inference/test_vision_inference.py @@ -36,7 +36,7 @@ def test_image_chat_completion_non_streaming(client_with_models, vision_model_id "type": "image", "image": { "url": { - "uri": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/api/inference/dog.png" + "uri": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/integration/inference/dog.png" }, }, }, @@ -65,7 +65,7 @@ def test_image_chat_completion_streaming(client_with_models, vision_model_id): "type": "image", "image": { "url": { - "uri": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/api/inference/dog.png" + "uri": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/integration/inference/dog.png" }, }, }, From 33b096cc21e48910cf05f0c3e513032adb99fa84 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Thu, 13 Mar 2025 19:56:32 -0700 Subject: [PATCH 054/557] fix: OpenAPI with provider get (#1627) # What does this PR do? - https://github.com/meta-llama/llama-stack/pull/1429 introduces GetProviderResponse in OpenAPI, which is not needed, and not correctly defined. cc @cdoern [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan ``` llama-stack-client providers list ``` image [//]: # (## Documentation) --- docs/_static/llama-stack-spec.html | 189 ++++++++++++------------ docs/_static/llama-stack-spec.yaml | 124 ++++++++-------- llama_stack/apis/providers/providers.py | 10 +- llama_stack/distribution/providers.py | 22 +-- 4 files changed, 166 insertions(+), 179 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index e62f66bd6..b5e4097d9 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -2151,6 +2151,48 @@ } } }, + "/v1/providers/{provider_id}": { + "get": { + "responses": { + "200": { + "description": "OK", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ProviderInfo" + } + } + } + }, + "400": { + "$ref": "#/components/responses/BadRequest400" + }, + "429": { + "$ref": "#/components/responses/TooManyRequests429" + }, + "500": { + "$ref": "#/components/responses/InternalServerError500" + }, + "default": { + "$ref": "#/components/responses/DefaultError" + } + }, + "tags": [ + "Providers" + ], + "description": "", + "parameters": [ + { + "name": "provider_id", + "in": "path", + "required": true, + "schema": { + "type": "string" + } + } + ] + } + }, "/v1/tool-runtime/invoke": { "post": { "responses": { @@ -2643,80 +2685,6 @@ } }, "/v1/providers": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ListProvidersResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Providers" - ], - "description": "", - "parameters": [] - } - }, - "/v1/providers/{provider_id}": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/GetProviderResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Providers" - ], - "description": "", - "parameters": [ - { - "name": "provider_id", - "in": "path", - "required": true, - "schema": { - "type": "string" - } - } - ] - }, - "/v1/inspect/providers": { "get": { "responses": { "200": { @@ -7986,6 +7954,53 @@ ], "title": "InsertChunksRequest" }, + "ProviderInfo": { + "type": "object", + "properties": { + "api": { + "type": "string" + }, + "provider_id": { + "type": "string" + }, + "provider_type": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + } + ] + } + } + }, + "additionalProperties": false, + "required": [ + "api", + "provider_id", + "provider_type", + "config" + ], + "title": "ProviderInfo" + }, "InvokeToolRequest": { "type": "object", "properties": { @@ -8198,27 +8213,6 @@ ], "title": "ListModelsResponse" }, - "ProviderInfo": { - "type": "object", - "properties": { - "api": { - "type": "string" - }, - "provider_id": { - "type": "string" - }, - "provider_type": { - "type": "string" - } - }, - "additionalProperties": false, - "required": [ - "api", - "provider_id", - "provider_type" - ], - "title": "ProviderInfo" - }, "ListProvidersResponse": { "type": "object", "properties": { @@ -10219,6 +10213,10 @@ { "name": "PostTraining (Coming Soon)" }, + { + "name": "Providers", + "x-displayName": "Providers API for inspecting, listing, and modifying providers and their configurations." + }, { "name": "Safety" }, @@ -10265,6 +10263,7 @@ "Inspect", "Models", "PostTraining (Coming Soon)", + "Providers", "Safety", "Scoring", "ScoringFunctions", diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index cb31848ee..bf2343ede 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -1444,6 +1444,34 @@ paths: schema: $ref: '#/components/schemas/InsertChunksRequest' required: true + /v1/providers/{provider_id}: + get: + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/ProviderInfo' + '400': + $ref: '#/components/responses/BadRequest400' + '429': + $ref: >- + #/components/responses/TooManyRequests429 + '500': + $ref: >- + #/components/responses/InternalServerError500 + default: + $ref: '#/components/responses/DefaultError' + tags: + - Providers + description: '' + parameters: + - name: provider_id + in: path + required: true + schema: + type: string /v1/tool-runtime/invoke: post: responses: @@ -1783,57 +1811,6 @@ paths: $ref: '#/components/schemas/RegisterModelRequest' required: true /v1/providers: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/ListProvidersResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Providers - description: '' - parameters: [] - /v1/providers/{provider_id}: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/GetProviderResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Providers - description: '' - parameters: - - name: provider_id - in: path - required: true - schema: - type: string - /v1/inspect/providers: get: responses: '200': @@ -5460,6 +5437,32 @@ components: - vector_db_id - chunks title: InsertChunksRequest + ProviderInfo: + type: object + properties: + api: + type: string + provider_id: + type: string + provider_type: + type: string + config: + type: object + additionalProperties: + oneOf: + - type: 'null' + - type: boolean + - type: number + - type: string + - type: array + - type: object + additionalProperties: false + required: + - api + - provider_id + - provider_type + - config + title: ProviderInfo InvokeToolRequest: type: object properties: @@ -5595,21 +5598,6 @@ components: required: - data title: ListModelsResponse - ProviderInfo: - type: object - properties: - api: - type: string - provider_id: - type: string - provider_type: - type: string - additionalProperties: false - required: - - api - - provider_id - - provider_type - title: ProviderInfo ListProvidersResponse: type: object properties: @@ -6883,6 +6871,9 @@ tags: - name: Inspect - name: Models - name: PostTraining (Coming Soon) + - name: Providers + x-displayName: >- + Providers API for inspecting, listing, and modifying providers and their configurations. - name: Safety - name: Scoring - name: ScoringFunctions @@ -6907,6 +6898,7 @@ x-tagGroups: - Inspect - Models - PostTraining (Coming Soon) + - Providers - Safety - Scoring - ScoringFunctions diff --git a/llama_stack/apis/providers/providers.py b/llama_stack/apis/providers/providers.py index fd37bd500..83d03d7c1 100644 --- a/llama_stack/apis/providers/providers.py +++ b/llama_stack/apis/providers/providers.py @@ -4,11 +4,10 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import List, Protocol, runtime_checkable +from typing import Any, Dict, List, Protocol, runtime_checkable from pydantic import BaseModel -from llama_stack.distribution.datatypes import Provider from llama_stack.schema_utils import json_schema_type, webmethod @@ -17,10 +16,7 @@ class ProviderInfo(BaseModel): api: str provider_id: str provider_type: str - - -class GetProviderResponse(BaseModel): - data: Provider | None + config: Dict[str, Any] class ListProvidersResponse(BaseModel): @@ -37,4 +33,4 @@ class Providers(Protocol): async def list_providers(self) -> ListProvidersResponse: ... @webmethod(route="/providers/{provider_id}", method="GET") - async def inspect_provider(self, provider_id: str) -> GetProviderResponse: ... + async def inspect_provider(self, provider_id: str) -> ProviderInfo: ... diff --git a/llama_stack/distribution/providers.py b/llama_stack/distribution/providers.py index 219384900..fb2476767 100644 --- a/llama_stack/distribution/providers.py +++ b/llama_stack/distribution/providers.py @@ -4,9 +4,10 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. + from pydantic import BaseModel -from llama_stack.apis.providers import GetProviderResponse, ListProvidersResponse, ProviderInfo, Providers +from llama_stack.apis.providers import ListProvidersResponse, ProviderInfo, Providers from .datatypes import StackRunConfig from .stack import redact_sensitive_fields @@ -32,14 +33,16 @@ class ProviderImpl(Providers): async def list_providers(self) -> ListProvidersResponse: run_config = self.config.run_config + safe_config = StackRunConfig(**redact_sensitive_fields(run_config.model_dump())) ret = [] - for api, providers in run_config.providers.items(): + for api, providers in safe_config.providers.items(): ret.extend( [ ProviderInfo( api=api, provider_id=p.provider_id, provider_type=p.provider_type, + config=p.config, ) for p in providers ] @@ -47,13 +50,10 @@ class ProviderImpl(Providers): return ListProvidersResponse(data=ret) - async def inspect_provider(self, provider_id: str) -> GetProviderResponse: - run_config = self.config.run_config - safe_config = StackRunConfig(**redact_sensitive_fields(run_config.model_dump())) - ret = None - for _, providers in safe_config.providers.items(): - for p in providers: - if p.provider_id == provider_id: - ret = p + async def inspect_provider(self, provider_id: str) -> ProviderInfo: + all_providers = await self.list_providers() + for p in all_providers.data: + if p.provider_id == provider_id: + return p - return GetProviderResponse(data=ret) + raise ValueError(f"Provider {provider_id} not found") From bfc79217a8837b6615209fd0e530b3fe004921ef Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Thu, 13 Mar 2025 23:25:15 -0400 Subject: [PATCH 055/557] chore: Add ./scripts/unit-tests.sh (#1515) # What does this PR do? Useful for local development. Now you can just trigger the script and not care about specific arguments to pass to run unit tests. [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan ``` $ . ./venv/bin/activate $ ./scripts/run_tests.sh $ echo $? 0 ``` [//]: # (## Documentation) Signed-off-by: Ihar Hrachyshka Co-authored-by: Nathan Weinberg <31703736+nathan-weinberg@users.noreply.github.com> --- .github/workflows/unit-tests.yml | 2 +- CONTRIBUTING.md | 16 ++++++++++++++++ scripts/unit-tests.sh | 19 +++++++++++++++++++ 3 files changed, 36 insertions(+), 1 deletion(-) create mode 100755 scripts/unit-tests.sh diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 517b5c39a..c7a30e9b8 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -33,7 +33,7 @@ jobs: - name: Run unit tests run: | - uv run --python ${{ matrix.python }} --with-editable . --with-editable ".[dev]" --with-editable ".[unit]" pytest --cov=llama_stack -s -v tests/unit/ --junitxml=pytest-report-${{ matrix.python }}.xml --cov-report=html:htmlcov-${{ matrix.python }} + PYTHON_VERSION=${{ matrix.python }} ./scripts/unit-tests.sh --cov=llama_stack --junitxml=pytest-report-${{ matrix.python }}.xml --cov-report=html:htmlcov-${{ matrix.python }} - name: Upload test results if: always() diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 71e610064..1f188f259 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -108,6 +108,22 @@ uv run pre-commit run --all-files > [!CAUTION] > Before pushing your changes, make sure that the pre-commit hooks have passed successfully. +## Running unit tests + +You can run the unit tests by running: + +```bash +source .venv/bin/activate +./scripts/unit-tests.sh +``` + +If you'd like to run for a non-default version of Python (currently 3.10), pass `PYTHON_VERSION` variable as follows: + +``` +source .venv/bin/activate +PYTHON_VERSION=3.13 ./scripts/unit-tests.sh +``` + ## Adding a new dependency to the project To add a new dependency to the project, you can use the `uv` command. For example, to add `foo` to the project, you can run: diff --git a/scripts/unit-tests.sh b/scripts/unit-tests.sh new file mode 100755 index 000000000..dbc25e06b --- /dev/null +++ b/scripts/unit-tests.sh @@ -0,0 +1,19 @@ +#!/bin/sh + +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +PYTHON_VERSION=${PYTHON_VERSION:-3.10} + +command -v uv >/dev/null 2>&1 || { echo >&2 "uv is required but it's not installed. Exiting."; exit 1; } + +uv python find $PYTHON_VERSION +FOUND_PYTHON=$? +if [ $FOUND_PYTHON -ne 0 ]; then + uv python install $PYTHON_VERSION +fi + +uv run --python $PYTHON_VERSION --with-editable . --with-editable ".[dev]" --with-editable ".[unit]" pytest -s -v tests/unit/ $@ From e3e7013ac8e64af382de2f6885f6094aedcffc11 Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Fri, 14 Mar 2025 12:20:49 -0400 Subject: [PATCH 056/557] chore: Add pre-commit check to sync api spec docs (#1609) # What does this PR do? It will fail if the newly generated spec docs are different. [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan ``` $ pre-commit run --all-files check for merge conflicts................................................Passed trim trailing whitespace.................................................Passed check for added large files..............................................Passed fix end of files.........................................................Passed Insert license in comments...............................................Passed ruff.....................................................................Passed ruff-format..............................................................Passed blacken-docs.............................................................Passed uv-lock..................................................................Passed uv-export................................................................Passed mypy.....................................................................Passed Distribution Template Codegen............................................Passed API Spec Codegen.........................................................Passed ``` Now add a field to existing API. Repeat: ``` $ pre-commit run --all-files check for merge conflicts................................................Passed trim trailing whitespace.................................................Passed check for added large files..............................................Passed fix end of files.........................................................Passed Insert license in comments...............................................Passed ruff.....................................................................Passed ruff-format..............................................................Passed blacken-docs.............................................................Passed uv-lock..................................................................Passed uv-export................................................................Passed mypy.....................................................................Passed Distribution Template Codegen............................................Passed API Spec Codegen.........................................................Failed - hook id: openapi-codegen - files were modified by this hook ``` [//]: # (## Documentation) Signed-off-by: Ihar Hrachyshka --- .pre-commit-config.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 80a303b09..072fa21e2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -83,6 +83,17 @@ repos: require_serial: true files: ^llama_stack/templates/.*$|^llama_stack/providers/.*/inference/.*/models\.py$ +- repo: local + hooks: + - id: openapi-codegen + name: API Spec Codegen + additional_dependencies: + - uv==0.6.2 + entry: sh -c 'uv run --with ".[dev]" ./docs/openapi_generator/run_openapi_generator.sh > /dev/null 2>&1' + language: python + pass_filenames: false + require_serial: true + ci: autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks autoupdate_commit_msg: ⬆ [pre-commit.ci] pre-commit autoupdate From 78d4872c0cdb39643b4109735b667df1568c8443 Mon Sep 17 00:00:00 2001 From: Charlie Doern Date: Fri, 14 Mar 2025 15:36:25 -0400 Subject: [PATCH 057/557] feat: add support for logging config in the run.yaml (#1408) # What does this PR do? a user should be able to store a static logging configuration outside of their environment. This would make sense to store in the run yaml given that we store other things like server configuration in there. The environment variable settings override the config settings if both are available. The format in the config looks like this: ``` logging_config: category_levels: VALID_CATEGORY: VALID_STRING_LOG_LEVEL ``` any specified category out of the following: `core | server | router | inference | agents | safety | eval | tools | client` combined with any of the following log levels: `debug | info | warning | error | critical` can be placed in the category_levels list in order to achieve the desired log level ## Test Plan Test locally with a run config like the following: ``` version: '2' image_name: ollama logging_config: category_levels: server: debug apis: ... ``` Signed-off-by: Charlie Doern --- llama_stack/distribution/datatypes.py | 10 +++ llama_stack/distribution/server/server.py | 36 ++++++---- llama_stack/log.py | 82 +++++++++++++++++------ 3 files changed, 93 insertions(+), 35 deletions(-) diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py index f62996081..7e1d8c016 100644 --- a/llama_stack/distribution/datatypes.py +++ b/llama_stack/distribution/datatypes.py @@ -117,6 +117,14 @@ class Provider(BaseModel): config: Dict[str, Any] +class LoggingConfig(BaseModel): + category_levels: Dict[str, str] = Field( + default_factory=Dict, + description=""" + Dictionary of different logging configurations for different portions (ex: core, server) of llama stack""", + ) + + class ServerConfig(BaseModel): port: int = Field( default=8321, @@ -176,6 +184,8 @@ a default SQLite store will be used.""", benchmarks: List[BenchmarkInput] = Field(default_factory=list) tool_groups: List[ToolGroupInput] = Field(default_factory=list) + logging: Optional[LoggingConfig] = Field(default=None, description="Configuration for Llama Stack Logging") + server: ServerConfig = Field( default_factory=ServerConfig, description="Configuration for the HTTP(S) server", diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py index 8f9500ae9..b37b3a007 100644 --- a/llama_stack/distribution/server/server.py +++ b/llama_stack/distribution/server/server.py @@ -25,7 +25,7 @@ from fastapi.responses import JSONResponse, StreamingResponse from pydantic import BaseModel, ValidationError from typing_extensions import Annotated -from llama_stack.distribution.datatypes import StackRunConfig +from llama_stack.distribution.datatypes import LoggingConfig, StackRunConfig from llama_stack.distribution.distribution import builtin_automatically_routed_apis from llama_stack.distribution.request_headers import ( PROVIDER_DATA_VAR, @@ -306,34 +306,42 @@ def main(): args = parser.parse_args() - if args.env: - for env_pair in args.env: - try: - key, value = validate_env_pair(env_pair) - logger.info(f"Setting CLI environment variable {key} => {value}") - os.environ[key] = value - except ValueError as e: - logger.error(f"Error: {str(e)}") - sys.exit(1) - + log_line = "" if args.yaml_config: # if the user provided a config file, use it, even if template was specified config_file = Path(args.yaml_config) if not config_file.exists(): raise ValueError(f"Config file {config_file} does not exist") - logger.info(f"Using config file: {config_file}") + log_line = f"Using config file: {config_file}" elif args.template: config_file = Path(REPO_ROOT) / "llama_stack" / "templates" / args.template / "run.yaml" if not config_file.exists(): raise ValueError(f"Template {args.template} does not exist") - logger.info(f"Using template {args.template} config file: {config_file}") + log_line = f"Using template {args.template} config file: {config_file}" else: raise ValueError("Either --yaml-config or --template must be provided") + logger_config = None with open(config_file, "r") as fp: - config = replace_env_vars(yaml.safe_load(fp)) + config_contents = yaml.safe_load(fp) + if isinstance(config_contents, dict) and (cfg := config_contents.get("logging_config")): + logger_config = LoggingConfig(**cfg) + logger = get_logger(name=__name__, category="server", config=logger_config) + if args.env: + for env_pair in args.env: + try: + key, value = validate_env_pair(env_pair) + logger.info(f"Setting CLI environment variable {key} => {value}") + os.environ[key] = value + except ValueError as e: + logger.error(f"Error: {str(e)}") + sys.exit(1) + config = replace_env_vars(config_contents) config = StackRunConfig(**config) + # now that the logger is initialized, print the line about which type of config we are using. + logger.info(log_line) + logger.info("Run configuration:") safe_config = redact_sensitive_fields(config.model_dump()) logger.info(yaml.dump(safe_config, indent=2)) diff --git a/llama_stack/log.py b/llama_stack/log.py index 572dea234..0ba95d547 100644 --- a/llama_stack/log.py +++ b/llama_stack/log.py @@ -7,13 +7,15 @@ import logging import os from logging.config import dictConfig -from typing import Dict +from typing import Dict, Optional from rich.console import Console from rich.errors import MarkupError from rich.logging import RichHandler from termcolor import cprint +from .distribution.datatypes import LoggingConfig + # Default log level DEFAULT_LOG_LEVEL = logging.INFO @@ -34,6 +36,56 @@ CATEGORIES = [ _category_levels: Dict[str, int] = {category: DEFAULT_LOG_LEVEL for category in CATEGORIES} +def config_to_category_levels(category: str, level: str): + """ + Helper function to be called either by environment parsing or yaml parsing to go from a list of categories and levels to a dictionary ready to be + used by the logger dictConfig. + + Parameters: + category (str): logging category to apply the level to + level (str): logging level to be used in the category + + Returns: + Dict[str, int]: A dictionary mapping categories to their log levels. + """ + + category_levels: Dict[str, int] = {} + level_value = logging._nameToLevel.get(str(level).upper()) + if level_value is None: + logging.warning(f"Unknown log level '{level}' for category '{category}'. Falling back to default 'INFO'.") + return category_levels + + if category == "all": + # Apply the log level to all categories and the root logger + for cat in CATEGORIES: + category_levels[cat] = level_value + # Set the root logger's level to the specified level + category_levels["root"] = level_value + elif category in CATEGORIES: + category_levels[category] = level_value + logging.info(f"Setting '{category}' category to level '{level}'.") + else: + logging.warning(f"Unknown logging category: {category}. No changes made.") + return category_levels + + +def parse_yaml_config(yaml_config: LoggingConfig) -> Dict[str, int]: + """ + Helper function to parse a yaml logging configuration found in the run.yaml + + Parameters: + yaml_config (Logging): the logger config object found in the run.yaml + + Returns: + Dict[str, int]: A dictionary mapping categories to their log levels. + """ + category_levels = {} + for category, level in yaml_config.category_levels.items(): + category_levels.update(config_to_category_levels(category=category, level=level)) + + return category_levels + + def parse_environment_config(env_config: str) -> Dict[str, int]: """ Parse the LLAMA_STACK_LOGGING environment variable and return a dictionary of category log levels. @@ -53,25 +105,7 @@ def parse_environment_config(env_config: str) -> Dict[str, int]: category, level = pair.split("=", 1) category = category.strip().lower() level = level.strip().upper() # Convert to uppercase for logging._nameToLevel - - level_value = logging._nameToLevel.get(level) - if level_value is None: - logging.warning( - f"Unknown log level '{level}' for category '{category}'. Falling back to default 'INFO'." - ) - continue - - if category == "all": - # Apply the log level to all categories and the root logger - for cat in CATEGORIES: - category_levels[cat] = level_value - # Set the root logger's level to the specified level - category_levels["root"] = level_value - elif category in CATEGORIES: - category_levels[category] = level_value - logging.info(f"Setting '{category}' category to level '{level}'.") - else: - logging.warning(f"Unknown logging category: {category}. No changes made.") + category_levels.update(config_to_category_levels(category=category, level=level)) except ValueError: logging.warning(f"Invalid logging configuration: '{pair}'. Expected format: 'category=level'.") @@ -176,7 +210,9 @@ def setup_logging(category_levels: Dict[str, int], log_file: str | None) -> None logger.setLevel(root_level) -def get_logger(name: str, category: str = "uncategorized") -> logging.LoggerAdapter: +def get_logger( + name: str, category: str = "uncategorized", config: Optional[LoggingConfig] | None = None +) -> logging.LoggerAdapter: """ Returns a logger with the specified name and category. If no category is provided, defaults to 'uncategorized'. @@ -184,10 +220,14 @@ def get_logger(name: str, category: str = "uncategorized") -> logging.LoggerAdap Parameters: name (str): The name of the logger (e.g., module or filename). category (str): The category of the logger (default 'uncategorized'). + config (Logging): optional yaml config to override the existing logger configuration Returns: logging.LoggerAdapter: Configured logger with category support. """ + if config: + _category_levels.update(parse_yaml_config(config)) + logger = logging.getLogger(name) logger.setLevel(_category_levels.get(category, DEFAULT_LOG_LEVEL)) return logging.LoggerAdapter(logger, {"category": category}) From a626b7bce37089913d528b7f343a49e0d7350051 Mon Sep 17 00:00:00 2001 From: yyymeta <123776235+yyymeta@users.noreply.github.com> Date: Fri, 14 Mar 2025 12:50:49 -0700 Subject: [PATCH 058/557] feat: [new open benchmark] BFCL_v3 (#1578) # What does this PR do? create a new dataset BFCL_v3 from https://gorilla.cs.berkeley.edu/blogs/13_bfcl_v3_multi_turn.html overall each question asks the model to perform a task described in natural language, and additionally a set of available functions and their schema are given for the model to choose from. the model is required to write the function call form including function name and parameters , to achieve the stated purpose. the results are validated against provided ground truth, to make sure that the generated function call and the ground truth function call are syntactically and semantically equivalent, by checking their AST . ## Test Plan start server by ``` llama stack run ./llama_stack/templates/ollama/run.yaml ``` then send traffic ``` llama-stack-client eval run-benchmark "bfcl" --model-id meta-llama/Llama-3.2-3B-Instruct --output-dir /tmp/gpqa --num-examples 2 ``` [//]: # (## Documentation) --- distributions/dependencies.json | 19 + .../inline/eval/meta_reference/eval.py | 7 +- .../providers/inline/scoring/basic/scoring.py | 9 +- .../basic/scoring_fn/bfcl_scoring_fn.py | 93 ++ .../scoring/basic/scoring_fn/fn_defs/bfcl.py | 21 + .../scoring/basic/utils/bfcl/__init__.py | 5 + .../scoring/basic/utils/bfcl/ast_parser.py | 296 ++++++ .../scoring/basic/utils/bfcl/checker.py | 989 ++++++++++++++++++ .../scoring/basic/utils/bfcl/tree_sitter.py | 40 + llama_stack/providers/registry/eval.py | 2 +- .../utils/common/data_schema_validator.py | 22 + .../open-benchmark/open_benchmark.py | 21 + llama_stack/templates/open-benchmark/run.yaml | 23 + requirements.txt | 2 +- uv.lock | 6 +- 15 files changed, 1546 insertions(+), 9 deletions(-) create mode 100644 llama_stack/providers/inline/scoring/basic/scoring_fn/bfcl_scoring_fn.py create mode 100644 llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/bfcl.py create mode 100644 llama_stack/providers/inline/scoring/basic/utils/bfcl/__init__.py create mode 100644 llama_stack/providers/inline/scoring/basic/utils/bfcl/ast_parser.py create mode 100644 llama_stack/providers/inline/scoring/basic/utils/bfcl/checker.py create mode 100644 llama_stack/providers/inline/scoring/basic/utils/bfcl/tree_sitter.py diff --git a/distributions/dependencies.json b/distributions/dependencies.json index c3f039247..d2ed12d3a 100644 --- a/distributions/dependencies.json +++ b/distributions/dependencies.json @@ -30,6 +30,7 @@ "sentencepiece", "tqdm", "transformers", + "tree_sitter", "uvicorn" ], "cerebras": [ @@ -62,6 +63,7 @@ "sentencepiece", "tqdm", "transformers", + "tree_sitter", "uvicorn", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" @@ -97,6 +99,7 @@ "sqlite-vec", "tqdm", "transformers", + "tree_sitter", "uvicorn", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" @@ -132,6 +135,7 @@ "sentencepiece", "tqdm", "transformers", + "tree_sitter", "uvicorn", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" @@ -168,6 +172,7 @@ "sqlite-vec", "tqdm", "transformers", + "tree_sitter", "uvicorn", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" @@ -203,6 +208,7 @@ "sentencepiece", "tqdm", "transformers", + "tree_sitter", "uvicorn", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" @@ -236,6 +242,7 @@ "sentencepiece", "tqdm", "transformers", + "tree_sitter", "uvicorn" ], "hf-endpoint": [ @@ -270,6 +277,7 @@ "sentencepiece", "tqdm", "transformers", + "tree_sitter", "uvicorn" ], "hf-serverless": [ @@ -304,6 +312,7 @@ "sentencepiece", "tqdm", "transformers", + "tree_sitter", "uvicorn", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" @@ -344,6 +353,7 @@ "torchvision", "tqdm", "transformers", + "tree_sitter", "uvicorn", "zmq" ], @@ -385,6 +395,7 @@ "torchvision", "tqdm", "transformers", + "tree_sitter", "uvicorn", "zmq" ], @@ -417,6 +428,7 @@ "sentencepiece", "tqdm", "transformers", + "tree_sitter", "uvicorn" ], "ollama": [ @@ -451,6 +463,7 @@ "sentencepiece", "tqdm", "transformers", + "tree_sitter", "uvicorn" ], "open-benchmark": [ @@ -485,6 +498,7 @@ "together", "tqdm", "transformers", + "tree_sitter", "uvicorn" ], "passthrough": [ @@ -517,6 +531,7 @@ "sentencepiece", "tqdm", "transformers", + "tree_sitter", "uvicorn", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" @@ -551,6 +566,7 @@ "sentencepiece", "tqdm", "transformers", + "tree_sitter", "uvicorn", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" @@ -616,6 +632,7 @@ "sentencepiece", "tqdm", "transformers", + "tree_sitter", "uvicorn", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" @@ -651,6 +668,7 @@ "together", "tqdm", "transformers", + "tree_sitter", "uvicorn", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" @@ -685,6 +703,7 @@ "sentencepiece", "tqdm", "transformers", + "tree_sitter", "uvicorn", "vllm", "sentence-transformers --no-deps", diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py index a1bebaa4c..85b351262 100644 --- a/llama_stack/providers/inline/eval/meta_reference/eval.py +++ b/llama_stack/providers/inline/eval/meta_reference/eval.py @@ -12,7 +12,7 @@ from llama_stack.apis.agents import Agents, StepType from llama_stack.apis.benchmarks import Benchmark from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasets import Datasets -from llama_stack.apis.inference import Inference, UserMessage +from llama_stack.apis.inference import Inference, SystemMessage, UserMessage from llama_stack.apis.scoring import Scoring from llama_stack.distribution.datatypes import Api from llama_stack.providers.datatypes import BenchmarksProtocolPrivate @@ -118,7 +118,7 @@ class MetaReferenceEvalImpl( for i, x in tqdm(enumerate(input_rows)): assert ColumnName.chat_completion_input.value in x, "Invalid input row" input_messages = json.loads(x[ColumnName.chat_completion_input.value]) - input_messages = [UserMessage(**x) for x in input_messages] + input_messages = [UserMessage(**x) for x in input_messages if x["role"] == "user"] # NOTE: only single-turn agent generation is supported. Create a new session for each input row session_create_response = await self.agents_api.create_agent_session(agent_id, f"session-{i}") @@ -168,10 +168,11 @@ class MetaReferenceEvalImpl( generations.append({ColumnName.generated_answer.value: response.completion_message.content}) elif ColumnName.chat_completion_input.value in x: chat_completion_input_json = json.loads(x[ColumnName.chat_completion_input.value]) - input_messages = [UserMessage(**x) for x in chat_completion_input_json] + input_messages = [UserMessage(**x) for x in chat_completion_input_json if x["role"] == "user"] messages = [] if candidate.system_message: messages.append(candidate.system_message) + messages += [SystemMessage(**x) for x in chat_completion_input_json if x["role"] == "system"] messages += input_messages response = await self.inference_api.chat_completion( model_id=candidate.model, diff --git a/llama_stack/providers/inline/scoring/basic/scoring.py b/llama_stack/providers/inline/scoring/basic/scoring.py index 00945b99d..599f5f98c 100644 --- a/llama_stack/providers/inline/scoring/basic/scoring.py +++ b/llama_stack/providers/inline/scoring/basic/scoring.py @@ -22,12 +22,19 @@ from llama_stack.providers.utils.common.data_schema_validator import ( ) from .config import BasicScoringConfig +from .scoring_fn.bfcl_scoring_fn import BFCLScoringFn from .scoring_fn.equality_scoring_fn import EqualityScoringFn from .scoring_fn.regex_parser_math_response_scoring_fn import RegexParserMathResponseScoringFn from .scoring_fn.regex_parser_scoring_fn import RegexParserScoringFn from .scoring_fn.subset_of_scoring_fn import SubsetOfScoringFn -FIXED_FNS = [EqualityScoringFn, SubsetOfScoringFn, RegexParserScoringFn, RegexParserMathResponseScoringFn] +FIXED_FNS = [ + EqualityScoringFn, + SubsetOfScoringFn, + RegexParserScoringFn, + RegexParserMathResponseScoringFn, + BFCLScoringFn, +] class BasicScoringImpl( diff --git a/llama_stack/providers/inline/scoring/basic/scoring_fn/bfcl_scoring_fn.py b/llama_stack/providers/inline/scoring/basic/scoring_fn/bfcl_scoring_fn.py new file mode 100644 index 000000000..f37780f3e --- /dev/null +++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/bfcl_scoring_fn.py @@ -0,0 +1,93 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import json +import re +from typing import Any, Dict, Optional + +from llama_stack.apis.scoring import ScoringResultRow +from llama_stack.apis.scoring_functions import ScoringFnParams +from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn + +from ..utils.bfcl.ast_parser import decode_ast +from ..utils.bfcl.checker import ast_checker, is_empty_output +from .fn_defs.bfcl import bfcl + + +def postprocess(x: Dict[str, Any], test_category: str) -> Dict[str, Any]: + contain_func_call = False + error = None + error_type = None + checker_result = {} + try: + prediction = decode_ast(x["generated_answer"], x["language"]) or "" + contain_func_call = True + # if not is_function_calling_format_output(prediction): + if is_empty_output(prediction): + contain_func_call = False + error = "Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability." + error_type = "ast_decoder:decoder_wrong_output_format" + else: + checker_result = ast_checker( + json.loads(x["function"]), + prediction, + json.loads(x["ground_truth"]), + x["language"], + test_category=test_category, + model_name="", + ) + except Exception as e: + prediction = "" + error = f"Invalid syntax. Failed to decode AST. {str(e)}" + error_type = "ast_decoder:decoder_failed" + return { + "prediction": prediction, + "contain_func_call": contain_func_call, + "valid": checker_result.get("valid", False), + "error": error or checker_result.get("error", ""), + "error_type": error_type or checker_result.get("error_type", ""), + } + + +def gen_valid(x: Dict[str, Any]) -> Dict[str, float]: + return {"valid": x["valid"]} + + +def gen_relevance_acc(x: Dict[str, Any]) -> Dict[str, float]: + # This function serves for both relevance and irrelevance tests, which share the exact opposite logic. + # If `test_category` is "irrelevance", the model is expected to output no function call. + # No function call means either the AST decoding fails (a error message is generated) or the decoded AST does not contain any function call (such as a empty list, `[]`). + # If `test_category` is "relevance", the model is expected to output to a function call, and empty list doesn't count as a function call. + acc = not x["contain_func_call"] if "irrelevance" in x["id"] else x["contain_func_call"] + return {"valid": float(acc)} + + +class BFCLScoringFn(RegisteredBaseScoringFn): + """ + A scoring_fn for BFCL + """ + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self.supported_fn_defs_registry = { + bfcl.identifier: bfcl, + } + + async def score_row( + self, + input_row: Dict[str, Any], + scoring_fn_identifier: Optional[str] = "bfcl", + scoring_params: Optional[ScoringFnParams] = None, + ) -> ScoringResultRow: + test_category = re.sub(r"_[0-9_-]+$", "", input_row["id"]) + score_result = postprocess(input_row, test_category) + if test_category in {"irrelevance", "live_relevance", "live_irrelevance"}: + score = gen_relevance_acc(score_result)["valid"] + else: + score = gen_valid(score_result)["valid"] + return { + "score": float(score), + } diff --git a/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/bfcl.py b/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/bfcl.py new file mode 100644 index 000000000..392d92c86 --- /dev/null +++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/bfcl.py @@ -0,0 +1,21 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.apis.common.type_system import NumberType +from llama_stack.apis.scoring_functions import ( + AggregationFunctionType, + BasicScoringFnParams, + ScoringFn, +) + +bfcl = ScoringFn( + identifier="basic::bfcl", + description="BFCL complex scoring", + return_type=NumberType(), + provider_id="basic", + provider_resource_id="bfcl", + params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.accuracy]), +) diff --git a/llama_stack/providers/inline/scoring/basic/utils/bfcl/__init__.py b/llama_stack/providers/inline/scoring/basic/utils/bfcl/__init__.py new file mode 100644 index 000000000..756f351d8 --- /dev/null +++ b/llama_stack/providers/inline/scoring/basic/utils/bfcl/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/llama_stack/providers/inline/scoring/basic/utils/bfcl/ast_parser.py b/llama_stack/providers/inline/scoring/basic/utils/bfcl/ast_parser.py new file mode 100644 index 000000000..445cdfc77 --- /dev/null +++ b/llama_stack/providers/inline/scoring/basic/utils/bfcl/ast_parser.py @@ -0,0 +1,296 @@ +# ruff: noqa +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +import ast + +from .tree_sitter import get_parser + + +def parse_java_function_call(source_code): + if not source_code.endswith(";"): + source_code += ";" # Necessary for the parser not to register an error + parser = get_parser("java") + tree = parser.parse(bytes(source_code, "utf8")) + root_node = tree.root_node + + if root_node.has_error: + raise Exception("Error parsing java the source code.") + + def get_text(node): + """Returns the text represented by the node.""" + return source_code[node.start_byte : node.end_byte] + + def traverse_node(node, nested=False): + if node.type == "string_literal": + if nested: + return get_text(node) + # Strip surrounding quotes from string literals + return get_text(node)[1:-1] + elif node.type == "character_literal": + if nested: + return get_text(node) + # Strip surrounding single quotes from character literals + return get_text(node)[1:-1] + """Traverse the node to collect texts for complex structures.""" + if node.type in [ + "identifier", + "class_literal", + "type_identifier", + "method_invocation", + ]: + return get_text(node) + elif node.type == "array_creation_expression": + # Handle array creation expression specifically + type_node = node.child_by_field_name("type") + value_node = node.child_by_field_name("value") + type_text = traverse_node(type_node, True) + value_text = traverse_node(value_node, True) + return f"new {type_text}[]{value_text}" + elif node.type == "object_creation_expression": + # Handle object creation expression specifically + type_node = node.child_by_field_name("type") + arguments_node = node.child_by_field_name("arguments") + type_text = traverse_node(type_node, True) + if arguments_node: + # Process each argument carefully, avoiding unnecessary punctuation + argument_texts = [] + for child in arguments_node.children: + if child.type not in [ + ",", + "(", + ")", + ]: # Exclude commas and parentheses + argument_text = traverse_node(child, True) + argument_texts.append(argument_text) + arguments_text = ", ".join(argument_texts) + return f"new {type_text}({arguments_text})" + else: + return f"new {type_text}()" + elif node.type == "set": + # Handling sets specifically + items = [traverse_node(n, True) for n in node.children if n.type not in [",", "set"]] + return "{" + ", ".join(items) + "}" + + elif node.child_count > 0: + return "".join(traverse_node(child, True) for child in node.children) + else: + return get_text(node) + + def extract_arguments(args_node): + arguments = {} + for child in args_node.children: + if child.type == "assignment_expression": + # For named parameters + name_node, value_node = child.children[0], child.children[2] + name = get_text(name_node) + value = traverse_node(value_node) + if name in arguments: + if not isinstance(arguments[name], list): + arguments[name] = [arguments[name]] + arguments[name].append(value) + else: + arguments[name] = value + # arguments.append({'name': name, 'value': value}) + elif child.type in ["identifier", "class_literal", "set"]: + # For unnamed parameters and handling sets + value = traverse_node(child) + if None in arguments: + if not isinstance(arguments[None], list): + arguments[None] = [arguments[None]] + arguments[None].append(value) + else: + arguments[None] = value + return arguments + + def traverse(node): + if node.type == "method_invocation": + # Extract the function name and its arguments + method_name = get_text(node.child_by_field_name("name")) + class_name_node = node.child_by_field_name("object") + if class_name_node: + class_name = get_text(class_name_node) + function_name = f"{class_name}.{method_name}" + else: + function_name = method_name + arguments_node = node.child_by_field_name("arguments") + if arguments_node: + arguments = extract_arguments(arguments_node) + for key, value in arguments.items(): + if isinstance(value, list): + raise Exception("Error: Multiple arguments with the same name are not supported.") + return [{function_name: arguments}] + + else: + for child in node.children: + result = traverse(child) + if result: + return result + + result = traverse(root_node) + return result if result else {} + + +def parse_javascript_function_call(source_code): + if not source_code.endswith(";"): + source_code += ";" # Necessary for the parser not to register an error + parser = get_parser("javascript") + # Parse the source code + tree = parser.parse(bytes(source_code, "utf8")) + root_node = tree.root_node + if root_node.has_error: + raise Exception("Error js parsing the source code.") + + # Function to recursively extract argument details + def extract_arguments(node): + args = {} + for child in node.children: + if child.type == "assignment_expression": + # Extract left (name) and right (value) parts of the assignment + name = child.children[0].text.decode("utf-8") + value = child.children[2].text.decode("utf-8") + if (value.startswith('"') and value.endswith('"')) or (value.startswith("'") and value.endswith("'")): + value = value[1:-1] # Trim the quotation marks + if name in args: + if not isinstance(args[name], list): + args[name] = [args[name]] + args[name].append(value) + else: + args[name] = value + + elif child.type == "identifier" or child.type == "true": + # Handle non-named arguments and boolean values + value = child.text.decode("utf-8") + if None in args: + if not isinstance(args[None], list): + args[None] = [args[None]] + args[None].append(value) + else: + args[None] = value + return args + + # Find the function call and extract its name and arguments + if root_node.type == "program": + for child in root_node.children: + if child.type == "expression_statement": + for sub_child in child.children: + if sub_child.type == "call_expression": + function_name = sub_child.children[0].text.decode("utf8") + arguments_node = sub_child.children[1] + parameters = extract_arguments(arguments_node) + for key, value in parameters.items(): + if isinstance(value, list): + raise Exception("Error: Multiple arguments with the same name are not supported.") + result = [{function_name: parameters}] + return result + + +def ast_parse(input_str, language="Python"): + if language == "Python": + cleaned_input = input_str.strip("[]'") + parsed = ast.parse(cleaned_input, mode="eval") + extracted = [] + if isinstance(parsed.body, ast.Call): + extracted.append(resolve_ast_call(parsed.body)) + else: + for elem in parsed.body.elts: + extracted.append(resolve_ast_call(elem)) + return extracted + elif language == "Java": + return parse_java_function_call(input_str[1:-1]) # Remove the [ and ] from the string + elif language == "JavaScript": + return parse_javascript_function_call(input_str[1:-1]) + else: + raise NotImplementedError(f"Unsupported language: {language}") + + +def resolve_ast_call(elem): + # Handle nested attributes for deeply nested module paths + func_parts = [] + func_part = elem.func + while isinstance(func_part, ast.Attribute): + func_parts.append(func_part.attr) + func_part = func_part.value + if isinstance(func_part, ast.Name): + func_parts.append(func_part.id) + func_name = ".".join(reversed(func_parts)) + args_dict = {} + # Parse when args are simply passed as an unnamed dictionary arg + for arg in elem.args: + if isinstance(arg, ast.Dict): + for key, value in zip(arg.keys, arg.values): + if isinstance(key, ast.Constant): + arg_name = key.value + output = resolve_ast_by_type(value) + args_dict[arg_name] = output + for arg in elem.keywords: + output = resolve_ast_by_type(arg.value) + args_dict[arg.arg] = output + return {func_name: args_dict} + + +def resolve_ast_by_type(value): + if isinstance(value, ast.Constant): + if value.value is Ellipsis: + output = "..." + else: + output = value.value + elif isinstance(value, ast.UnaryOp): + output = -value.operand.value + elif isinstance(value, ast.List): + output = [resolve_ast_by_type(v) for v in value.elts] + elif isinstance(value, ast.Dict): + output = {resolve_ast_by_type(k): resolve_ast_by_type(v) for k, v in zip(value.keys, value.values)} + elif isinstance(value, ast.NameConstant): # Added this condition to handle boolean values + output = value.value + elif isinstance(value, ast.BinOp): # Added this condition to handle function calls as arguments + output = eval(ast.unparse(value)) + elif isinstance(value, ast.Name): + output = value.id + elif isinstance(value, ast.Call): + if len(value.keywords) == 0: + output = ast.unparse(value) + else: + output = resolve_ast_call(value) + elif isinstance(value, ast.Tuple): + output = tuple(resolve_ast_by_type(v) for v in value.elts) + elif isinstance(value, ast.Lambda): + output = eval(ast.unparse(value.body[0].value)) + elif isinstance(value, ast.Ellipsis): + output = "..." + elif isinstance(value, ast.Subscript): + try: + output = ast.unparse(value.body[0].value) + except: + output = ast.unparse(value.value) + "[" + ast.unparse(value.slice) + "]" + else: + raise Exception(f"Unsupported AST type: {type(value)}") + return output + + +def decode_ast(result, language="Python"): + func = result + func = func.replace("\n", "") # remove new line characters + if not func.startswith("["): + func = "[" + func + if not func.endswith("]"): + func = func + "]" + decoded_output = ast_parse(func, language) + return decoded_output + + +def decode_execute(result): + func = result + func = func.replace("\n", "") # remove new line characters + if not func.startswith("["): + func = "[" + func + if not func.endswith("]"): + func = func + "]" + decode_output = ast_parse(func) + execution_list = [] + for function_call in decode_output: + for key, value in function_call.items(): + execution_list.append(f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})") + return execution_list diff --git a/llama_stack/providers/inline/scoring/basic/utils/bfcl/checker.py b/llama_stack/providers/inline/scoring/basic/utils/bfcl/checker.py new file mode 100644 index 000000000..f6aab123c --- /dev/null +++ b/llama_stack/providers/inline/scoring/basic/utils/bfcl/checker.py @@ -0,0 +1,989 @@ +# ruff: noqa +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +import json +import re +import time +from typing import Any + +# Comment out for now until we actually use the rest checker in evals +# import requests # Do not remove this import even though it seems to be unused. It's used in the executable_checker_rest function. + + +class NoAPIKeyError(Exception): + def __init__(self): + self.message = "❗️Please fill in the API keys in the function_credential_config.json file. If you do not provide the API keys, the executable test category results will be inaccurate." + super().__init__(self.message) + + +REAL_TIME_MATCH_ALLOWED_DIFFERENCE = 0.2 + + +JAVA_TYPE_CONVERSION = { + "byte": int, + "short": int, + "integer": int, + "float": float, + "double": float, + "long": int, + "boolean": bool, + "char": str, + "Array": list, + "ArrayList": list, + "Set": set, + "HashMap": dict, + "Hashtable": dict, + "Queue": list, # this can be `queue.Queue` as well, for simplicity we check with list + "Stack": list, + "String": str, + "any": str, +} + +JS_TYPE_CONVERSION = { + "String": str, + "integer": int, + "float": float, + "Bigint": int, + "Boolean": bool, + "dict": dict, + "array": list, + "any": str, +} + +# We switch to conditional import for the following two imports to avoid unnecessary installations. +# User doesn't need to setup the tree-sitter packages if they are not running the test for that language. +# from js_type_converter import js_type_converter +# from java_type_converter import java_type_converter + +PYTHON_TYPE_MAPPING = { + "string": str, + "integer": int, + "float": float, + "boolean": bool, + "array": list, + "tuple": list, + "dict": dict, + "any": str, +} + +# This is the list of types that we need to recursively check its values +PYTHON_NESTED_TYPE_CHECK_LIST = ["array", "tuple"] + + +NESTED_CONVERSION_TYPE_LIST = ["Array", "ArrayList", "array"] + + +#### Helper functions for AST #### +def find_description(func_descriptions, name): + if type(func_descriptions) == list: + for func_description in func_descriptions: + if func_description["name"] == name: + return func_description + return None + else: + # it is a dict, there is only one function + return func_descriptions + + +def get_possible_answer_type(possible_answer: list): + for answer in possible_answer: + if answer != "": # Optional parameter + return type(answer) + return None + + +def type_checker( + param: str, + value, + possible_answer: list, + expected_type_description: str, + expected_type_converted, + nested_type_converted, +): + # NOTE: This type checker only supports nested type checking for one level deep. + # We didn't implement recursive type checking for nested types, as it's not needed for the current use case and it's very complex. + + result: Any = { + "valid": True, + "error": [], + "is_variable": False, + "error_type": "type_error:simple", + } + + is_variable = False + # check for the case where a variable is used instead of a actual value. + # use the type in possible_answer as the expected type + possible_answer_type = get_possible_answer_type(possible_answer) + # if possible_answer only contains optional parameters, we can't determine the type + if possible_answer_type != None: + # we are being precise here. + # in fact, possible_answer_type should always be string, as that's how we treat varibale in possible_answer + if possible_answer_type != expected_type_converted: + is_variable = True + + # value is the same type as in function description + if type(value) == expected_type_converted: + # We don't need to do recursive check for simple types + if nested_type_converted == None: + result["is_variable"] = is_variable + return result + else: + for possible_answer_item in possible_answer: + flag = True # Each parameter should match to at least one possible answer type. + # Here, we assume that each item should be the same type. We could also relax it. + if type(possible_answer_item) == list: + for value_item in value: + checker_result = type_checker( + param, + value_item, + possible_answer_item, + str(nested_type_converted), + nested_type_converted, + None, + ) + if not checker_result["valid"]: + flag = False + break + + if flag: + return {"valid": True, "error": [], "is_variable": is_variable} + + result["valid"] = False + result["error"] = [ + f"Nested type checking failed for parameter {repr(param)}. Expected outer type {expected_type_description} with inner type {str(nested_type_converted)}. Parameter value: {repr(value)}." + ] + result["error_type"] = "type_error:nested" + + # value is not as expected, check for the case where a variable is used instead of a actual value + # use the type in possible_answer as the expected type + possible_answer_type = get_possible_answer_type(possible_answer) + # if possible_answer only contains optional parameters, we can't determine the type + if possible_answer_type != None: + # we are being precise here. + # in fact, possible_answer_type should always be string, as that's how we treat varibale in possible_answer + if type(value) == possible_answer_type: + result["is_variable"] = True + return result + + result["valid"] = False + result["error"].append( + f"Incorrect type for parameter {repr(param)}. Expected type {expected_type_description}, got {type(value).__name__}. Parameter value: {repr(value)}." + ) + result["error_type"] = "type_error:simple" + return result + + +def standardize_string(input_string: str): + # This function standardizes the string by removing all the spaces, ",./-_*^" punctuation, and converting it to lowercase + # It will also convert all the single quotes to double quotes + # This is used to compare the model output with the possible answers + # We don't want to punish model for answer like April 1, 2024 vs April 1,2024, vs April 1 2024 + regex_string = r"[ \,\.\/\-\_\*\^]" + return re.sub(regex_string, "", input_string).lower().replace("'", '"') + + +def string_checker(param: str, model_output: str, possible_answer: list): + standardize_possible_answer = [] + standardize_model_output = standardize_string(model_output) + for i in range(len(possible_answer)): + if type(possible_answer[i]) == str: + standardize_possible_answer.append(standardize_string(possible_answer[i])) + + if standardize_model_output not in standardize_possible_answer: + return { + "valid": False, + "error": [ + f"Invalid value for parameter {repr(param)}: {repr(model_output)}. Expected one of {possible_answer}. Case insensitive." + ], + "error_type": "value_error:string", + } + + return {"valid": True, "error": []} + + +def list_checker(param: str, model_output: list, possible_answer: list): + # Convert the tuple to a list + + standardize_model_output = list(model_output) + + # If the element in the list is a string, we need to standardize it + for i in range(len(standardize_model_output)): + if type(standardize_model_output[i]) == str: + standardize_model_output[i] = standardize_string(model_output[i]) + + standardize_possible_answer: Any = [] + # We also need to standardize the possible answers + for i in range(len(possible_answer)): + standardize_possible_answer.append([]) + for j in range(len(possible_answer[i])): + if type(possible_answer[i][j]) == str: + standardize_possible_answer[i].append(standardize_string(possible_answer[i][j])) + else: + standardize_possible_answer[i].append(possible_answer[i][j]) + + if standardize_model_output not in standardize_possible_answer: + return { + "valid": False, + "error": [ + f"Invalid value for parameter {repr(param)}: {repr(model_output)}. Expected one of {possible_answer}." + ], + "error_type": "value_error:list/tuple", + } + + return {"valid": True, "error": []} + + +def dict_checker(param: str, model_output: dict, possible_answers: list): + # This function works for simple dictionaries, but not dictionaries with nested dictionaries. + # The current dataset only contains simple dictionaries, so this is sufficient. + + result = {"valid": False, "error": [], "error_type": "dict_checker:unclear"} + for i in range(len(possible_answers)): + if possible_answers[i] == "": + continue + + result = {"valid": False, "error": [], "error_type": "dict_checker:unclear"} + + flag = True + + possible_answer = possible_answers[i] + # possible_anwer is a single dictionary + + for key, value in model_output.items(): + if key not in possible_answer: + result["valid"] = False + result["error"].append(f"Unexpected dict key parameter: '{key}'.") # type: ignore[attr-defined] + result["error_type"] = "value_error:dict_key" + flag = False + break + + standardize_value = value + # If the value is a string, we need to standardize it + if type(value) == str: + standardize_value = standardize_string(value) + + # We also need to standardize the possible answers if they are string + standardize_possible_answer = [] + for i in range(len(possible_answer[key])): + if type(possible_answer[key][i]) == str: + standardize_possible_answer.append(standardize_string(possible_answer[key][i])) + else: + standardize_possible_answer.append(possible_answer[key][i]) + + if standardize_value not in standardize_possible_answer: + result["valid"] = False + result["error"].append( # type: ignore[attr-defined] + f"Invalid value for parameter {repr(key)}: {repr(value)}. Expected one of {standardize_possible_answer}." + ) + result["error_type"] = "value_error:dict_value" + flag = False + break + + for key, value in possible_answer.items(): + if key not in model_output and "" not in value: + result["valid"] = False + result["error"].append(f"Missing dict key parameter: '{key}'.") # type: ignore[attr-defined] + result["error_type"] = "value_error:dict_key" + flag = False + break + + if flag: + return {"valid": True, "error": []} + + return result + + +def list_dict_checker(param: str, model_output: list, possible_answers: list): + # This function takes in a list of dictionaries and checks if each dictionary is valid + # The order of the dictionaries in the list must match the order of the possible answers + + result = {"valid": False, "error": [], "error_type": "list_dict_checker:unclear"} + + for answer_index in range(len(possible_answers)): + flag = True # True means so far, all dictionaries are valid + + # Only proceed if the number of dictionaries in the list matches the number of dictionaries in the possible answers + if len(model_output) != len(possible_answers[answer_index]): + result["valid"] = False + result["error"] = ["Wrong number of dictionaries in the list."] + result["error_type"] = "value_error:list_dict_count" + flag = False + continue + + for dict_index in range(len(model_output)): + result = dict_checker( + param, + model_output[dict_index], + [possible_answers[answer_index][dict_index]], + ) + if not result["valid"]: + flag = False + break + if flag: + return {"valid": True, "error": []} + + return result + + +def simple_function_checker( + func_description: dict, + model_output: dict, + possible_answer: dict, + language: str, + model_name: str, +): + possible_answer = list(possible_answer.values())[0] + # Extract function name and parameters details + func_name = func_description["name"] + param_details = func_description["parameters"]["properties"] + required_params = func_description["parameters"]["required"] + + # Initialize a result dictionary + result = { + "valid": True, + "error": [], + "error_type": "simple_function_checker:unclear", + } + + # Check if function name matches + if func_name not in model_output: + result["valid"] = False + result["error"].append( # type: ignore[attr-defined] + f"Function name {repr(func_name)} not found in model output." + ) + result["error_type"] = "simple_function_checker:wrong_func_name" + return result + + model_params = model_output[func_name] + + # Check for required parameters in model output + for param in required_params: + if param not in model_params: + result["valid"] = False + result["error"].append(f"Missing required parameter: {repr(param)}.") # type: ignore[attr-defined] + result["error_type"] = "simple_function_checker:missing_required" + return result + + # Validate types and values for each parameter in model output + for param, value in model_params.items(): + if param not in param_details or param not in possible_answer: + result["valid"] = False + result["error"].append(f"Unexpected parameter: {repr(param)}.") # type: ignore[attr-defined] + result["error_type"] = "simple_function_checker:unexpected_param" + return result + + full_param_details = param_details[param] + expected_type_description = full_param_details["type"] # This is a string + is_variable = False + nested_type_converted = None + + if language == "Java": + from evals.utils.bfcl.java_type_converter import java_type_converter + + expected_type_converted = JAVA_TYPE_CONVERSION[expected_type_description] + + if expected_type_description in JAVA_TYPE_CONVERSION: + if type(value) != str: + result["valid"] = False + result["error"].append( # type: ignore[attr-defined] + f"Incorrect type for parameter {repr(param)}. Expected type String, got {type(value).__name__}. Parameter value: {repr(value)}." + ) + result["error_type"] = "type_error:java" + return result + + if expected_type_description in NESTED_CONVERSION_TYPE_LIST: + nested_type = param_details[param]["items"]["type"] + nested_type_converted = JAVA_TYPE_CONVERSION[nested_type] + value = java_type_converter(value, expected_type_description, nested_type) + else: + value = java_type_converter(value, expected_type_description) + + elif language == "JavaScript": + from evals.utils.bfcl.js_type_converter import js_type_converter + + expected_type_converted = JS_TYPE_CONVERSION[expected_type_description] + + if expected_type_description in JS_TYPE_CONVERSION: + if type(value) != str: + result["valid"] = False + result["error"].append( # type: ignore[attr-defined] + f"Incorrect type for parameter {repr(param)}. Expected type String, got {type(value).__name__}. Parameter value: {repr(value)}." + ) + result["error_type"] = "type_error:js" + return result + + if expected_type_description in NESTED_CONVERSION_TYPE_LIST: + nested_type = param_details[param]["items"]["type"] + nested_type_converted = JS_TYPE_CONVERSION[nested_type] + value = js_type_converter(value, expected_type_description, nested_type) + else: + value = js_type_converter(value, expected_type_description) + + elif language == "Python": + expected_type_converted = PYTHON_TYPE_MAPPING[expected_type_description] + if expected_type_description in PYTHON_NESTED_TYPE_CHECK_LIST: + nested_type = param_details[param]["items"]["type"] + nested_type_converted = PYTHON_TYPE_MAPPING[nested_type] + + # We convert all tuple value to list when the expected type is tuple. + # The conversion is necessary because any tuple in the possible answer would become a list after being processed through json.dump() and json.load(). + # This does introduce some false positive (eg, when the model provides a list value instead of tuple). We hope to find a better solution in the future. + if expected_type_description == "tuple" and type(value) == tuple: + value = list(value) + + # Allow python auto conversion from int to float + if language == "Python" and expected_type_description == "float" and type(value) == int: + value = float(value) + + # Type checking + # In fact, we only check for Python here. + # Type check for other languages are handled by the type converter, and so their value (after conversion) is always correct. + type_check_result = type_checker( + param, + value, + possible_answer[param], + expected_type_description, + expected_type_converted, + nested_type_converted, + ) + is_variable = type_check_result["is_variable"] + if not type_check_result["valid"]: + return type_check_result + + # It doesn't make sense to special handle dictionaries and list of dictionaries if the value is a variable. + # We can just treat the variable as a string and use the normal flow. + if not is_variable: + # Special handle for dictionaries + if expected_type_converted == dict: + result = dict_checker(param, value, possible_answer[param]) + if not result["valid"]: + return result + continue + + # Special handle for list of dictionaries + elif expected_type_converted == list and nested_type_converted == dict: + result = list_dict_checker(param, value, possible_answer[param]) + if not result["valid"]: + return result + continue + + # Special handle for strings + elif expected_type_converted == str: + # We don't check for case sensitivity for string, as long as it's not a variable + result = string_checker(param, value, possible_answer[param]) + if not result["valid"]: + return result + continue + + elif expected_type_converted == list: + result = list_checker(param, value, possible_answer[param]) + if not result["valid"]: + return result + continue + + # Check if the value is within the possible answers + if value not in possible_answer[param]: + result["valid"] = False + result["error"].append( # type: ignore[attr-defined] + f"Invalid value for parameter {repr(param)}: {repr(value)}. Expected one of {possible_answer[param]}." + ) + result["error_type"] = "value_error:others" + return result + + # Check for optional parameters not provided but allowed + for param in possible_answer: + if param not in model_params and "" not in possible_answer[param]: + result["valid"] = False + result["error"].append( # type: ignore[attr-defined] + f"Optional parameter {repr(param)} not provided and not marked as optional." + ) + result["error_type"] = "simple_function_checker:missing_optional" + return result + + return result + + +def parallel_function_checker_enforce_order( + func_descriptions: list, + model_output: list, + possible_answers: dict, + language: str, + model_name: str, +): + if len(model_output) != len(possible_answers): + return { + "valid": False, + "error": ["Wrong number of functions."], + "error_type": "parallel_function_checker_enforce_order:wrong_count", + } + + func_name_list = list(possible_answers.keys()) + possible_answers_list = [] + + for key, value in possible_answers.items(): + possible_answers_list.append({key: value}) + + for i in range(len(possible_answers_list)): + func_description = find_description(func_descriptions, func_name_list[i]) + + result = simple_function_checker( + func_description, + model_output[i], + possible_answers_list[i], + language, + model_name, + ) + if not result["valid"]: + return result + + return {"valid": True, "error": []} + + +def parallel_function_checker_no_order( + func_descriptions: list, + model_output: list, + possible_answers: list, + language: str, + model_name: str, +): + if len(model_output) != len(possible_answers): + return { + "valid": False, + "error": ["Wrong number of functions."], + "error_type": "parallel_function_checker_no_order:wrong_count", + } + + matched_indices = [] + + # We go throught the possible answers one by one, and eliminate the model output that matches the possible answer + # It must be this way because we need ground truth to fetch the correct function description + for i in range(len(possible_answers)): + # possible_answers[i] is a dictionary with only one key + func_name_expected = list(possible_answers[i].keys())[0] + func_description = find_description(func_descriptions, func_name_expected) + + all_errors = [] + + for index in range(len(model_output)): + if index in matched_indices: + continue + + result = simple_function_checker( + func_description, + model_output[index], + possible_answers[i], + language, + model_name, + ) + + if result["valid"]: + matched_indices.append(index) + break + else: + all_errors.append( + { + f"Model Result Index {index}": { + "sub_error": result["error"], + "sub_error_type": result["error_type"], + "model_output_item": model_output[index], + "possible_answer_item": possible_answers[i], + } + } + ) + + if not result["valid"]: + considered_indices = [i for i in range(len(model_output)) if i not in matched_indices] + all_errors.insert( + 0, + f"Could not find a matching function among index {considered_indices} of model output for index {i} of possible answers.", # type: ignore[arg-type] + ) + return { + "valid": False, + "error": all_errors, + "error_type": "parallel_function_checker_no_order:cannot_find_match", + } + + return {"valid": True, "error": []} + + +def multiple_function_checker( + func_descriptions: list, + model_output: list, + possible_answers: list, + language: str, + model_name: str, +): + if len(model_output) != len(possible_answers): + return { + "valid": False, + "error": ["Wrong number of functions."], + "error_type": "multiple_function_checker:wrong_count", + } + + # possible_answers is a list of only one dictionary with only one key + func_name_expected = list(possible_answers[0].keys())[0] + func_description = find_description(func_descriptions, func_name_expected) + return simple_function_checker( + func_description, + model_output[0], + possible_answers[0], + language, + model_name, + ) + + +def patten_matcher(exec_output, expected_result, function_call, is_sanity_check): + result = {"valid": True, "error": [], "error_type": "executable_checker:unclear"} + + if type(exec_output) != type(expected_result): + return { + "valid": False, + "error": [ + f"Wrong execution result type for {repr(function_call)}. Expected type: {type(expected_result)}, but got: {type(exec_output)}." + ], + "error_type": "executable_checker:wrong_result_type", + "model_executed_output": exec_output, + } + if type(exec_output) == dict: + # We loose the requirement for the sanity check as the expected result used in the sanity check might not be the most up-to-date one. + # This happens when the key is a timestamp or a random number. + if is_sanity_check: + if len(exec_output) != len(expected_result): + return { + "valid": False, + "error": [ + f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but wrong number of elements in the output. Expected length: {len(expected_result)}, but got: {len(exec_output)}." + ], + "error_type": "executable_checker:wrong_result_type:dict_length", + "model_executed_output": exec_output, + } + else: + return result + + for key, value in expected_result.items(): + if key not in exec_output: + return { + "valid": False, + "error": [ + f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but key {repr(key)} not found in the model output." + ], + "error_type": "executable_checker:wrong_result_type:dict_key_not_found", + "model_executed_output": exec_output, + } + for key, value in exec_output.items(): + if key not in expected_result: + return { + "valid": False, + "error": [ + f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but key {repr(key)} not expected in the model output." + ], + "error_type": "executable_checker:wrong_result_type:dict_extra_key", + "model_executed_output": exec_output, + } + if type(exec_output) == list: + if len(exec_output) != len(expected_result): + return { + "valid": False, + "error": [ + f"Wrong execution result pattern for {repr(function_call)}. Expect type list, but wrong number of elements in the output. Expected length: {len(expected_result)}, but got: {len(exec_output)}." + ], + "error_type": "executable_checker:wrong_result_type:list_length", + "model_executed_output": exec_output, + } + return result + + +#### Helper functions for Exec #### +def executable_checker_simple( + function_call: str, + expected_result, + expected_result_type: str, + is_sanity_check=False, +): + result = {"valid": True, "error": [], "error_type": "executable_checker:unclear"} + + exec_dict: Any = {} + + try: + exec( + "from executable_python_function import *" + "\nresult=" + function_call, + exec_dict, + ) + exec_output = exec_dict["result"] + except NoAPIKeyError as e: + raise e + except Exception as e: + result["valid"] = False + result["error"].append( # type: ignore[attr-defined] + f"Error in execution: {repr(function_call)}. Error: {str(e)}" + ) + result["error_type"] = "executable_checker:execution_error" + return result + + # We need to special handle the case where the execution result is a tuple and convert it to a list + # Because when json is stored, the tuple is converted to a list, and so the expected result is a list when loaded from json + if isinstance(exec_output, tuple): + exec_output = list(exec_output) + + if expected_result_type == "exact_match": + if exec_output != expected_result: + result["valid"] = False + result["error"].append( # type: ignore[attr-defined] + f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}." + ) + result["error_type"] = "executable_checker:wrong_result" + result["model_executed_output"] = exec_output + return result + + elif expected_result_type == "real_time_match": + # Allow for 5% difference + if (type(expected_result) == float or type(expected_result) == int) and ( + type(exec_output) == float or type(exec_output) == int + ): + if not ( + expected_result * (1 - REAL_TIME_MATCH_ALLOWED_DIFFERENCE) + <= exec_output + <= expected_result * (1 + REAL_TIME_MATCH_ALLOWED_DIFFERENCE) + ): + result["valid"] = False + result["error"].append( # type: ignore[attr-defined] + f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}. {REAL_TIME_MATCH_ALLOWED_DIFFERENCE * 100}% difference allowed." + ) + result["error_type"] = "executable_checker:wrong_result_real_time" + result["model_executed_output"] = exec_output + return result + else: + result["valid"] = False + result["error"].append( # type: ignore[attr-defined] + f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}. Type needs to be float or int for real time match criteria." + ) + result["error_type"] = "executable_checker:wrong_result_real_time" + result["model_executed_output"] = exec_output + return result + + else: + # structural match + pattern_match_result = patten_matcher(exec_output, expected_result, function_call, is_sanity_check) + if not pattern_match_result["valid"]: + return pattern_match_result + + return result + + +def executable_checker_parallel_no_order( + decoded_result: list, expected_exec_result: list, expected_exec_result_type: list +): + if len(decoded_result) != len(expected_exec_result): + return { + "valid": False, + "error": [ + f"Wrong number of functions provided. Expected {len(expected_exec_result)}, but got {len(decoded_result)}." + ], + "error_type": "value_error:exec_result_count", + } + + matched_indices = [] + for i in range(len(expected_exec_result)): + all_errors = [] + for index in range(len(decoded_result)): + if index in matched_indices: + continue + + result = executable_checker_simple( + decoded_result[index], + expected_exec_result[i], + expected_exec_result_type[i], + False, + ) + + if result["valid"]: + matched_indices.append(index) + break + else: + all_errors.append( + { + f"Model Result Index {index}": { + "sub_error": result["error"], + "sub_error_type": result["error_type"], + "model_executed_output": ( + result["model_executed_output"] if "model_executed_output" in result else None + ), + } + } + ) + + if not result["valid"]: + considered_indices = [i for i in range(len(decoded_result)) if i not in matched_indices] + all_errors.insert( + 0, + f"Could not find a matching function among index {considered_indices} of model output for index {i} of possible answers.", # type: ignore[arg-type] + ) + return { + "valid": False, + "error": all_errors, + "error_type": "executable_checker:cannot_find_match", + } + + return {"valid": True, "error": [], "error_type": "executable_checker:unclear"} + + +#### Main function #### +def executable_checker_rest(func_call, idx): + # Move this here for now to avoid needing to read this file / fix paths to be relative to dataset_dir. Fix when it's actually needed / used. + EVAL_GROUND_TRUTH_PATH = "/mnt/wsfuse/fair_llm_v2/datasets/eval/bfcl/rest-eval-response_v5.jsonl" # Ground truth file for v5 for rest execution + with open(EVAL_GROUND_TRUTH_PATH, "r") as f: + EVAL_GROUND_TRUTH = f.readlines() + if "https://geocode.maps.co" in func_call: + time.sleep(2) + if "requests_get" in func_call: + func_call = func_call.replace("requests_get", "requests.get") + try: + response = eval(func_call) + except Exception as e: + return { + "valid": False, + "error": [f"Execution failed. {str(e)}"], + "error_type": "executable_checker_rest:execution_error", + } + + try: + if response.status_code == 200: + eval_GT_json = json.loads(EVAL_GROUND_TRUTH[idx]) + try: + if isinstance(eval_GT_json, dict): + if isinstance(response.json(), dict): + if set(eval_GT_json.keys()) == set(response.json().keys()): + return {"valid": True, "error": [], "error_type": ""} + return { + "valid": False, + "error": ["Key inconsistency"], + "error_type": "executable_checker_rest:wrong_key", + } + return { + "valid": False, + "error": [f"Expected dictionary, but got {type(response.json())}"], + "error_type": "executable_checker_rest:wrong_type", + } + + elif isinstance(eval_GT_json, list): + if isinstance(response.json(), list): + if len(eval_GT_json) != len(response.json()): + return { + "valid": False, + "error": [f"Response list length inconsistency."], + "error_type": "value_error:exec_result_rest_count", + } + + else: + for i in range(len(eval_GT_json)): + if set(eval_GT_json[i].keys()) != set(response.json()[i].keys()): + return { + "valid": False, + "error": [f"Key inconsistency"], + "error_type": "executable_checker_rest:wrong_key", + } + + return {"valid": True, "error": []} + else: + return { + "valid": False, + "error": [f"Expected list, but got {type(response.json())}"], + "error_type": "executable_checker_rest:wrong_type", + } + return { + "valid": False, + "error": [f"Expected dict or list, but got {type(response.json())}"], + "error_type": "executable_checker_rest:wrong_type", + } + except Exception as e: + return { + "valid": False, + "error": [ + f"Error in execution and type checking. Status code: {response.status_code}. Error: {str(e)}" + ], + "error_type": "executable_checker_rest:response_format_error", + } + else: + return { + "valid": False, + "error": [f"Execution result status code is not 200, got {response.status_code}"], + "error_type": "executable_checker_rest:wrong_status_code", + } + except Exception as e: + return { + "valid": False, + "error": [f"Cannot get status code of the response. Error: {str(e)}"], + "error_type": "executable_checker_rest:cannot_get_status_code", + } + + +def ast_checker(func_description, model_output, possible_answer, language, test_category, model_name): + if "parallel" in test_category: + return parallel_function_checker_no_order(func_description, model_output, possible_answer, language, model_name) + + elif "multiple" in test_category: + return multiple_function_checker(func_description, model_output, possible_answer, language, model_name) + + else: + if len(model_output) != 1: + return { + "valid": False, + "error": ["Wrong number of functions."], + "error_type": "simple_function_checker:wrong_count", + } + + return simple_function_checker( + func_description[0], + model_output[0], + possible_answer[0], + language, + model_name, + ) + + +def exec_checker(decoded_result: list, func_description: dict, test_category: str): + if "multiple" in test_category or "parallel" in test_category: + return executable_checker_parallel_no_order( + decoded_result, + func_description["execution_result"], + func_description["execution_result_type"], + ) + + else: + if len(decoded_result) != 1: + return { + "valid": False, + "error": ["Wrong number of functions."], + "error_type": "simple_exec_checker:wrong_count", + } + return executable_checker_simple( + decoded_result[0], + func_description["execution_result"][0], + func_description["execution_result_type"][0], + False, + ) + + +def is_empty_output(decoded_output): + # This function is a patch to the ast decoder for relevance detection + # Sometimes the ast decoder will parse successfully, but the input doens't really have a function call + # [], [{}], and anything that is not in function calling format is considered empty (and thus should be marked as correct) + if not is_function_calling_format_output(decoded_output): + return True + if len(decoded_output) == 0: + return True + if len(decoded_output) == 1 and len(decoded_output[0]) == 0: + return True + + +def is_function_calling_format_output(decoded_output): + # Ensure the output is a list of dictionaries + if type(decoded_output) == list: + for item in decoded_output: + if type(item) != dict: + return False + return True + return False diff --git a/llama_stack/providers/inline/scoring/basic/utils/bfcl/tree_sitter.py b/llama_stack/providers/inline/scoring/basic/utils/bfcl/tree_sitter.py new file mode 100644 index 000000000..ed97ee360 --- /dev/null +++ b/llama_stack/providers/inline/scoring/basic/utils/bfcl/tree_sitter.py @@ -0,0 +1,40 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +""" +Tree-sitter changes its API with unfortunate frequency. Modules that need it should +import it from here so that we can centrally manage things as necessary. +""" + +# These currently work with tree-sitter 0.23.0 +# NOTE: Don't import tree-sitter or any of the language modules in the main module +# because not all environments have them. Import lazily inside functions where needed. + +import importlib +import typing + +if typing.TYPE_CHECKING: + import tree_sitter + + +def get_language(language: str) -> "tree_sitter.Language": + import tree_sitter + + language_module_name = f"tree_sitter_{language}" + try: + language_module = importlib.import_module(language_module_name) + except ModuleNotFoundError as exc: + raise ValueError( + f"Language {language} is not found. Please install the tree-sitter-{language} package." + ) from exc + return tree_sitter.Language(language_module.language()) + + +def get_parser(language: str, **kwargs) -> "tree_sitter.Parser": + import tree_sitter + + lang = get_language(language) + return tree_sitter.Parser(lang, **kwargs) diff --git a/llama_stack/providers/registry/eval.py b/llama_stack/providers/registry/eval.py index 6901c3741..755d30382 100644 --- a/llama_stack/providers/registry/eval.py +++ b/llama_stack/providers/registry/eval.py @@ -14,7 +14,7 @@ def available_providers() -> List[ProviderSpec]: InlineProviderSpec( api=Api.eval, provider_type="inline::meta-reference", - pip_packages=[], + pip_packages=["tree_sitter"], module="llama_stack.providers.inline.eval.meta_reference", config_class="llama_stack.providers.inline.eval.meta_reference.MetaReferenceEvalConfig", api_dependencies=[ diff --git a/llama_stack/providers/utils/common/data_schema_validator.py b/llama_stack/providers/utils/common/data_schema_validator.py index 3d14c4148..eb9d9dd60 100644 --- a/llama_stack/providers/utils/common/data_schema_validator.py +++ b/llama_stack/providers/utils/common/data_schema_validator.py @@ -23,6 +23,10 @@ class ColumnName(Enum): generated_answer = "generated_answer" context = "context" dialog = "dialog" + function = "function" + language = "language" + id = "id" + ground_truth = "ground_truth" VALID_SCHEMAS_FOR_SCORING = [ @@ -37,6 +41,15 @@ VALID_SCHEMAS_FOR_SCORING = [ ColumnName.generated_answer.value: StringType(), ColumnName.context.value: StringType(), }, + { + ColumnName.input_query.value: StringType(), + ColumnName.expected_answer.value: StringType(), + ColumnName.generated_answer.value: StringType(), + ColumnName.function.value: StringType(), + ColumnName.language.value: StringType(), + ColumnName.id.value: StringType(), + ColumnName.ground_truth.value: StringType(), + }, ] VALID_SCHEMAS_FOR_EVAL = [ @@ -50,6 +63,15 @@ VALID_SCHEMAS_FOR_EVAL = [ ColumnName.expected_answer.value: StringType(), ColumnName.completion_input.value: CompletionInputType(), }, + { + ColumnName.input_query.value: StringType(), + ColumnName.expected_answer.value: StringType(), + ColumnName.generated_answer.value: StringType(), + ColumnName.function.value: StringType(), + ColumnName.language.value: StringType(), + ColumnName.id.value: StringType(), + ColumnName.ground_truth.value: StringType(), + }, ] diff --git a/llama_stack/templates/open-benchmark/open_benchmark.py b/llama_stack/templates/open-benchmark/open_benchmark.py index 2b40797f9..17f5b8ee7 100644 --- a/llama_stack/templates/open-benchmark/open_benchmark.py +++ b/llama_stack/templates/open-benchmark/open_benchmark.py @@ -226,6 +226,22 @@ def get_distribution_template() -> DistributionTemplate: "chat_completion_input": {"type": "string"}, }, ), + DatasetInput( + dataset_id="bfcl", + provider_id="huggingface", + url=URL(uri="https://huggingface.co/datasets/llamastack/bfcl_v3"), + metadata={ + "path": "llamastack/bfcl_v3", + "split": "train", + }, + dataset_schema={ + "function": {"type": "string"}, + "language": {"type": "string"}, + "ground_truth": {"type": "string"}, + "id": {"type": "string"}, + "chat_completion_input": {"type": "string"}, + }, + ), ] default_benchmarks = [ @@ -249,6 +265,11 @@ def get_distribution_template() -> DistributionTemplate: dataset_id="math_500", scoring_functions=["basic::regex_parser_math_response"], ), + BenchmarkInput( + benchmark_id="meta-reference-bfcl", + dataset_id="bfcl", + scoring_functions=["basic::bfcl"], + ), ] return DistributionTemplate( name=name, diff --git a/llama_stack/templates/open-benchmark/run.yaml b/llama_stack/templates/open-benchmark/run.yaml index 5ef25435b..6961f8022 100644 --- a/llama_stack/templates/open-benchmark/run.yaml +++ b/llama_stack/templates/open-benchmark/run.yaml @@ -216,6 +216,24 @@ datasets: split: test dataset_id: math_500 provider_id: huggingface +- dataset_schema: + function: + type: string + language: + type: string + ground_truth: + type: string + id: + type: string + chat_completion_input: + type: string + url: + uri: https://huggingface.co/datasets/llamastack/bfcl_v3 + metadata: + path: llamastack/bfcl_v3 + split: train + dataset_id: bfcl + provider_id: huggingface scoring_fns: [] benchmarks: - dataset_id: simpleqa @@ -238,6 +256,11 @@ benchmarks: - basic::regex_parser_math_response metadata: {} benchmark_id: meta-reference-math-500 +- dataset_id: bfcl + scoring_functions: + - basic::bfcl + metadata: {} + benchmark_id: meta-reference-bfcl tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/requirements.txt b/requirements.txt index ae8a0af9f..3c382ad84 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,7 +12,7 @@ distro==1.9.0 exceptiongroup==1.2.2 ; python_full_version < '3.11' filelock==3.17.0 fire==0.7.0 -fsspec==2025.2.0 +fsspec==2024.12.0 h11==0.14.0 httpcore==1.0.7 httpx==0.28.1 diff --git a/uv.lock b/uv.lock index 9ec3680f8..207f5981f 100644 --- a/uv.lock +++ b/uv.lock @@ -769,11 +769,11 @@ wheels = [ [[package]] name = "fsspec" -version = "2025.2.0" +version = "2024.12.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/b5/79/68612ed99700e6413de42895aa725463e821a6b3be75c87fcce1b4af4c70/fsspec-2025.2.0.tar.gz", hash = "sha256:1c24b16eaa0a1798afa0337aa0db9b256718ab2a89c425371f5628d22c3b6afd", size = 292283 } +sdist = { url = "https://files.pythonhosted.org/packages/ee/11/de70dee31455c546fbc88301971ec03c328f3d1138cfba14263f651e9551/fsspec-2024.12.0.tar.gz", hash = "sha256:670700c977ed2fb51e0d9f9253177ed20cbde4a3e5c0283cc5385b5870c8533f", size = 291600 } wheels = [ - { url = "https://files.pythonhosted.org/packages/e2/94/758680531a00d06e471ef649e4ec2ed6bf185356a7f9fbfbb7368a40bd49/fsspec-2025.2.0-py3-none-any.whl", hash = "sha256:9de2ad9ce1f85e1931858535bc882543171d197001a0a5eb2ddc04f1781ab95b", size = 184484 }, + { url = "https://files.pythonhosted.org/packages/de/86/5486b0188d08aa643e127774a99bac51ffa6cf343e3deb0583956dca5b22/fsspec-2024.12.0-py3-none-any.whl", hash = "sha256:b520aed47ad9804237ff878b504267a3b0b441e97508bd6d2d8774e3db85cee2", size = 183862 }, ] [package.optional-dependencies] From c5857a9b50253ac2d2dbe0d270a3ea75fc0bf6f6 Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Fri, 14 Mar 2025 14:45:37 -0700 Subject: [PATCH 059/557] fix: sleep between tests oof --- tests/integration/conftest.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index bf1092c4a..22290b519 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -8,6 +8,7 @@ import itertools import os import platform import textwrap +import time from dotenv import load_dotenv @@ -18,6 +19,12 @@ from .report import Report logger = get_logger(__name__, category="tests") +def pytest_runtest_teardown(item): + interval_seconds = os.getenv("LLAMA_STACK_TEST_INTERVAL_SECONDS") + if interval_seconds: + time.sleep(float(interval_seconds)) + + def pytest_configure(config): config.option.tbstyle = "short" config.option.disable_warnings = True From 93cfade8c9929a9cb5c7082d1824c96550af20e3 Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Fri, 14 Mar 2025 15:21:26 -0700 Subject: [PATCH 060/557] ci: Bump version to 0.1.7 --- pyproject.toml | 4 ++-- requirements.txt | 2 +- uv.lock | 10 +++++----- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ff7f46f77..4a5befbd0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "llama_stack" -version = "0.1.6" +version = "0.1.7" authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }] description = "Llama Stack" readme = "README.md" @@ -27,7 +27,7 @@ dependencies = [ "huggingface-hub", "jinja2>=3.1.6", "jsonschema", - "llama-stack-client>=0.1.6", + "llama-stack-client>=0.1.7", "prompt-toolkit", "python-dotenv", "pydantic>=2", diff --git a/requirements.txt b/requirements.txt index 3c382ad84..cb75e2c43 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,7 +21,7 @@ idna==3.10 jinja2==3.1.6 jsonschema==4.23.0 jsonschema-specifications==2024.10.1 -llama-stack-client==0.1.6 +llama-stack-client==0.1.7 lxml==5.3.1 markdown-it-py==3.0.0 markupsafe==3.0.2 diff --git a/uv.lock b/uv.lock index 207f5981f..860b29241 100644 --- a/uv.lock +++ b/uv.lock @@ -1176,7 +1176,7 @@ wheels = [ [[package]] name = "llama-stack" -version = "0.1.6" +version = "0.1.7" source = { editable = "." } dependencies = [ { name = "blobfile" }, @@ -1270,7 +1270,7 @@ requires-dist = [ { name = "jinja2", specifier = ">=3.1.6" }, { name = "jinja2", marker = "extra == 'codegen'", specifier = ">=3.1.6" }, { name = "jsonschema" }, - { name = "llama-stack-client", specifier = ">=0.1.6" }, + { name = "llama-stack-client", specifier = ">=0.1.7" }, { name = "mcp", marker = "extra == 'test'" }, { name = "myst-parser", marker = "extra == 'docs'" }, { name = "nbval", marker = "extra == 'dev'" }, @@ -1318,7 +1318,7 @@ provides-extras = ["dev", "unit", "test", "docs", "codegen"] [[package]] name = "llama-stack-client" -version = "0.1.6" +version = "0.1.7" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, @@ -1335,9 +1335,9 @@ dependencies = [ { name = "tqdm" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/b5/48/70ffdc7ab655234794e9559de9b1776b39610c09aaee8d3bc74bfbd570b4/llama_stack_client-0.1.6.tar.gz", hash = "sha256:92c6c55c3281839e690df7bfc289c36a5dde0f491574bbdb6b8b665dc3d5a16c", size = 264874 } +sdist = { url = "https://files.pythonhosted.org/packages/12/82/cd3ac4cddfeb2c63dc25b2469ded66e39101a01586bb2535bfae4293dc49/llama_stack_client-0.1.7.tar.gz", hash = "sha256:8ed51d73a62848a72e1c862d8881e64785be1a9118bcea94aa910025571b34d0", size = 242071 } wheels = [ - { url = "https://files.pythonhosted.org/packages/38/51/1102914f819cf4412a5c9fd3f7dcc28175608e5f01ee164885972c3ec30b/llama_stack_client-0.1.6-py3-none-any.whl", hash = "sha256:708e20630d4e97a1cb03a19b933f4da6748cc857fe170998c392cf0f30f0f4c7", size = 373941 }, + { url = "https://files.pythonhosted.org/packages/dc/ee/055282258899f25f61d248a335e6bcdceb76e3f6ed76ad3a28240ebfa0fe/llama_stack_client-0.1.7-py3-none-any.whl", hash = "sha256:db89a21811310f9249a951a25f96298affa3ca1d3610a518093c94e984105292", size = 273882 }, ] [[package]] From 7b81761a56a2adb16327c5a9aea76d89a332b79d Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Fri, 14 Mar 2025 15:46:45 -0700 Subject: [PATCH 061/557] fix: update CDN url for stoplight --- docs/_static/llama-stack-spec.html | 4 ++-- docs/openapi_generator/pyopenapi/template.html | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index b5e4097d9..4f1b09140 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -6,8 +6,8 @@ OpenAPI specification - - + +