From 0b0be70605f4497f817433a1484bde0b202efb18 Mon Sep 17 00:00:00 2001 From: Botao Chen Date: Wed, 12 Mar 2025 11:12:08 -0700 Subject: [PATCH] feat: Add open benchmark template codegen (#1579) ## What does this PR do? As title, add codegen for open-benchmark template ## test checked the new generated run.yaml file and it's identical before and after the change Also add small improvement to together template so that missing TOGETHER_API_KEY won't crash the server which is the consistent user experience as other remote providers --- distributions/dependencies.json | 34 ++ .../remote/inference/together/config.py | 2 +- .../templates/open-benchmark/__init__.py | 7 + .../open-benchmark/open_benchmark.py | 293 ++++++++++++++++++ llama_stack/templates/open-benchmark/run.yaml | 168 +++++----- llama_stack/templates/template.py | 6 + .../templates/together/run-with-safety.yaml | 2 +- llama_stack/templates/together/run.yaml | 2 +- 8 files changed, 430 insertions(+), 84 deletions(-) create mode 100644 llama_stack/templates/open-benchmark/__init__.py create mode 100644 llama_stack/templates/open-benchmark/open_benchmark.py diff --git a/distributions/dependencies.json b/distributions/dependencies.json index 97aecc719..82fbcec8d 100644 --- a/distributions/dependencies.json +++ b/distributions/dependencies.json @@ -453,6 +453,40 @@ "transformers", "uvicorn" ], + "open-benchmark": [ + "aiosqlite", + "autoevals", + "blobfile", + "chardet", + "chromadb-client", + "datasets", + "fastapi", + "fire", + "httpx", + "litellm", + "matplotlib", + "mcp", + "nltk", + "numpy", + "openai", + "opentelemetry-exporter-otlp-proto-http", + "opentelemetry-sdk", + "pandas", + "pillow", + "psycopg2-binary", + "pymongo", + "pypdf", + "redis", + "requests", + "scikit-learn", + "scipy", + "sentencepiece", + "sqlite-vec", + "together", + "tqdm", + "transformers", + "uvicorn" + ], "remote-vllm": [ "aiosqlite", "autoevals", diff --git a/llama_stack/providers/remote/inference/together/config.py b/llama_stack/providers/remote/inference/together/config.py index fda3b8f43..fa7c45c9f 100644 --- a/llama_stack/providers/remote/inference/together/config.py +++ b/llama_stack/providers/remote/inference/together/config.py @@ -26,5 +26,5 @@ class TogetherImplConfig(BaseModel): def sample_run_config(cls, **kwargs) -> Dict[str, Any]: return { "url": "https://api.together.xyz/v1", - "api_key": "${env.TOGETHER_API_KEY}", + "api_key": "${env.TOGETHER_API_KEY:}", } diff --git a/llama_stack/templates/open-benchmark/__init__.py b/llama_stack/templates/open-benchmark/__init__.py new file mode 100644 index 000000000..14d0a28f5 --- /dev/null +++ b/llama_stack/templates/open-benchmark/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .open_benchmark import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/open-benchmark/open_benchmark.py b/llama_stack/templates/open-benchmark/open_benchmark.py new file mode 100644 index 000000000..7df33a715 --- /dev/null +++ b/llama_stack/templates/open-benchmark/open_benchmark.py @@ -0,0 +1,293 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import List, Tuple + +from llama_stack.apis.models.models import ModelType +from llama_stack.distribution.datatypes import ( + BenchmarkInput, + DatasetInput, + ModelInput, + Provider, + ShieldInput, + ToolGroupInput, +) +from llama_stack.providers.inline.vector_io.sqlite_vec.config import SQLiteVectorIOConfig +from llama_stack.providers.remote.inference.anthropic.config import AnthropicConfig +from llama_stack.providers.remote.inference.gemini.config import GeminiConfig +from llama_stack.providers.remote.inference.groq.config import GroqConfig +from llama_stack.providers.remote.inference.openai.config import OpenAIConfig +from llama_stack.providers.remote.inference.together.config import TogetherImplConfig +from llama_stack.providers.remote.vector_io.chroma.config import ChromaVectorIOConfig +from llama_stack.providers.remote.vector_io.pgvector.config import PGVectorVectorIOConfig +from llama_stack.providers.utils.inference.model_registry import ( + ProviderModelEntry, +) +from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry + + +def get_inference_providers() -> Tuple[List[Provider], List[ModelInput]]: + # in this template, we allow each API key to be optional + providers = [ + ( + "openai", + [ + ProviderModelEntry( + provider_model_id="openai/gpt-4o", + model_type=ModelType.llm, + ) + ], + OpenAIConfig.sample_run_config(api_key="${env.OPENAI_API_KEY:}"), + ), + ( + "anthropic", + [ + ProviderModelEntry( + provider_model_id="anthropic/claude-3-5-sonnet-latest", + model_type=ModelType.llm, + ) + ], + AnthropicConfig.sample_run_config(api_key="${env.ANTHROPIC_API_KEY:}"), + ), + ( + "gemini", + [ + ProviderModelEntry( + provider_model_id="gemini/gemini-1.5-flash", + model_type=ModelType.llm, + ) + ], + GeminiConfig.sample_run_config(api_key="${env.GEMINI_API_KEY:}"), + ), + ( + "groq", + [], + GroqConfig.sample_run_config(api_key="${env.GROQ_API_KEY:}"), + ), + ( + "together", + [], + TogetherImplConfig.sample_run_config(api_key="${env.TOGETHER_API_KEY:}"), + ), + ] + inference_providers = [] + available_models = {} + for provider_id, model_entries, config in providers: + inference_providers.append( + Provider( + provider_id=provider_id, + provider_type=f"remote::{provider_id}", + config=config, + ) + ) + available_models[provider_id] = model_entries + return inference_providers, available_models + + +def get_distribution_template() -> DistributionTemplate: + inference_providers, available_models = get_inference_providers() + providers = { + "inference": [p.provider_type for p in inference_providers], + "vector_io": ["inline::sqlite-vec", "remote::chromadb", "remote::pgvector"], + "safety": ["inline::llama-guard"], + "agents": ["inline::meta-reference"], + "telemetry": ["inline::meta-reference"], + "eval": ["inline::meta-reference"], + "datasetio": ["remote::huggingface", "inline::localfs"], + "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], + "tool_runtime": [ + "remote::brave-search", + "remote::tavily-search", + "inline::code-interpreter", + "inline::rag-runtime", + "remote::model-context-protocol", + ], + } + name = "open-benchmark" + + vector_io_providers = [ + Provider( + provider_id="sqlite-vec", + provider_type="inline::sqlite-vec", + config=SQLiteVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"), + ), + Provider( + provider_id="${env.ENABLE_CHROMADB+chromadb}", + provider_type="remote::chromadb", + config=ChromaVectorIOConfig.sample_run_config(url="${env.CHROMADB_URL:}"), + ), + Provider( + provider_id="${env.ENABLE_PGVECTOR+pgvector}", + provider_type="remote::pgvector", + config=PGVectorVectorIOConfig.sample_run_config( + db="${env.PGVECTOR_DB:}", + user="${env.PGVECTOR_USER:}", + password="${env.PGVECTOR_PASSWORD:}", + ), + ), + ] + + default_tool_groups = [ + ToolGroupInput( + toolgroup_id="builtin::websearch", + provider_id="tavily-search", + ), + ToolGroupInput( + toolgroup_id="builtin::rag", + provider_id="rag-runtime", + ), + ToolGroupInput( + toolgroup_id="builtin::code_interpreter", + provider_id="code-interpreter", + ), + ] + + default_models = get_model_registry(available_models) + [ + ModelInput( + model_id="meta-llama/Llama-3.3-70B-Instruct", + provider_id="groq", + provider_model_id="groq/llama-3.3-70b-versatile", + model_type=ModelType.llm, + ), + ModelInput( + model_id="meta-llama/Llama-3.1-405B-Instruct", + provider_id="together", + provider_model_id="meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo", + model_type=ModelType.llm, + ), + ] + + default_datasets = [ + DatasetInput( + dataset_id="simpleqa", + provider_id="huggingface", + url={"uri": "https://huggingface.co/datasets/llamastack/simpleqa"}, + metadata={ + "path": "llamastack/simpleqa", + "split": "train", + }, + dataset_schema={ + "input_query": {"type": "string"}, + "expected_answer": {"type": "string"}, + "chat_completion_input": {"type": "string"}, + }, + ), + DatasetInput( + dataset_id="mmlu_cot", + provider_id="huggingface", + url={"uri": "https://huggingface.co/datasets/llamastack/mmlu_cot"}, + metadata={ + "path": "llamastack/mmlu_cot", + "name": "all", + "split": "test", + }, + dataset_schema={ + "input_query": {"type": "string"}, + "expected_answer": {"type": "string"}, + "chat_completion_input": {"type": "string"}, + }, + ), + DatasetInput( + dataset_id="gpqa_cot", + provider_id="huggingface", + url={"uri": "https://huggingface.co/datasets/llamastack/gpqa_0shot_cot"}, + metadata={ + "path": "llamastack/gpqa_0shot_cot", + "name": "gpqa_main", + "split": "train", + }, + dataset_schema={ + "input_query": {"type": "string"}, + "expected_answer": {"type": "string"}, + "chat_completion_input": {"type": "string"}, + }, + ), + DatasetInput( + dataset_id="math_500", + provider_id="huggingface", + url={"uri": "https://huggingface.co/datasets/llamastack/math_500"}, + metadata={ + "path": "llamastack/math_500", + "split": "test", + }, + dataset_schema={ + "input_query": {"type": "string"}, + "expected_answer": {"type": "string"}, + "chat_completion_input": {"type": "string"}, + }, + ), + ] + + default_benchmarks = [ + BenchmarkInput( + benchmark_id="meta-reference-simpleqa", + dataset_id="simpleqa", + scoring_functions=["llm-as-judge::405b-simpleqa"], + ), + BenchmarkInput( + benchmark_id="meta-reference-mmlu-cot", + dataset_id="mmlu_cot", + scoring_functions=["basic::regex_parser_multiple_choice_answer"], + ), + BenchmarkInput( + benchmark_id="meta-reference-gpqa-cot", + dataset_id="gpqa_cot", + scoring_functions=["basic::regex_parser_multiple_choice_answer"], + ), + BenchmarkInput( + benchmark_id="meta-reference-math-500", + dataset_id="math_500", + scoring_functions=["basic::regex_parser_math_response"], + ), + ] + return DistributionTemplate( + name=name, + distro_type="self_hosted", + description="Distribution for running open benchmarks", + container_image=None, + template_path=None, + providers=providers, + available_models_by_provider=available_models, + run_configs={ + "run.yaml": RunConfigSettings( + provider_overrides={ + "inference": inference_providers, + "vector_io": vector_io_providers, + }, + default_models=default_models, + default_tool_groups=default_tool_groups, + default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")], + default_datasets=default_datasets, + default_benchmarks=default_benchmarks, + ), + }, + run_config_env_vars={ + "LLAMA_STACK_PORT": ( + "5001", + "Port for the Llama Stack distribution server", + ), + "TOGETHER_API_KEY": ( + "", + "Together API Key", + ), + "OPENAI_API_KEY": ( + "", + "OpenAI API Key", + ), + "GEMINI_API_KEY": ( + "", + "Gemini API Key", + ), + "ANTHROPIC_API_KEY": ( + "", + "Anthropic API Key", + ), + "GROQ_API_KEY": ( + "", + "Groq API Key", + ), + }, + ) diff --git a/llama_stack/templates/open-benchmark/run.yaml b/llama_stack/templates/open-benchmark/run.yaml index 736b47746..97c54e621 100644 --- a/llama_stack/templates/open-benchmark/run.yaml +++ b/llama_stack/templates/open-benchmark/run.yaml @@ -38,7 +38,7 @@ providers: - provider_id: sqlite-vec provider_type: inline::sqlite-vec config: - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/sqlite_vec.db + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/open-benchmark}/sqlite_vec.db - provider_id: ${env.ENABLE_CHROMADB+chromadb} provider_type: remote::chromadb config: @@ -62,14 +62,14 @@ providers: persistence_store: type: sqlite namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/agents_store.db + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/open-benchmark}/agents_store.db telemetry: - provider_id: meta-reference provider_type: inline::meta-reference config: service_name: ${env.OTEL_SERVICE_NAME:llama-stack} sinks: ${env.TELEMETRY_SINKS:console,sqlite} - sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/dev/trace_store.db} + sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/open-benchmark/trace_store.db} eval: - provider_id: meta-reference provider_type: inline::meta-reference @@ -114,18 +114,13 @@ providers: config: {} metadata_store: type: sqlite - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/registry.db + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/open-benchmark}/registry.db models: - metadata: {} model_id: openai/gpt-4o provider_id: openai provider_model_id: openai/gpt-4o model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.1-405B-Instruct - provider_id: together - provider_model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo - model_type: llm - metadata: {} model_id: anthropic/claude-3-5-sonnet-latest provider_id: anthropic @@ -141,84 +136,95 @@ models: provider_id: groq provider_model_id: groq/llama-3.3-70b-versatile model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.1-405B-Instruct + provider_id: together + provider_model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo + model_type: llm shields: - shield_id: meta-llama/Llama-Guard-3-8B vector_dbs: [] datasets: - - dataset_id: simpleqa - provider_id: huggingface - url: - uri: https://huggingface.co/datasets/llamastack/simpleqa - metadata: - path: llamastack/simpleqa - name: - split: train - dataset_schema: - input_query: - type: string - expected_answer: - type: string - chat_completion_input: - type: string - - dataset_id: mmlu_cot - provider_id: huggingface - url: - uri: https://huggingface.co/datasets/llamastack/mmlu_cot - metadata: - path: llamastack/mmlu_cot - name: all - split: test - dataset_schema: - input_query: - type: string - expected_answer: - type: string - chat_completion_input: - type: string - - dataset_id: gpqa_cot - provider_id: huggingface - url: - uri: https://huggingface.co/datasets/llamastack/gpqa_0shot_cot - metadata: - path: llamastack/gpqa_0shot_cot - name: gpqa_main - split: train - dataset_schema: - input_query: - type: string - expected_answer: - type: string - chat_completion_input: - type: string - - dataset_id: math_500 - provider_id: huggingface - url: - uri: https://huggingface.co/datasets/llamastack/math_500 - metadata: - path: llamastack/math_500 - name: - split: test - dataset_schema: - input_query: - type: string - expected_answer: - type: string - chat_completion_input: - type: string +- dataset_schema: + input_query: + type: string + expected_answer: + type: string + chat_completion_input: + type: string + url: + uri: https://huggingface.co/datasets/llamastack/simpleqa + metadata: + path: llamastack/simpleqa + split: train + dataset_id: simpleqa + provider_id: huggingface +- dataset_schema: + input_query: + type: string + expected_answer: + type: string + chat_completion_input: + type: string + url: + uri: https://huggingface.co/datasets/llamastack/mmlu_cot + metadata: + path: llamastack/mmlu_cot + name: all + split: test + dataset_id: mmlu_cot + provider_id: huggingface +- dataset_schema: + input_query: + type: string + expected_answer: + type: string + chat_completion_input: + type: string + url: + uri: https://huggingface.co/datasets/llamastack/gpqa_0shot_cot + metadata: + path: llamastack/gpqa_0shot_cot + name: gpqa_main + split: train + dataset_id: gpqa_cot + provider_id: huggingface +- dataset_schema: + input_query: + type: string + expected_answer: + type: string + chat_completion_input: + type: string + url: + uri: https://huggingface.co/datasets/llamastack/math_500 + metadata: + path: llamastack/math_500 + split: test + dataset_id: math_500 + provider_id: huggingface scoring_fns: [] benchmarks: - - benchmark_id: meta-reference-simpleqa - dataset_id: simpleqa - scoring_functions: ["llm-as-judge::405b-simpleqa"] - - benchmark_id: meta-reference-mmlu-cot - dataset_id: mmlu_cot - scoring_functions: ["basic::regex_parser_multiple_choice_answer"] - - benchmark_id: meta-reference-gpqa-cot - dataset_id: gpqa_cot - scoring_functions: ["basic::regex_parser_multiple_choice_answer"] - - benchmark_id: meta-reference-math-500 - dataset_id: math_500 - scoring_functions: ["basic::regex_parser_math_response"] +- dataset_id: simpleqa + scoring_functions: + - llm-as-judge::405b-simpleqa + metadata: {} + benchmark_id: meta-reference-simpleqa +- dataset_id: mmlu_cot + scoring_functions: + - basic::regex_parser_multiple_choice_answer + metadata: {} + benchmark_id: meta-reference-mmlu-cot +- dataset_id: gpqa_cot + scoring_functions: + - basic::regex_parser_multiple_choice_answer + metadata: {} + benchmark_id: meta-reference-gpqa-cot +- dataset_id: math_500 + scoring_functions: + - basic::regex_parser_math_response + metadata: {} + benchmark_id: meta-reference-math-500 tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/template.py b/llama_stack/templates/template.py index a7b862396..aa1ce144f 100644 --- a/llama_stack/templates/template.py +++ b/llama_stack/templates/template.py @@ -14,7 +14,9 @@ from pydantic import BaseModel, Field from llama_stack.apis.models.models import ModelType from llama_stack.distribution.datatypes import ( Api, + BenchmarkInput, BuildConfig, + DatasetInput, DistributionSpec, ModelInput, Provider, @@ -56,6 +58,8 @@ class RunConfigSettings(BaseModel): default_models: Optional[List[ModelInput]] = None default_shields: Optional[List[ShieldInput]] = None default_tool_groups: Optional[List[ToolGroupInput]] = None + default_datasets: Optional[List[DatasetInput]] = None + default_benchmarks: Optional[List[BenchmarkInput]] = None def run_config( self, @@ -113,6 +117,8 @@ class RunConfigSettings(BaseModel): models=self.default_models or [], shields=self.default_shields or [], tool_groups=self.default_tool_groups or [], + datasets=self.default_datasets or [], + benchmarks=self.default_benchmarks or [], ) diff --git a/llama_stack/templates/together/run-with-safety.yaml b/llama_stack/templates/together/run-with-safety.yaml index fd74f80c3..3a7d3dfba 100644 --- a/llama_stack/templates/together/run-with-safety.yaml +++ b/llama_stack/templates/together/run-with-safety.yaml @@ -16,7 +16,7 @@ providers: provider_type: remote::together config: url: https://api.together.xyz/v1 - api_key: ${env.TOGETHER_API_KEY} + api_key: ${env.TOGETHER_API_KEY:} - provider_id: sentence-transformers provider_type: inline::sentence-transformers config: {} diff --git a/llama_stack/templates/together/run.yaml b/llama_stack/templates/together/run.yaml index 9a717290a..10668914a 100644 --- a/llama_stack/templates/together/run.yaml +++ b/llama_stack/templates/together/run.yaml @@ -16,7 +16,7 @@ providers: provider_type: remote::together config: url: https://api.together.xyz/v1 - api_key: ${env.TOGETHER_API_KEY} + api_key: ${env.TOGETHER_API_KEY:} - provider_id: sentence-transformers provider_type: inline::sentence-transformers config: {}