From 27d5892dfaadaeabb8a530ef60e6dcf8af59583a Mon Sep 17 00:00:00 2001 From: Botao Chen Date: Tue, 11 Mar 2025 17:11:26 -0700 Subject: [PATCH] temp commit --- distributions/dependencies.json | 34 ++++ .../remote/inference/together/config.py | 2 +- .../templates/open-benchmark/__init__.py | 7 + .../{open-benchmark.py => open_benchmark.py} | 40 ++-- llama_stack/templates/open-benchmark/run.yaml | 171 +++++++++--------- .../templates/together/run-with-safety.yaml | 2 +- llama_stack/templates/together/run.yaml | 2 +- 7 files changed, 149 insertions(+), 109 deletions(-) create mode 100644 llama_stack/templates/open-benchmark/__init__.py rename llama_stack/templates/open-benchmark/{open-benchmark.py => open_benchmark.py} (91%) diff --git a/distributions/dependencies.json b/distributions/dependencies.json index 97aecc719..82fbcec8d 100644 --- a/distributions/dependencies.json +++ b/distributions/dependencies.json @@ -453,6 +453,40 @@ "transformers", "uvicorn" ], + "open-benchmark": [ + "aiosqlite", + "autoevals", + "blobfile", + "chardet", + "chromadb-client", + "datasets", + "fastapi", + "fire", + "httpx", + "litellm", + "matplotlib", + "mcp", + "nltk", + "numpy", + "openai", + "opentelemetry-exporter-otlp-proto-http", + "opentelemetry-sdk", + "pandas", + "pillow", + "psycopg2-binary", + "pymongo", + "pypdf", + "redis", + "requests", + "scikit-learn", + "scipy", + "sentencepiece", + "sqlite-vec", + "together", + "tqdm", + "transformers", + "uvicorn" + ], "remote-vllm": [ "aiosqlite", "autoevals", diff --git a/llama_stack/providers/remote/inference/together/config.py b/llama_stack/providers/remote/inference/together/config.py index fda3b8f43..fa7c45c9f 100644 --- a/llama_stack/providers/remote/inference/together/config.py +++ b/llama_stack/providers/remote/inference/together/config.py @@ -26,5 +26,5 @@ class TogetherImplConfig(BaseModel): def sample_run_config(cls, **kwargs) -> Dict[str, Any]: return { "url": "https://api.together.xyz/v1", - "api_key": "${env.TOGETHER_API_KEY}", + "api_key": "${env.TOGETHER_API_KEY:}", } diff --git a/llama_stack/templates/open-benchmark/__init__.py b/llama_stack/templates/open-benchmark/__init__.py new file mode 100644 index 000000000..14d0a28f5 --- /dev/null +++ b/llama_stack/templates/open-benchmark/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .open_benchmark import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/open-benchmark/open-benchmark.py b/llama_stack/templates/open-benchmark/open_benchmark.py similarity index 91% rename from llama_stack/templates/open-benchmark/open-benchmark.py rename to llama_stack/templates/open-benchmark/open_benchmark.py index 6621a0553..fd2b07dce 100644 --- a/llama_stack/templates/open-benchmark/open-benchmark.py +++ b/llama_stack/templates/open-benchmark/open_benchmark.py @@ -36,7 +36,7 @@ def get_inference_providers() -> Tuple[List[Provider], List[ModelInput]]: "openai", [ ProviderModelEntry( - provider_model_id="penai/gpt-4o", + provider_model_id="openai/gpt-4o", model_type=ModelType.llm, ) ], @@ -62,26 +62,6 @@ def get_inference_providers() -> Tuple[List[Provider], List[ModelInput]]: ], GeminiConfig.sample_run_config(api_key="${env.GEMINI_API_KEY:}"), ), - ( - "groq", - [ - ProviderModelEntry( - provider_model_id="groq/llama-3.3-70b-versatile", - model_type=ModelType.llm, - ) - ], - GroqConfig.sample_run_config(api_key="${env.GROQ_API_KEY:}"), - ), - ( - "together", - [ - ProviderModelEntry( - provider_model_id="meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo", - model_type=ModelType.llm, - ) - ], - TogetherImplConfig.sample_run_config(api_key="${env.TOGETHER_API_KEY:}"), - ), ] inference_providers = [] available_models = {} @@ -243,7 +223,7 @@ def get_distribution_template() -> DistributionTemplate: return DistributionTemplate( name=name, distro_type="self_hosted", - description="Distribution for running e2e tests in CI", + description="Distribution for running open benchmarks", container_image=None, template_path=None, providers=providers, @@ -266,13 +246,25 @@ def get_distribution_template() -> DistributionTemplate: "5001", "Port for the Llama Stack distribution server", ), - "FIREWORKS_API_KEY": ( + "TOGETHER_API_KEY": ( "", - "Fireworks API Key", + "Together API Key", ), "OPENAI_API_KEY": ( "", "OpenAI API Key", ), + "GEMINI_API_KEY": ( + "", + "Gemini API Key", + ), + "ANTHROPIC_API_KEY": ( + "", + "Anthropic API Key", + ), + "GROQ_API_KEY": ( + "", + "Groq API Key", + ), }, ) diff --git a/llama_stack/templates/open-benchmark/run.yaml b/llama_stack/templates/open-benchmark/run.yaml index 736b47746..f15dee920 100644 --- a/llama_stack/templates/open-benchmark/run.yaml +++ b/llama_stack/templates/open-benchmark/run.yaml @@ -38,7 +38,7 @@ providers: - provider_id: sqlite-vec provider_type: inline::sqlite-vec config: - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/sqlite_vec.db + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/open-benchmark}/sqlite_vec.db - provider_id: ${env.ENABLE_CHROMADB+chromadb} provider_type: remote::chromadb config: @@ -62,14 +62,14 @@ providers: persistence_store: type: sqlite namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/agents_store.db + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/open-benchmark}/agents_store.db telemetry: - provider_id: meta-reference provider_type: inline::meta-reference config: service_name: ${env.OTEL_SERVICE_NAME:llama-stack} sinks: ${env.TELEMETRY_SINKS:console,sqlite} - sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/dev/trace_store.db} + sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/open-benchmark/trace_store.db} eval: - provider_id: meta-reference provider_type: inline::meta-reference @@ -114,18 +114,13 @@ providers: config: {} metadata_store: type: sqlite - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/registry.db + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/open-benchmark}/registry.db models: - metadata: {} model_id: openai/gpt-4o provider_id: openai provider_model_id: openai/gpt-4o model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.1-405B-Instruct - provider_id: together - provider_model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo - model_type: llm - metadata: {} model_id: anthropic/claude-3-5-sonnet-latest provider_id: anthropic @@ -137,88 +132,100 @@ models: provider_model_id: gemini/gemini-1.5-flash model_type: llm - metadata: {} - model_id: meta-llama/Llama-3.3-70B-Instruct + model_id: groq/llama-3.3-70b-versatile provider_id: groq provider_model_id: groq/llama-3.3-70b-versatile model_type: llm +- metadata: {} + model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo + provider_id: together + provider_model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo + model_type: llm shields: - shield_id: meta-llama/Llama-Guard-3-8B vector_dbs: [] datasets: - - dataset_id: simpleqa - provider_id: huggingface - url: - uri: https://huggingface.co/datasets/llamastack/simpleqa - metadata: - path: llamastack/simpleqa - name: - split: train - dataset_schema: - input_query: - type: string - expected_answer: - type: string - chat_completion_input: - type: string - - dataset_id: mmlu_cot - provider_id: huggingface - url: - uri: https://huggingface.co/datasets/llamastack/mmlu_cot - metadata: - path: llamastack/mmlu_cot - name: all - split: test - dataset_schema: - input_query: - type: string - expected_answer: - type: string - chat_completion_input: - type: string - - dataset_id: gpqa_cot - provider_id: huggingface - url: - uri: https://huggingface.co/datasets/llamastack/gpqa_0shot_cot - metadata: - path: llamastack/gpqa_0shot_cot - name: gpqa_main - split: train - dataset_schema: - input_query: - type: string - expected_answer: - type: string - chat_completion_input: - type: string - - dataset_id: math_500 - provider_id: huggingface - url: - uri: https://huggingface.co/datasets/llamastack/math_500 - metadata: - path: llamastack/math_500 - name: - split: test - dataset_schema: - input_query: - type: string - expected_answer: - type: string - chat_completion_input: - type: string +- dataset_schema: + input_query: + type: string + expected_answer: + type: string + chat_completion_input: + type: string + url: + uri: https://huggingface.co/datasets/llamastack/simpleqa + metadata: + path: llamastack/simpleqa + name: null + split: train + dataset_id: simpleqa + provider_id: huggingface +- dataset_schema: + input_query: + type: string + expected_answer: + type: string + chat_completion_input: + type: string + url: + uri: https://huggingface.co/datasets/llamastack/mmlu_cot + metadata: + path: llamastack/mmlu_cot + name: all + split: test + dataset_id: mmlu_cot + provider_id: huggingface +- dataset_schema: + input_query: + type: string + expected_answer: + type: string + chat_completion_input: + type: string + url: + uri: https://huggingface.co/datasets/llamastack/gpqa_0shot_cot + metadata: + path: llamastack/gpqa_0shot_cot + name: main + split: train + dataset_id: gpqa_cot + provider_id: huggingface +- dataset_schema: + input_query: + type: string + expected_answer: + type: string + chat_completion_input: + type: string + url: + uri: https://huggingface.co/datasets/llamastack/math_500 + metadata: + path: llamastack/math_500 + split: test + dataset_id: math_500 + provider_id: huggingface scoring_fns: [] benchmarks: - - benchmark_id: meta-reference-simpleqa - dataset_id: simpleqa - scoring_functions: ["llm-as-judge::405b-simpleqa"] - - benchmark_id: meta-reference-mmlu-cot - dataset_id: mmlu_cot - scoring_functions: ["basic::regex_parser_multiple_choice_answer"] - - benchmark_id: meta-reference-gpqa-cot - dataset_id: gpqa_cot - scoring_functions: ["basic::regex_parser_multiple_choice_answer"] - - benchmark_id: meta-reference-math-500 - dataset_id: math_500 - scoring_functions: ["basic::regex_parser_math_response"] +- dataset_id: simpleqa + scoring_functions: + - llm-as-judge::405b-simpleqa + metadata: {} + benchmark_id: meta-reference-simpleqa +- dataset_id: mmlu_cot + scoring_functions: + - basic::regex_parser_multiple_choice_answer + metadata: {} + benchmark_id: meta-reference-mmlu-cot +- dataset_id: gpqa_cot + scoring_functions: + - basic::regex_parser_multiple_choice_answer + metadata: {} + benchmark_id: meta-reference-gpqa-cot +- dataset_id: math_500 + scoring_functions: + - basic::regex_parser_math_response + metadata: {} + benchmark_id: meta-reference-math-500 tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/together/run-with-safety.yaml b/llama_stack/templates/together/run-with-safety.yaml index fd74f80c3..3a7d3dfba 100644 --- a/llama_stack/templates/together/run-with-safety.yaml +++ b/llama_stack/templates/together/run-with-safety.yaml @@ -16,7 +16,7 @@ providers: provider_type: remote::together config: url: https://api.together.xyz/v1 - api_key: ${env.TOGETHER_API_KEY} + api_key: ${env.TOGETHER_API_KEY:} - provider_id: sentence-transformers provider_type: inline::sentence-transformers config: {} diff --git a/llama_stack/templates/together/run.yaml b/llama_stack/templates/together/run.yaml index 9a717290a..10668914a 100644 --- a/llama_stack/templates/together/run.yaml +++ b/llama_stack/templates/together/run.yaml @@ -16,7 +16,7 @@ providers: provider_type: remote::together config: url: https://api.together.xyz/v1 - api_key: ${env.TOGETHER_API_KEY} + api_key: ${env.TOGETHER_API_KEY:} - provider_id: sentence-transformers provider_type: inline::sentence-transformers config: {}