mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-11 11:50:41 +00:00
Merge branch 'evals_6' into evals_7
This commit is contained in:
commit
575e51eb76
51 changed files with 448 additions and 420 deletions
|
|
@ -1,4 +1,4 @@
|
||||||
exclude: 'build'
|
exclude: 'build/'
|
||||||
|
|
||||||
default_language_version:
|
default_language_version:
|
||||||
python: python3
|
python: python3
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
include requirements.txt
|
include requirements.txt
|
||||||
include llama_stack/distribution/*.sh
|
include llama_stack/distribution/*.sh
|
||||||
include llama_stack/cli/scripts/*.sh
|
include llama_stack/cli/scripts/*.sh
|
||||||
include distributions/*/build.yaml
|
include llama_stack/templates/*/build.yaml
|
||||||
|
|
|
||||||
|
|
@ -1,10 +0,0 @@
|
||||||
name: bedrock
|
|
||||||
distribution_spec:
|
|
||||||
description: Use Amazon Bedrock APIs.
|
|
||||||
providers:
|
|
||||||
inference: remote::bedrock
|
|
||||||
memory: meta-reference
|
|
||||||
safety: meta-reference
|
|
||||||
agents: meta-reference
|
|
||||||
telemetry: meta-reference
|
|
||||||
image_type: conda
|
|
||||||
1
distributions/bedrock/build.yaml
Symbolic link
1
distributions/bedrock/build.yaml
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../llama_stack/templates/bedrock/build.yaml
|
||||||
|
|
@ -1,10 +0,0 @@
|
||||||
name: databricks
|
|
||||||
distribution_spec:
|
|
||||||
description: Use Databricks for running LLM inference
|
|
||||||
providers:
|
|
||||||
inference: remote::databricks
|
|
||||||
memory: meta-reference
|
|
||||||
safety: meta-reference
|
|
||||||
agents: meta-reference
|
|
||||||
telemetry: meta-reference
|
|
||||||
image_type: conda
|
|
||||||
1
distributions/databricks/build.yaml
Symbolic link
1
distributions/databricks/build.yaml
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../llama_stack/templates/databricks/build.yaml
|
||||||
|
|
@ -49,7 +49,7 @@ inference:
|
||||||
**Via Conda**
|
**Via Conda**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
llama stack build --config ./build.yaml
|
llama stack build --template fireworks --image-type conda
|
||||||
# -- modify run.yaml to a valid Fireworks server endpoint
|
# -- modify run.yaml to a valid Fireworks server endpoint
|
||||||
llama stack run ./run.yaml
|
llama stack run ./run.yaml
|
||||||
```
|
```
|
||||||
|
|
|
||||||
|
|
@ -1,10 +0,0 @@
|
||||||
name: fireworks
|
|
||||||
distribution_spec:
|
|
||||||
description: Use Fireworks.ai for running LLM inference
|
|
||||||
providers:
|
|
||||||
inference: remote::fireworks
|
|
||||||
memory: meta-reference
|
|
||||||
safety: meta-reference
|
|
||||||
agents: meta-reference
|
|
||||||
telemetry: meta-reference
|
|
||||||
image_type: docker
|
|
||||||
1
distributions/fireworks/build.yaml
Symbolic link
1
distributions/fireworks/build.yaml
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../llama_stack/templates/fireworks/build.yaml
|
||||||
|
|
@ -1,10 +0,0 @@
|
||||||
name: hf-endpoint
|
|
||||||
distribution_spec:
|
|
||||||
description: "Like local, but use Hugging Face Inference Endpoints for running LLM inference.\nSee https://hf.co/docs/api-endpoints."
|
|
||||||
providers:
|
|
||||||
inference: remote::hf::endpoint
|
|
||||||
memory: meta-reference
|
|
||||||
safety: meta-reference
|
|
||||||
agents: meta-reference
|
|
||||||
telemetry: meta-reference
|
|
||||||
image_type: conda
|
|
||||||
1
distributions/hf-endpoint/build.yaml
Symbolic link
1
distributions/hf-endpoint/build.yaml
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../llama_stack/templates/hf-endpoint/build.yaml
|
||||||
|
|
@ -1,10 +0,0 @@
|
||||||
name: hf-serverless
|
|
||||||
distribution_spec:
|
|
||||||
description: "Like local, but use Hugging Face Inference API (serverless) for running LLM inference.\nSee https://hf.co/docs/api-inference."
|
|
||||||
providers:
|
|
||||||
inference: remote::hf::serverless
|
|
||||||
memory: meta-reference
|
|
||||||
safety: meta-reference
|
|
||||||
agents: meta-reference
|
|
||||||
telemetry: meta-reference
|
|
||||||
image_type: conda
|
|
||||||
1
distributions/hf-serverless/build.yaml
Symbolic link
1
distributions/hf-serverless/build.yaml
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../llama_stack/templates/hf-serverless/build.yaml
|
||||||
|
|
@ -1,14 +0,0 @@
|
||||||
name: meta-reference-gpu
|
|
||||||
distribution_spec:
|
|
||||||
docker_image: pytorch/pytorch
|
|
||||||
description: Use code from `llama_stack` itself to serve all llama stack APIs
|
|
||||||
providers:
|
|
||||||
inference: meta-reference
|
|
||||||
memory:
|
|
||||||
- meta-reference
|
|
||||||
- remote::chromadb
|
|
||||||
- remote::pgvector
|
|
||||||
safety: meta-reference
|
|
||||||
agents: meta-reference
|
|
||||||
telemetry: meta-reference
|
|
||||||
image_type: docker
|
|
||||||
1
distributions/meta-reference-gpu/build.yaml
Symbolic link
1
distributions/meta-reference-gpu/build.yaml
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../llama_stack/templates/meta-reference-gpu/build.yaml
|
||||||
|
|
@ -1,14 +0,0 @@
|
||||||
name: meta-reference-quantized-gpu
|
|
||||||
distribution_spec:
|
|
||||||
docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
|
|
||||||
description: Use code from `llama_stack` itself to serve all llama stack APIs
|
|
||||||
providers:
|
|
||||||
inference: meta-reference-quantized
|
|
||||||
memory:
|
|
||||||
- meta-reference
|
|
||||||
- remote::chromadb
|
|
||||||
- remote::pgvector
|
|
||||||
safety: meta-reference
|
|
||||||
agents: meta-reference
|
|
||||||
telemetry: meta-reference
|
|
||||||
image_type: docker
|
|
||||||
1
distributions/meta-reference-quantized-gpu/build.yaml
Symbolic link
1
distributions/meta-reference-quantized-gpu/build.yaml
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../llama_stack/templates/meta-reference-quantized-gpu/build.yaml
|
||||||
|
|
@ -86,6 +86,6 @@ inference:
|
||||||
**Via Conda**
|
**Via Conda**
|
||||||
|
|
||||||
```
|
```
|
||||||
llama stack build --config ./build.yaml
|
llama stack build --template ollama --image-type conda
|
||||||
llama stack run ./gpu/run.yaml
|
llama stack run ./gpu/run.yaml
|
||||||
```
|
```
|
||||||
|
|
|
||||||
|
|
@ -1,13 +0,0 @@
|
||||||
name: ollama
|
|
||||||
distribution_spec:
|
|
||||||
description: Use ollama for running LLM inference
|
|
||||||
providers:
|
|
||||||
inference: remote::ollama
|
|
||||||
memory:
|
|
||||||
- meta-reference
|
|
||||||
- remote::chromadb
|
|
||||||
- remote::pgvector
|
|
||||||
safety: meta-reference
|
|
||||||
agents: meta-reference
|
|
||||||
telemetry: meta-reference
|
|
||||||
image_type: docker
|
|
||||||
1
distributions/ollama/build.yaml
Symbolic link
1
distributions/ollama/build.yaml
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../llama_stack/templates/ollama/build.yaml
|
||||||
|
|
@ -88,7 +88,7 @@ inference:
|
||||||
**Via Conda**
|
**Via Conda**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
llama stack build --config ./build.yaml
|
llama stack build --template tgi --image-type conda
|
||||||
# -- start a TGI server endpoint
|
# -- start a TGI server endpoint
|
||||||
llama stack run ./gpu/run.yaml
|
llama stack run ./gpu/run.yaml
|
||||||
```
|
```
|
||||||
|
|
|
||||||
|
|
@ -1,13 +0,0 @@
|
||||||
name: tgi
|
|
||||||
distribution_spec:
|
|
||||||
description: Use TGI for running LLM inference
|
|
||||||
providers:
|
|
||||||
inference: remote::tgi
|
|
||||||
memory:
|
|
||||||
- meta-reference
|
|
||||||
- remote::chromadb
|
|
||||||
- remote::pgvector
|
|
||||||
safety: meta-reference
|
|
||||||
agents: meta-reference
|
|
||||||
telemetry: meta-reference
|
|
||||||
image_type: docker
|
|
||||||
1
distributions/tgi/build.yaml
Symbolic link
1
distributions/tgi/build.yaml
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../llama_stack/templates/tgi/build.yaml
|
||||||
|
|
@ -62,7 +62,7 @@ memory:
|
||||||
**Via Conda**
|
**Via Conda**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
llama stack build --config ./build.yaml
|
llama stack build --template together --image-type conda
|
||||||
# -- modify run.yaml to a valid Together server endpoint
|
# -- modify run.yaml to a valid Together server endpoint
|
||||||
llama stack run ./run.yaml
|
llama stack run ./run.yaml
|
||||||
```
|
```
|
||||||
|
|
|
||||||
|
|
@ -1,10 +0,0 @@
|
||||||
name: together
|
|
||||||
distribution_spec:
|
|
||||||
description: Use Together.ai for running LLM inference
|
|
||||||
providers:
|
|
||||||
inference: remote::together
|
|
||||||
memory: remote::weaviate
|
|
||||||
safety: remote::together
|
|
||||||
agents: meta-reference
|
|
||||||
telemetry: meta-reference
|
|
||||||
image_type: docker
|
|
||||||
1
distributions/together/build.yaml
Symbolic link
1
distributions/together/build.yaml
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../llama_stack/templates/together/build.yaml
|
||||||
|
|
@ -1,10 +0,0 @@
|
||||||
name: vllm
|
|
||||||
distribution_spec:
|
|
||||||
description: Like local, but use vLLM for running LLM inference
|
|
||||||
providers:
|
|
||||||
inference: vllm
|
|
||||||
memory: meta-reference
|
|
||||||
safety: meta-reference
|
|
||||||
agents: meta-reference
|
|
||||||
telemetry: meta-reference
|
|
||||||
image_type: conda
|
|
||||||
1
distributions/vllm/build.yaml
Symbolic link
1
distributions/vllm/build.yaml
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
../../llama_stack/templates/vllm/build.yaml
|
||||||
|
|
@ -279,11 +279,11 @@ llama stack build --list-templates
|
||||||
You may then pick a template to build your distribution with providers fitted to your liking.
|
You may then pick a template to build your distribution with providers fitted to your liking.
|
||||||
|
|
||||||
```
|
```
|
||||||
llama stack build --template local-tgi --name my-tgi-stack
|
llama stack build --template local-tgi --name my-tgi-stack --image-type conda
|
||||||
```
|
```
|
||||||
|
|
||||||
```
|
```
|
||||||
$ llama stack build --template local-tgi --name my-tgi-stack
|
$ llama stack build --template local-tgi --name my-tgi-stack --image-type conda
|
||||||
...
|
...
|
||||||
...
|
...
|
||||||
Build spec configuration saved at ~/.conda/envs/llamastack-my-tgi-stack/my-tgi-stack-build.yaml
|
Build spec configuration saved at ~/.conda/envs/llamastack-my-tgi-stack/my-tgi-stack-build.yaml
|
||||||
|
|
@ -293,10 +293,10 @@ You may now run `llama stack configure my-tgi-stack` or `llama stack configure ~
|
||||||
#### Building from config file
|
#### Building from config file
|
||||||
- In addition to templates, you may customize the build to your liking through editing config files and build from config files with the following command.
|
- In addition to templates, you may customize the build to your liking through editing config files and build from config files with the following command.
|
||||||
|
|
||||||
- The config file will be of contents like the ones in `llama_stack/distributions/templates/`.
|
- The config file will be of contents like the ones in `llama_stack/templates/`.
|
||||||
|
|
||||||
```
|
```
|
||||||
$ cat llama_stack/distribution/templates/local-ollama-build.yaml
|
$ cat build.yaml
|
||||||
|
|
||||||
name: local-ollama
|
name: local-ollama
|
||||||
distribution_spec:
|
distribution_spec:
|
||||||
|
|
@ -311,7 +311,7 @@ image_type: conda
|
||||||
```
|
```
|
||||||
|
|
||||||
```
|
```
|
||||||
llama stack build --config llama_stack/distribution/templates/local-ollama-build.yaml
|
llama stack build --config build.yaml
|
||||||
```
|
```
|
||||||
|
|
||||||
#### How to build distribution with Docker image
|
#### How to build distribution with Docker image
|
||||||
|
|
|
||||||
|
|
@ -35,11 +35,7 @@ You have two ways to start up Llama stack server:
|
||||||
|
|
||||||
1. **Starting up server via docker**:
|
1. **Starting up server via docker**:
|
||||||
|
|
||||||
We provide 2 pre-built Docker image of Llama Stack distribution, which can be found in the following links.
|
We provide pre-built Docker image of Llama Stack distribution, which can be found in the following links in the [distributions](../distributions/) folder.
|
||||||
- [llamastack-local-gpu](https://hub.docker.com/repository/docker/llamastack/llamastack-local-gpu/general)
|
|
||||||
- This is a packaged version with our local meta-reference implementations, where you will be running inference locally with downloaded Llama model checkpoints.
|
|
||||||
- [llamastack-local-cpu](https://hub.docker.com/repository/docker/llamastack/llamastack-local-cpu/general)
|
|
||||||
- This is a lite version with remote inference where you can hook up to your favourite remote inference framework (e.g. ollama, fireworks, together, tgi) for running inference without GPU.
|
|
||||||
|
|
||||||
> [!NOTE]
|
> [!NOTE]
|
||||||
> For GPU inference, you need to set these environment variables for specifying local directory containing your model checkpoints, and enable GPU inference to start running docker container.
|
> For GPU inference, you need to set these environment variables for specifying local directory containing your model checkpoints, and enable GPU inference to start running docker container.
|
||||||
|
|
|
||||||
|
|
@ -37,7 +37,7 @@ class ScoreResponse(BaseModel):
|
||||||
|
|
||||||
|
|
||||||
class ScoringFunctionStore(Protocol):
|
class ScoringFunctionStore(Protocol):
|
||||||
def get_scoring_function(self, name: str) -> ScoringFunctionDefWithProvider: ...
|
def get_scoring_function(self, name: str) -> ScoringFnDefWithProvider: ...
|
||||||
|
|
||||||
|
|
||||||
@runtime_checkable
|
@runtime_checkable
|
||||||
|
|
|
||||||
|
|
@ -29,7 +29,7 @@ class LLMAsJudgeContext(BaseModel):
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class ScoringFunctionDef(BaseModel):
|
class ScoringFnDef(BaseModel):
|
||||||
identifier: str
|
identifier: str
|
||||||
description: Optional[str] = None
|
description: Optional[str] = None
|
||||||
metadata: Dict[str, Any] = Field(
|
metadata: Dict[str, Any] = Field(
|
||||||
|
|
@ -48,7 +48,7 @@ class ScoringFunctionDef(BaseModel):
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class ScoringFunctionDefWithProvider(ScoringFunctionDef):
|
class ScoringFnDefWithProvider(ScoringFnDef):
|
||||||
provider_id: str = Field(
|
provider_id: str = Field(
|
||||||
description="ID of the provider which serves this dataset",
|
description="ID of the provider which serves this dataset",
|
||||||
)
|
)
|
||||||
|
|
@ -57,14 +57,14 @@ class ScoringFunctionDefWithProvider(ScoringFunctionDef):
|
||||||
@runtime_checkable
|
@runtime_checkable
|
||||||
class ScoringFunctions(Protocol):
|
class ScoringFunctions(Protocol):
|
||||||
@webmethod(route="/scoring_functions/list", method="GET")
|
@webmethod(route="/scoring_functions/list", method="GET")
|
||||||
async def list_scoring_functions(self) -> List[ScoringFunctionDefWithProvider]: ...
|
async def list_scoring_functions(self) -> List[ScoringFnDefWithProvider]: ...
|
||||||
|
|
||||||
@webmethod(route="/scoring_functions/get", method="GET")
|
@webmethod(route="/scoring_functions/get", method="GET")
|
||||||
async def get_scoring_function(
|
async def get_scoring_function(
|
||||||
self, name: str
|
self, name: str
|
||||||
) -> Optional[ScoringFunctionDefWithProvider]: ...
|
) -> Optional[ScoringFnDefWithProvider]: ...
|
||||||
|
|
||||||
@webmethod(route="/scoring_functions/register", method="POST")
|
@webmethod(route="/scoring_functions/register", method="POST")
|
||||||
async def register_scoring_function(
|
async def register_scoring_function(
|
||||||
self, function_def: ScoringFunctionDefWithProvider
|
self, function_def: ScoringFnDefWithProvider
|
||||||
) -> None: ...
|
) -> None: ...
|
||||||
|
|
|
||||||
|
|
@ -12,9 +12,7 @@ import os
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
TEMPLATES_PATH = (
|
TEMPLATES_PATH = Path(os.path.relpath(__file__)).parent.parent.parent / "templates"
|
||||||
Path(os.path.relpath(__file__)).parent.parent.parent.parent / "distributions"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache()
|
||||||
|
|
@ -26,7 +24,6 @@ def available_templates_specs() -> List[BuildConfig]:
|
||||||
with open(p, "r") as f:
|
with open(p, "r") as f:
|
||||||
build_config = BuildConfig(**yaml.safe_load(f))
|
build_config = BuildConfig(**yaml.safe_load(f))
|
||||||
template_specs.append(build_config)
|
template_specs.append(build_config)
|
||||||
|
|
||||||
return template_specs
|
return template_specs
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -78,112 +75,17 @@ class StackBuild(Subcommand):
|
||||||
choices=["conda", "docker"],
|
choices=["conda", "docker"],
|
||||||
)
|
)
|
||||||
|
|
||||||
def _get_build_config_from_name(self, args: argparse.Namespace) -> Optional[Path]:
|
|
||||||
if os.getenv("CONDA_PREFIX", ""):
|
|
||||||
conda_dir = (
|
|
||||||
Path(os.getenv("CONDA_PREFIX")).parent / f"llamastack-{args.name}"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
cprint(
|
|
||||||
"Cannot find CONDA_PREFIX. Trying default conda path ~/.conda/envs...",
|
|
||||||
color="green",
|
|
||||||
)
|
|
||||||
conda_dir = (
|
|
||||||
Path(os.path.expanduser("~/.conda/envs")) / f"llamastack-{args.name}"
|
|
||||||
)
|
|
||||||
build_config_file = Path(conda_dir) / f"{args.name}-build.yaml"
|
|
||||||
if build_config_file.exists():
|
|
||||||
return build_config_file
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _run_stack_build_command_from_build_config(
|
|
||||||
self, build_config: BuildConfig
|
|
||||||
) -> None:
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
|
|
||||||
import yaml
|
|
||||||
|
|
||||||
from llama_stack.distribution.build import build_image, ImageType
|
|
||||||
from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR
|
|
||||||
from llama_stack.distribution.utils.serialize import EnumEncoder
|
|
||||||
from termcolor import cprint
|
|
||||||
|
|
||||||
# save build.yaml spec for building same distribution again
|
|
||||||
if build_config.image_type == ImageType.docker.value:
|
|
||||||
# docker needs build file to be in the llama-stack repo dir to be able to copy over to the image
|
|
||||||
llama_stack_path = Path(
|
|
||||||
os.path.abspath(__file__)
|
|
||||||
).parent.parent.parent.parent
|
|
||||||
build_dir = llama_stack_path / "tmp/configs/"
|
|
||||||
else:
|
|
||||||
build_dir = DISTRIBS_BASE_DIR / f"llamastack-{build_config.name}"
|
|
||||||
|
|
||||||
os.makedirs(build_dir, exist_ok=True)
|
|
||||||
build_file_path = build_dir / f"{build_config.name}-build.yaml"
|
|
||||||
|
|
||||||
with open(build_file_path, "w") as f:
|
|
||||||
to_write = json.loads(json.dumps(build_config.dict(), cls=EnumEncoder))
|
|
||||||
f.write(yaml.dump(to_write, sort_keys=False))
|
|
||||||
|
|
||||||
return_code = build_image(build_config, build_file_path)
|
|
||||||
if return_code != 0:
|
|
||||||
return
|
|
||||||
|
|
||||||
configure_name = (
|
|
||||||
build_config.name
|
|
||||||
if build_config.image_type == "conda"
|
|
||||||
else (f"llamastack-{build_config.name}")
|
|
||||||
)
|
|
||||||
if build_config.image_type == "conda":
|
|
||||||
cprint(
|
|
||||||
f"You can now run `llama stack configure {configure_name}`",
|
|
||||||
color="green",
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
cprint(
|
|
||||||
f"You can now run `llama stack run {build_config.name}`",
|
|
||||||
color="green",
|
|
||||||
)
|
|
||||||
|
|
||||||
def _run_template_list_cmd(self, args: argparse.Namespace) -> None:
|
|
||||||
import json
|
|
||||||
|
|
||||||
from llama_stack.cli.table import print_table
|
|
||||||
|
|
||||||
# eventually, this should query a registry at llama.meta.com/llamastack/distributions
|
|
||||||
headers = [
|
|
||||||
"Template Name",
|
|
||||||
"Providers",
|
|
||||||
"Description",
|
|
||||||
]
|
|
||||||
|
|
||||||
rows = []
|
|
||||||
for spec in available_templates_specs():
|
|
||||||
rows.append(
|
|
||||||
[
|
|
||||||
spec.name,
|
|
||||||
json.dumps(spec.distribution_spec.providers, indent=2),
|
|
||||||
spec.distribution_spec.description,
|
|
||||||
]
|
|
||||||
)
|
|
||||||
print_table(
|
|
||||||
rows,
|
|
||||||
headers,
|
|
||||||
separate_rows=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _run_stack_build_command(self, args: argparse.Namespace) -> None:
|
def _run_stack_build_command(self, args: argparse.Namespace) -> None:
|
||||||
import textwrap
|
import textwrap
|
||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
from llama_stack.distribution.distribution import get_provider_registry
|
|
||||||
from prompt_toolkit import prompt
|
from prompt_toolkit import prompt
|
||||||
from prompt_toolkit.completion import WordCompleter
|
from prompt_toolkit.completion import WordCompleter
|
||||||
from prompt_toolkit.validation import Validator
|
from prompt_toolkit.validation import Validator
|
||||||
from termcolor import cprint
|
from termcolor import cprint
|
||||||
|
|
||||||
|
from llama_stack.distribution.distribution import get_provider_registry
|
||||||
|
|
||||||
if args.list_templates:
|
if args.list_templates:
|
||||||
self._run_template_list_cmd(args)
|
self._run_template_list_cmd(args)
|
||||||
return
|
return
|
||||||
|
|
@ -194,19 +96,22 @@ class StackBuild(Subcommand):
|
||||||
"You must specify a name for the build using --name when using a template"
|
"You must specify a name for the build using --name when using a template"
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
build_path = TEMPLATES_PATH / f"{args.template}-build.yaml"
|
available_templates = available_templates_specs()
|
||||||
if not build_path.exists():
|
for build_config in available_templates:
|
||||||
self.parser.error(
|
if build_config.name == args.template:
|
||||||
f"Could not find template {args.template}. Please run `llama stack build --list-templates` to check out the available templates"
|
build_config.name = args.name
|
||||||
)
|
if args.image_type:
|
||||||
return
|
build_config.image_type = args.image_type
|
||||||
with open(build_path, "r") as f:
|
else:
|
||||||
build_config = BuildConfig(**yaml.safe_load(f))
|
self.parser.error(
|
||||||
build_config.name = args.name
|
f"Please specify a image-type (docker | conda) for {args.template}"
|
||||||
if args.image_type:
|
)
|
||||||
build_config.image_type = args.image_type
|
self._run_stack_build_command_from_build_config(build_config)
|
||||||
self._run_stack_build_command_from_build_config(build_config)
|
return
|
||||||
|
|
||||||
|
self.parser.error(
|
||||||
|
f"Could not find template {args.template}. Please run `llama stack build --list-templates` to check out the available templates"
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
# try to see if we can find a pre-existing build config file through name
|
# try to see if we can find a pre-existing build config file through name
|
||||||
|
|
@ -297,3 +202,99 @@ class StackBuild(Subcommand):
|
||||||
self.parser.error(f"Could not parse config file {args.config}: {e}")
|
self.parser.error(f"Could not parse config file {args.config}: {e}")
|
||||||
return
|
return
|
||||||
self._run_stack_build_command_from_build_config(build_config)
|
self._run_stack_build_command_from_build_config(build_config)
|
||||||
|
|
||||||
|
def _get_build_config_from_name(self, args: argparse.Namespace) -> Optional[Path]:
|
||||||
|
if os.getenv("CONDA_PREFIX", ""):
|
||||||
|
conda_dir = (
|
||||||
|
Path(os.getenv("CONDA_PREFIX")).parent / f"llamastack-{args.name}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
cprint(
|
||||||
|
"Cannot find CONDA_PREFIX. Trying default conda path ~/.conda/envs...",
|
||||||
|
color="green",
|
||||||
|
)
|
||||||
|
conda_dir = (
|
||||||
|
Path(os.path.expanduser("~/.conda/envs")) / f"llamastack-{args.name}"
|
||||||
|
)
|
||||||
|
build_config_file = Path(conda_dir) / f"{args.name}-build.yaml"
|
||||||
|
if build_config_file.exists():
|
||||||
|
return build_config_file
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _run_stack_build_command_from_build_config(
|
||||||
|
self, build_config: BuildConfig
|
||||||
|
) -> None:
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
from termcolor import cprint
|
||||||
|
|
||||||
|
from llama_stack.distribution.build import build_image, ImageType
|
||||||
|
from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR
|
||||||
|
from llama_stack.distribution.utils.serialize import EnumEncoder
|
||||||
|
|
||||||
|
# save build.yaml spec for building same distribution again
|
||||||
|
if build_config.image_type == ImageType.docker.value:
|
||||||
|
# docker needs build file to be in the llama-stack repo dir to be able to copy over to the image
|
||||||
|
llama_stack_path = Path(
|
||||||
|
os.path.abspath(__file__)
|
||||||
|
).parent.parent.parent.parent
|
||||||
|
build_dir = llama_stack_path / "tmp/configs/"
|
||||||
|
else:
|
||||||
|
build_dir = DISTRIBS_BASE_DIR / f"llamastack-{build_config.name}"
|
||||||
|
|
||||||
|
os.makedirs(build_dir, exist_ok=True)
|
||||||
|
build_file_path = build_dir / f"{build_config.name}-build.yaml"
|
||||||
|
|
||||||
|
with open(build_file_path, "w") as f:
|
||||||
|
to_write = json.loads(json.dumps(build_config.dict(), cls=EnumEncoder))
|
||||||
|
f.write(yaml.dump(to_write, sort_keys=False))
|
||||||
|
|
||||||
|
return_code = build_image(build_config, build_file_path)
|
||||||
|
if return_code != 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
configure_name = (
|
||||||
|
build_config.name
|
||||||
|
if build_config.image_type == "conda"
|
||||||
|
else (f"llamastack-{build_config.name}")
|
||||||
|
)
|
||||||
|
if build_config.image_type == "conda":
|
||||||
|
cprint(
|
||||||
|
f"You can now run `llama stack configure {configure_name}`",
|
||||||
|
color="green",
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
cprint(
|
||||||
|
f"You can now edit your run.yaml file and run `docker run -it -p 5000:5000 {build_config.name}`. See full command in llama-stack/distributions/",
|
||||||
|
color="green",
|
||||||
|
)
|
||||||
|
|
||||||
|
def _run_template_list_cmd(self, args: argparse.Namespace) -> None:
|
||||||
|
import json
|
||||||
|
|
||||||
|
from llama_stack.cli.table import print_table
|
||||||
|
|
||||||
|
# eventually, this should query a registry at llama.meta.com/llamastack/distributions
|
||||||
|
headers = [
|
||||||
|
"Template Name",
|
||||||
|
"Providers",
|
||||||
|
"Description",
|
||||||
|
]
|
||||||
|
|
||||||
|
rows = []
|
||||||
|
for spec in available_templates_specs():
|
||||||
|
rows.append(
|
||||||
|
[
|
||||||
|
spec.name,
|
||||||
|
json.dumps(spec.distribution_spec.providers, indent=2),
|
||||||
|
spec.distribution_spec.description,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
print_table(
|
||||||
|
rows,
|
||||||
|
headers,
|
||||||
|
separate_rows=True,
|
||||||
|
)
|
||||||
|
|
|
||||||
|
|
@ -8,18 +8,19 @@ from enum import Enum
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
import pkg_resources
|
import pkg_resources
|
||||||
|
|
||||||
from llama_stack.distribution.utils.exec import run_with_pty
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from termcolor import cprint
|
from termcolor import cprint
|
||||||
|
|
||||||
|
from llama_stack.distribution.utils.exec import run_with_pty
|
||||||
|
|
||||||
from llama_stack.distribution.datatypes import * # noqa: F403
|
from llama_stack.distribution.datatypes import * # noqa: F403
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from llama_stack.distribution.utils.config_dirs import BUILDS_BASE_DIR
|
|
||||||
from llama_stack.distribution.distribution import get_provider_registry
|
from llama_stack.distribution.distribution import get_provider_registry
|
||||||
|
|
||||||
|
from llama_stack.distribution.utils.config_dirs import BUILDS_BASE_DIR
|
||||||
|
|
||||||
|
|
||||||
# These are the dependencies needed by the distribution server.
|
# These are the dependencies needed by the distribution server.
|
||||||
# `llama-stack` is automatically installed by the installation script.
|
# `llama-stack` is automatically installed by the installation script.
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,11 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
LLAMA_MODELS_DIR=${LLAMA_MODELS_DIR:-}
|
LLAMA_MODELS_DIR=${LLAMA_MODELS_DIR:-}
|
||||||
LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
|
LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
|
||||||
TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
|
TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
|
||||||
|
|
@ -34,9 +40,6 @@ REPO_CONFIGS_DIR="$REPO_DIR/tmp/configs"
|
||||||
|
|
||||||
TEMP_DIR=$(mktemp -d)
|
TEMP_DIR=$(mktemp -d)
|
||||||
|
|
||||||
llama stack configure $build_file_path
|
|
||||||
cp $host_build_dir/$build_name-run.yaml $REPO_CONFIGS_DIR
|
|
||||||
|
|
||||||
add_to_docker() {
|
add_to_docker() {
|
||||||
local input
|
local input
|
||||||
output_file="$TEMP_DIR/Dockerfile"
|
output_file="$TEMP_DIR/Dockerfile"
|
||||||
|
|
@ -113,7 +116,6 @@ ENTRYPOINT ["python", "-m", "llama_stack.distribution.server.server"]
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
add_to_docker "ADD tmp/configs/$(basename "$build_file_path") ./llamastack-build.yaml"
|
add_to_docker "ADD tmp/configs/$(basename "$build_file_path") ./llamastack-build.yaml"
|
||||||
add_to_docker "ADD tmp/configs/$build_name-run.yaml ./llamastack-run.yaml"
|
|
||||||
|
|
||||||
printf "Dockerfile created successfully in $TEMP_DIR/Dockerfile"
|
printf "Dockerfile created successfully in $TEMP_DIR/Dockerfile"
|
||||||
cat $TEMP_DIR/Dockerfile
|
cat $TEMP_DIR/Dockerfile
|
||||||
|
|
|
||||||
|
|
@ -34,7 +34,7 @@ RoutableObject = Union[
|
||||||
ShieldDef,
|
ShieldDef,
|
||||||
MemoryBankDef,
|
MemoryBankDef,
|
||||||
DatasetDef,
|
DatasetDef,
|
||||||
ScoringFunctionDef,
|
ScoringFnDef,
|
||||||
]
|
]
|
||||||
|
|
||||||
RoutableObjectWithProvider = Union[
|
RoutableObjectWithProvider = Union[
|
||||||
|
|
@ -42,7 +42,7 @@ RoutableObjectWithProvider = Union[
|
||||||
ShieldDefWithProvider,
|
ShieldDefWithProvider,
|
||||||
MemoryBankDefWithProvider,
|
MemoryBankDefWithProvider,
|
||||||
DatasetDefWithProvider,
|
DatasetDefWithProvider,
|
||||||
ScoringFunctionDefWithProvider,
|
ScoringFnDefWithProvider,
|
||||||
]
|
]
|
||||||
|
|
||||||
RoutedProtocol = Union[
|
RoutedProtocol = Union[
|
||||||
|
|
|
||||||
|
|
@ -100,7 +100,7 @@ class CommonRoutingTableImpl(RoutingTable):
|
||||||
scoring_functions = await p.list_scoring_functions()
|
scoring_functions = await p.list_scoring_functions()
|
||||||
add_objects(
|
add_objects(
|
||||||
[
|
[
|
||||||
ScoringFunctionDefWithProvider(**s.dict(), provider_id=pid)
|
ScoringFnDefWithProvider(**s.dict(), provider_id=pid)
|
||||||
for s in scoring_functions
|
for s in scoring_functions
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
@ -239,7 +239,7 @@ class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets):
|
||||||
|
|
||||||
|
|
||||||
class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, Scoring):
|
class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, Scoring):
|
||||||
async def list_scoring_functions(self) -> List[ScoringFunctionDefWithProvider]:
|
async def list_scoring_functions(self) -> List[ScoringFnDefWithProvider]:
|
||||||
objects = []
|
objects = []
|
||||||
for objs in self.registry.values():
|
for objs in self.registry.values():
|
||||||
objects.extend(objs)
|
objects.extend(objs)
|
||||||
|
|
@ -247,10 +247,10 @@ class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, Scoring):
|
||||||
|
|
||||||
async def get_scoring_function(
|
async def get_scoring_function(
|
||||||
self, name: str
|
self, name: str
|
||||||
) -> Optional[ScoringFunctionDefWithProvider]:
|
) -> Optional[ScoringFnDefWithProvider]:
|
||||||
return self.get_object_by_identifier(name)
|
return self.get_object_by_identifier(name)
|
||||||
|
|
||||||
async def register_scoring_function(
|
async def register_scoring_function(
|
||||||
self, function_def: ScoringFunctionDefWithProvider
|
self, function_def: ScoringFnDefWithProvider
|
||||||
) -> None:
|
) -> None:
|
||||||
await self.register_object(function_def)
|
await self.register_object(function_def)
|
||||||
|
|
|
||||||
|
|
@ -29,7 +29,7 @@ if [ $# -lt 3 ]; then
|
||||||
fi
|
fi
|
||||||
|
|
||||||
build_name="$1"
|
build_name="$1"
|
||||||
docker_image="llamastack-$build_name"
|
docker_image="distribution-$build_name"
|
||||||
shift
|
shift
|
||||||
|
|
||||||
yaml_config="$1"
|
yaml_config="$1"
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,7 @@ from pydantic import BaseModel, Field
|
||||||
from llama_stack.apis.datasets import DatasetDef
|
from llama_stack.apis.datasets import DatasetDef
|
||||||
from llama_stack.apis.memory_banks import MemoryBankDef
|
from llama_stack.apis.memory_banks import MemoryBankDef
|
||||||
from llama_stack.apis.models import ModelDef
|
from llama_stack.apis.models import ModelDef
|
||||||
from llama_stack.apis.scoring_functions import ScoringFunctionDef
|
from llama_stack.apis.scoring_functions import ScoringFnDef
|
||||||
from llama_stack.apis.shields import ShieldDef
|
from llama_stack.apis.shields import ShieldDef
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -64,11 +64,9 @@ class DatasetsProtocolPrivate(Protocol):
|
||||||
|
|
||||||
|
|
||||||
class ScoringFunctionsProtocolPrivate(Protocol):
|
class ScoringFunctionsProtocolPrivate(Protocol):
|
||||||
async def list_scoring_functions(self) -> List[ScoringFunctionDef]: ...
|
async def list_scoring_functions(self) -> List[ScoringFnDef]: ...
|
||||||
|
|
||||||
async def register_scoring_function(
|
async def register_scoring_function(self, function_def: ScoringFnDef) -> None: ...
|
||||||
self, function_def: ScoringFunctionDef
|
|
||||||
) -> None: ...
|
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
|
|
|
||||||
|
|
@ -169,7 +169,7 @@ class MetaReferenceAgentsImpl(Agents):
|
||||||
turn_ids: Optional[List[str]] = None,
|
turn_ids: Optional[List[str]] = None,
|
||||||
) -> Session:
|
) -> Session:
|
||||||
session = await self.persistence_store.get(f"session:{agent_id}:{session_id}")
|
session = await self.persistence_store.get(f"session:{agent_id}:{session_id}")
|
||||||
session = Session(**json.loads(session))
|
session = Session(**json.loads(session), turns=[])
|
||||||
turns = []
|
turns = []
|
||||||
if turn_ids:
|
if turn_ids:
|
||||||
for turn_id in turn_ids:
|
for turn_id in turn_ids:
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@
|
||||||
#
|
#
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
from enum import Enum
|
||||||
from llama_models.llama3.api.datatypes import * # noqa: F403
|
from llama_models.llama3.api.datatypes import * # noqa: F403
|
||||||
|
|
||||||
from llama_stack.apis.common.type_system import * # noqa: F403
|
from llama_stack.apis.common.type_system import * # noqa: F403
|
||||||
|
|
@ -16,6 +17,13 @@ from llama_stack.apis.scoring import Scoring
|
||||||
from .config import MetaReferenceEvalConfig
|
from .config import MetaReferenceEvalConfig
|
||||||
|
|
||||||
|
|
||||||
|
class ColumnName(Enum):
|
||||||
|
expected_answer = "expected_answer"
|
||||||
|
chat_completion_input = "chat_completion_input"
|
||||||
|
completion_input = "completion_input"
|
||||||
|
generated_answer = "generated_answer"
|
||||||
|
|
||||||
|
|
||||||
class MetaReferenceEvalImpl(Eval):
|
class MetaReferenceEvalImpl(Eval):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|
@ -41,18 +49,16 @@ class MetaReferenceEvalImpl(Eval):
|
||||||
async def validate_eval_input_dataset_schema(self, dataset_id: str) -> None:
|
async def validate_eval_input_dataset_schema(self, dataset_id: str) -> None:
|
||||||
dataset_def = await self.datasets_api.get_dataset(dataset_identifier=dataset_id)
|
dataset_def = await self.datasets_api.get_dataset(dataset_identifier=dataset_id)
|
||||||
if not dataset_def.dataset_schema or len(dataset_def.dataset_schema) == 0:
|
if not dataset_def.dataset_schema or len(dataset_def.dataset_schema) == 0:
|
||||||
raise ValueError(
|
raise ValueError(f"Dataset {dataset_id} does not have a schema defined.")
|
||||||
f"Dataset {dataset_id} does not have a schema defined. Please define a schema for the dataset."
|
|
||||||
)
|
|
||||||
|
|
||||||
expected_schemas = [
|
expected_schemas = [
|
||||||
{
|
{
|
||||||
"expected_answer": StringType(),
|
ColumnName.expected_answer.value: StringType(),
|
||||||
"chat_completion_input": ChatCompletionInputType(),
|
ColumnName.chat_completion_input.value: ChatCompletionInputType(),
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"expected_answer": StringType(),
|
ColumnName.expected_answer.value: StringType(),
|
||||||
"chat_completion_input": CompletionInputType(),
|
ColumnName.completion_input.value: CompletionInputType(),
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
@ -94,27 +100,43 @@ class MetaReferenceEvalImpl(Eval):
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"Evaluation with generation has not been implemented for agents"
|
"Evaluation with generation has not been implemented for agents"
|
||||||
)
|
)
|
||||||
|
assert (
|
||||||
|
candidate.sampling_params.max_tokens is not None
|
||||||
|
), "SamplingParams.max_tokens must be provided"
|
||||||
|
|
||||||
generations = []
|
generations = []
|
||||||
for x in input_rows:
|
for x in input_rows:
|
||||||
if "completion_input" in x:
|
if ColumnName.completion_input.value in x:
|
||||||
raise NotImplementedError(
|
input_content = eval(str(x[ColumnName.completion_input.value]))
|
||||||
"Evaluation with completion API has not been implemented"
|
response = await self.inference_api.completion(
|
||||||
|
model=candidate.model,
|
||||||
|
content=input_content,
|
||||||
|
sampling_params=candidate.sampling_params,
|
||||||
)
|
)
|
||||||
|
generations.append(
|
||||||
input_messages = eval(str(x["chat_completion_input"]))
|
{
|
||||||
input_messages = [UserMessage(**x) for x in input_messages]
|
ColumnName.generated_answer.value: response.completion_message.content
|
||||||
messages = []
|
}
|
||||||
if candidate.system_message:
|
)
|
||||||
messages.append(candidate.system_message)
|
elif ColumnName.chat_completion_input.value in x:
|
||||||
messages += input_messages
|
input_messages = eval(str(x[ColumnName.chat_completion_input.value]))
|
||||||
response = await self.inference_api.chat_completion(
|
input_messages = [UserMessage(**x) for x in input_messages]
|
||||||
model=candidate.model,
|
messages = []
|
||||||
messages=messages,
|
if candidate.system_message:
|
||||||
sampling_params=candidate.sampling_params,
|
messages.append(candidate.system_message)
|
||||||
)
|
messages += input_messages
|
||||||
generations.append(
|
response = await self.inference_api.chat_completion(
|
||||||
{"generated_answer": response.completion_message.content}
|
model=candidate.model,
|
||||||
)
|
messages=messages,
|
||||||
|
sampling_params=candidate.sampling_params,
|
||||||
|
)
|
||||||
|
generations.append(
|
||||||
|
{
|
||||||
|
ColumnName.generated_answer.value: response.completion_message.content
|
||||||
|
}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ValueError("Invalid input row")
|
||||||
|
|
||||||
# scoring with generated_answer
|
# scoring with generated_answer
|
||||||
score_input_rows = [
|
score_input_rows = [
|
||||||
|
|
@ -132,6 +154,8 @@ class MetaReferenceEvalImpl(Eval):
|
||||||
if job_id in self.jobs:
|
if job_id in self.jobs:
|
||||||
return JobStatus.completed
|
return JobStatus.completed
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
async def job_cancel(self, job_id: str) -> None:
|
async def job_cancel(self, job_id: str) -> None:
|
||||||
raise NotImplementedError("Job cancel is not implemented yet")
|
raise NotImplementedError("Job cancel is not implemented yet")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,11 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
if [[ $# -ne 1 ]]; then
|
if [[ $# -ne 1 ]]; then
|
||||||
echo "Error: Please provide the name of CONDA environment you wish to create"
|
echo "Error: Please provide the name of CONDA environment you wish to create"
|
||||||
exit 1
|
exit 1
|
||||||
|
|
|
||||||
|
|
@ -13,22 +13,22 @@ from llama_stack.apis.datasetio import * # noqa: F403
|
||||||
from llama_stack.apis.datasets import * # noqa: F403
|
from llama_stack.apis.datasets import * # noqa: F403
|
||||||
|
|
||||||
from llama_stack.providers.datatypes import ScoringFunctionsProtocolPrivate
|
from llama_stack.providers.datatypes import ScoringFunctionsProtocolPrivate
|
||||||
from llama_stack.providers.impls.meta_reference.scoring.scorer.equality_scorer import (
|
from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.equality_scoring_fn import (
|
||||||
EqualityScorer,
|
EqualityScoringFn,
|
||||||
)
|
)
|
||||||
|
|
||||||
from llama_stack.providers.impls.meta_reference.scoring.scorer.subset_of_scorer import (
|
from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.subset_of_scoring_fn import (
|
||||||
SubsetOfScorer,
|
SubsetOfScoringFn,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .config import MetaReferenceScoringConfig
|
from .config import MetaReferenceScoringConfig
|
||||||
|
|
||||||
SUPPORTED_SCORERS = [
|
SUPPORTED_SCORING_FNS = [
|
||||||
EqualityScorer,
|
EqualityScoringFn,
|
||||||
SubsetOfScorer,
|
SubsetOfScoringFn,
|
||||||
]
|
]
|
||||||
|
|
||||||
SCORER_REGISTRY = {x.scoring_function_def.identifier: x for x in SUPPORTED_SCORERS}
|
SCORER_REGISTRY = {x.scoring_function_def.identifier: x for x in SUPPORTED_SCORING_FNS}
|
||||||
|
|
||||||
|
|
||||||
class MetaReferenceScoringImpl(Scoring, ScoringFunctionsProtocolPrivate):
|
class MetaReferenceScoringImpl(Scoring, ScoringFunctionsProtocolPrivate):
|
||||||
|
|
@ -46,10 +46,10 @@ class MetaReferenceScoringImpl(Scoring, ScoringFunctionsProtocolPrivate):
|
||||||
|
|
||||||
async def shutdown(self) -> None: ...
|
async def shutdown(self) -> None: ...
|
||||||
|
|
||||||
async def list_scoring_functions(self) -> List[ScoringFunctionDef]:
|
async def list_scoring_functions(self) -> List[ScoringFnDef]:
|
||||||
return [x.scoring_function_def for x in SUPPORTED_SCORERS]
|
return [x.scoring_function_def for x in SUPPORTED_SCORING_FNS]
|
||||||
|
|
||||||
async def register_scoring_function(self, function_def: ScoringFunctionDef) -> None:
|
async def register_scoring_function(self, function_def: ScoringFnDef) -> None:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"Dynamically registering scoring functions is not supported"
|
"Dynamically registering scoring functions is not supported"
|
||||||
)
|
)
|
||||||
|
|
@ -101,9 +101,9 @@ class MetaReferenceScoringImpl(Scoring, ScoringFunctionsProtocolPrivate):
|
||||||
for scoring_fn_id in scoring_functions:
|
for scoring_fn_id in scoring_functions:
|
||||||
if scoring_fn_id not in SCORER_REGISTRY:
|
if scoring_fn_id not in SCORER_REGISTRY:
|
||||||
raise ValueError(f"Scoring function {scoring_fn_id} is not supported.")
|
raise ValueError(f"Scoring function {scoring_fn_id} is not supported.")
|
||||||
scorer = SCORER_REGISTRY[scoring_fn_id]()
|
scoring_fn = SCORER_REGISTRY[scoring_fn_id]()
|
||||||
score_results = scorer.score(input_rows)
|
score_results = scoring_fn.score(input_rows)
|
||||||
agg_results = scorer.aggregate(score_results)
|
agg_results = scoring_fn.aggregate(score_results)
|
||||||
res[scoring_fn_id] = ScoringResult(
|
res[scoring_fn_id] = ScoringResult(
|
||||||
score_rows=score_results,
|
score_rows=score_results,
|
||||||
aggregated_results=agg_results,
|
aggregated_results=agg_results,
|
||||||
|
|
|
||||||
|
|
@ -9,15 +9,15 @@ from llama_stack.apis.scoring_functions import * # noqa: F401, F403
|
||||||
from llama_stack.apis.scoring import * # noqa: F401, F403
|
from llama_stack.apis.scoring import * # noqa: F401, F403
|
||||||
|
|
||||||
|
|
||||||
class BaseScorer(ABC):
|
class BaseScoringFn(ABC):
|
||||||
"""
|
"""
|
||||||
Base interface class for all meta-reference scorers.
|
Base interface class for all meta-reference scoring_fns.
|
||||||
Each scorer needs to implement the following methods:
|
Each scoring_fn needs to implement the following methods:
|
||||||
- score_row(self, row)
|
- score_row(self, row)
|
||||||
- aggregate(self, scorer_results)
|
- aggregate(self, scoring_fn_results)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
scoring_function_def: ScoringFunctionDef
|
scoring_function_def: ScoringFnDef
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs) -> None:
|
def __init__(self, *args, **kwargs) -> None:
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
|
|
@ -4,23 +4,23 @@
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
|
||||||
from llama_stack.providers.impls.meta_reference.scoring.scorer.base_scorer import (
|
from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.base_scoring_fn import (
|
||||||
BaseScorer,
|
BaseScoringFn,
|
||||||
)
|
)
|
||||||
from llama_stack.apis.scoring_functions import * # noqa: F401, F403
|
from llama_stack.apis.scoring_functions import * # noqa: F401, F403
|
||||||
from llama_stack.apis.scoring import * # noqa: F401, F403
|
from llama_stack.apis.scoring import * # noqa: F401, F403
|
||||||
from llama_stack.apis.common.type_system import * # noqa: F403
|
from llama_stack.apis.common.type_system import * # noqa: F403
|
||||||
from llama_stack.providers.impls.meta_reference.scoring.scorer.common import (
|
from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.common import (
|
||||||
aggregate_accuracy,
|
aggregate_accuracy,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class EqualityScorer(BaseScorer):
|
class EqualityScoringFn(BaseScoringFn):
|
||||||
"""
|
"""
|
||||||
A scorer that assigns a score of 1.0 if the input string matches the target string, and 0.0 otherwise.
|
A scoring_fn that assigns a score of 1.0 if the input string matches the target string, and 0.0 otherwise.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
scoring_function_def = ScoringFunctionDef(
|
scoring_function_def = ScoringFnDef(
|
||||||
identifier="equality",
|
identifier="equality",
|
||||||
description="Returns 1.0 if the input is equal to the target, 0.0 otherwise.",
|
description="Returns 1.0 if the input is equal to the target, 0.0 otherwise.",
|
||||||
parameters=[],
|
parameters=[],
|
||||||
|
|
@ -4,23 +4,23 @@
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
|
||||||
from llama_stack.providers.impls.meta_reference.scoring.scorer.base_scorer import (
|
from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.base_scoring_fn import (
|
||||||
BaseScorer,
|
BaseScoringFn,
|
||||||
)
|
)
|
||||||
from llama_stack.apis.scoring_functions import * # noqa: F401, F403
|
from llama_stack.apis.scoring_functions import * # noqa: F401, F403
|
||||||
from llama_stack.apis.scoring import * # noqa: F401, F403
|
from llama_stack.apis.scoring import * # noqa: F401, F403
|
||||||
from llama_stack.apis.common.type_system import * # noqa: F403
|
from llama_stack.apis.common.type_system import * # noqa: F403
|
||||||
from llama_stack.providers.impls.meta_reference.scoring.scorer.common import (
|
from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.common import (
|
||||||
aggregate_accuracy,
|
aggregate_accuracy,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class SubsetOfScorer(BaseScorer):
|
class SubsetOfScoringFn(BaseScoringFn):
|
||||||
"""
|
"""
|
||||||
A scorer that assigns a score of 1.0 if the expected string is included in the generated string, and 0.0 otherwise.
|
A scoring_fn that assigns a score of 1.0 if the expected string is included in the generated string, and 0.0 otherwise.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
scoring_function_def = ScoringFunctionDef(
|
scoring_function_def = ScoringFnDef(
|
||||||
identifier="subset_of",
|
identifier="subset_of",
|
||||||
description="Returns 1.0 if the expected is included in generated, 0.0 otherwise.",
|
description="Returns 1.0 if the expected is included in generated, 0.0 otherwise.",
|
||||||
parameters=[],
|
parameters=[],
|
||||||
|
|
@ -15,13 +15,24 @@ class VLLMConfig(BaseModel):
|
||||||
"""Configuration for the vLLM inference provider."""
|
"""Configuration for the vLLM inference provider."""
|
||||||
|
|
||||||
model: str = Field(
|
model: str = Field(
|
||||||
default="Llama3.1-8B-Instruct",
|
default="Llama3.2-3B-Instruct",
|
||||||
description="Model descriptor from `llama model list`",
|
description="Model descriptor from `llama model list`",
|
||||||
)
|
)
|
||||||
tensor_parallel_size: int = Field(
|
tensor_parallel_size: int = Field(
|
||||||
default=1,
|
default=1,
|
||||||
description="Number of tensor parallel replicas (number of GPUs to use).",
|
description="Number of tensor parallel replicas (number of GPUs to use).",
|
||||||
)
|
)
|
||||||
|
max_tokens: int = Field(
|
||||||
|
default=4096,
|
||||||
|
description="Maximum number of tokens to generate.",
|
||||||
|
)
|
||||||
|
enforce_eager: bool = Field(
|
||||||
|
default=False,
|
||||||
|
description="Whether to use eager mode for inference (otherwise cuda graphs are used).",
|
||||||
|
)
|
||||||
|
gpu_memory_utilization: float = Field(
|
||||||
|
default=0.3,
|
||||||
|
)
|
||||||
|
|
||||||
@field_validator("model")
|
@field_validator("model")
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|
|
||||||
|
|
@ -7,11 +7,12 @@
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import uuid
|
import uuid
|
||||||
from typing import Any, AsyncGenerator
|
from typing import AsyncGenerator, Optional
|
||||||
|
|
||||||
from llama_models.llama3.api.chat_format import ChatFormat
|
from llama_models.llama3.api.chat_format import ChatFormat
|
||||||
from llama_models.llama3.api.datatypes import * # noqa: F403
|
from llama_models.llama3.api.datatypes import * # noqa: F403
|
||||||
from llama_models.llama3.api.tokenizer import Tokenizer
|
from llama_models.llama3.api.tokenizer import Tokenizer
|
||||||
|
from llama_models.sku_list import resolve_model
|
||||||
|
|
||||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||||
|
|
@ -19,7 +20,7 @@ from vllm.sampling_params import SamplingParams as VLLMSamplingParams
|
||||||
|
|
||||||
from llama_stack.apis.inference import * # noqa: F403
|
from llama_stack.apis.inference import * # noqa: F403
|
||||||
|
|
||||||
from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
|
from llama_stack.providers.datatypes import ModelDef, ModelsProtocolPrivate
|
||||||
from llama_stack.providers.utils.inference.openai_compat import (
|
from llama_stack.providers.utils.inference.openai_compat import (
|
||||||
OpenAICompatCompletionChoice,
|
OpenAICompatCompletionChoice,
|
||||||
OpenAICompatCompletionResponse,
|
OpenAICompatCompletionResponse,
|
||||||
|
|
@ -40,74 +41,15 @@ def _random_uuid() -> str:
|
||||||
return str(uuid.uuid4().hex)
|
return str(uuid.uuid4().hex)
|
||||||
|
|
||||||
|
|
||||||
def _vllm_sampling_params(sampling_params: Any) -> VLLMSamplingParams:
|
class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
|
||||||
"""Convert sampling params to vLLM sampling params."""
|
|
||||||
if sampling_params is None:
|
|
||||||
return VLLMSamplingParams()
|
|
||||||
|
|
||||||
# TODO convert what I saw in my first test ... but surely there's more to do here
|
|
||||||
kwargs = {
|
|
||||||
"temperature": sampling_params.temperature,
|
|
||||||
}
|
|
||||||
if sampling_params.top_k >= 1:
|
|
||||||
kwargs["top_k"] = sampling_params.top_k
|
|
||||||
if sampling_params.top_p:
|
|
||||||
kwargs["top_p"] = sampling_params.top_p
|
|
||||||
if sampling_params.max_tokens >= 1:
|
|
||||||
kwargs["max_tokens"] = sampling_params.max_tokens
|
|
||||||
if sampling_params.repetition_penalty > 0:
|
|
||||||
kwargs["repetition_penalty"] = sampling_params.repetition_penalty
|
|
||||||
|
|
||||||
return VLLMSamplingParams(**kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class VLLMInferenceImpl(ModelRegistryHelper, Inference):
|
|
||||||
"""Inference implementation for vLLM."""
|
"""Inference implementation for vLLM."""
|
||||||
|
|
||||||
HF_MODEL_MAPPINGS = {
|
|
||||||
# TODO: seems like we should be able to build this table dynamically ...
|
|
||||||
"Llama3.1-8B": "meta-llama/Llama-3.1-8B",
|
|
||||||
"Llama3.1-70B": "meta-llama/Llama-3.1-70B",
|
|
||||||
"Llama3.1-405B:bf16-mp8": "meta-llama/Llama-3.1-405B",
|
|
||||||
"Llama3.1-405B": "meta-llama/Llama-3.1-405B-FP8",
|
|
||||||
"Llama3.1-405B:bf16-mp16": "meta-llama/Llama-3.1-405B",
|
|
||||||
"Llama3.1-8B-Instruct": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"Llama3.1-70B-Instruct": "meta-llama/Llama-3.1-70B-Instruct",
|
|
||||||
"Llama3.1-405B-Instruct:bf16-mp8": "meta-llama/Llama-3.1-405B-Instruct",
|
|
||||||
"Llama3.1-405B-Instruct": "meta-llama/Llama-3.1-405B-Instruct-FP8",
|
|
||||||
"Llama3.1-405B-Instruct:bf16-mp16": "meta-llama/Llama-3.1-405B-Instruct",
|
|
||||||
"Llama3.2-1B": "meta-llama/Llama-3.2-1B",
|
|
||||||
"Llama3.2-3B": "meta-llama/Llama-3.2-3B",
|
|
||||||
"Llama3.2-11B-Vision": "meta-llama/Llama-3.2-11B-Vision",
|
|
||||||
"Llama3.2-90B-Vision": "meta-llama/Llama-3.2-90B-Vision",
|
|
||||||
"Llama3.2-1B-Instruct": "meta-llama/Llama-3.2-1B-Instruct",
|
|
||||||
"Llama3.2-3B-Instruct": "meta-llama/Llama-3.2-3B-Instruct",
|
|
||||||
"Llama3.2-11B-Vision-Instruct": "meta-llama/Llama-3.2-11B-Vision-Instruct",
|
|
||||||
"Llama3.2-90B-Vision-Instruct": "meta-llama/Llama-3.2-90B-Vision-Instruct",
|
|
||||||
"Llama-Guard-3-11B-Vision": "meta-llama/Llama-Guard-3-11B-Vision",
|
|
||||||
"Llama-Guard-3-1B:int4-mp1": "meta-llama/Llama-Guard-3-1B-INT4",
|
|
||||||
"Llama-Guard-3-1B": "meta-llama/Llama-Guard-3-1B",
|
|
||||||
"Llama-Guard-3-8B": "meta-llama/Llama-Guard-3-8B",
|
|
||||||
"Llama-Guard-3-8B:int8-mp1": "meta-llama/Llama-Guard-3-8B-INT8",
|
|
||||||
"Prompt-Guard-86M": "meta-llama/Prompt-Guard-86M",
|
|
||||||
"Llama-Guard-2-8B": "meta-llama/Llama-Guard-2-8B",
|
|
||||||
}
|
|
||||||
|
|
||||||
def __init__(self, config: VLLMConfig):
|
def __init__(self, config: VLLMConfig):
|
||||||
Inference.__init__(self)
|
|
||||||
ModelRegistryHelper.__init__(
|
|
||||||
self,
|
|
||||||
stack_to_provider_models_map=self.HF_MODEL_MAPPINGS,
|
|
||||||
)
|
|
||||||
self.config = config
|
self.config = config
|
||||||
self.engine = None
|
self.engine = None
|
||||||
|
self.formatter = ChatFormat(Tokenizer.get_instance())
|
||||||
tokenizer = Tokenizer.get_instance()
|
|
||||||
self.formatter = ChatFormat(tokenizer)
|
|
||||||
|
|
||||||
async def initialize(self):
|
async def initialize(self):
|
||||||
"""Initialize the vLLM inference adapter."""
|
|
||||||
|
|
||||||
log.info("Initializing vLLM inference adapter")
|
log.info("Initializing vLLM inference adapter")
|
||||||
|
|
||||||
# Disable usage stats reporting. This would be a surprising thing for most
|
# Disable usage stats reporting. This would be a surprising thing for most
|
||||||
|
|
@ -116,15 +58,22 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference):
|
||||||
if "VLLM_NO_USAGE_STATS" not in os.environ:
|
if "VLLM_NO_USAGE_STATS" not in os.environ:
|
||||||
os.environ["VLLM_NO_USAGE_STATS"] = "1"
|
os.environ["VLLM_NO_USAGE_STATS"] = "1"
|
||||||
|
|
||||||
hf_model = self.HF_MODEL_MAPPINGS.get(self.config.model)
|
model = resolve_model(self.config.model)
|
||||||
|
if model is None:
|
||||||
|
raise ValueError(f"Unknown model {self.config.model}")
|
||||||
|
|
||||||
|
if model.huggingface_repo is None:
|
||||||
|
raise ValueError(f"Model {self.config.model} needs a huggingface repo")
|
||||||
|
|
||||||
# TODO -- there are a ton of options supported here ...
|
# TODO -- there are a ton of options supported here ...
|
||||||
engine_args = AsyncEngineArgs()
|
engine_args = AsyncEngineArgs(
|
||||||
engine_args.model = hf_model
|
model=model.huggingface_repo,
|
||||||
# We will need a new config item for this in the future if model support is more broad
|
tokenizer=model.huggingface_repo,
|
||||||
# than it is today (llama only)
|
tensor_parallel_size=self.config.tensor_parallel_size,
|
||||||
engine_args.tokenizer = hf_model
|
enforce_eager=self.config.enforce_eager,
|
||||||
engine_args.tensor_parallel_size = self.config.tensor_parallel_size
|
gpu_memory_utilization=self.config.gpu_memory_utilization,
|
||||||
|
guided_decoding_backend="lm-format-enforcer",
|
||||||
|
)
|
||||||
|
|
||||||
self.engine = AsyncLLMEngine.from_engine_args(engine_args)
|
self.engine = AsyncLLMEngine.from_engine_args(engine_args)
|
||||||
|
|
||||||
|
|
@ -134,13 +83,47 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference):
|
||||||
if self.engine:
|
if self.engine:
|
||||||
self.engine.shutdown_background_loop()
|
self.engine.shutdown_background_loop()
|
||||||
|
|
||||||
|
async def register_model(self, model: ModelDef) -> None:
|
||||||
|
raise ValueError(
|
||||||
|
"You cannot dynamically add a model to a running vllm instance"
|
||||||
|
)
|
||||||
|
|
||||||
|
async def list_models(self) -> List[ModelDef]:
|
||||||
|
return [
|
||||||
|
ModelDef(
|
||||||
|
identifier=self.config.model,
|
||||||
|
llama_model=self.config.model,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
def _sampling_params(self, sampling_params: SamplingParams) -> VLLMSamplingParams:
|
||||||
|
if sampling_params is None:
|
||||||
|
return VLLMSamplingParams(max_tokens=self.config.max_tokens)
|
||||||
|
|
||||||
|
# TODO convert what I saw in my first test ... but surely there's more to do here
|
||||||
|
kwargs = {
|
||||||
|
"temperature": sampling_params.temperature,
|
||||||
|
"max_tokens": self.config.max_tokens,
|
||||||
|
}
|
||||||
|
if sampling_params.top_k:
|
||||||
|
kwargs["top_k"] = sampling_params.top_k
|
||||||
|
if sampling_params.top_p:
|
||||||
|
kwargs["top_p"] = sampling_params.top_p
|
||||||
|
if sampling_params.max_tokens:
|
||||||
|
kwargs["max_tokens"] = sampling_params.max_tokens
|
||||||
|
if sampling_params.repetition_penalty > 0:
|
||||||
|
kwargs["repetition_penalty"] = sampling_params.repetition_penalty
|
||||||
|
|
||||||
|
return VLLMSamplingParams(**kwargs)
|
||||||
|
|
||||||
async def completion(
|
async def completion(
|
||||||
self,
|
self,
|
||||||
model: str,
|
model: str,
|
||||||
content: InterleavedTextMedia,
|
content: InterleavedTextMedia,
|
||||||
sampling_params: Any | None = ...,
|
sampling_params: Optional[SamplingParams] = SamplingParams(),
|
||||||
stream: bool | None = False,
|
response_format: Optional[ResponseFormat] = None,
|
||||||
logprobs: LogProbConfig | None = None,
|
stream: Optional[bool] = False,
|
||||||
|
logprobs: Optional[LogProbConfig] = None,
|
||||||
) -> CompletionResponse | CompletionResponseStreamChunk:
|
) -> CompletionResponse | CompletionResponseStreamChunk:
|
||||||
log.info("vLLM completion")
|
log.info("vLLM completion")
|
||||||
messages = [UserMessage(content=content)]
|
messages = [UserMessage(content=content)]
|
||||||
|
|
@ -155,13 +138,14 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference):
|
||||||
async def chat_completion(
|
async def chat_completion(
|
||||||
self,
|
self,
|
||||||
model: str,
|
model: str,
|
||||||
messages: list[Message],
|
messages: List[Message],
|
||||||
sampling_params: Any | None = ...,
|
sampling_params: Optional[SamplingParams] = SamplingParams(),
|
||||||
tools: list[ToolDefinition] | None = ...,
|
tools: Optional[List[ToolDefinition]] = None,
|
||||||
tool_choice: ToolChoice | None = ...,
|
tool_choice: Optional[ToolChoice] = ToolChoice.auto,
|
||||||
tool_prompt_format: ToolPromptFormat | None = ...,
|
tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
|
||||||
stream: bool | None = False,
|
response_format: Optional[ResponseFormat] = None,
|
||||||
logprobs: LogProbConfig | None = None,
|
stream: Optional[bool] = False,
|
||||||
|
logprobs: Optional[LogProbConfig] = None,
|
||||||
) -> ChatCompletionResponse | ChatCompletionResponseStreamChunk:
|
) -> ChatCompletionResponse | ChatCompletionResponseStreamChunk:
|
||||||
log.info("vLLM chat completion")
|
log.info("vLLM chat completion")
|
||||||
|
|
||||||
|
|
@ -182,7 +166,7 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference):
|
||||||
request_id = _random_uuid()
|
request_id = _random_uuid()
|
||||||
|
|
||||||
prompt = chat_completion_request_to_prompt(request, self.formatter)
|
prompt = chat_completion_request_to_prompt(request, self.formatter)
|
||||||
vllm_sampling_params = _vllm_sampling_params(request.sampling_params)
|
vllm_sampling_params = self._sampling_params(request.sampling_params)
|
||||||
results_generator = self.engine.generate(
|
results_generator = self.engine.generate(
|
||||||
prompt, vllm_sampling_params, request_id
|
prompt, vllm_sampling_params, request_id
|
||||||
)
|
)
|
||||||
|
|
@ -213,14 +197,19 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference):
|
||||||
self, request: ChatCompletionRequest, results_generator: AsyncGenerator
|
self, request: ChatCompletionRequest, results_generator: AsyncGenerator
|
||||||
) -> AsyncGenerator:
|
) -> AsyncGenerator:
|
||||||
async def _generate_and_convert_to_openai_compat():
|
async def _generate_and_convert_to_openai_compat():
|
||||||
|
cur = []
|
||||||
async for chunk in results_generator:
|
async for chunk in results_generator:
|
||||||
if not chunk.outputs:
|
if not chunk.outputs:
|
||||||
log.warning("Empty chunk received")
|
log.warning("Empty chunk received")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
text = "".join([output.text for output in chunk.outputs])
|
output = chunk.outputs[-1]
|
||||||
|
|
||||||
|
new_tokens = output.token_ids[len(cur) :]
|
||||||
|
text = self.formatter.tokenizer.decode(new_tokens)
|
||||||
|
cur.extend(new_tokens)
|
||||||
choice = OpenAICompatCompletionChoice(
|
choice = OpenAICompatCompletionChoice(
|
||||||
finish_reason=chunk.outputs[-1].stop_reason,
|
finish_reason=output.finish_reason,
|
||||||
text=text,
|
text=text,
|
||||||
)
|
)
|
||||||
yield OpenAICompatCompletionResponse(
|
yield OpenAICompatCompletionResponse(
|
||||||
|
|
|
||||||
|
|
@ -62,7 +62,7 @@ async def test_eval(eval_settings):
|
||||||
response = await eval_impl.evaluate_batch(
|
response = await eval_impl.evaluate_batch(
|
||||||
dataset_id=response[0].identifier,
|
dataset_id=response[0].identifier,
|
||||||
candidate=ModelCandidate(
|
candidate=ModelCandidate(
|
||||||
model="Llama3.1-8B-Instruct",
|
model="Llama3.2-1B-Instruct",
|
||||||
sampling_params=SamplingParams(),
|
sampling_params=SamplingParams(),
|
||||||
),
|
),
|
||||||
scoring_functions=["subset_of"],
|
scoring_functions=["subset_of"],
|
||||||
|
|
|
||||||
9
llama_stack/templates/bedrock/build.yaml
Normal file
9
llama_stack/templates/bedrock/build.yaml
Normal file
|
|
@ -0,0 +1,9 @@
|
||||||
|
name: bedrock
|
||||||
|
distribution_spec:
|
||||||
|
description: Use Amazon Bedrock APIs.
|
||||||
|
providers:
|
||||||
|
inference: remote::bedrock
|
||||||
|
memory: meta-reference
|
||||||
|
safety: meta-reference
|
||||||
|
agents: meta-reference
|
||||||
|
telemetry: meta-reference
|
||||||
9
llama_stack/templates/databricks/build.yaml
Normal file
9
llama_stack/templates/databricks/build.yaml
Normal file
|
|
@ -0,0 +1,9 @@
|
||||||
|
name: databricks
|
||||||
|
distribution_spec:
|
||||||
|
description: Use Databricks for running LLM inference
|
||||||
|
providers:
|
||||||
|
inference: remote::databricks
|
||||||
|
memory: meta-reference
|
||||||
|
safety: meta-reference
|
||||||
|
agents: meta-reference
|
||||||
|
telemetry: meta-reference
|
||||||
9
llama_stack/templates/fireworks/build.yaml
Normal file
9
llama_stack/templates/fireworks/build.yaml
Normal file
|
|
@ -0,0 +1,9 @@
|
||||||
|
name: fireworks
|
||||||
|
distribution_spec:
|
||||||
|
description: Use Fireworks.ai for running LLM inference
|
||||||
|
providers:
|
||||||
|
inference: remote::fireworks
|
||||||
|
memory: meta-reference
|
||||||
|
safety: meta-reference
|
||||||
|
agents: meta-reference
|
||||||
|
telemetry: meta-reference
|
||||||
9
llama_stack/templates/hf-endpoint/build.yaml
Normal file
9
llama_stack/templates/hf-endpoint/build.yaml
Normal file
|
|
@ -0,0 +1,9 @@
|
||||||
|
name: hf-endpoint
|
||||||
|
distribution_spec:
|
||||||
|
description: "Like local, but use Hugging Face Inference Endpoints for running LLM inference.\nSee https://hf.co/docs/api-endpoints."
|
||||||
|
providers:
|
||||||
|
inference: remote::hf::endpoint
|
||||||
|
memory: meta-reference
|
||||||
|
safety: meta-reference
|
||||||
|
agents: meta-reference
|
||||||
|
telemetry: meta-reference
|
||||||
9
llama_stack/templates/hf-serverless/build.yaml
Normal file
9
llama_stack/templates/hf-serverless/build.yaml
Normal file
|
|
@ -0,0 +1,9 @@
|
||||||
|
name: hf-serverless
|
||||||
|
distribution_spec:
|
||||||
|
description: "Like local, but use Hugging Face Inference API (serverless) for running LLM inference.\nSee https://hf.co/docs/api-inference."
|
||||||
|
providers:
|
||||||
|
inference: remote::hf::serverless
|
||||||
|
memory: meta-reference
|
||||||
|
safety: meta-reference
|
||||||
|
agents: meta-reference
|
||||||
|
telemetry: meta-reference
|
||||||
13
llama_stack/templates/meta-reference-gpu/build.yaml
Normal file
13
llama_stack/templates/meta-reference-gpu/build.yaml
Normal file
|
|
@ -0,0 +1,13 @@
|
||||||
|
name: meta-reference-gpu
|
||||||
|
distribution_spec:
|
||||||
|
docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
|
||||||
|
description: Use code from `llama_stack` itself to serve all llama stack APIs
|
||||||
|
providers:
|
||||||
|
inference: meta-reference
|
||||||
|
memory:
|
||||||
|
- meta-reference
|
||||||
|
- remote::chromadb
|
||||||
|
- remote::pgvector
|
||||||
|
safety: meta-reference
|
||||||
|
agents: meta-reference
|
||||||
|
telemetry: meta-reference
|
||||||
|
|
@ -0,0 +1,13 @@
|
||||||
|
name: meta-reference-quantized-gpu
|
||||||
|
distribution_spec:
|
||||||
|
docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
|
||||||
|
description: Use code from `llama_stack` itself to serve all llama stack APIs
|
||||||
|
providers:
|
||||||
|
inference: meta-reference-quantized
|
||||||
|
memory:
|
||||||
|
- meta-reference
|
||||||
|
- remote::chromadb
|
||||||
|
- remote::pgvector
|
||||||
|
safety: meta-reference
|
||||||
|
agents: meta-reference
|
||||||
|
telemetry: meta-reference
|
||||||
12
llama_stack/templates/ollama/build.yaml
Normal file
12
llama_stack/templates/ollama/build.yaml
Normal file
|
|
@ -0,0 +1,12 @@
|
||||||
|
name: ollama
|
||||||
|
distribution_spec:
|
||||||
|
description: Use ollama for running LLM inference
|
||||||
|
providers:
|
||||||
|
inference: remote::ollama
|
||||||
|
memory:
|
||||||
|
- meta-reference
|
||||||
|
- remote::chromadb
|
||||||
|
- remote::pgvector
|
||||||
|
safety: meta-reference
|
||||||
|
agents: meta-reference
|
||||||
|
telemetry: meta-reference
|
||||||
12
llama_stack/templates/tgi/build.yaml
Normal file
12
llama_stack/templates/tgi/build.yaml
Normal file
|
|
@ -0,0 +1,12 @@
|
||||||
|
name: tgi
|
||||||
|
distribution_spec:
|
||||||
|
description: Use TGI for running LLM inference
|
||||||
|
providers:
|
||||||
|
inference: remote::tgi
|
||||||
|
memory:
|
||||||
|
- meta-reference
|
||||||
|
- remote::chromadb
|
||||||
|
- remote::pgvector
|
||||||
|
safety: meta-reference
|
||||||
|
agents: meta-reference
|
||||||
|
telemetry: meta-reference
|
||||||
9
llama_stack/templates/together/build.yaml
Normal file
9
llama_stack/templates/together/build.yaml
Normal file
|
|
@ -0,0 +1,9 @@
|
||||||
|
name: together
|
||||||
|
distribution_spec:
|
||||||
|
description: Use Together.ai for running LLM inference
|
||||||
|
providers:
|
||||||
|
inference: remote::together
|
||||||
|
memory: remote::weaviate
|
||||||
|
safety: remote::together
|
||||||
|
agents: meta-reference
|
||||||
|
telemetry: meta-reference
|
||||||
9
llama_stack/templates/vllm/build.yaml
Normal file
9
llama_stack/templates/vllm/build.yaml
Normal file
|
|
@ -0,0 +1,9 @@
|
||||||
|
name: vllm
|
||||||
|
distribution_spec:
|
||||||
|
description: Like local, but use vLLM for running LLM inference
|
||||||
|
providers:
|
||||||
|
inference: vllm
|
||||||
|
memory: meta-reference
|
||||||
|
safety: meta-reference
|
||||||
|
agents: meta-reference
|
||||||
|
telemetry: meta-reference
|
||||||
Loading…
Add table
Add a link
Reference in a new issue