Merge-related changes.

This commit is contained in:
ilya-kolchinsky 2025-04-02 19:56:44 +02:00
commit 60e9f46856
456 changed files with 38636 additions and 10892 deletions

View file

@ -36,6 +36,7 @@ export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
export CUDA_VISIBLE_DEVICES=0
docker run \
--pull always \
--runtime nvidia \
--gpus $CUDA_VISIBLE_DEVICES \
-v ~/.cache/huggingface:/root/.cache/huggingface \
@ -48,6 +49,8 @@ docker run \
--port $INFERENCE_PORT
```
Note that you'll also need to set `--enable-auto-tool-choice` and `--tool-call-parser` to [enable tool calling in vLLM](https://docs.vllm.ai/en/latest/features/tool_calling.html).
If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
```bash
@ -56,6 +59,7 @@ export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
export CUDA_VISIBLE_DEVICES=1
docker run \
--pull always \
--runtime nvidia \
--gpus $CUDA_VISIBLE_DEVICES \
-v ~/.cache/huggingface:/root/.cache/huggingface \
@ -79,12 +83,16 @@ This method allows you to get started quickly without having to build the distri
```bash
export INFERENCE_PORT=8000
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
export LLAMA_STACK_PORT=5001
export LLAMA_STACK_PORT=8321
# You need a local checkout of llama-stack to run this, get it using
# git clone https://github.com/meta-llama/llama-stack.git
cd /path/to/llama-stack
docker run \
-it \
--pull always \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run.yaml:/root/my-run.yaml \
-v ./llama_stack/templates/remote-vllm/run.yaml:/root/my-run.yaml \
llamastack/distribution-{{ name }} \
--yaml-config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \
@ -103,7 +111,7 @@ export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
cd /path/to/llama-stack
docker run \
-it \
--pull always \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ~/.llama:/root/.llama \
-v ./llama_stack/templates/remote-vllm/run-with-safety.yaml:/root/my-run.yaml \
@ -124,7 +132,7 @@ Make sure you have done `uv pip install llama-stack` and have the Llama Stack CL
```bash
export INFERENCE_PORT=8000
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
export LLAMA_STACK_PORT=5001
export LLAMA_STACK_PORT=8321
cd distributions/remote-vllm
llama stack build --template remote-vllm --image-type conda

View file

@ -16,15 +16,17 @@ providers:
- provider_id: vllm-inference
provider_type: remote::vllm
config:
url: ${env.VLLM_URL}
url: ${env.VLLM_URL:http://localhost:8000/v1}
max_tokens: ${env.VLLM_MAX_TOKENS:4096}
api_token: ${env.VLLM_API_TOKEN:fake}
tls_verify: ${env.VLLM_TLS_VERIFY:true}
- provider_id: vllm-safety
provider_type: remote::vllm
config:
url: ${env.SAFETY_VLLM_URL}
max_tokens: ${env.VLLM_MAX_TOKENS:4096}
api_token: ${env.VLLM_API_TOKEN:fake}
tls_verify: ${env.VLLM_TLS_VERIFY:true}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
@ -39,7 +41,8 @@ providers:
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config: {}
config:
excluded_categories: []
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
@ -51,14 +54,26 @@ providers:
eval:
- provider_id: meta-reference
provider_type: inline::meta-reference
config: {}
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/meta_reference_eval.db
datasetio:
- provider_id: huggingface
provider_type: remote::huggingface
config: {}
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/huggingface_datasetio.db
- provider_id: localfs
provider_type: inline::localfs
config: {}
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/localfs_datasetio.db
scoring:
- provider_id: basic
provider_type: inline::basic
@ -74,7 +89,7 @@ providers:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
sinks: ${env.TELEMETRY_SINKS:console,sqlite}
sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/remote-vllm/trace_store.db}
tool_runtime:
@ -99,7 +114,8 @@ providers:
config: {}
- provider_id: wolfram-alpha
provider_type: remote::wolfram-alpha
config: {}
config:
api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
preprocessing:
- provider_id: basic
provider_type: inline::basic

View file

@ -16,9 +16,10 @@ providers:
- provider_id: vllm-inference
provider_type: remote::vllm
config:
url: ${env.VLLM_URL}
url: ${env.VLLM_URL:http://localhost:8000/v1}
max_tokens: ${env.VLLM_MAX_TOKENS:4096}
api_token: ${env.VLLM_API_TOKEN:fake}
tls_verify: ${env.VLLM_TLS_VERIFY:true}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
@ -33,7 +34,8 @@ providers:
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config: {}
config:
excluded_categories: []
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
@ -45,14 +47,26 @@ providers:
eval:
- provider_id: meta-reference
provider_type: inline::meta-reference
config: {}
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/meta_reference_eval.db
datasetio:
- provider_id: huggingface
provider_type: remote::huggingface
config: {}
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/huggingface_datasetio.db
- provider_id: localfs
provider_type: inline::localfs
config: {}
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/localfs_datasetio.db
scoring:
- provider_id: basic
provider_type: inline::basic
@ -68,7 +82,7 @@ providers:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
service_name: "${env.OTEL_SERVICE_NAME:\u200B}"
sinks: ${env.TELEMETRY_SINKS:console,sqlite}
sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/remote-vllm/trace_store.db}
tool_runtime:
@ -93,7 +107,8 @@ providers:
config: {}
- provider_id: wolfram-alpha
provider_type: remote::wolfram-alpha
config: {}
config:
api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
preprocessing:
- provider_id: basic
provider_type: inline::basic

View file

@ -47,7 +47,7 @@ def get_distribution_template() -> DistributionTemplate:
provider_id="vllm-inference",
provider_type="remote::vllm",
config=VLLMInferenceAdapterConfig.sample_run_config(
url="${env.VLLM_URL}",
url="${env.VLLM_URL:http://localhost:8000/v1}",
),
)
embedding_provider = Provider(
@ -149,7 +149,7 @@ def get_distribution_template() -> DistributionTemplate:
},
run_config_env_vars={
"LLAMA_STACK_PORT": (
"5001",
"8321",
"Port for the Llama Stack distribution server",
),
"INFERENCE_MODEL": (