Run the script to produce vllm outputs

This commit is contained in:
Ashwin Bharambe 2024-11-17 14:09:36 -08:00
parent 0218e68849
commit 9bb07ce298
10 changed files with 109 additions and 71 deletions

View file

@ -1,68 +1,71 @@
version: '2' version: '2'
built_at: '2024-11-11T20:09:45.988375' built_at: 2024-11-17 14:07:24.568750
image_name: remote-vllm image_name: remote-vllm
docker_image: remote-vllm docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3
conda_env: null conda_env: null
apis: apis:
- inference
- memory
- safety
- agents - agents
- telemetry - telemetry
- safety
- inference
- memory
providers: providers:
inference: inference:
# serves main inference model
- provider_id: vllm-inference - provider_id: vllm-inference
provider_type: remote::vllm provider_type: remote::vllm
config: config:
# NOTE: replace with "localhost" if you are running in "host" network mode
url: ${env.VLLM_URL} url: ${env.VLLM_URL}
max_tokens: ${env.MAX_TOKENS:4096} max_tokens: ${env.VLLM_MAX_TOKENS:4096}
api_token: fake api_token: ${env.VLLM_API_TOKEN:fake}
# serves safety llama_guard model
- provider_id: vllm-safety - provider_id: vllm-safety
provider_type: remote::vllm provider_type: remote::vllm
config: config:
# NOTE: replace with "localhost" if you are running in "host" network mode
url: ${env.SAFETY_VLLM_URL} url: ${env.SAFETY_VLLM_URL}
max_tokens: ${env.MAX_TOKENS:4096} max_tokens: ${env.VLLM_MAX_TOKENS:4096}
api_token: fake api_token: ${env.VLLM_API_TOKEN:fake}
memory: memory:
- provider_id: faiss-0 - provider_id: faiss
provider_type: inline::faiss provider_type: inline::faiss
config: config:
kvstore: kvstore:
namespace: null
type: sqlite type: sqlite
db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/faiss_store.db" namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/faiss_store.db
safety: safety:
- provider_id: llama-guard - provider_id: llama-guard
provider_type: inline::llama-guard provider_type: inline::llama-guard
config: {} config: {}
memory:
- provider_id: meta0
provider_type: inline::faiss
config: {}
agents: agents:
- provider_id: meta0 - provider_id: meta-reference
provider_type: inline::meta-reference provider_type: inline::meta-reference
config: config:
persistence_store: persistence_store:
namespace: null
type: sqlite type: sqlite
db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/agents_store.db" namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/agents_store.db
telemetry: telemetry:
- provider_id: meta0 - provider_id: meta-reference
provider_type: inline::meta-reference provider_type: inline::meta-reference
config: {} config: {}
metadata_store: metadata_store:
namespace: null namespace: null
type: sqlite type: sqlite
db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db" db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db
models: models:
- model_id: ${env.INFERENCE_MODEL} - metadata: {}
provider_id: vllm-inference model_id: ${env.INFERENCE_MODEL}
- model_id: ${env.SAFETY_MODEL} provider_id: vllm-inference
provider_id: vllm-safety provider_model_id: null
- metadata: {}
model_id: ${env.SAFETY_MODEL}
provider_id: vllm-safety
provider_model_id: null
shields: shields:
- shield_id: ${env.SAFETY_MODEL} - params: null
shield_id: ${env.SAFETY_MODEL}
provider_id: null
provider_shield_id: null
memory_banks: []
datasets: []
scoring_fns: []
eval_tasks: []

View file

@ -1,50 +1,57 @@
version: '2' version: '2'
built_at: '2024-11-11T20:09:45.988375' built_at: 2024-11-17 14:07:24.563541
image_name: remote-vllm image_name: remote-vllm
docker_image: remote-vllm docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3
conda_env: null conda_env: null
apis: apis:
- inference
- memory
- agents - agents
- telemetry - telemetry
- safety
- inference
- memory
providers: providers:
inference: inference:
# serves main inference model
- provider_id: vllm-inference - provider_id: vllm-inference
provider_type: remote::vllm provider_type: remote::vllm
config: config:
url: ${env.VLLM_URL} url: ${env.VLLM_URL}
max_tokens: ${env.MAX_TOKENS:4096} max_tokens: ${env.VLLM_MAX_TOKENS:4096}
api_token: fake api_token: ${env.VLLM_API_TOKEN:fake}
memory: memory:
- provider_id: faiss - provider_id: faiss
provider_type: inline::faiss provider_type: inline::faiss
config: config:
kvstore: kvstore:
namespace: null
type: sqlite type: sqlite
db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/faiss_store.db" namespace: null
memory: db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/faiss_store.db
- provider_id: meta0 safety:
provider_type: inline::faiss - provider_id: llama-guard
provider_type: inline::llama-guard
config: {} config: {}
agents: agents:
- provider_id: meta0 - provider_id: meta-reference
provider_type: inline::meta-reference provider_type: inline::meta-reference
config: config:
persistence_store: persistence_store:
namespace: null
type: sqlite type: sqlite
db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/agents_store.db" namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/agents_store.db
telemetry: telemetry:
- provider_id: meta0 - provider_id: meta-reference
provider_type: inline::meta-reference provider_type: inline::meta-reference
config: {} config: {}
metadata_store: metadata_store:
namespace: null namespace: null
type: sqlite type: sqlite
db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db" db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db
models: models:
- model_id: ${env.INFERENCE_MODEL} - metadata: {}
provider_id: vllm-inference model_id: ${env.INFERENCE_MODEL}
provider_id: vllm-inference
provider_model_id: null
shields: []
memory_banks: []
datasets: []
scoring_fns: []
eval_tasks: []

View file

@ -1,20 +1,39 @@
# Remote vLLM Distribution # Remote vLLM Distribution
The `llamastack/distribution-remote-vllm` distribution consists of the following provider configurations. The `llamastack/distribution-remote-vllm` distribution consists of the following provider configurations:
| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** | Provider Configuration
|----------------- |---------------- |---------------- |------------------------------------ |---------------- |---------------- | ┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
| **Provider(s)** | remote::vllm | meta-reference | remote::pgvector, remote::chromadb | meta-reference | meta-reference | ┃ API ┃ Provider(s) ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ agents │ `inline::meta-reference`
│ inference │ `remote::vllm`
│ memory │ `inline::faiss`, `remote::chromadb`, `remote::pgvector`
│ safety │ `inline::llama-guard`
│ telemetry │ `inline::meta-reference`
└───────────┴─────────────────────────────────────────────────────────┘
You can use this distribution if you have GPUs and want to run an independent vLLM server container for running inference.
You can use this distribution if you have GPUs and want to run an independent vLLM server container for running inference.### Environment Variables
The following environment variables can be configured:
- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
- `INFERENCE_MODEL`: Inference model loaded into the vLLM server (default: `meta-llama/Llama-3.2-3B-Instruct`)
- `VLLM_URL`: URL of the vLLM server with the main inference model (default: `http://host.docker.internal:5100}/v1`)
- `MAX_TOKENS`: Maximum number of tokens for generation (default: `4096`)
- `SAFETY_VLLM_URL`: URL of the vLLM server with the safety model (default: `http://host.docker.internal:5101/v1`)
- `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`)
### Models
The following models are configured by default:
- `${env.INFERENCE_MODEL}`
- `${env.SAFETY_MODEL}`
## Using Docker Compose ## Using Docker Compose
You can use `docker compose` to start a vLLM container and Llama Stack server container together. You can use `docker compose` to start a vLLM container and Llama Stack server container together.
> [!NOTE]
> This assumes you have access to GPU to start a vLLM server with access to your GPU.
```bash ```bash
$ cd distributions/remote-vllm; docker compose up $ cd distributions/remote-vllm; docker compose up
``` ```
@ -31,8 +50,7 @@ docker compose down
## Starting vLLM and Llama Stack separately ## Starting vLLM and Llama Stack separately
You may want to start a vLLM server and connect with Llama Stack manually. There are two ways to start a vLLM server and connect with Llama Stack. You can also decide to start a vLLM server and connect with Llama Stack manually. There are two ways to start a vLLM server and connect with Llama Stack.
#### Start vLLM server. #### Start vLLM server.
@ -43,7 +61,7 @@ docker run --runtime nvidia --gpus all \
-p 8000:8000 \ -p 8000:8000 \
--ipc=host \ --ipc=host \
vllm/vllm-openai:latest \ vllm/vllm-openai:latest \
--model meta-llama/Llama-3.1-8B-Instruct --model meta-llama/Llama-3.2-3B-Instruct
``` ```
Please check the [vLLM Documentation](https://docs.vllm.ai/en/v0.5.5/serving/deploying_with_docker.html) for more details. Please check the [vLLM Documentation](https://docs.vllm.ai/en/v0.5.5/serving/deploying_with_docker.html) for more details.
@ -66,7 +84,7 @@ inference:
If you are using Conda, you can build and run the Llama Stack server with the following commands: If you are using Conda, you can build and run the Llama Stack server with the following commands:
```bash ```bash
cd distributions/remote-vllm cd distributions/remote-vllm
llama stack build --template remote_vllm --image-type conda llama stack build --template remote-vllm --image-type conda
llama stack run run.yaml llama stack run run.yaml
``` ```

View file

@ -21,5 +21,5 @@ class MetaReferenceAgentsImplConfig(BaseModel):
"persistence_store": SqliteKVStoreConfig.sample_run_config( "persistence_store": SqliteKVStoreConfig.sample_run_config(
__distro_dir__=__distro_dir__, __distro_dir__=__distro_dir__,
db_name="agents_store.db", db_name="agents_store.db",
).model_dump(), )
} }

View file

@ -25,5 +25,5 @@ class FaissImplConfig(BaseModel):
"kvstore": SqliteKVStoreConfig.sample_run_config( "kvstore": SqliteKVStoreConfig.sample_run_config(
__distro_dir__=__distro_dir__, __distro_dir__=__distro_dir__,
db_name="faiss_store.db", db_name="faiss_store.db",
).model_dump(), )
} }

View file

@ -61,8 +61,9 @@ class SqliteKVStoreConfig(CommonConfig):
"type": "sqlite", "type": "sqlite",
"namespace": None, "namespace": None,
"db_path": "${env.SQLITE_STORE_DIR:~/.llama/" "db_path": "${env.SQLITE_STORE_DIR:~/.llama/"
+ f"{__distro_dir__}/{db_name}" + __distro_dir__
+ "}", + "}/"
+ db_name,
} }

View file

@ -50,6 +50,7 @@ def process_template(template_dir: Path, progress) -> None:
except Exception as e: except Exception as e:
progress.print(f"[red]Error processing {template_dir.name}: {str(e)}") progress.print(f"[red]Error processing {template_dir.name}: {str(e)}")
raise e
def main(): def main():

View file

@ -1,12 +1,19 @@
version: '2'
name: remote-vllm name: remote-vllm
distribution_spec: distribution_spec:
description: Use (an external) vLLM server for running LLM inference description: Use (an external) vLLM server for running LLM inference
docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3
providers: providers:
inference: remote::vllm inference:
- remote::vllm
memory: memory:
- inline::faiss - inline::faiss
- remote::chromadb - remote::chromadb
- remote::pgvector - remote::pgvector
safety: inline::llama-guard safety:
agents: inline::meta-reference - inline::llama-guard
telemetry: inline::meta-reference agents:
- inline::meta-reference
telemetry:
- inline::meta-reference
image_type: conda

View file

@ -41,6 +41,7 @@ def get_distribution_template() -> DistributionTemplate:
name="remote-vllm", name="remote-vllm",
distro_type="self_hosted", distro_type="self_hosted",
description="Use (an external) vLLM server for running LLM inference", description="Use (an external) vLLM server for running LLM inference",
docker_image="llamastack/distribution-remote-vllm:test-0.0.52rc3",
template_path=Path(__file__).parent / "doc_template.md", template_path=Path(__file__).parent / "doc_template.md",
providers=providers, providers=providers,
default_models=[inference_model, safety_model], default_models=[inference_model, safety_model],

View file

@ -91,7 +91,7 @@ class RunConfigSettings(BaseModel):
apis=list(apis), apis=list(apis),
providers=provider_configs, providers=provider_configs,
metadata_store=SqliteKVStoreConfig.sample_run_config( metadata_store=SqliteKVStoreConfig.sample_run_config(
dir=f"distributions/{name}", __distro_dir__=f"distributions/{name}",
db_name="registry.db", db_name="registry.db",
), ),
models=self.default_models, models=self.default_models,