Run the script to produce vllm outputs

This commit is contained in:
Ashwin Bharambe 2024-11-17 14:09:36 -08:00
parent 0218e68849
commit 9bb07ce298
10 changed files with 109 additions and 71 deletions

View file

@ -1,68 +1,71 @@
version: '2'
built_at: '2024-11-11T20:09:45.988375'
built_at: 2024-11-17 14:07:24.568750
image_name: remote-vllm
docker_image: remote-vllm
docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3
conda_env: null
apis:
- inference
- memory
- safety
- agents
- telemetry
- safety
- inference
- memory
providers:
inference:
# serves main inference model
- provider_id: vllm-inference
provider_type: remote::vllm
config:
# NOTE: replace with "localhost" if you are running in "host" network mode
url: ${env.VLLM_URL}
max_tokens: ${env.MAX_TOKENS:4096}
api_token: fake
# serves safety llama_guard model
max_tokens: ${env.VLLM_MAX_TOKENS:4096}
api_token: ${env.VLLM_API_TOKEN:fake}
- provider_id: vllm-safety
provider_type: remote::vllm
config:
# NOTE: replace with "localhost" if you are running in "host" network mode
url: ${env.SAFETY_VLLM_URL}
max_tokens: ${env.MAX_TOKENS:4096}
api_token: fake
max_tokens: ${env.VLLM_MAX_TOKENS:4096}
api_token: ${env.VLLM_API_TOKEN:fake}
memory:
- provider_id: faiss-0
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
namespace: null
type: sqlite
db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/faiss_store.db"
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/faiss_store.db
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config: {}
memory:
- provider_id: meta0
provider_type: inline::faiss
config: {}
agents:
- provider_id: meta0
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
namespace: null
type: sqlite
db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/agents_store.db"
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/agents_store.db
telemetry:
- provider_id: meta0
- provider_id: meta-reference
provider_type: inline::meta-reference
config: {}
metadata_store:
namespace: null
type: sqlite
db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db"
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db
models:
- model_id: ${env.INFERENCE_MODEL}
provider_id: vllm-inference
- model_id: ${env.SAFETY_MODEL}
provider_id: vllm-safety
- metadata: {}
model_id: ${env.INFERENCE_MODEL}
provider_id: vllm-inference
provider_model_id: null
- metadata: {}
model_id: ${env.SAFETY_MODEL}
provider_id: vllm-safety
provider_model_id: null
shields:
- shield_id: ${env.SAFETY_MODEL}
- params: null
shield_id: ${env.SAFETY_MODEL}
provider_id: null
provider_shield_id: null
memory_banks: []
datasets: []
scoring_fns: []
eval_tasks: []

View file

@ -1,50 +1,57 @@
version: '2'
built_at: '2024-11-11T20:09:45.988375'
built_at: 2024-11-17 14:07:24.563541
image_name: remote-vllm
docker_image: remote-vllm
docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3
conda_env: null
apis:
- inference
- memory
- agents
- telemetry
- safety
- inference
- memory
providers:
inference:
# serves main inference model
- provider_id: vllm-inference
provider_type: remote::vllm
config:
url: ${env.VLLM_URL}
max_tokens: ${env.MAX_TOKENS:4096}
api_token: fake
max_tokens: ${env.VLLM_MAX_TOKENS:4096}
api_token: ${env.VLLM_API_TOKEN:fake}
memory:
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
namespace: null
type: sqlite
db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/faiss_store.db"
memory:
- provider_id: meta0
provider_type: inline::faiss
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/faiss_store.db
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config: {}
agents:
- provider_id: meta0
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
namespace: null
type: sqlite
db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/agents_store.db"
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/agents_store.db
telemetry:
- provider_id: meta0
- provider_id: meta-reference
provider_type: inline::meta-reference
config: {}
metadata_store:
namespace: null
type: sqlite
db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db"
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db
models:
- model_id: ${env.INFERENCE_MODEL}
provider_id: vllm-inference
- metadata: {}
model_id: ${env.INFERENCE_MODEL}
provider_id: vllm-inference
provider_model_id: null
shields: []
memory_banks: []
datasets: []
scoring_fns: []
eval_tasks: []

View file

@ -1,20 +1,39 @@
# Remote vLLM Distribution
The `llamastack/distribution-remote-vllm` distribution consists of the following provider configurations.
The `llamastack/distribution-remote-vllm` distribution consists of the following provider configurations:
| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** |
|----------------- |---------------- |---------------- |------------------------------------ |---------------- |---------------- |
| **Provider(s)** | remote::vllm | meta-reference | remote::pgvector, remote::chromadb | meta-reference | meta-reference |
Provider Configuration
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ API ┃ Provider(s) ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ agents │ `inline::meta-reference`
│ inference │ `remote::vllm`
│ memory │ `inline::faiss`, `remote::chromadb`, `remote::pgvector`
│ safety │ `inline::llama-guard`
│ telemetry │ `inline::meta-reference`
└───────────┴─────────────────────────────────────────────────────────┘
You can use this distribution if you have GPUs and want to run an independent vLLM server container for running inference.
You can use this distribution if you have GPUs and want to run an independent vLLM server container for running inference.### Environment Variables
The following environment variables can be configured:
- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
- `INFERENCE_MODEL`: Inference model loaded into the vLLM server (default: `meta-llama/Llama-3.2-3B-Instruct`)
- `VLLM_URL`: URL of the vLLM server with the main inference model (default: `http://host.docker.internal:5100}/v1`)
- `MAX_TOKENS`: Maximum number of tokens for generation (default: `4096`)
- `SAFETY_VLLM_URL`: URL of the vLLM server with the safety model (default: `http://host.docker.internal:5101/v1`)
- `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`)
### Models
The following models are configured by default:
- `${env.INFERENCE_MODEL}`
- `${env.SAFETY_MODEL}`
## Using Docker Compose
You can use `docker compose` to start a vLLM container and Llama Stack server container together.
> [!NOTE]
> This assumes you have access to GPU to start a vLLM server with access to your GPU.
```bash
$ cd distributions/remote-vllm; docker compose up
```
@ -31,8 +50,7 @@ docker compose down
## Starting vLLM and Llama Stack separately
You may want to start a vLLM server and connect with Llama Stack manually. There are two ways to start a vLLM server and connect with Llama Stack.
You can also decide to start a vLLM server and connect with Llama Stack manually. There are two ways to start a vLLM server and connect with Llama Stack.
#### Start vLLM server.
@ -43,7 +61,7 @@ docker run --runtime nvidia --gpus all \
-p 8000:8000 \
--ipc=host \
vllm/vllm-openai:latest \
--model meta-llama/Llama-3.1-8B-Instruct
--model meta-llama/Llama-3.2-3B-Instruct
```
Please check the [vLLM Documentation](https://docs.vllm.ai/en/v0.5.5/serving/deploying_with_docker.html) for more details.
@ -66,7 +84,7 @@ inference:
If you are using Conda, you can build and run the Llama Stack server with the following commands:
```bash
cd distributions/remote-vllm
llama stack build --template remote_vllm --image-type conda
llama stack build --template remote-vllm --image-type conda
llama stack run run.yaml
```

View file

@ -21,5 +21,5 @@ class MetaReferenceAgentsImplConfig(BaseModel):
"persistence_store": SqliteKVStoreConfig.sample_run_config(
__distro_dir__=__distro_dir__,
db_name="agents_store.db",
).model_dump(),
)
}

View file

@ -25,5 +25,5 @@ class FaissImplConfig(BaseModel):
"kvstore": SqliteKVStoreConfig.sample_run_config(
__distro_dir__=__distro_dir__,
db_name="faiss_store.db",
).model_dump(),
)
}

View file

@ -61,8 +61,9 @@ class SqliteKVStoreConfig(CommonConfig):
"type": "sqlite",
"namespace": None,
"db_path": "${env.SQLITE_STORE_DIR:~/.llama/"
+ f"{__distro_dir__}/{db_name}"
+ "}",
+ __distro_dir__
+ "}/"
+ db_name,
}

View file

@ -50,6 +50,7 @@ def process_template(template_dir: Path, progress) -> None:
except Exception as e:
progress.print(f"[red]Error processing {template_dir.name}: {str(e)}")
raise e
def main():

View file

@ -1,12 +1,19 @@
version: '2'
name: remote-vllm
distribution_spec:
description: Use (an external) vLLM server for running LLM inference
docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3
providers:
inference: remote::vllm
inference:
- remote::vllm
memory:
- inline::faiss
- remote::chromadb
- remote::pgvector
safety: inline::llama-guard
agents: inline::meta-reference
telemetry: inline::meta-reference
safety:
- inline::llama-guard
agents:
- inline::meta-reference
telemetry:
- inline::meta-reference
image_type: conda

View file

@ -41,6 +41,7 @@ def get_distribution_template() -> DistributionTemplate:
name="remote-vllm",
distro_type="self_hosted",
description="Use (an external) vLLM server for running LLM inference",
docker_image="llamastack/distribution-remote-vllm:test-0.0.52rc3",
template_path=Path(__file__).parent / "doc_template.md",
providers=providers,
default_models=[inference_model, safety_model],

View file

@ -91,7 +91,7 @@ class RunConfigSettings(BaseModel):
apis=list(apis),
providers=provider_configs,
metadata_store=SqliteKVStoreConfig.sample_run_config(
dir=f"distributions/{name}",
__distro_dir__=f"distributions/{name}",
db_name="registry.db",
),
models=self.default_models,