mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-31 05:00:00 +00:00
feat: add batch inference API to llama stack inference
This commit is contained in:
parent
ed58a94b30
commit
0cfb2e2473
24 changed files with 1041 additions and 377 deletions
|
|
@ -16,11 +16,12 @@ providers:
|
|||
provider_type: inline::meta-reference
|
||||
config:
|
||||
model: ${env.INFERENCE_MODEL}
|
||||
max_seq_len: 4096
|
||||
checkpoint_dir: ${env.INFERENCE_CHECKPOINT_DIR:null}
|
||||
quantization:
|
||||
type: ${env.QUANTIZATION_TYPE:bf16}
|
||||
model_parallel_size: ${env.MODEL_PARALLEL_SIZE:0}
|
||||
max_batch_size: ${env.MAX_BATCH_SIZE:1}
|
||||
max_seq_len: ${env.MAX_SEQ_LEN:4096}
|
||||
- provider_id: sentence-transformers
|
||||
provider_type: inline::sentence-transformers
|
||||
config: {}
|
||||
|
|
@ -28,11 +29,12 @@ providers:
|
|||
provider_type: inline::meta-reference
|
||||
config:
|
||||
model: ${env.SAFETY_MODEL}
|
||||
max_seq_len: 4096
|
||||
checkpoint_dir: ${env.SAFETY_CHECKPOINT_DIR:null}
|
||||
quantization:
|
||||
type: ${env.QUANTIZATION_TYPE:bf16}
|
||||
model_parallel_size: ${env.MODEL_PARALLEL_SIZE:0}
|
||||
max_batch_size: ${env.MAX_BATCH_SIZE:1}
|
||||
max_seq_len: ${env.MAX_SEQ_LEN:4096}
|
||||
vector_io:
|
||||
- provider_id: faiss
|
||||
provider_type: inline::faiss
|
||||
|
|
|
|||
|
|
@ -16,11 +16,12 @@ providers:
|
|||
provider_type: inline::meta-reference
|
||||
config:
|
||||
model: ${env.INFERENCE_MODEL}
|
||||
max_seq_len: 4096
|
||||
checkpoint_dir: ${env.INFERENCE_CHECKPOINT_DIR:null}
|
||||
quantization:
|
||||
type: ${env.QUANTIZATION_TYPE:bf16}
|
||||
model_parallel_size: ${env.MODEL_PARALLEL_SIZE:0}
|
||||
max_batch_size: ${env.MAX_BATCH_SIZE:1}
|
||||
max_seq_len: ${env.MAX_SEQ_LEN:4096}
|
||||
- provider_id: sentence-transformers
|
||||
provider_type: inline::sentence-transformers
|
||||
config: {}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue