Merge branch 'main' into add-watsonx-inference-adapter

This commit is contained in:
Sajikumar JS 2025-04-15 11:47:56 +05:30
commit ebf994475d
126 changed files with 18440 additions and 10199 deletions

View file

@ -381,7 +381,7 @@
"sentence-transformers",
"sentencepiece",
"torch",
"torchao==0.5.0",
"torchao==0.8.0",
"torchvision",
"tqdm",
"transformers",

View file

@ -386,6 +386,16 @@ models:
provider_id: groq
provider_model_id: groq/llama-4-scout-17b-16e-instruct
model_type: llm
- metadata: {}
model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct
provider_id: groq
provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct
model_type: llm
- metadata: {}
model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
provider_id: groq
provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct
model_type: llm
- metadata: {}
model_id: groq/llama-4-maverick-17b-128e-instruct
provider_id: groq
@ -396,6 +406,16 @@ models:
provider_id: groq
provider_model_id: groq/llama-4-maverick-17b-128e-instruct
model_type: llm
- metadata: {}
model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
provider_id: groq
provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
model_type: llm
- metadata: {}
model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct
provider_id: groq
provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
model_type: llm
- metadata:
embedding_dimension: 384
model_id: all-MiniLM-L6-v2

View file

@ -158,6 +158,16 @@ models:
provider_id: groq
provider_model_id: groq/llama-4-scout-17b-16e-instruct
model_type: llm
- metadata: {}
model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct
provider_id: groq
provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct
model_type: llm
- metadata: {}
model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
provider_id: groq
provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct
model_type: llm
- metadata: {}
model_id: groq/llama-4-maverick-17b-128e-instruct
provider_id: groq
@ -168,6 +178,16 @@ models:
provider_id: groq
provider_model_id: groq/llama-4-maverick-17b-128e-instruct
model_type: llm
- metadata: {}
model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
provider_id: groq
provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
model_type: llm
- metadata: {}
model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct
provider_id: groq
provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
model_type: llm
- metadata:
embedding_dimension: 384
model_id: all-MiniLM-L6-v2

View file

@ -16,11 +16,12 @@ providers:
provider_type: inline::meta-reference
config:
model: ${env.INFERENCE_MODEL}
max_seq_len: 4096
checkpoint_dir: ${env.INFERENCE_CHECKPOINT_DIR:null}
quantization:
type: ${env.QUANTIZATION_TYPE:bf16}
model_parallel_size: ${env.MODEL_PARALLEL_SIZE:0}
max_batch_size: ${env.MAX_BATCH_SIZE:1}
max_seq_len: ${env.MAX_SEQ_LEN:4096}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
@ -28,11 +29,12 @@ providers:
provider_type: inline::meta-reference
config:
model: ${env.SAFETY_MODEL}
max_seq_len: 4096
checkpoint_dir: ${env.SAFETY_CHECKPOINT_DIR:null}
quantization:
type: ${env.QUANTIZATION_TYPE:bf16}
model_parallel_size: ${env.MODEL_PARALLEL_SIZE:0}
max_batch_size: ${env.MAX_BATCH_SIZE:1}
max_seq_len: ${env.MAX_SEQ_LEN:4096}
vector_io:
- provider_id: faiss
provider_type: inline::faiss

View file

@ -16,11 +16,12 @@ providers:
provider_type: inline::meta-reference
config:
model: ${env.INFERENCE_MODEL}
max_seq_len: 4096
checkpoint_dir: ${env.INFERENCE_CHECKPOINT_DIR:null}
quantization:
type: ${env.QUANTIZATION_TYPE:bf16}
model_parallel_size: ${env.MODEL_PARALLEL_SIZE:0}
max_batch_size: ${env.MAX_BATCH_SIZE:1}
max_seq_len: ${env.MAX_SEQ_LEN:4096}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}

View file

@ -13,7 +13,7 @@ The `llamastack/distribution-{{ name }}` distribution consists of the following
{{ providers_table }}
You can use this distribution if you have GPUs and want to run an independent vLLM server container for running inference.
You can use this distribution if you want to run an independent vLLM server for inference.
{% if run_config_env_vars %}
### Environment Variables
@ -28,7 +28,10 @@ The following environment variables can be configured:
## Setting up vLLM server
Both AMD and NVIDIA GPUs can serve as accelerators for the vLLM server, which acts as both the LLM inference provider and the safety provider.
In the following sections, we'll use either AMD and NVIDIA GPUs to serve as hardware accelerators for the vLLM
server, which acts as both the LLM inference provider and the safety provider. Note that vLLM also
[supports many other hardware accelerators](https://docs.vllm.ai/en/latest/getting_started/installation.html) and
that we only use GPUs here for demonstration purposes.
### Setting up vLLM server on AMD GPU

View file

@ -474,6 +474,16 @@ models:
provider_id: groq-openai-compat
provider_model_id: groq/llama-4-scout-17b-16e-instruct
model_type: llm
- metadata: {}
model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct
provider_id: groq-openai-compat
provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct
model_type: llm
- metadata: {}
model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct
provider_id: groq-openai-compat
provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct
model_type: llm
- metadata: {}
model_id: groq/llama-4-maverick-17b-128e-instruct
provider_id: groq-openai-compat
@ -484,6 +494,16 @@ models:
provider_id: groq-openai-compat
provider_model_id: groq/llama-4-maverick-17b-128e-instruct
model_type: llm
- metadata: {}
model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
provider_id: groq-openai-compat
provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
model_type: llm
- metadata: {}
model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct
provider_id: groq-openai-compat
provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct
model_type: llm
- metadata: {}
model_id: Meta-Llama-3.1-8B-Instruct
provider_id: sambanova-openai-compat