#!/usr/bin/env bash export INFERENCE_MODEL="inference-llama4-maverick" export EMBEDDING_MODEL="inference-bge-m3" export EMBEDDING_DIMENSION="1024" export LLAMA_STACK_PORT=8321 export OPENAI_BASE_URL=https://maas.ai-2.kvant.cloud/v1 export OPENAI_API_KEY=sk-ZqAWqBKFXjb6y3tVej2AaA export VLLM_MAX_TOKENS=125000 docker run -it \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -v $(pwd)/data:/root/.llama \ --mount type=bind,source="$(pwd)"/llama_stack/templates/kvant/run.yaml,target=/root/.llama/config.yaml,readonly \ --entrypoint python \ distribution-kvant:dev \ -m llama_stack.distribution.server.server --config /root/.llama/config.yaml \ --port $LLAMA_STACK_PORT \ --env VLLM_URL=$OPENAI_BASE_URL \ --env VLLM_API_TOKEN=$OPENAI_API_KEY \ --env PASSTHROUGH_URL=$OPENAI_BASE_URL \ --env PASSTHROUGH_API_KEY=$OPENAI_API_KEY \ --env INFERENCE_MODEL=$INFERENCE_MODEL \ --env EMBEDDING_MODEL=$EMBEDDING_MODEL \ --env EMBEDDING_DIMENSION=$EMBEDDING_DIMENSION \