From 509ac4a659ae95eeae305154f7be643512127e2f Mon Sep 17 00:00:00 2001 From: Justin Date: Tue, 7 Oct 2025 03:24:50 -0700 Subject: [PATCH] feat: enable Runpod inference adapter (#3707) # What does this PR do? Sorry to @mattf I thought I could close the other PR and reopen it.. But I didn't have the option to reopen it now. I just didn't want it to keep notifying maintainers if I would make other commits for testing. Continuation of: https://github.com/llamastack/llama-stack/pull/3641 PR fixes Runpod Adapter https://github.com/llamastack/llama-stack/issues/3517 ## What I fixed from before: Continuation of: https://github.com/llamastack/llama-stack/pull/3641 1. Made it all OpenAI 2. Fixed the class up since the OpenAIMixin had a couple changes with the pydantic base model stuff. 3. Test to make sure that we could dynamically find models and use the resulting identifier to make requests ```bash curl -X GET \ -H "Content-Type: application/json" \ "http://localhost:8321/v1/models" ``` ## Test Plan ``` # RunPod Provider Quick Start ## Prerequisites - Python 3.10+ - Git - RunPod API token ## Setup for Development ```bash # 1. Clone and enter the repository cd (into the repo) # 2. Create and activate virtual environment python3 -m venv .venv source .venv/bin/activate # 3. Remove any existing llama-stack installation pip uninstall llama-stack llama-stack-client -y # 4. Install llama-stack in development mode pip install -e . # 5. Build using local development code (Found this through the Discord) LLAMA_STACK_DIR=. llama stack build # When prompted during build: # - Name: runpod-dev # - Image type: venv # - Inference provider: remote::runpod # - Safety provider: "llama-guard" # - Other providers: first defaults ``` ## Configure the Stack The RunPod adapter automatically discovers models from your endpoint via the `/v1/models` API. No manual model configuration is required - just set your environment variables. ## Run the Server ### Important: Use the Build-Created Virtual Environment ```bash # Exit the development venv if you're in it deactivate # Activate the build-created venv (NOT .venv) cd (lama-stack folder github repo) source llamastack-runpod-dev/bin/activate ``` ### For Qwen3-32B-AWQ Public Endpoint (Recommended) ```bash # Set environment variables export RUNPOD_URL="https://api.runpod.ai/v2/qwen3-32b-awq/openai/v1" export RUNPOD_API_TOKEN="your_runpod_api_key" # Start server llama stack run ~/.llama/distributions/llamastack-runpod-dev/llamastack-runpod-dev-run.yaml ``` ## Quick Test ### 1. List Available Models (Dynamic Discovery) First, check which models are available on your RunPod endpoint: ```bash curl -X GET \ -H "Content-Type: application/json" \ "http://localhost:8321/v1/models" ``` **Example Response:** ```json { "data": [ { "identifier": "qwen3-32b-awq", "provider_resource_id": "Qwen/Qwen3-32B-AWQ", "provider_id": "runpod", "type": "model", "metadata": {}, "model_type": "llm" } ] } ``` **Note:** Use the `identifier` value from the response above in your requests below. ### 2. Chat Completion (Non-streaming) Replace `qwen3-32b-awq` with your model identifier from step 1: ```bash curl -X POST http://localhost:8321/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "qwen3-32b-awq", "messages": [{"role": "user", "content": "Hello, count to 3"}], "stream": false }' ``` ### 3. Chat Completion (Streaming) ```bash curl -X POST http://localhost:8321/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "qwen3-32b-awq", "messages": [{"role": "user", "content": "Count to 5"}], "stream": true }' ``` **Clean streaming output:** ```bash curl -N -X POST http://localhost:8321/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{"model": "qwen3-32b-awq", "messages": [{"role": "user", "content": "Count to 5"}], "stream": true}' \ 2>/dev/null | while read -r line; do echo "$line" | grep "^data: " | sed 's/^data: //' | jq -r '.choices[0].delta.content // empty' 2>/dev/null done ``` **Expected Output:** ``` 1 2 3 4 5 ``` --- .../remote/inference/runpod/__init__.py | 2 +- .../remote/inference/runpod/runpod.py | 123 ++++++++++-------- 2 files changed, 71 insertions(+), 54 deletions(-) diff --git a/llama_stack/providers/remote/inference/runpod/__init__.py b/llama_stack/providers/remote/inference/runpod/__init__.py index 69bf95046..d1fd2b718 100644 --- a/llama_stack/providers/remote/inference/runpod/__init__.py +++ b/llama_stack/providers/remote/inference/runpod/__init__.py @@ -11,6 +11,6 @@ async def get_adapter_impl(config: RunpodImplConfig, _deps): from .runpod import RunpodInferenceAdapter assert isinstance(config, RunpodImplConfig), f"Unexpected config type: {type(config)}" - impl = RunpodInferenceAdapter(config) + impl = RunpodInferenceAdapter(config=config) await impl.initialize() return impl diff --git a/llama_stack/providers/remote/inference/runpod/runpod.py b/llama_stack/providers/remote/inference/runpod/runpod.py index 08652f8c0..f752740e5 100644 --- a/llama_stack/providers/remote/inference/runpod/runpod.py +++ b/llama_stack/providers/remote/inference/runpod/runpod.py @@ -4,69 +4,86 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from typing import Any -from llama_stack.apis.inference import * # noqa: F403 -from llama_stack.apis.inference import OpenAIEmbeddingsResponse - -# from llama_stack.providers.datatypes import ModelsProtocolPrivate -from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper, build_hf_repo_model_entry -from llama_stack.providers.utils.inference.openai_compat import ( - get_sampling_options, -) -from llama_stack.providers.utils.inference.prompt_adapter import ( - chat_completion_request_to_prompt, +from llama_stack.apis.inference import ( + OpenAIMessageParam, + OpenAIResponseFormatParam, ) +from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from .config import RunpodImplConfig -# https://docs.runpod.io/serverless/vllm/overview#compatible-models -# https://github.com/runpod-workers/worker-vllm/blob/main/README.md#compatible-model-architectures -RUNPOD_SUPPORTED_MODELS = { - "Llama3.1-8B": "meta-llama/Llama-3.1-8B", - "Llama3.1-70B": "meta-llama/Llama-3.1-70B", - "Llama3.1-405B:bf16-mp8": "meta-llama/Llama-3.1-405B", - "Llama3.1-405B": "meta-llama/Llama-3.1-405B-FP8", - "Llama3.1-405B:bf16-mp16": "meta-llama/Llama-3.1-405B", - "Llama3.1-8B-Instruct": "meta-llama/Llama-3.1-8B-Instruct", - "Llama3.1-70B-Instruct": "meta-llama/Llama-3.1-70B-Instruct", - "Llama3.1-405B-Instruct:bf16-mp8": "meta-llama/Llama-3.1-405B-Instruct", - "Llama3.1-405B-Instruct": "meta-llama/Llama-3.1-405B-Instruct-FP8", - "Llama3.1-405B-Instruct:bf16-mp16": "meta-llama/Llama-3.1-405B-Instruct", - "Llama3.2-1B": "meta-llama/Llama-3.2-1B", - "Llama3.2-3B": "meta-llama/Llama-3.2-3B", -} -SAFETY_MODELS_ENTRIES = [] +class RunpodInferenceAdapter(OpenAIMixin): + """ + Adapter for RunPod's OpenAI-compatible API endpoints. + Supports VLLM for serverless endpoint self-hosted or public endpoints. + Can work with any runpod endpoints that support OpenAI-compatible API + """ -# Create MODEL_ENTRIES from RUNPOD_SUPPORTED_MODELS for compatibility with starter template -MODEL_ENTRIES = [ - build_hf_repo_model_entry(provider_model_id, model_descriptor) - for provider_model_id, model_descriptor in RUNPOD_SUPPORTED_MODELS.items() -] + SAFETY_MODELS_ENTRIES + config: RunpodImplConfig + def get_api_key(self) -> str: + """Get API key for OpenAI client.""" + return self.config.api_token -class RunpodInferenceAdapter( - ModelRegistryHelper, - Inference, -): - def __init__(self, config: RunpodImplConfig) -> None: - ModelRegistryHelper.__init__(self, stack_to_provider_models_map=RUNPOD_SUPPORTED_MODELS) - self.config = config + def get_base_url(self) -> str: + """Get base URL for OpenAI client.""" + return self.config.url - def _get_params(self, request: ChatCompletionRequest) -> dict: - return { - "model": self.map_to_provider_model(request.model), - "prompt": chat_completion_request_to_prompt(request), - "stream": request.stream, - **get_sampling_options(request.sampling_params), - } - - async def openai_embeddings( + async def openai_chat_completion( self, model: str, - input: str | list[str], - encoding_format: str | None = "float", - dimensions: int | None = None, + messages: list[OpenAIMessageParam], + frequency_penalty: float | None = None, + function_call: str | dict[str, Any] | None = None, + functions: list[dict[str, Any]] | None = None, + logit_bias: dict[str, float] | None = None, + logprobs: bool | None = None, + max_completion_tokens: int | None = None, + max_tokens: int | None = None, + n: int | None = None, + parallel_tool_calls: bool | None = None, + presence_penalty: float | None = None, + response_format: OpenAIResponseFormatParam | None = None, + seed: int | None = None, + stop: str | list[str] | None = None, + stream: bool | None = None, + stream_options: dict[str, Any] | None = None, + temperature: float | None = None, + tool_choice: str | dict[str, Any] | None = None, + tools: list[dict[str, Any]] | None = None, + top_logprobs: int | None = None, + top_p: float | None = None, user: str | None = None, - ) -> OpenAIEmbeddingsResponse: - raise NotImplementedError() + ): + """Override to add RunPod-specific stream_options requirement.""" + if stream and not stream_options: + stream_options = {"include_usage": True} + + return await super().openai_chat_completion( + model=model, + messages=messages, + frequency_penalty=frequency_penalty, + function_call=function_call, + functions=functions, + logit_bias=logit_bias, + logprobs=logprobs, + max_completion_tokens=max_completion_tokens, + max_tokens=max_tokens, + n=n, + parallel_tool_calls=parallel_tool_calls, + presence_penalty=presence_penalty, + response_format=response_format, + seed=seed, + stop=stop, + stream=stream, + stream_options=stream_options, + temperature=temperature, + tool_choice=tool_choice, + tools=tools, + top_logprobs=top_logprobs, + top_p=top_p, + user=user, + )